In [None]:
import pandas as pd
import dotenv
import os

from libraries.client_stashapp import get_stashapp_client

dotenv.load_dotenv()

stash = get_stashapp_client()

tpdb_headers = {
    "Content-Type": "application/json",
    "Authorization": f"Bearer {os.environ['TPDB_API_KEY']}",
}

# Load data from Stash

In [None]:
studio_ids = [81]

In [None]:
tpdb_unmatched_scenes = stash.find_scenes(
    {
        "stash_id_endpoint": {
            "endpoint": "https://theporndb.net/graphql",
            "modifier": "IS_NULL"
        },
        "studios": {
            "value": studio_ids,
            "modifier": "INCLUDES"
        }
    },
    {
        "per_page": 100,
        "sort": "date"
    },
    fragment="""
        id
        stash_ids {
            endpoint
            stash_id
        }
        title
        date
        urls
        studio {
            name
            url
        }
        files {
            fingerprints {
                type
                value
            }
        }
    """
)
df = pd.DataFrame(tpdb_unmatched_scenes)

def is_tpdb_endpoint_missing(stash_ids):
    # Ensure stash_ids is a list
    if isinstance(stash_ids, list):
        # Check if no dictionary in the list has the required endpoint
        return not any(item.get("endpoint") == "https://theporndb.net/graphql" for item in stash_ids)
    return True  # If stash_ids is not a list, assume endpoint is missing

def get_single_phash_from_files(files):
    if len(files) != 1:
        return None

    file = files[0]
    if not "fingerprints" in file:
        return None
    
    fingerprints = file["fingerprints"]
    if not isinstance(fingerprints, list):
        return None
    
    # find a fingerprint with type phash
    for fingerprint in fingerprints:
        if fingerprint["type"] == "phash":
            return fingerprint["value"]

    return None

def get_stashdb_uuid(stash_ids):
    for stash_id in stash_ids:
        if stash_id["endpoint"] == "https://stashdb.org/graphql":
            return stash_id["stash_id"]
    return None

# Filter dataframe to only include rows where the TPDB endpoint is missing
df = df[df["stash_ids"].apply(is_tpdb_endpoint_missing)]
df["phash"] = df["files"].apply(get_single_phash_from_files)
df["stashdb_uuid"] = df["stash_ids"].apply(get_stashdb_uuid)
df["studio_name"] = df["studio"].apply(lambda x: x["name"])

df = df.drop(columns=["studio"])

In [None]:
# Match with TPDB

In [None]:
import pandas as pd
import requests

# Function to fetch scene data by phash and store directly in a list
def fetch_scene_data(df, headers):
    # List to store the scene data for DataFrame creation
    scenes_data_list = []

    # Loop through data frame rows
    for index, row in df.iterrows():
        id = row["id"]
        phash = row["phash"]

        response = requests.get(
            f"https://api.theporndb.net/scenes?hash={phash}&hashType=PHASH",
            headers=headers,
        )

        if response.status_code == 200:
            response_json = response.json()

            if response_json:
                scenes_data = response_json.get("data", [])
                if scenes_data:
                    scene_data_list = []
                    for scene_data in scenes_data:
                        scene_data_list.append({
                            "tpdb_date": scene_data.get("date"),
                            "tpdb_title": scene_data.get("title"),
                            "tpdb_uuid": scene_data.get("id"),
                            "tpdb_url": scene_data.get("url"),
                            "tpdb_studio_id": scene_data.get("site", {}).get("uuid"),
                            "tpdb_studio_name": scene_data.get("site", {}).get("name"),
                            "tpdb_studio_url": scene_data.get("site", {}).get("url"),
                        })
                    scenes_data_list.append({
                        "id": id,
                        "stash_scene": row.to_dict(),  # Store the whole row as a dict
                        "matches": scene_data_list
                    })

    # Create a DataFrame from the list of scene data
    return pd.DataFrame(scenes_data_list)

# Fetch all scene data and store in a DataFrame
scene_df = fetch_scene_data(df, tpdb_headers)

In [None]:
# Analyzing the results

In [None]:
import pandas as pd
from difflib import SequenceMatcher

# Function to evaluate a match
def evaluate_match(stash_scene, match):
    score = 0

    # Check date similarity
    if pd.notna(match["tpdb_date"]):
        row_date = pd.to_datetime(stash_scene["date"])
        if abs(row_date - pd.to_datetime(match["tpdb_date"])) < pd.Timedelta(days=1):
            score += 1

    # Check title similarity
    if pd.notna(match["tpdb_title"]):
        clean_title = stash_scene["title"].strip().upper()
        clean_tpdb_title = match["tpdb_title"].strip().upper()
        if clean_title == clean_tpdb_title:
            score += 2  # Higher weight for exact match
        elif SequenceMatcher(None, clean_title, clean_tpdb_title).ratio() > 0.9:
            score += 1

    # Check studio similarity
    if pd.notna(match["tpdb_studio_name"]):
        if stash_scene["studio_name"] == match["tpdb_studio_name"]:
            score += 1

    # Check URL similarity
    if pd.notna(match["tpdb_url"]) and isinstance(match["tpdb_url"], str):
        clean_tpdb_url = match["tpdb_url"].replace("www.", "")
        if stash_scene["urls"]:
            for url in stash_scene["urls"]:
                clean_url = url.replace("www.", "")
                if clean_url == clean_tpdb_url:
                    score += 1
                    break

    return score

# Function to select the best match
def select_best_match(row):
    best_match = None
    highest_score = 0
    stash_scene = row["stash_scene"]

    for match in row["matches"]:
        score = evaluate_match(stash_scene, match)
        if score > highest_score:
            highest_score = score
            best_match = match

    return best_match

# Apply the selection function to get the best match and normalize it into a DataFrame
scene_df["best_match"] = scene_df.apply(select_best_match, axis=1)

# Flatten the best match columns into separate columns in the main DataFrame
best_match_df = pd.json_normalize(scene_df["best_match"]).add_prefix("tpdb_")

# Merge the best match back into the original DataFrame
merged_df = pd.concat([df, best_match_df], axis=1)


# Assume `df` and `scene_df` are already defined and contain the relevant data
# Assume `fetch_scene_data` is previously defined and executed

# Function to evaluate a match
def evaluate_match(stash_scene, match):
    score = 0

    # Check date similarity
    if pd.notna(match["tpdb_date"]):
        row_date = pd.to_datetime(stash_scene["date"])
        if abs(row_date - pd.to_datetime(match["tpdb_date"])) < pd.Timedelta(days=1):
            score += 1

    # Check title similarity
    if pd.notna(match["tpdb_title"]):
        clean_title = stash_scene["title"].strip().upper()
        clean_tpdb_title = match["tpdb_title"].strip().upper()
        if clean_title == clean_tpdb_title:
            score += 2  # Higher weight for exact match
        elif SequenceMatcher(None, clean_title, clean_tpdb_title).ratio() > 0.9:
            score += 1

    # Check studio similarity
    if pd.notna(match["tpdb_studio_name"]):
        if stash_scene["studio_name"] == match["tpdb_studio_name"]:
            score += 1

    # Check URL similarity
    if pd.notna(match["tpdb_url"]) and isinstance(match["tpdb_url"], str):
        clean_tpdb_url = match["tpdb_url"].replace("www.", "")
        if stash_scene["urls"]:
            for url in stash_scene["urls"]:
                clean_url = url.replace("www.", "")
                if clean_url == clean_tpdb_url:
                    score += 1
                    break

    return score

# Function to select the best match
def select_best_match(row):
    best_match = None
    highest_score = 0
    stash_scene = row["stash_scene"]

    for match in row["matches"]:
        score = evaluate_match(stash_scene, match)
        if score > highest_score:
            highest_score = score
            best_match = match

    return best_match

# Apply the selection function to get the best match
scene_df["best_match"] = scene_df.apply(select_best_match, axis=1)

# Normalize the best match into separate columns
best_match_df = pd.json_normalize(scene_df["best_match"])

# Combine the best matches with the original scene_df
scene_df = pd.concat([scene_df.drop(columns=["best_match", "matches"]), best_match_df], axis=1)

# Merge with the original DataFrame on the 'id' column
merged_df = pd.merge(df, scene_df, on="id", how="inner")

# Clean up columns after merging if needed
# Remove the stash_scene column after merging if it's no longer needed
merged_df = merged_df.drop(columns=["stash_scene"])

# Function to check date and title similarity (unchanged)
def analyze_scene(row):
    # Initialize matching results
    date_match = False
    exact_title_match = False
    near_title_match = False
    studio_match = False
    url_match = False

    if pd.notna(row["tpdb_date"]):
        # Check date similarity
        row_date = pd.to_datetime(row["date"])
        if abs(row_date - pd.to_datetime(row["tpdb_date"])) < pd.Timedelta(days=1):
            date_match = True

    if pd.notna(row["tpdb_title"]):
        # Check title similarity
        clean_title = row["title"].strip().upper()
        clean_tpdb_title = row["tpdb_title"].strip().upper()
        exact_title_match = clean_title == clean_tpdb_title
        near_title_match = SequenceMatcher(None, clean_title, clean_tpdb_title).ratio() > 0.9
    
    if pd.notna(row["tpdb_studio_name"]):
        # Check studio similarity
        if row["studio_name"] == row["tpdb_studio_name"]:
            studio_match = True
    
    if pd.notna(row["tpdb_url"]):
        # Check URL is string
        if not isinstance(row["tpdb_url"], str):
            url_match = False
            print(f"URL is not a string: {row['id']} {row['tpdb_uuid']} {row['tpdb_url']}")
        else:
            clean_tpdb_url = row["tpdb_url"].replace("www.", "")

            if row["urls"]:
                for url in row["urls"]:
                    clean_url = url.replace("www.", "")
                    if clean_url == clean_tpdb_url:
                        url_match = True
        

    return pd.Series({
        "date_match": date_match,
        "exact_title_match": exact_title_match,
        "near_title_match": near_title_match,
        "studio_match": studio_match,
        "url_match": url_match,
        "all_match": all([date_match, exact_title_match, near_title_match, studio_match, url_match])
    })

# Apply the analysis and add results to new columns in the merged DataFrame
merged_df[["date_match", "exact_title_match", "near_title_match", "studio_match", "url_match", "all_match"]] = merged_df.apply(analyze_scene, axis=1)

# Define the new column order
new_column_order = [
    'id', 
    
    'all_match', 'date_match', 'exact_title_match', 'near_title_match', 'studio_match', 'url_match',
    
    'stashdb_uuid', 'tpdb_uuid',

    'title', 'tpdb_title',
    'date', 'tpdb_date',
    'studio_name', 'tpdb_studio_name', 'tpdb_studio_id', 'tpdb_studio_url',
    'urls', 'tpdb_url',
    'files',
    'phash',

    'stash_ids',
]

# Reorder the DataFrame columns
merged_df = merged_df[new_column_order]


In [None]:
# Saving TPDB UUIDs to Stash

In [None]:
def save_tpdb_uuid(df):
    for index, row in df.iterrows():
        merged_stash_ids = row["stash_ids"]
        merged_stash_ids.append({
            "endpoint": "https://theporndb.net/graphql",
            "stash_id": row["tpdb_uuid"]
        })

        stash.update_scene(
            {
                "id": row["id"],
                "stash_ids": merged_stash_ids
            },
            False
        )
        print(f"Updated scene {row['id']} with TPDB ID {row['tpdb_uuid']}")

In [None]:
merged_df[merged_df["all_match"]]

In [None]:
df_partially_matched = merged_df[~merged_df["all_match"] & merged_df["date_match"] & merged_df["exact_title_match"] & merged_df["near_title_match"] & merged_df["studio_match"]]

In [None]:
save_tpdb_uuid(df_partially_matched)