In [None]:
# Importing metadata from Culture Extractor to StashApp
# 1. Import metadata from Culture Extractor
# 2. Import metadata from StashApp by oshash
# 3. Join the two on oshash
# 4. Query metadata from StashDB by phash
# 5. Join the three on phash
# 6. Match performers between Culture Extractor, StashApp and StashDB
# 7. Set Culture Extractor UUIDs to performer custom fields in StashApp
# 8. Set metadata to StashApp scenes

In [None]:
import os
import sys


sys.path.append(os.path.dirname(os.path.abspath("")))

import os

import polars as pl
from dotenv import load_dotenv

import libraries.client_culture_extractor as client_culture_extractor


load_dotenv()

# Culture Extractor
user = os.environ.get("CE_DB_USERNAME")
pw = os.environ.get("CE_DB_PASSWORD")
host = os.environ.get("CE_DB_HOST")
port = os.environ.get("CE_DB_PORT")
db = os.environ.get("CE_DB_NAME")

connection_string = f"dbname={db} user={user} password={pw} host={host} port={port}"

culture_extractor_client = client_culture_extractor.ClientCultureExtractor(connection_string)


# StashApp
from libraries.client_stashapp import StashAppClient, get_stashapp_client


stash_client = StashAppClient()
stash_raw_client = get_stashapp_client()


# StashDB
import os

import dotenv

from libraries.StashDbClient import StashDbClient


dotenv.load_dotenv()

stashbox_client = StashDbClient(
    os.getenv("STASHDB_ENDPOINT"),
    os.getenv("STASHDB_API_KEY"),
)


# Functions
def hex_to_binary(hex_string):
    return bin(int(hex_string, 16))[2:].zfill(64)

def calculate_hamming_distance(phash1, phash2):
    # Convert hexadecimal phashes to binary
    binary1 = hex_to_binary(phash1)
    binary2 = hex_to_binary(phash2)

    # Ensure both binary strings are of equal length
    if len(binary1) != len(binary2):
        raise ValueError("Binary strings must be of equal length")

    # Calculate Hamming distance
    return sum(c1 != c2 for c1, c2 in zip(binary1, binary2))

# Example usage:
# phash1 = "951428607cf7cb8f"
# phash2 = "951428607cf7cb8e"
# distance = calculate_hamming_distance(phash1, phash2)
# print(f"Hamming distance between {phash1} and {phash2}: {distance}")

def levenshtein(s1: str, s2: str):
    if not s1:
        return None
    if not s2:
        return None
    from Levenshtein import distance
    return distance(s1.lower(), s2.lower())


In [None]:
# Get database schema
# culture_extractor_client.get_database_schema().write_json()

In [None]:
all_tags = stash_raw_client.find_tags()
all_ce_sites = culture_extractor_client.get_sites()
all_ce_sub_sites = culture_extractor_client.get_sub_sites()
all_stash_studios = stash_client.get_studios()
# all_ce_sites_stash_studios_joined = all_ce_sites.join(all_stash_studios, left_on="ce_sites_uuid", right_on="stash_studios_ce_id", how="left", coalesce=False)
all_ce_sites_stash_studios_joined = all_ce_sites.join(all_stash_studios, left_on="ce_sites_name", right_on="stash_studios_name", how="left", coalesce=False)

# Then join remaining unmatched rows by name
for row in all_ce_sites_stash_studios_joined.filter(
    pl.col("stash_studios_id").is_null()
).iter_rows(named=True):
    print(f"Unmatched studio {row["ce_sites_uuid"]} {row["ce_sites_name"]}")

In [None]:
# Link by name
site_name = "Braless Forever"
rows = all_ce_sites_stash_studios_joined.filter(pl.col("stash_studios_name").str.contains(site_name))
selected_studio = rows.to_dicts()[0]
stash_client.set_studio_stash_id_for_endpoint(selected_studio["stash_studios_id"], "https://culture.extractor/graphql", selected_studio["ce_sites_uuid"])
selected_studio

# Manual override
# stash_client.set_studio_stash_id_for_endpoint(306, "https://culture.extractor/graphql", "018b94b1-b5e9-71d7-ab70-8665111e8bd8")
# selected_studio = all_ce_sites_stash_studios_joined.filter(pl.col("ce_sites_uuid").eq("018b94b1-b5e9-71d7-ab70-8665111e8bd8")).to_dicts()[0]
# selected_studio

In [None]:
downloads = culture_extractor_client.get_downloads(selected_studio["ce_sites_uuid"]) # .filter(pl.col("ce_downloads_file_type").str.contains("video"))
# downloads = downloads.filter(~pl.col("ce_downloads_file_type").str.contains("image"))
# Filter out duplicate variants for sites
# downloads = downloads.filter(~pl.col("ce_downloads_variant").str.contains("Best"))
downloads

In [None]:
oshashes = ["9ee341c717f845b6", "f05c541e68a83eb9"]
downloads.filter(pl.col("ce_downloads_hash_oshash").is_in(oshashes)).select(pl.col("ce_downloads_hash_oshash"), pl.col("ce_downloads_release_name"))

# Matching scenes

In [None]:
oshashes = downloads["ce_downloads_hash_oshash"].unique().to_list()
stash_app_scenes = stash_client.find_scenes_by_oshash(oshashes)
stash_app_scenes

In [None]:
joined_scenes = stash_app_scenes.join(downloads, left_on="stashapp_primary_file_oshash", right_on="ce_downloads_hash_oshash", how="left", coalesce=False)

In [None]:
# Create a list to store scene data
scene_data = []

# Create list of scene objects with filename, phash and duration
scene_objects = joined_scenes.select(
    pl.col("stashapp_primary_file_basename").alias("basename"),
    pl.col("ce_downloads_release_name").alias("title"),
    pl.col("stashapp_primary_file_phash").alias("phash"),
    pl.col("stashapp_primary_file_duration").dt.total_seconds().alias("duration"),
    pl.col("stashapp_stashdb_id").alias("stashdb_id")
).to_dicts()

batch_size = 100

stashdb_scene_batches = []
for i in range(0, len(scene_objects), batch_size):
    batch = scene_objects[i:i+batch_size]
    batch_stashdb_scenes = stashbox_client.query_scenes_by_phash(batch)
    stashdb_scene_batches.append(batch_stashdb_scenes)

df_stashdb_scenes = pl.concat(stashdb_scene_batches)

In [None]:
joined_scenes = joined_scenes.join(df_stashdb_scenes, left_on="stashapp_primary_file_phash", right_on="queried_phash", how="left", coalesce=False)
joined_scenes

In [None]:
# Get the stashapp_ids that have duplicates
duplicate_ids = (joined_scenes.group_by("stashapp_id")
                .agg(pl.col("stashapp_id").count().alias("scene_count"))
                .filter(pl.col("scene_count") > 1)
                .get_column("stashapp_id"))

# Show all rows for scenes that have duplicates
scenes_with_duplicates = joined_scenes.filter(pl.col("stashapp_id").is_in(duplicate_ids)).sort("stashapp_id")
scenes_with_duplicates

In [None]:
joined_scenes.filter(pl.col("stashapp_id").is_in(duplicate_ids)).sort("stashapp_id").select(pl.col("ce_downloads_release_uuid"))

In [None]:
# parquet_path = "joined_scenes_with_stashdb_scenes_20250105_1715.parquet"
# joined_galleries_with_stashdb_scenes.write_parquet(parquet_path)
# joined_scenes = pl.read_parquet(parquet_path)

In [None]:
def calculate_duration_difference(stashapp_duration, stashdb_duration):
    return (
        pl.when(stashapp_duration.is_not_null() & stashdb_duration.is_not_null())
        .then(
            ((stashapp_duration - stashdb_duration).abs() /
             pl.max_horizontal([stashapp_duration, stashdb_duration])) * 100
        )
        .otherwise(None)
    )

def calculate_title_similarity(ce_title, stashdb_title):
    return (
        pl.when(ce_title.is_not_null() & stashdb_title.is_not_null())
        .then(
            pl.struct([ce_title, stashdb_title])
            .map_elements(
                lambda row: levenshtein(str(row[0]), str(row[1])),
                return_dtype=pl.Int64
            )
        )
        .otherwise(None)
    )

def get_date_difference_days(ce_date, stashdb_date):
    return (
        pl.when(ce_date.is_not_null() & stashdb_date.is_not_null())
        .then(
            (ce_date.cast(pl.Datetime) - stashdb_date.cast(pl.Datetime)).dt.total_days().abs()
        )
        .otherwise(None)
    )

# First create the calculated columns
df_verification = joined_scenes.with_columns([
    calculate_duration_difference(
        pl.col("stashapp_primary_file_duration"),
        pl.col("duration")
    ).alias("duration_diff_pct"),

    pl.struct(["ce_downloads_release_name", "title"])
        .map_elements(lambda x: levenshtein(x["ce_downloads_release_name"], x["title"]), return_dtype=pl.Int64)
        .alias("title_levenshtein"),

    get_date_difference_days(
        pl.col("ce_downloads_release_date"),
        pl.col("date")
    ).alias("date_diff_days"),
])

# Then add the warning flags
df_verification = df_verification.with_columns([
    # Add warning flags
    (pl.col("duration_diff_pct") > 5).alias("duration_warning"),
    (pl.col("title_levenshtein") > 5).alias("title_warning"),
    (pl.col("date_diff_days") > 7).alias("date_warning")
])


df_verification_warnings = df_verification.filter(
    pl.col("date_warning") | pl.col("duration_warning") | pl.col("title_warning")
).select([
    "stashapp_id",
    "id",
    "stashapp_title",
    "ce_downloads_release_name",
    "title",
    "title_levenshtein",
    "ce_downloads_release_date",
    "date",
    "date_diff_days",
    "stashapp_primary_file_duration",
    "duration",
    "duration_diff_pct",
])
df_verification_warnings

In [None]:
foo = df_verification.select(["stashapp_id", "id", "stashapp_primary_file_basename", "title"])
foo

In [None]:
joined_scenes_ce_unique_performers = (
    joined_scenes
    .select(pl.col("ce_downloads_performers"))
    .explode("ce_downloads_performers")
    .select([
        pl.col("ce_downloads_performers").struct.field("uuid").alias("performer_uuid"),
        pl.col("ce_downloads_performers").struct.field("name").alias("performer_name")
    ])
    .unique()
    .sort("performer_name")
)
joined_scenes_ce_unique_performers

In [None]:
all_stashapp_performers = stash_client.get_performers()
all_stashapp_performers = all_stashapp_performers.with_columns(
    pl.col("stashapp_custom_fields").list.eval(
        pl.when(pl.element().struct.field("key") == f"CultureExtractor.{selected_studio['ce_sites_short_name']}")
        .then(pl.element().struct.field("value"))
        .otherwise(None)
    ).list.eval(
        pl.element().filter(pl.element().is_not_null())
    ).list.first().alias("ce_custom_field_value")
)
all_stashapp_performers

In [None]:
# Check for Culture Extractor performers that have not been matched to a StashApp performer
unmatched_performers_df = joined_scenes_ce_unique_performers.join(
    all_stashapp_performers.filter(pl.col("ce_custom_field_value").is_not_null()),
    left_on="performer_uuid",
    right_on="ce_custom_field_value",
    how="left",  # Changed from "inner" to "left"
    coalesce=False
).filter(
    pl.col("ce_custom_field_value").is_null()  # Only show performers without matches
)
unmatched_performers_df

In [None]:
exclusion_list = []
joined_scenes.filter(~pl.col("stashapp_id").is_in(exclusion_list))


In [None]:
from libraries.performer_matcher import PerformerMatcher


# Create matcher instance
matcher = PerformerMatcher(all_stashapp_performers)

# Your DataFrame already has the required columns, but we need to process each row
all_matches = []

# Process each row in your DataFrame
for row in joined_scenes.iter_rows(named=True):
    data_frame = pl.DataFrame([{
        "ce_downloads_performers": row["ce_downloads_performers"],
        "stashapp_performers": row["stashapp_performers"],
        "performers": row["performers"]
    }])

    matches = matcher.match_performers(
        data_frame["ce_downloads_performers"],
        data_frame["stashapp_performers"],
        data_frame["performers"]
    )

    # Add scene context to matches
    for match in matches:
        all_matches.append({
            "scene_id": row["stashapp_id"],
            "scene_title": row["stashapp_title"],
            "ce_uuid": match.ce_uuid,
            "ce_name": match.ce_name,
            "stashapp_id": match.stashapp_id,
            "stashapp_name": match.stashapp_name,
            "stashdb_uuid": match.stashdb_uuid,
            "stashdb_name": match.stashdb_name,
            "confidence": match.confidence,
            "reason": match.reason
        })

# Convert matches to DataFrame for analysis
joined_performers_matches_df = pl.DataFrame(all_matches)
joined_performers_matches_df = joined_performers_matches_df.sort("stashapp_name")
joined_performers_matches_df

In [None]:
new_performers = joined_performers_matches_df.filter(pl.col("stashapp_id").eq(-1))
new_performers

In [None]:
for row in new_performers.select(pl.col(["ce_uuid", "stashdb_uuid", "stashdb_name"])).unique().iter_rows(named=True):
    stash_raw_client.create_performer({
        "name": row["stashdb_name"],
        "stash_ids": [
            {
                "endpoint": "https://stashdb.org/graphql",
                "stash_id": row["stashdb_uuid"]
            }
        ],
        "custom_fields": {
            "CultureExtractor." + selected_studio["ce_sites_short_name"]: row["ce_uuid"]
        }
    })
    print(f"Created performer {row['stashdb_name']}")

In [None]:
existing_performers = joined_performers_matches_df.filter(pl.col("stashapp_id").ne(-1))
existing_performers

In [None]:
custom_field_name = "CultureExtractor." + selected_studio["ce_sites_short_name"]
for row in existing_performers.select(pl.col(["ce_uuid", "stashapp_id", "stashapp_name"])).unique().iter_rows(named=True):
    stash_client.update_performer_custom_fields(row["stashapp_id"], {custom_field_name: row["ce_uuid"]})
    print(f"Setting custom field {custom_field_name} for {row['stashapp_name']} ({row['stashapp_id']}) to {row['ce_uuid']}")

In [None]:
ce_performer_mapping = stash_client.get_performers().with_columns([
    pl.col("stashapp_custom_fields").list.eval(
        pl.element().struct.field("value").filter(
            pl.element().struct.field("key") == "CultureExtractor." + selected_studio["ce_sites_short_name"]
        )
    ).list.first().alias("ce_custom_field_value")
]).filter(
    pl.col("ce_custom_field_value").is_not_null()
).select(
    pl.col("ce_custom_field_value").alias("ce_performer_uuid"),
    pl.col("stashapp_id").alias("stashapp_id"),
    pl.col("stashapp_name").alias("stashapp_name")
).sort(by=["stashapp_name"])
ce_performer_mapping

In [None]:
# First get all unique performer IDs from the scenes
unique_stashdb_performer_ids = joined_scenes.select([
    pl.col("performers").list.eval(
        pl.element().struct.field("performer").struct.field("id")
    )
]).explode(
    pl.col("performers")
).unique()
unique_stashdb_performer_ids

# Then join with StashApp performers that have StashDB IDs
stashdb_performer_mapping = stash_client.get_performers().with_columns([
    # Find the StashDB ID by filtering the stash_ids list first
    pl.col("stashapp_stash_ids").list.eval(
        pl.when(pl.element().struct.field("endpoint") == "https://stashdb.org/graphql")
        .then(pl.element().struct.field("stash_id"))
        .otherwise(None)
    ).list.eval(
        pl.element().filter(pl.element().is_not_null())
    ).list.first().alias("stashdb_id")
]).filter(
    pl.col("stashdb_id").is_not_null()
).select([
    pl.col("stashdb_id"),
    pl.col("stashapp_id"),
    pl.col("stashapp_name")
]).join(
    unique_stashdb_performer_ids,
    left_on="stashdb_id",
    right_on="performers",
    how="inner"
).sort(by=["stashapp_name"])
stashdb_performer_mapping

In [None]:
ce_performer_mapping.join(stashdb_performer_mapping, on="stashapp_id", how="inner", coalesce=False).sort("stashapp_name")

In [None]:
# Find names in CE but not in StashDB (left difference)
names_only_in_ce = ce_performer_mapping.join(
    stashdb_performer_mapping,
    on="stashapp_id",
    how="anti"
)

# Find names in StashDB but not in CE (right difference)
names_only_in_stashdb = stashdb_performer_mapping.join(
    ce_performer_mapping,
    on="stashapp_id",
    how="anti"
)

if len(names_only_in_ce) > 0:
    print("Names only in Culture Extractor:")
    print(names_only_in_ce)
if len(names_only_in_stashdb) > 0:
    print("\nNames only in StashDB:")
    print(names_only_in_stashdb)

In [None]:
downloads.filter(
    pl.col("ce_downloads_release_uuid").is_in(
        joined_scenes.get_column("ce_downloads_release_uuid").unique().to_list()
    )
).select(
    pl.col("ce_downloads_file_type"),
    pl.col("ce_downloads_content_type"),
    pl.col("ce_downloads_variant"),
).unique().sort(by=["ce_downloads_file_type", "ce_downloads_content_type", "ce_downloads_variant"])

In [None]:
# Get scene images
scene_images = downloads.filter(
    pl.col("ce_downloads_release_uuid").is_in(joined_scenes.get_column("ce_downloads_release_uuid").unique().to_list())
).filter(
    pl.col("ce_downloads_file_type") == "image",
    pl.col("ce_downloads_content_type").is_in(["scene", "wide", "poster", "cover"])
).select([
    pl.col("ce_downloads_release_uuid"),
    pl.col("ce_downloads_content_type"),
    pl.col("ce_downloads_saved_filename").alias("scene_image_filename")
]).sort(
    by=["ce_downloads_release_uuid", "ce_downloads_content_type"],
    descending=[False, True]
).group_by(
    "ce_downloads_release_uuid"
).agg(
    pl.col("scene_image_filename").first(),
    pl.col("ce_downloads_content_type").first()
)
scene_images = scene_images.join(joined_scenes, on="ce_downloads_release_uuid", how="left").select(
    pl.col("ce_downloads_release_uuid"),
    pl.col("stashapp_id"),
    pl.col("stashapp_title"),
    pl.col("scene_image_filename"),
    pl.col("ce_downloads_content_type")
)
scene_images

In [None]:
# Create update inputs for reviewing and applying
import base64


def create_update_dataframe(joined_scenes, downloads, all_stashapp_performers, all_tags, stashapp_studio_id):
    # Get all scene data ready for updates
    updates_df = joined_scenes.select([
        pl.col("ce_downloads_release_uuid").alias("ce_release_uuid"),
        pl.col("stashapp_id").alias("scene_id"),
        pl.col("stashapp_primary_file_basename").alias("primary_file_basename"),
        pl.col("ce_downloads_release_date").alias("date"),
        pl.col("ce_downloads_release_name").alias("title"),
        pl.col("ce_downloads_release_short_name").alias("code"),
        pl.col("ce_downloads_release_description").alias("details"),
        pl.lit(stashapp_studio_id).alias("studio_id"),
        pl.col("ce_downloads_release_url").alias("url"),
        pl.col("ce_downloads_release_uuid"),
        pl.col("id").alias("stashdb_id"),
        pl.col("ce_downloads_performers"),
        pl.col("performers"),
        pl.col("tags").alias("stashdb_tags")
    ])

    # Map performers - now with unique values
    updates_df = updates_df.with_columns([
        # Get Culture Extractor UUIDs
        pl.col("ce_downloads_performers").list.eval(
            pl.element().struct.field("uuid")
        ).list.unique().alias("ce_performer_uuids"),

        # Get StashDB IDs
        pl.col("performers").list.eval(
            pl.element().struct.field("performer").struct.field("id")
        ).list.unique().alias("stashdb_performer_ids")
    ])

    # Join performer IDs with unique values
    updates_df = updates_df.with_columns([
        pl.when(pl.col("ce_performer_uuids").is_not_null())
        .then(
            pl.col("ce_performer_uuids").map_elements(
                lambda uuids: ce_performer_mapping.filter(
                    pl.col("ce_performer_uuid").is_in(uuids)
                ).get_column("stashapp_id").unique().to_list(),
                return_dtype=pl.List(pl.Int64)
            )
        )
        .otherwise(pl.Series([[]]))
        .alias("ce_performer_stashapp_ids"),

        pl.when(pl.col("ce_performer_uuids").is_not_null())
        .then(
            pl.col("ce_performer_uuids").map_elements(
                lambda uuids: ce_performer_mapping.filter(
                    pl.col("ce_performer_uuid").is_in(uuids)
                ).get_column("stashapp_name").unique().to_list(),
                return_dtype=pl.List(pl.Utf8)
            )
        )
        .otherwise(pl.Series([[]]))
        .alias("ce_performer_stashapp_names")
    ])

    updates_df = updates_df.with_columns([
        pl.when(pl.col("performers").is_not_null())
        .then(
            pl.col("stashdb_performer_ids").map_elements(
                lambda uuids: stashdb_performer_mapping.filter(
                    pl.col("stashdb_id").is_in(uuids)
                ).get_column("stashapp_id").unique().to_list(),
                return_dtype=pl.List(pl.Int64)
            )
        )
        .otherwise(pl.Series([[]]))
        .alias("stashdb_performer_stashapp_ids"),

        pl.when(pl.col("stashdb_performer_ids").is_not_null())
        .then(
            pl.col("stashdb_performer_ids").map_elements(
                lambda uuids: stashdb_performer_mapping.filter(
                    pl.col("stashdb_id").is_in(uuids)
                ).get_column("stashapp_name").unique().to_list(),
                return_dtype=pl.List(pl.Utf8)
            )
        )
        .otherwise(pl.Series([[]]))
        .alias("stashdb_performer_stashapp_names")
    ])

    # Combine performer IDs with unique values
    updates_df = updates_df.with_columns([
        pl.concat_list([
            pl.col("ce_performer_stashapp_ids"),
            pl.col("stashdb_performer_stashapp_ids")
        ]).list.unique().alias("performer_ids"),

        pl.concat_list([
            pl.col("ce_performer_stashapp_names"),
            pl.col("stashdb_performer_stashapp_names")
        ]).list.unique().alias("performer_names")
    ])

    # Map tags
    tag_mapping = pl.DataFrame({
        "stashdb_name": [tag["name"] for tag in all_tags],
        "stashapp_id": [tag["id"] for tag in all_tags]
    })

    updates_df = updates_df.with_columns([
        pl.when(pl.col("stashdb_tags").is_not_null())
        .then(
            pl.col("stashdb_tags").map_elements(
                lambda tags: tag_mapping.filter(
                    pl.col("stashdb_name").is_in([t["name"] for t in tags])
                ).get_column("stashapp_id").to_list(),
                return_dtype=pl.List(pl.Utf8)
            )
        )
        .otherwise(pl.Series([[]]))
        .alias("tag_ids")
    ])

    # Get gallery info
    galleries = downloads.filter(
        (pl.col("ce_downloads_content_type") == "gallery") &
        (pl.col("ce_downloads_variant").is_in(["Large", "high", ""]))
    ).select([
        pl.col("ce_downloads_release_uuid"),
        pl.col("ce_downloads_hash_sha256").alias("gallery_hash")
    ])

    # Join images and galleries
    updates_df = updates_df.join(
        scene_images,
        on="ce_downloads_release_uuid",
        how="left"
    ).join(
        galleries,
        on="ce_downloads_release_uuid",
        how="left"
    )

    return updates_df

def generate_update_inputs(updates_df, stash_raw_client):
    updates = []

    for row in updates_df.iter_rows(named=True):
        # Get current scene data
        refreshed_scene = stash_raw_client.find_scene(row["scene_id"])
        existing_scene_galleries = refreshed_scene.get("galleries", [])
        existing_scene_gallery_id = existing_scene_galleries[0]["id"] if existing_scene_galleries else None

        image_path = None
        try:
            if row["scene_image_filename"]:
                image_path = os.path.join(
                    "F:\\Ripping\\" + selected_studio["ce_sites_name"] + "\\Metadata",
                    row["ce_release_uuid"],
                    row["scene_image_filename"]
                )
                if not os.path.exists(image_path):
                    print(f"Image not found: {image_path}")
                    image_path = None
        except Exception as e:
            print(f"Exception occurred: {e}")

        # Find gallery if exists
        gallery_id = None
        refreshed_gallery = None  # Initialize refreshed_gallery
        existing_gallery_urls = []
        if existing_scene_gallery_id:
            gallery_id = existing_scene_gallery_id
        elif row["gallery_hash"]:
            found_galleries = stash_raw_client.find_galleries(q=row["gallery_hash"])
            if len(found_galleries) == 1:
                gallery_id = found_galleries[0]["id"]

        if gallery_id:
            refreshed_gallery = stash_raw_client.find_gallery(gallery_id)
            existing_gallery_urls = refreshed_gallery.get("urls", [])

        # Handle potentially null values
        existing_scene_tag_ids = sorted([tag["id"] for tag in refreshed_scene.get("tags", [])])
        existing_gallery_tag_ids = sorted([tag["id"] for tag in refreshed_gallery.get("tags", [])]) if refreshed_gallery else []
        new_tag_ids = sorted(row["tag_ids"]) if row["tag_ids"] is not None else []

        existing_performer_ids = [int(performer["id"]) for performer in refreshed_scene.get("performers", [])]
        new_performer_ids = row["performer_ids"] if row["performer_ids"] is not None else []

        existing_urls = refreshed_scene.get("urls", [])
        new_url = [row["url"]] if row["url"] is not None else []

        existing_stash_ids = refreshed_scene.get("stash_ids", [])

        new_stash_ids = []
        if row.get("stashdb_id"):
            new_stash_ids.append({
                "endpoint": "https://stashdb.org/graphql",
                "stash_id": row["stashdb_id"]
            })
        if row.get("ce_downloads_release_uuid"):
            new_stash_ids.append({
                "endpoint": "https://culture.extractor/graphql",
                "stash_id": row["ce_downloads_release_uuid"]
            })

        scene_stash_ids = list({
            (stash_id["endpoint"], stash_id["stash_id"]): stash_id
            for stash_id in existing_stash_ids + new_stash_ids
        }.values())

        update = {
            "ce_release_uuid": row["ce_release_uuid"],
            "scene_id": row["scene_id"],
            "primary_file_basename": row["primary_file_basename"],
            "existing_scene_gallery_id": existing_scene_gallery_id,
            "gallery_id": gallery_id if gallery_id else existing_scene_gallery_id,
            "existing_scene_date": refreshed_scene.get("date", None),
            "existing_gallery_date": refreshed_gallery.get("date", None) if refreshed_gallery else None,
            "date": row["date"].strftime("%Y-%m-%d") if row["date"] else None,
            "existing_scene_title": refreshed_scene.get("title", None),
            "existing_gallery_title": refreshed_gallery.get("title", None) if refreshed_gallery else None,
            "title": row["title"],
            "existing_scene_code": refreshed_scene.get("code", None),
            "existing_gallery_code": refreshed_gallery.get("code", None) if refreshed_gallery else None,
            "code": row["code"],
            "existing_scene_details": refreshed_scene.get("details", None),
            "existing_gallery_details": refreshed_gallery.get("details", None) if refreshed_gallery else None,
            "details": row["details"],
            "existing_scene_studio_id": refreshed_scene.get("studio", {}).get("id") if refreshed_scene.get("studio") else None,
            "existing_gallery_studio_id": refreshed_gallery.get("studio", {}).get("id") if refreshed_gallery and refreshed_gallery.get("studio") else None,
            "studio_id": row["studio_id"],
            "existing_scene_performers": refreshed_scene.get("performers", []),
            "existing_gallery_performers": refreshed_gallery.get("performers", []) if refreshed_gallery else [],
            "performer_ids": list(set(existing_performer_ids + new_performer_ids)),
            "existing_scene_tags": existing_scene_tag_ids,
            "existing_gallery_tags": existing_gallery_tag_ids,
            "scene_tag_ids": sorted(list(set(existing_scene_tag_ids + new_tag_ids))),
            "gallery_tag_ids": sorted(list(set(existing_gallery_tag_ids + new_tag_ids))) if refreshed_gallery else [],
            "existing_scene_urls": refreshed_scene.get("urls", []),
            "scene_urls": existing_urls + new_url,
            "existing_gallery_urls": refreshed_gallery.get("urls", []) if refreshed_gallery else [],
            "gallery_urls": (existing_gallery_urls + [
                row["url"],
                f"https://culture.extractor/galleries/{row['ce_downloads_release_uuid']}"
            ]) if gallery_id else None,
            "cover_image_path": image_path,
            "scene_stash_ids": scene_stash_ids
        }
        updates.append(update)

    return pl.DataFrame(updates)

# Usage
updates_df = create_update_dataframe(
    joined_scenes,
    downloads,
    all_stashapp_performers,
    all_tags,
    selected_studio["stash_studios_id"]
)

update_inputs_df = generate_update_inputs(updates_df, stash_raw_client)

update_inputs_df = update_inputs_df.sort(by=["date"])

# Review updates before applying
print("Updates to be applied:")
print(update_inputs_df)

In [None]:
# print(f"Filtered out scenes: {exclusion_list}")

# Apply updates if everything looks good
for update in update_inputs_df.iter_rows(named=True): # .filter(~pl.col("scene_id").is_in(exclusion_list)).iter_rows(named=True):
    if update["cover_image_path"]:
        cover_image_base64 = f"data:image/jpeg;base64,{base64.b64encode(open(update['cover_image_path'], "rb").read()).decode("utf-8")}"
    else:
        cover_image_base64 = None

    scene_input = {
        "id": update["scene_id"],
        "date": update["date"],
        "title": update["title"],
        "code": update["code"],
        "details": update["details"],
        "studio_id": update["studio_id"],
        "performer_ids": update["performer_ids"],
        "tag_ids": update["scene_tag_ids"],
        "urls": update["scene_urls"],
        "cover_image": cover_image_base64,
        "stash_ids": update["scene_stash_ids"]
    }
    if update["gallery_id"]:
        scene_input["gallery_ids"] = [update["gallery_id"]]

    try:
        stash_raw_client.update_scene(scene_input)
    except Exception as e:
        print(f"Error updating scene for {update['scene_id']}: {e}")
        continue

    if update["gallery_id"]:
        gallery_input = {
            "id": update["gallery_id"],
            "date": update["date"],
            "title": update["title"],
            "code": update["code"],
            "details": update["details"],
            "studio_id": update["studio_id"],
            "performer_ids": update["performer_ids"],
            "tag_ids": update["gallery_tag_ids"],
            "urls": update["gallery_urls"]
        }
        try:
            stash_raw_client.update_gallery(gallery_input)
        except Exception as e:
            print(f"Error updating gallery for {update['scene_id']}: {e}")
            continue


# Setting metadata from Culture Extractor to unmatched scenes

This targets scenes which were not found in StashDB.


In [None]:
unmatched_joined_scenes = stash_app_scenes.join(downloads, left_on="stashapp_primary_file_oshash", right_on="ce_downloads_hash_oshash", how="left", coalesce=False).filter(pl.col("stashapp_id").is_in(exclusion_list))
unmatched_joined_scenes


In [None]:
unmatched_scene = unmatched_joined_scenes.to_dicts()[0]
unmatched_scene

In [None]:
stash_raw_client.update_scene({
    "id": unmatched_scene["stashapp_id"],
    "date": unmatched_scene["ce_downloads_release_date"].strftime("%Y-%m-%d"),
    "title": unmatched_scene["ce_downloads_release_name"],
    "code": unmatched_scene["ce_downloads_release_short_name"],
    "details": unmatched_scene["ce_downloads_release_description"],
    "urls": [unmatched_scene["ce_downloads_release_url"]],
    "stash_ids": [
        {
            "endpoint": "https://culture.extractor/graphql",
            "stash_id": unmatched_scene["ce_downloads_release_uuid"]
        }
    ]
})

# Matching standalone galleries

In [None]:
sha256_hashes = downloads["ce_downloads_hash_sha256"].unique().to_list()
stash_app_galleries = stash_client.find_galleries_by_sha256(sha256_hashes)
stash_app_galleries

In [None]:
joined_galleries = downloads.join(stash_app_galleries, left_on="ce_downloads_hash_sha256", right_on="stashapp_primary_file_sha256", coalesce=False)
joined_galleries

In [None]:
df_verification = joined_galleries.select(pl.col("ce_downloads_release_date"), pl.col("ce_downloads_release_name"), pl.col("stashapp_primary_file_basename"))
df_verification

In [None]:
all_stashapp_performers = stash_client.get_performers()
all_stashapp_performers = all_stashapp_performers.with_columns(
    pl.col("stashapp_custom_fields").list.eval(
        pl.when(pl.element().struct.field("key") == f"CultureExtractor.{selected_studio['ce_sites_short_name']}")
        .then(pl.element().struct.field("value"))
        .otherwise(None)
    ).list.eval(
        pl.element().filter(pl.element().is_not_null())
    ).list.first().alias("ce_custom_field_value")
)
print(f"Found {len(all_stashapp_performers.filter(pl.col('ce_custom_field_value').is_not_null()))} performers with a custom field")

In [None]:
unique_gallery_performers = joined_galleries.select(pl.col("ce_downloads_performers")).explode(pl.col("ce_downloads_performers")).select([
        pl.col("ce_downloads_performers").struct.field("uuid").alias("performer_uuid"),
        pl.col("ce_downloads_performers").struct.field("name").alias("performer_name"),
    ]).unique().sort("performer_name")
unique_gallery_performers

In [None]:
# Get matching performers using a join
matching_performers = all_stashapp_performers.join(
    unique_gallery_performers,
    left_on="ce_custom_field_value",
    right_on="performer_uuid",
    how="right"
)

# Unmatched performers are the ones where stash fields are null
df_unmatched_performers = matching_performers.filter(
    pl.col("stashapp_id").is_null()
).select([
    "performer_uuid",
    "performer_name"
])

print(f"Found {len(matching_performers.filter(pl.col('stashapp_id').is_not_null()))} matching performers")
if len(df_unmatched_performers) > 0:
    print(f"WARNING: {len(df_unmatched_performers)} performers not found in Stash:")
    for row in df_unmatched_performers.iter_rows(named=True):
        print(f"  {row['performer_uuid']} {row['performer_name']}")

# Return the matched performers (filter out nulls)
matching_performers.filter(pl.col("stashapp_id").is_not_null())

In [None]:
stash_client.update_performer_custom_fields(
    4221,
    { "CultureExtractor." + selected_studio["ce_sites_short_name"]: "018b93bc-ae9b-7335-80e6-cdd24648a057" }
)


In [None]:
from libraries.performer_matcher import PerformerMatcher


# Create matcher instance
matcher = PerformerMatcher(all_stashapp_performers)
gallery_performer_match_candidates = matcher.match_unmatched_performers(df_unmatched_performers)
gallery_performer_match_candidates

In [None]:
for match in gallery_performer_match_candidates:
    stash_client.update_performer_custom_fields(
        match.stashapp_id,
        { "CultureExtractor." + selected_studio["ce_sites_short_name"]: match.ce_uuid }
    )
    print(f"Updated performer {match.stashapp_name} ({match.stashapp_id}) with {match.ce_uuid}")


In [None]:
# import psycopg2
#
# with psycopg2.connect(connection_string) as conn:
#     with conn.cursor() as cursor:
#         performer_id = "018b93bc-8f31-7449-849c-ec009f8064e3"
#         cursor.execute("DELETE FROM performers WHERE uuid = %s", (performer_id,))
#         conn.commit()
#         print(f"Deleted performer {performer_id}")

In [None]:
ce_performer_mapping = stash_client.get_performers().with_columns([
    pl.col("stashapp_custom_fields").list.eval(
        pl.element().struct.field("value").filter(
            pl.element().struct.field("key") == "CultureExtractor." + selected_studio["ce_sites_short_name"]
        )
    ).list.first().alias("ce_custom_field_value")
]).filter(
    pl.col("ce_custom_field_value").is_not_null()
).select(
    pl.col("ce_custom_field_value").alias("ce_performer_uuid"),
    pl.col("stashapp_id").alias("stashapp_id"),
    pl.col("stashapp_name").alias("stashapp_name")
).sort(by=["stashapp_name"])
ce_performer_mapping

In [None]:
# Create update inputs for reviewing and applying
def create_update_dataframe(joined_galleries, downloads, all_stashapp_performers, all_tags, stashapp_studio_id):
    # Get all scene data ready for updates
    updates_df = joined_galleries.select([
        pl.col("ce_downloads_release_uuid").alias("ce_release_uuid"),
        pl.col("stashapp_id").alias("gallery_id"),
        pl.col("stashapp_primary_file_basename").alias("primary_file_basename"),
        pl.col("ce_downloads_release_date").alias("date"),
        pl.col("ce_downloads_release_name").alias("title"),
        pl.col("ce_downloads_release_short_name").alias("code"),
        pl.col("ce_downloads_release_description").alias("details"),
        pl.lit(stashapp_studio_id).alias("studio_id"),
        pl.col("ce_downloads_release_url").alias("url"),
        pl.col("ce_downloads_release_uuid"),
        pl.col("ce_downloads_performers"),
    ])

    # Map performers - now with unique values
    updates_df = updates_df.with_columns([
        # Get Culture Extractor UUIDs
        pl.col("ce_downloads_performers").list.eval(
            pl.element().struct.field("uuid")
        ).list.unique().alias("ce_performer_uuids"),
    ])

    # Join performer IDs with unique values
    updates_df = updates_df.with_columns([
        pl.when(pl.col("ce_performer_uuids").is_not_null())
        .then(
            pl.col("ce_performer_uuids").map_elements(
                lambda uuids: ce_performer_mapping.filter(
                    pl.col("ce_performer_uuid").is_in(uuids)
                ).get_column("stashapp_id").unique().to_list(),
                return_dtype=pl.List(pl.Int64)
            )
        )
        .otherwise(pl.Series([[]]))
        .alias("ce_performer_stashapp_ids"),

        pl.when(pl.col("ce_performer_uuids").is_not_null())
        .then(
            pl.col("ce_performer_uuids").map_elements(
                lambda uuids: ce_performer_mapping.filter(
                    pl.col("ce_performer_uuid").is_in(uuids)
                ).get_column("stashapp_name").unique().to_list(),
                return_dtype=pl.List(pl.Utf8)
            )
        )
        .otherwise(pl.Series([[]]))
        .alias("ce_performer_stashapp_names")
    ])

    updates_df = updates_df.with_columns([
        pl.concat_list([
            pl.col("ce_performer_stashapp_ids"),
        ]).list.unique().alias("performer_ids")
    ])

    return updates_df

def generate_update_inputs(updates_df, stash_raw_client):
    updates = []

    for row in updates_df.iter_rows(named=True):
        # Get current scene data
        refreshed_gallery = stash_raw_client.find_gallery(row["gallery_id"])

        existing_performer_ids = [int(performer["id"]) for performer in refreshed_gallery.get("performers", [])]
        new_performer_ids = row["performer_ids"] if row["performer_ids"] is not None else []

        update = {
            "ce_release_uuid": row["ce_release_uuid"],
            "gallery_id": row["gallery_id"],
            "primary_file_basename": row["primary_file_basename"],
            "existing_gallery_date": refreshed_gallery.get("date", None) if refreshed_gallery else None,
            "date": row["date"].strftime("%Y-%m-%d") if row["date"] else None,
            "existing_gallery_title": refreshed_gallery.get("title", None) if refreshed_gallery else None,
            "title": row["title"],
            "existing_gallery_code": refreshed_gallery.get("code", None) if refreshed_gallery else None,
            "code": row["code"],
            "existing_gallery_details": refreshed_gallery.get("details", None) if refreshed_gallery else None,
            "details": row["details"],
            "existing_gallery_studio_id": refreshed_gallery.get("studio", {}).get("id") if refreshed_gallery and refreshed_gallery.get("studio") else None,
            "studio_id": row["studio_id"],
            "existing_gallery_performers": refreshed_gallery.get("performers", []) if refreshed_gallery else [],
            "performer_ids": list(set(existing_performer_ids + new_performer_ids)),
            "existing_gallery_urls": refreshed_gallery.get("urls", []) if refreshed_gallery else [],
            "gallery_urls": list(set(refreshed_gallery.get("urls", []) + [
                row["url"],
                f"https://culture.extractor/galleries/{row['ce_downloads_release_uuid']}"
            ])),
        }
        updates.append(update)

    return pl.DataFrame(updates)

# Usage
updates_df = create_update_dataframe(
    joined_galleries,
    downloads,
    all_stashapp_performers,
    all_tags,
    selected_studio["stash_studios_id"]
)

update_inputs_df = generate_update_inputs(updates_df, stash_raw_client)

update_inputs_df = update_inputs_df.sort(by=["date"])

# Review updates before applying
print("Updates to be applied:")
print(update_inputs_df)

In [None]:
stash_raw_client.find_gallery(3768)

In [None]:
for update in update_inputs_df.iter_rows(named=True):
    gallery_input = {
        "id": update["gallery_id"],
        "date": update["date"],
        "title": update["title"],
        "code": update["code"],
        "details": update["details"],
        "studio_id": update["studio_id"],
        "performer_ids": update["performer_ids"],
        "urls": update["gallery_urls"],
    }
    try:
        stash_raw_client.update_gallery(gallery_input)
    except Exception as e:
        print(f"Error updating gallery for {update['gallery_id']}: {e}")
        continue

# Linking scenes and galleries by name

In [None]:
focus_studio = stash_raw_client.find_studio("SexArt")

In [None]:
studio_id = focus_studio["id"]

stash_scenes = stash_raw_client.find_scenes(
    { "studios": { "value": [studio_id], "modifier": "INCLUDES" } },
    fragment="id title galleries { id title }"
)

# Convert to DataFrame
stash_scenes_df = pl.DataFrame(stash_scenes)

stash_scenes_df = stash_scenes_df.select([
    pl.col("id").alias("scene_id"),
    pl.col("title").alias("scene_title"),
    pl.col("galleries").list.eval(
        pl.element().struct.field("id")
    ).list.first().alias("gallery_id"),
    pl.col("galleries").list.eval(
        pl.element().struct.field("title")
    ).list.first().alias("gallery_title")
])

stash_galleries = stash_raw_client.find_galleries(
    { "studios": { "value": [168], "modifier": "INCLUDES" } },
    fragment="id title scenes { id title }"
)

# Convert to DataFrame
stash_galleries_df = pl.DataFrame(stash_galleries)

stash_galleries_df = stash_galleries_df.select([
    pl.col("id").alias("gallery_id"),
    pl.col("title").alias("gallery_title"),
    pl.col("scenes").list.eval(
        pl.element().struct.field("id")
    ).list.first().alias("scene_id"),
    pl.col("scenes").list.eval(
        pl.element().struct.field("title")
    ).list.first().alias("scene_title")
])

joined_scenes_and_galleries = stash_scenes_df.join(stash_galleries_df, left_on="scene_title", right_on="gallery_title", how="inner")
joined_scenes_and_galleries

# Performer checks

In [None]:
all_stashapp_performers = stash