In [1]:
# Importing metadata from Culture Extractor to StashApp
# 1. Import metadata from Culture Extractor
# 2. Import metadata from StashApp by oshash
# 3. Join the two on oshash
# 4. Query metadata from StashDB by phash
# 5. Join the three on phash
# 6. Match performers between Culture Extractor, StashApp and StashDB
# 7. Set Culture Extractor UUIDs to performer custom fields in StashApp
# 8. Set metadata to StashApp scenes

In [None]:
import libraries.client_culture_extractor as client_culture_extractor
import os
import polars as pl
from dotenv import load_dotenv

load_dotenv()

# Culture Extractor
user = os.environ.get("CE_DB_USERNAME")
pw = os.environ.get("CE_DB_PASSWORD")
host = os.environ.get("CE_DB_HOST")
port = os.environ.get("CE_DB_PORT")
db = os.environ.get("CE_DB_NAME")

connection_string = f"dbname={db} user={user} password={pw} host={host} port={port}"

culture_extractor_client = client_culture_extractor.ClientCultureExtractor(connection_string)


# StashApp
from libraries.client_stashapp import StashAppClient, get_stashapp_client

stash_client = StashAppClient()
stash_raw_client = get_stashapp_client()


# StashDB
from libraries.StashDbClient import StashDbClient
import dotenv
import os

dotenv.load_dotenv()

stashbox_client = StashDbClient(
    os.getenv("STASHDB_ENDPOINT"),
    os.getenv("STASHDB_API_KEY"),
)


# Functions
def hex_to_binary(hex_string):
    return bin(int(hex_string, 16))[2:].zfill(64)

def calculate_hamming_distance(phash1, phash2):
    # Convert hexadecimal phashes to binary
    binary1 = hex_to_binary(phash1)
    binary2 = hex_to_binary(phash2)
    
    # Ensure both binary strings are of equal length
    if len(binary1) != len(binary2):
        raise ValueError("Binary strings must be of equal length")
    
    # Calculate Hamming distance
    return sum(c1 != c2 for c1, c2 in zip(binary1, binary2))

# Example usage:
# phash1 = "951428607cf7cb8f"
# phash2 = "951428607cf7cb8e"
# distance = calculate_hamming_distance(phash1, phash2)
# print(f"Hamming distance between {phash1} and {phash2}: {distance}")

def levenshtein(s1, s2):
    if not s1:
        return None
    if not s2:
        return None
    from Levenshtein import distance
    return distance(s1, s2)


In [2]:
all_tags = stash_raw_client.find_tags()

In [None]:
sites = culture_extractor_client.get_sites()
# Copy to clipboard
sites.filter(pl.col("ce_sites_name").str.contains("Nubile Films")).select(pl.col("ce_sites_uuid"))

In [4]:
stash_client.set_studio_stash_id_for_endpoint(7, "https://culture.extractor/graphql", "018e8ed6-21fe-739d-8019-4203505a6f86")

In [None]:
stash_studios = stash_client.get_studios()
stash_studios.filter(pl.col("stash_studios_name").str.contains("Nubile Films"))

In [6]:
df_sites_joined = sites.join(stash_studios, left_on="ce_sites_name", right_on="stash_studios_name", how="left", coalesce=False)

In [None]:
refreshed_studio = stash_raw_client.find_studios(q="Nubile Films", fragment="id, name, url, stash_ids { endpoint, stash_id, updated_at }")
stashapp_studio_id = refreshed_studio[0]["id"]
refreshed_studio

In [None]:
downloads = culture_extractor_client.get_downloads('Nubile Films')
downloads

In [None]:
oshashes = downloads["ce_downloads_hash_oshash"].unique().to_list()
stash_app_scenes = stash_client.find_scenes_by_oshash(oshashes)
stash_app_scenes

In [10]:
joined_scenes = stash_app_scenes.join(downloads, left_on="stashapp_primary_file_oshash", right_on="ce_downloads_hash_oshash", how="left", coalesce=False)

In [11]:
# Create a list to store scene data
scene_data = []

# Create list of scene objects with filename, phash and duration
scene_objects = joined_scenes.select(
    pl.col("stashapp_primary_file_path").alias("filename"),
    pl.col("stashapp_primary_file_phash").alias("phash"),
    pl.col("stashapp_primary_file_duration").dt.total_seconds().alias("duration"),
).to_dicts()

batch_size = 100

stashdb_scene_batches = []
for i in range(0, len(scene_objects), batch_size):
    batch = scene_objects[i:i+batch_size]
    batch_stashdb_scenes = stashbox_client.query_scenes_by_phash(batch)
    stashdb_scene_batches.append(batch_stashdb_scenes)

df_stashdb_scenes = pl.concat(stashdb_scene_batches)

In [12]:
joined_scenes = joined_scenes.join(df_stashdb_scenes, left_on="stashapp_primary_file_phash", right_on="queried_phash", how="left", coalesce=False)

In [13]:
parquet_path = "joined_scenes_with_stashdb_scenes_20250105_1715.parquet"

# joined_scenes_with_stashdb_scenes.write_parquet(parquet_path)
joined_scenes = pl.read_parquet(parquet_path)

In [14]:
def calculate_duration_difference(stashapp_duration, stashdb_duration):
    return (
        pl.when(stashapp_duration.is_not_null() & stashdb_duration.is_not_null())
        .then(
            ((stashapp_duration - stashdb_duration).abs() / 
             pl.max_horizontal([stashapp_duration, stashdb_duration])) * 100
        )
        .otherwise(None)
    )

def calculate_title_similarity(ce_title, stashdb_title):
    return (
        pl.when(ce_title.is_not_null() & stashdb_title.is_not_null())
        .then(
            pl.struct([ce_title, stashdb_title])
            .map_elements(
                lambda row: levenshtein(str(row[0]), str(row[1])),  # Access by index instead of field name
                return_dtype=pl.Int64
            )
        )
        .otherwise(None)
    )

def get_date_difference_days(ce_date, stashdb_date):
    return (
        pl.when(ce_date.is_not_null() & stashdb_date.is_not_null())
        .then(
            (ce_date.cast(pl.Datetime) - stashdb_date.cast(pl.Datetime)).dt.total_days().abs()
        )
        .otherwise(None)
    )

# First create the calculated columns
df_verification = joined_scenes.with_columns([
    calculate_duration_difference(
        pl.col("stashapp_primary_file_duration"), 
        pl.col("duration")
    ).alias("duration_diff_pct"),
    
    pl.struct(["ce_downloads_release_name", "title"])
        .map_elements(lambda x: levenshtein(x["ce_downloads_release_name"], x["title"]), return_dtype=pl.Int64)
        .alias("title_levenshtein"),
    
    get_date_difference_days(
        pl.col("ce_downloads_release_date"),
        pl.col("date")
    ).alias("date_diff_days"),
])

# Then add the warning flags
df_verification = df_verification.with_columns([
    # Add warning flags
    (pl.col("duration_diff_pct") > 5).alias("duration_warning"),
    (pl.col("title_levenshtein") > 5).alias("title_warning"),
    (pl.col("date_diff_days") > 7).alias("date_warning")
])

In [None]:
foo_filtered = joined_scenes.filter(
    (joined_scenes['stashapp_ce_id'].is_null()),
    (joined_scenes['id'].is_not_null())
)
foo_filtered


In [20]:
all_stashapp_performers = stash_client.get_performers()
all_stashapp_performers = all_stashapp_performers.with_columns(
    pl.col("stashapp_custom_fields").list.eval(
        pl.when(pl.element().struct.field("key") == "CultureExtractor.nubilefilms")
        .then(pl.element().struct.field("value"))
        .otherwise(None)
    ).list.first().alias("ce_custom_field_value")
)

In [None]:
unmatched_performers = all_stashapp_performers.filter(pl.col("ce_custom_field_value").is_null())
unmatched_performers

In [None]:
from libraries.performer_matcher import PerformerMatcher

# Create matcher instance
matcher = PerformerMatcher(all_stashapp_performers)

# Your DataFrame already has the required columns, but we need to process each row
all_matches = []

# Process each row in your DataFrame
for row in joined_scenes.iter_rows(named=True):
    # Get performers from both sources
    ce_performers = row['ce_downloads_performers']
    stashapp_performers = row['stashapp_performers']
    
    # Create single-row DataFrame for the matcher
    scene_df = pl.DataFrame({
        'ce_downloads_performers': [ce_performers],
        'stashapp_performers': [stashapp_performers]
    })
    
    # Run matching for this scene
    matches = matcher.match_performers(
        scene_df['ce_downloads_performers'],
        scene_df['stashapp_performers']
    )
    
    # Add scene context to matches
    for match in matches:
        all_matches.append({
            'scene_id': row['stashapp_id'],
            'scene_title': row['stashapp_title'],
            'ce_uuid': match.ce_uuid,
            'ce_name': match.ce_name,
            'stashapp_id': match.stashapp_id,
            'stashapp_name': match.stashapp_name,
            'stashdb_uuid': match.stashdb_uuid,
            'stashdb_name': match.stashdb_name,
            'confidence': match.confidence,
            'reason': match.reason
        })

# Convert matches to DataFrame for analysis
matches_df = pl.DataFrame(all_matches)
matches_df

In [68]:
for row in matches_df.select(pl.col(["ce_uuid", "stashapp_id"])).unique().iter_rows(named=True):
    stash_client.update_performer_custom_fields(row["stashapp_id"], {"CultureExtractor.nubilefilms": row["ce_uuid"]})


In [None]:
import base64

def create_update_dataframe(joined_scenes, downloads, all_stashapp_performers, all_tags, stashapp_studio_id):
    # Get all scene data ready for updates
    updates_df = joined_scenes.select([
        pl.col("stashapp_id").alias("scene_id"),
        pl.col("stashapp_primary_file_path").alias("scene_name"),
        pl.col("ce_downloads_release_date").alias("date"),
        pl.col("ce_downloads_release_name").alias("title"),
        pl.col("ce_downloads_release_short_name").alias("code"),
        pl.col("ce_downloads_release_description").alias("details"),
        pl.lit(stashapp_studio_id).alias("studio_id"),
        pl.col("ce_downloads_release_url").alias("url"),
        pl.col("ce_downloads_release_uuid"),
        pl.col("id").alias("stashdb_id"),
        pl.col("ce_downloads_performers"),
        pl.col("tags").alias("stashdb_tags")
    ])

    # Map performers
    updates_df = updates_df.with_columns([
        pl.col("ce_downloads_performers").map_elements(
            lambda performers: [p["uuid"] for p in performers],
            return_dtype=pl.List(pl.Utf8)
        ).alias("ce_performer_uuids")
    ])

    # Get StashApp performer IDs
    performer_mapping = all_stashapp_performers.filter(
        pl.col("ce_custom_field_value").is_not_null()
    ).select([
        pl.col("ce_custom_field_value"),
        pl.col("stashapp_id")
    ])

    # Join performer IDs
    updates_df = updates_df.with_columns([
        pl.col("ce_performer_uuids").map_elements(
            lambda uuids: performer_mapping.filter(
                pl.col("ce_custom_field_value").is_in(uuids)
            ).select("stashapp_id").to_series().to_list(),
            return_dtype=pl.List(pl.Int64)
        ).alias("performer_ids")
    ])

    # Map tags
    tag_mapping = pl.DataFrame({
        "stashdb_name": [tag["name"] for tag in all_tags],
        "stashapp_id": [tag["id"] for tag in all_tags]
    })

    updates_df = updates_df.with_columns([
        pl.col("stashdb_tags").map_elements(
            lambda tags: tag_mapping.filter(
                pl.col("stashdb_name").is_in([t["name"] for t in tags])
            ).select("stashapp_id").to_series().to_list(),
            return_dtype=pl.List(pl.Utf8)
        ).alias("tag_ids")
    ])

    # Get scene images
    scene_images = downloads.filter(
        pl.col("ce_downloads_file_type") == "image"
    ).select([
        pl.col("ce_downloads_release_uuid"),
        pl.col("ce_downloads_saved_filename").alias("scene_image_filename")
    ])

    # Get gallery info
    galleries = downloads.filter(
        (pl.col("ce_downloads_content_type") == "gallery") &
        (pl.col("ce_downloads_variant") == "Large")
    ).select([
        pl.col("ce_downloads_release_uuid"),
        pl.col("ce_downloads_hash_sha256").alias("gallery_hash")
    ])

    # Join images and galleries
    updates_df = updates_df.join(
        scene_images,
        on="ce_downloads_release_uuid",
        how="left"
    ).join(
        galleries,
        on="ce_downloads_release_uuid",
        how="left"
    )

    return updates_df

def generate_update_inputs(updates_df, stash_raw_client):
    updates = []
    
    for row in updates_df.iter_rows(named=True):
        # Get current scene data
        refreshed_scene = stash_raw_client.find_scene(row["scene_id"])
        
        # Load scene image
        image_path = os.path.join(
            "F:\\Ripping\\Nubile Films\\Metadata", 
            row["ce_downloads_release_uuid"],
            row["scene_image_filename"]
        )
        scene_image_base64 = base64.b64encode(open(image_path, "rb").read()).decode("utf-8")

        # Find gallery if exists
        gallery_id = None
        gallery_urls = []
        if row["gallery_hash"]:
            found_galleries = stash_raw_client.find_galleries(q=row["gallery_hash"])
            if len(found_galleries) == 1:
                gallery_id = found_galleries[0]["id"]
                refreshed_gallery = stash_raw_client.find_gallery(gallery_id)
                gallery_urls = refreshed_gallery.get("urls", [])

        # Handle potentially null values
        existing_tag_ids = [tag["id"] for tag in refreshed_scene.get("tags", [])]
        new_tag_ids = row["tag_ids"] if row["tag_ids"] is not None else []
        existing_urls = refreshed_scene.get("urls", [])
        new_url = [row["url"]] if row["url"] is not None else []
        existing_stash_ids = refreshed_scene.get("stash_ids", [])

        scene_stash_ids = list({
            (stash_id["endpoint"], stash_id["stash_id"]): stash_id
            for stash_id in existing_stash_ids + [
                {
                    "endpoint": "https://stashdb.org/graphql",
                    "stash_id": row["stashdb_id"]
                },
                {
                    "endpoint": "https://culture.extractor/graphql",
                    "stash_id": row["ce_downloads_release_uuid"]
                }
            ]
        }.values())

        update = {
            "scene_id": row["scene_id"],
            "scene_name": row["scene_name"],
            "gallery_id": gallery_id,
            "date": row["date"].strftime("%Y-%m-%d") if row["date"] else None,
            "title": row["title"],
            "code": row["code"],
            "details": row["details"],
            "studio_id": row["studio_id"],
            "performer_ids": row["performer_ids"] if row["performer_ids"] is not None else [],
            "tag_ids": list(set(existing_tag_ids + new_tag_ids)),
            "scene_urls": existing_urls + new_url,
            "gallery_urls": (gallery_urls + [
                row["url"],
                f"https://culture.extractor/galleries/{row['ce_downloads_release_uuid']}"
            ]) if gallery_id else None,
            "cover_image": f"data:image/jpeg;base64,{scene_image_base64}",
            "scene_stash_ids": scene_stash_ids
        }
        updates.append(update)

    return pl.DataFrame(updates)

# Usage
updates_df = create_update_dataframe(
    foo_filtered,
    downloads,
    all_stashapp_performers,
    all_tags,
    stashapp_studio_id
)

update_inputs_df = generate_update_inputs(updates_df, stash_raw_client)

# Review updates before applying
print("Updates to be applied:")
print(update_inputs_df)


In [27]:
# Apply updates if everything looks good
for update in update_inputs_df.iter_rows(named=True):
    # Update scene
    scene_input = {
        "id": update["scene_id"],
        "date": update["date"],
        "title": update["title"],
        "code": update["code"],
        "details": update["details"],
        "studio_id": update["studio_id"],
        "performer_ids": update["performer_ids"],
        "tag_ids": update["tag_ids"],
        "urls": update["scene_urls"],
        "cover_image": update["cover_image"],
        "stash_ids": update["scene_stash_ids"]
    }
    if update["gallery_id"]:
        scene_input["gallery_ids"] = [update["gallery_id"]]
    
    stash_raw_client.update_scene(scene_input)

    # Update gallery if exists
    if update["gallery_id"]:
        gallery_input = {
            "id": update["gallery_id"],
            "date": update["date"],
            "title": update["title"],
            "code": update["code"],
            "details": update["details"],
            "studio_id": update["studio_id"],
            "performer_ids": update["performer_ids"],
            "tag_ids": update["tag_ids"],
            "urls": update["gallery_urls"]
        }
        stash_raw_client.update_gallery(gallery_input)