In [1]:
# Importing metadata from Culture Extractor to StashApp
# 1. Import metadata from Culture Extractor
# 2. Import metadata from StashApp by oshash
# 3. Join the two on oshash
# 4. Query metadata from StashDB by phash
# 5. Join the three on phash
# 6. Match performers between Culture Extractor, StashApp and StashDB
# 7. Set Culture Extractor UUIDs to performer custom fields in StashApp
# 8. Set metadata to StashApp scenes

In [None]:
import libraries.client_culture_extractor as client_culture_extractor
import os
import polars as pl
from dotenv import load_dotenv

load_dotenv()

# Culture Extractor
user = os.environ.get("CE_DB_USERNAME")
pw = os.environ.get("CE_DB_PASSWORD")
host = os.environ.get("CE_DB_HOST")
port = os.environ.get("CE_DB_PORT")
db = os.environ.get("CE_DB_NAME")

connection_string = f"dbname={db} user={user} password={pw} host={host} port={port}"

culture_extractor_client = client_culture_extractor.ClientCultureExtractor(connection_string)


# StashApp
from libraries.client_stashapp import StashAppClient, get_stashapp_client

stash_client = StashAppClient()
stash_raw_client = get_stashapp_client()


# StashDB
from libraries.StashDbClient import StashDbClient
import dotenv
import os

dotenv.load_dotenv()

stashbox_client = StashDbClient(
    os.getenv("STASHDB_ENDPOINT"),
    os.getenv("STASHDB_API_KEY"),
)


# Functions
def hex_to_binary(hex_string):
    return bin(int(hex_string, 16))[2:].zfill(64)

def calculate_hamming_distance(phash1, phash2):
    # Convert hexadecimal phashes to binary
    binary1 = hex_to_binary(phash1)
    binary2 = hex_to_binary(phash2)
    
    # Ensure both binary strings are of equal length
    if len(binary1) != len(binary2):
        raise ValueError("Binary strings must be of equal length")
    
    # Calculate Hamming distance
    return sum(c1 != c2 for c1, c2 in zip(binary1, binary2))

# Example usage:
# phash1 = "951428607cf7cb8f"
# phash2 = "951428607cf7cb8e"
# distance = calculate_hamming_distance(phash1, phash2)
# print(f"Hamming distance between {phash1} and {phash2}: {distance}")

def levenshtein(s1, s2):
    if not s1:
        return None
    if not s2:
        return None
    from Levenshtein import distance
    return distance(s1, s2)


In [181]:
all_tags = stash_raw_client.find_tags()

In [None]:
sites = culture_extractor_client.get_sites()
# Copy to clipboard
sites.filter(pl.col("ce_sites_name").str.contains("Nubile Films")).select(pl.col("ce_sites_uuid"))

In [3]:
stash_client.set_studio_stash_id_for_endpoint(7, "https://culture.extractor/graphql", "018e8ed6-21fe-739d-8019-4203505a6f86")

In [None]:
stash_studios = stash_client.get_studios()
stash_studios.filter(pl.col("stash_studios_name").str.contains("Nubile Films"))

In [5]:
df_sites_joined = sites.join(stash_studios, left_on="ce_sites_name", right_on="stash_studios_name", how="left", coalesce=False)

In [None]:
refreshed_studio = stash_raw_client.find_studios(q="Nubile Films", fragment="id, name, url, stash_ids { endpoint, stash_id, updated_at }")
stashapp_studio_id = refreshed_studio[0]["id"]
refreshed_studio

In [None]:
downloads = culture_extractor_client.get_downloads('Nubile Films')
downloads

In [None]:
oshashes = downloads["ce_downloads_hash_oshash"].unique().to_list()
stash_app_scenes = stash_client.find_scenes_by_oshash(oshashes)
stash_app_scenes

In [9]:
joined_scenes = stash_app_scenes.join(downloads, left_on="stashapp_primary_file_oshash", right_on="ce_downloads_hash_oshash", how="left", coalesce=False)

In [10]:
# Create a list to store scene data
scene_data = []

# Create list of scene objects with filename, phash and duration
scene_objects = joined_scenes.select(
    pl.col("stashapp_primary_file_path").alias("filename"),
    pl.col("stashapp_primary_file_phash").alias("phash"),
    pl.col("stashapp_primary_file_duration").dt.total_seconds().alias("duration"),
).to_dicts()

batch_size = 100

stashdb_scene_batches = []
for i in range(0, len(scene_objects), batch_size):
    batch = scene_objects[i:i+batch_size]
    batch_stashdb_scenes = stashbox_client.query_scenes_by_phash(batch)
    stashdb_scene_batches.append(batch_stashdb_scenes)

df_stashdb_scenes = pl.concat(stashdb_scene_batches)

In [11]:
joined_scenes_with_stashdb_scenes = joined_scenes.join(df_stashdb_scenes, left_on="stashapp_primary_file_phash", right_on="queried_phash", how="left", coalesce=False)

In [24]:
parquet_path = "joined_scenes_with_stashdb_scenes_20250105_1029.parquet"

# joined_scenes_with_stashdb_scenes.write_parquet(parquet_path)
joined_scenes_with_stashdb_scenes = pl.read_parquet(parquet_path)

In [12]:
def calculate_duration_difference(stashapp_duration, stashdb_duration):
    return (
        pl.when(stashapp_duration.is_not_null() & stashdb_duration.is_not_null())
        .then(
            ((stashapp_duration - stashdb_duration).abs() / 
             pl.max_horizontal([stashapp_duration, stashdb_duration])) * 100
        )
        .otherwise(None)
    )

def calculate_title_similarity(ce_title, stashdb_title):
    return (
        pl.when(ce_title.is_not_null() & stashdb_title.is_not_null())
        .then(
            pl.struct([ce_title, stashdb_title])
            .map_elements(
                lambda row: levenshtein(str(row[0]), str(row[1])),  # Access by index instead of field name
                return_dtype=pl.Int64
            )
        )
        .otherwise(None)
    )

def get_date_difference_days(ce_date, stashdb_date):
    return (
        pl.when(ce_date.is_not_null() & stashdb_date.is_not_null())
        .then(
            (ce_date.cast(pl.Datetime) - stashdb_date.cast(pl.Datetime)).dt.total_days().abs()
        )
        .otherwise(None)
    )

# First create the calculated columns
df_verification = joined_scenes_with_stashdb_scenes.with_columns([
    calculate_duration_difference(
        pl.col("stashapp_primary_file_duration"), 
        pl.col("duration")
    ).alias("duration_diff_pct"),
    
    pl.struct(["ce_downloads_release_name", "title"])
        .map_elements(lambda x: levenshtein(x["ce_downloads_release_name"], x["title"]), return_dtype=pl.Int64)
        .alias("title_levenshtein"),
    
    get_date_difference_days(
        pl.col("ce_downloads_release_date"),
        pl.col("date")
    ).alias("date_diff_days"),
])

# Then add the warning flags
df_verification = df_verification.with_columns([
    # Add warning flags
    (pl.col("duration_diff_pct") > 5).alias("duration_warning"),
    (pl.col("title_levenshtein") > 5).alias("title_warning"),
    (pl.col("date_diff_days") > 7).alias("date_warning")
])

In [66]:
all_stashapp_performers = stash_client.get_performers()
all_stashapp_performers = all_stashapp_performers.with_columns(
    pl.col("stashapp_custom_fields").list.eval(
        pl.when(pl.element().struct.field("key") == "CultureExtractor.nubilefilms")
        .then(pl.element().struct.field("value"))
        .otherwise(None)
    ).list.first().alias("ce_custom_field_value")
)

In [None]:
unmatched_performers = all_stashapp_performers.filter(pl.col("ce_custom_field_value").is_null())
unmatched_performers

In [None]:
from libraries.performer_matcher import PerformerMatcher

# Create matcher instance
matcher = PerformerMatcher(all_stashapp_performers)

# Your DataFrame already has the required columns, but we need to process each row
all_matches = []

# Process each row in your DataFrame
for row in joined_scenes_with_stashdb_scenes.iter_rows(named=True):
    # Get performers from both sources
    ce_performers = row['ce_downloads_performers']
    stashapp_performers = row['stashapp_performers']
    
    # Create single-row DataFrame for the matcher
    scene_df = pl.DataFrame({
        'ce_downloads_performers': [ce_performers],
        'stashapp_performers': [stashapp_performers]
    })
    
    # Run matching for this scene
    matches = matcher.match_performers(
        scene_df['ce_downloads_performers'],
        scene_df['stashapp_performers']
    )
    
    # Add scene context to matches
    for match in matches:
        all_matches.append({
            'scene_id': row['stashapp_id'],
            'scene_title': row['stashapp_title'],
            'ce_uuid': match.ce_uuid,
            'ce_name': match.ce_name,
            'stashapp_id': match.stashapp_id,
            'stashapp_name': match.stashapp_name,
            'stashdb_uuid': match.stashdb_uuid,
            'stashdb_name': match.stashdb_name,
            'confidence': match.confidence,
            'reason': match.reason
        })

# Convert matches to DataFrame for analysis
matches_df = pl.DataFrame(all_matches)
matches_df

In [68]:
for row in matches_df.select(pl.col(["ce_uuid", "stashapp_id"])).unique().iter_rows(named=True):
    stash_client.update_performer_custom_fields(row["stashapp_id"], {"CultureExtractor.nubilefilms": row["ce_uuid"]})


In [None]:
# Set metadata to StashApp scenes
import json
import base64

selected_id = 25192
selected = json.loads(joined_scenes_with_stashdb_scenes.filter(pl.col("stashapp_id") == selected_id).write_json())[0]

# First, get all performer UUIDs from the selected download
ce_performer_uuids = [performer["uuid"] for performer in selected["ce_downloads_performers"]]

stashapp_performer_ids_for_scene_list = all_stashapp_performers.filter(
    pl.col("ce_custom_field_value").is_in(ce_performer_uuids)
).select(pl.col("stashapp_id")).to_series(0).to_list()

# Refresh the scene from StashApp
refreshed_scene = stash_raw_client.find_scene(selected["stashapp_id"])

# Scene image
release_directory = os.path.join("F:\\Ripping\\Nubile Films\\Metadata", selected["ce_downloads_release_uuid"])
scene_image_path = os.path.join(release_directory, downloads.filter(pl.col("ce_downloads_release_uuid") == selected["ce_downloads_release_uuid"], pl.col("ce_downloads_file_type") == "image").select(pl.col("ce_downloads_saved_filename")).to_series(0).to_list()[0])

scene_image_base64 = base64.b64encode(open(scene_image_path, "rb").read()).decode("utf-8")

# Find gallery download
galleries = downloads.filter(pl.col("ce_downloads_release_uuid") == selected["ce_downloads_release_uuid"], pl.col("ce_downloads_content_type") == "gallery", pl.col("ce_downloads_variant") == "Large")
gallery_sha256 = galleries.select(pl.col("ce_downloads_hash_sha256")).to_series(0).to_list()[0]

found_galleries = stash_raw_client.find_galleries(q=gallery_sha256)

if len(found_galleries) == 1:
    selected_gallery = found_galleries[0]
    refreshed_gallery = stash_raw_client.find_gallery(selected_gallery["id"])
else:
    selected_gallery = None
    refreshed_gallery = None

# Map StashDB tag names to StashApp tag IDs
selected_tag_ids = [
    tag["id"] for tag in all_tags 
    if any(stashdb_tag["name"] == tag["name"] for stashdb_tag in selected["tags"])
]
selected_tag_ids

In [None]:
stash_update_input = {
    "date": selected["ce_downloads_release_date"],
    "title": selected["ce_downloads_release_name"],
    "code": selected["ce_downloads_release_short_name"],
    "details": selected["ce_downloads_release_description"],
    "studio_id": stashapp_studio_id,
    "tag_ids": list(set([tag["id"] for tag in refreshed_scene["tags"]] + selected_tag_ids)),
    "performer_ids": stashapp_performer_ids_for_scene_list,
}

stash_scene_update_input = {
    "id": selected["stashapp_id"],
    "urls": refreshed_scene["urls"] + [
        selected["ce_downloads_release_url"],
    ],
    "cover_image": f"data:image/jpeg;base64,{scene_image_base64}",
    "stash_ids": list({
        (stash_id["endpoint"], stash_id["stash_id"]): stash_id
        for stash_id in refreshed_scene["stash_ids"] + [
            {
                "endpoint": "https://stashdb.org/graphql", 
                "stash_id": selected["id"]
            },
            {
                "endpoint": "https://culture.extractor/graphql",
                "stash_id": selected["ce_downloads_release_uuid"]
            }
        ]
    }.values()),
    **stash_update_input,
}

print(json.dumps(stash_update_input, indent=4))
print(json.dumps(stash_scene_update_input, indent=4))

if selected_gallery:
    stash_gallery_update_input = {
        "id": selected_gallery["id"],
        "urls": refreshed_gallery["urls"] + [
            selected["ce_downloads_release_url"],
            "https://culture.extractor/galleries/" + selected["ce_downloads_release_uuid"]
        ],
        **stash_update_input,
    }

    print(json.dumps(stash_gallery_update_input, indent=4))

In [None]:
stash_raw_client.update_scene(stash_scene_update_input)

if selected_gallery:
    stash_raw_client.update_scene({
        "id": selected["stashapp_id"],
        "gallery_ids": [selected_gallery["id"]]
    })
    
    stash_raw_client.update_gallery(stash_gallery_update_input)