In [1]:
import psycopg
import pandas as pd

from dotenv import load_dotenv
import os

# Load the .env file
load_dotenv()

user = os.environ.get("CE_DB_USERNAME")
pw = os.environ.get("CE_DB_PASSWORD")
host = os.environ.get("CE_DB_HOST")
port = os.environ.get("CE_DB_PORT")
db = os.environ.get("CE_DB_NAME")

connection_string = f"dbname={db} user={user} password={pw} host={host} port={port}"

def hex_to_binary(hex_string):
    return bin(int(hex_string, 16))[2:].zfill(64)

def calculate_hamming_distance(phash1, phash2):
    # Convert hexadecimal phashes to binary
    binary1 = hex_to_binary(phash1)
    binary2 = hex_to_binary(phash2)
    
    # Ensure both binary strings are of equal length
    if len(binary1) != len(binary2):
        raise ValueError("Binary strings must be of equal length")
    
    # Calculate Hamming distance
    return sum(c1 != c2 for c1, c2 in zip(binary1, binary2))

# Example usage:
# phash1 = "951428607cf7cb8f"
# phash2 = "951428607cf7cb8e"
# distance = calculate_hamming_distance(phash1, phash2)
# print(f"Hamming distance between {phash1} and {phash2}: {distance}")

def levenshtein(s1, s2):
    from Levenshtein import distance
    return distance(s1, s2)


In [2]:
with psycopg.connect(connection_string) as conn:
    # Create a cursor object
    cursor = conn.cursor()

    # Execute the query
    query = "SELECT * FROM sites ORDER BY name"
    cursor.execute(query)

    # Fetch all the results
    matched_performers = cursor.fetchall()

    # Get the column names from cursor.description
    column_names = [desc[0] for desc in cursor.description]

    # Close the cursor
    cursor.close()

    # No need to manually close the connection when using 'with'

    # Convert the results to a data frame with column names
    df_sites = pd.DataFrame(matched_performers, columns=column_names)


In [24]:
# Creating scenes to Stash
# import base64
# 
# for _, release in df_tickling_submissions_joined.iterrows():
#     scene_data = {
#         "id": release["stash_scene"]["id"],
#         "code": release["culture_extractor_release_short_name"],
#         "title": release["culture_extractor_release_name"],
#         "date": release["culture_extractor_release_date"].strftime("%Y-%m-%d"),
#         "details": release["description"],
#         "performer_ids": release["stash_performer_ids"],
#         "urls": [release["culture_extractor_release_url"]],
#         "stash_ids": [{"endpoint": "https://culture.extractor/graphql", "stash_id": release["culture_extractor_release_uuid"]}]
#     }
#     
#     # Find jpg file in a directory like F:\Ripping\Tickling Submission\Metadata\0192221b-5f5b-75ac-a715-e8292b4262e7\
#     metadata_path = f"F:\\Ripping\\Tickling Submission\\Metadata\\{release['culture_extractor_release_uuid']}\\"
#     jpg_file = next((f for f in os.listdir(metadata_path) if f.endswith('.jpg')), None)
#     if jpg_file:
#         with open(f"F:\\Ripping\\Tickling Submission\\Metadata\\{release['culture_extractor_release_uuid']}\\" + jpg_file, "rb") as image_file:
#             encoded_image = base64.b64encode(image_file.read()).decode('utf-8')
#             scene_data["cover_image"] = f"data:image/jpeg;base64,{encoded_image}"
#     
#     stash.update_scene(scene_data)
# 

In [25]:
with psycopg.connect(connection_string) as conn:
    # Create a cursor object
    cursor = conn.cursor()

    # Execute the query with COALESCE to handle -infinity dates
    query = """
   SELECT 
       sites.name AS site_name,
       sub_sites.name AS sub_site_name,
       releases.uuid AS release_uuid,
       COALESCE(NULLIF(releases.release_date, '-infinity'), '1970-01-01'::date) as release_date,
       releases.short_name AS release_short_name, 
       releases.name AS release_name, 
       releases.url AS release_url,
       releases.description AS release_description,
       releases.json_document::text AS release_json_document,
       downloads.uuid AS downloads_uuid,
       downloads.file_type,
       downloads.content_type,
       downloads.saved_filename,
       downloads.file_metadata::text AS file_metadata,
       -- Aggregate performers into a JSON array of objects
       json_agg(
           DISTINCT jsonb_build_object(
               'uuid', performers.uuid,
               'name', performers.name,
               'short_name', performers.short_name,
               'url', performers.url
           ) 
       ) FILTER (WHERE performers.uuid IS NOT NULL) as performers,
       -- Aggregate tags into an array  
       array_agg(DISTINCT tags.name) FILTER (WHERE tags.name IS NOT NULL) as tag_names
   FROM releases
   JOIN sites ON releases.site_uuid = sites.uuid
   JOIN downloads ON releases.uuid = downloads.release_uuid
   LEFT JOIN sub_sites ON releases.sub_site_uuid = sub_sites.uuid
   -- Left join performers through junction table
   LEFT JOIN release_entity_site_performer_entity rep ON releases.uuid = rep.releases_uuid
   LEFT JOIN performers ON rep.performers_uuid = performers.uuid
   -- Left join tags through junction table
   LEFT JOIN release_entity_site_tag_entity ret ON releases.uuid = ret.releases_uuid
   LEFT JOIN tags ON ret.tags_uuid = tags.uuid
   WHERE sites.name = 'Nubile Films'
   GROUP BY
       sites.name,
       sub_sites.name,
       releases.uuid,
       releases.release_date,
       releases.short_name,
       releases.name,
       releases.url,
       releases.description,
       releases.json_document::text,
       downloads.uuid,
       downloads.file_type,
       downloads.content_type,
       downloads.saved_filename,
       downloads.file_metadata::text
    """
    cursor.execute(query)

    # Fetch all the results
    matched_performers = cursor.fetchall()

    # Get the column names from cursor.description
    column_names = [desc[0] for desc in cursor.description]

    # Close the cursor
    cursor.close()

    # No need to manually close the connection when using 'with'

    # Convert the results to a data frame with column names
    df_releases = pd.DataFrame(matched_performers, columns=column_names)
    # Convert string representation of JSON to dict
    df_releases['file_metadata'] = df_releases['file_metadata'].apply(lambda x: eval(x) if isinstance(x, str) else x)
    df_releases['sha256'] = df_releases['file_metadata'].apply(lambda x: x['sha256Sum'] if isinstance(x, dict) and 'sha256Sum' in x else None)
    df_releases['phash'] = df_releases['file_metadata'].apply(lambda x: x['phash'] if isinstance(x, dict) and 'phash' in x else None)
    df_releases['oshash'] = df_releases['file_metadata'].apply(lambda x: x['oshash'] if isinstance(x, dict) and 'oshash' in x else None)
    
    df_releases = df_releases.copy().add_prefix('culture_extractor_')


In [None]:
# Create Stash app client

import pandas as pd
import dotenv
import os
from libraries.client_stashapp import get_stashapp_client
from libraries.StashDbClient import StashDbClient

dotenv.load_dotenv()

stash = get_stashapp_client()
STASHAPP_URL = os.getenv("STASHAPP_URL")

stashbox_client = StashDbClient(
    os.getenv("STASHDB_ENDPOINT"),
    os.getenv("STASHDB_API_KEY"),
)

In [28]:
releases_with_stash_scenes = []

for _, release in df_releases.iterrows():
    oshash = release["culture_extractor_oshash"]
    if oshash is None:
        continue

    stash_scene = stash.find_scene_by_hash({ "oshash": oshash })
    if stash_scene:
        releases_with_stash_scenes.append({
            **release.to_dict(),
            "stash_scene": stash_scene
        })

df_releases_with_stash_scenes = pd.DataFrame(releases_with_stash_scenes)

In [29]:
# Create a list to store scene data
scene_data = []

# Create list of scene objects with filename, phash and duration
scene_objects = df_releases_with_stash_scenes[['stash_scene', 'culture_extractor_phash', 'culture_extractor_file_metadata']].dropna().apply(
    lambda x: {
        'filename': x['stash_scene']['files'][0]['basename'],
        'phash': x['culture_extractor_phash'], 
        'duration': x['culture_extractor_file_metadata'].get('duration')
    },
    axis=1
).tolist()

batch_size = 100
for i in range(0, len(scene_objects), batch_size):
    batch = scene_objects[i:i+batch_size]
    stashdb_scenes = stashbox_client.query_scenes_by_phash(batch)
    
    for scene_obj in batch:
        phash = scene_obj['phash']
        if phash in stashdb_scenes:
            release_row = df_releases_with_stash_scenes[df_releases_with_stash_scenes['culture_extractor_phash'] == phash].iloc[0]
            
            stashdb_row_data = {
                **release_row.to_dict(),
                'stashdb_scene': stashdb_scenes[phash]
            }
            scene_data.append(stashdb_row_data)

df_scenes = pd.DataFrame(scene_data)

In [30]:
# Create verification columns
def extract_duration(file_metadata):
   if isinstance(file_metadata, dict) and 'duration' in file_metadata:
       return file_metadata['duration']
   return None

def calculate_duration_difference(row):
   ce_duration = extract_duration(row['culture_extractor_file_metadata'])
   stashdb_duration = row.get('stashdb_scene', {}).get('duration')
   
   if ce_duration and stashdb_duration:
       diff = abs(ce_duration - stashdb_duration)
       # Return difference as percentage of the longer duration
       max_duration = max(ce_duration, stashdb_duration)
       return (diff / max_duration) * 100
   return None

def calculate_title_similarity(row):
   ce_title = row.get('culture_extractor_release_name')
   stashdb_title = row.get('stashdb_scene', {}).get('title')
   
   if ce_title and stashdb_title:
       return levenshtein(ce_title, stashdb_title)
   return None

def get_date_difference_days(row):
   ce_date = row.get('culture_extractor_release_date')
   stashdb_date = row.get('stashdb_scene', {}).get('date')
   
   if ce_date and stashdb_date:
       try:
           ce_date = pd.to_datetime(ce_date)
           stashdb_date = pd.to_datetime(stashdb_date)
           return abs((ce_date - stashdb_date).days)
       except:
           return None
   return None

# Create verification dataframe
df_verification = df_scenes[df_scenes['stashdb_scene'].notna()].copy()

# Add verification columns at the beginning
verification_columns = {
   'duration_diff_pct': df_verification.apply(calculate_duration_difference, axis=1),
   'title_levenshtein': df_verification.apply(calculate_title_similarity, axis=1),
   'date_diff_days': df_verification.apply(get_date_difference_days, axis=1),
   'ce_duration': df_verification['culture_extractor_file_metadata'].apply(extract_duration),
   'stashdb_duration': df_verification['stashdb_scene'].apply(lambda x: x.get('duration') if x else None),
   'ce_title': df_verification['culture_extractor_release_name'],
   'stashdb_title': df_verification['stashdb_scene'].apply(lambda x: x.get('title') if x else None),
   'ce_date': df_verification['culture_extractor_release_date'],
   'stashdb_date': df_verification['stashdb_scene'].apply(lambda x: x.get('date') if x else None)
}

# Create new dataframe with verification columns first
df_verification = pd.concat([
   pd.DataFrame(verification_columns),
   df_verification.drop(columns=verification_columns.keys(), errors='ignore')
], axis=1)

# Add warning flags
df_verification['duration_warning'] = df_verification['duration_diff_pct'] > 5  # Warning if >5% difference
df_verification['title_warning'] = df_verification['title_levenshtein'] > 5    # Warning if Levenshtein distance >5
df_verification['date_warning'] = df_verification['date_diff_days'] > 7        # Warning if >7 days difference

# Sort by warnings to show potential mismatches first
df_verification = df_verification.sort_values(
   by=['duration_warning', 'title_warning', 'date_warning', 'duration_diff_pct'],
   ascending=[False, False, False, False]
)

In [40]:
# df_verification.iloc[1]['culture_extractor_performers']
import json
with open('tests/data/culture_extractor_stashdb_performers.sample02.json', 'w') as f:
    json.dump({ 
        'culture_extractor_performers': df_verification.iloc[1]['culture_extractor_performers'],
        'stashdb_performers': df_verification.iloc[1]['stashdb_scene']['performers']
    }, f, indent=4)


In [None]:
from libraries.performer_matcher import PerformerMatcher

df_sample_scene = df_verification.iloc[1]
matched_performers = PerformerMatcher.match_all_performers(df_sample_scene['culture_extractor_performers'], df_sample_scene['stashdb_scene']['performers'])

for ce_performer, stashdb_performer in matched_performers:
    print(ce_performer)
    print(stashdb_performer)
    
    # refreshed_performer = stash.find_performers({ 'stash_id_endpoint': { 'endpoint': 'https://stashdb.org/graphql', 'stash_id': stashdb_performer['performer']['id'], 'modifier': 'EQUALS' } })
    # if len(refreshed_performer) == 0:
    #     print(f"No performer found with id {stashdb_performer['performer']['id']}")
    #     continue
    # if len(refreshed_performer) > 1:
    #     print(f"Multiple performers found with id {stashdb_performer['performer']['id']}")
    #     continue
    # 
    # refreshed_performer = refreshed_performer[0]
    # print(refreshed_performer)
    # print(refreshed_performer['stash_ids'])
    # 
    # # Check if performer already has Culture Extractor stash ID
    # has_ce_stash_id = any(
    #     stash_id['endpoint'] == 'https://culture.extractor/graphql' and 
    #     stash_id['stash_id'] == ce_performer['uuid']
    #     for stash_id in refreshed_performer['stash_ids']
    # )
    # if not has_ce_stash_id:
    #     new_stash_ids = refreshed_performer['stash_ids'] + [{ 'endpoint': 'https://culture.extractor/graphql', 'stash_id': ce_performer['uuid'] }]
    #     stash.update_performer({ 'id': refreshed_performer['id'], 'stash_ids': new_stash_ids })
    #     print(f"Updated performer {refreshed_performer['name']} with Culture Extractor stash ID")


In [None]:
# Get database schema information
with psycopg.connect(connection_string) as conn:
    # Create a cursor object
    cursor = conn.cursor()

    # Query to get table information
    table_query = """
        SELECT DISTINCT table_name 
        FROM information_schema.tables 
        WHERE table_schema = 'public' 
        AND table_type = 'BASE TABLE'
        ORDER BY table_name;
    """
    cursor.execute(table_query)
    tables = cursor.fetchall()

    print("Database Schema:")
    for (table_name,) in tables:
        print(f"\nTable: {table_name}")
        
        # Get column information for each table
        column_query = """
            SELECT 
                column_name,
                data_type,
                column_default,
                is_nullable,
                character_maximum_length
            FROM information_schema.columns
            WHERE table_schema = 'public'
            AND table_name = %s
            ORDER BY ordinal_position;
        """
        cursor.execute(column_query, (table_name,))
        columns = cursor.fetchall()
        
        for column in columns:
            column_name, data_type, default, nullable, max_length = column
            nullable_str = "NULL" if nullable == 'YES' else "NOT NULL"
            default_str = f"DEFAULT {default}" if default else ""
            length_str = f"({max_length})" if max_length else ""
            print(f"  {column_name}: {data_type}{length_str} {nullable_str} {default_str}")

    cursor.close()

In [44]:
with psycopg.connect(connection_string) as conn:
    # Create a cursor object
    cursor = conn.cursor()

    # Execute the query with COALESCE to handle -infinity dates
    query = """
        SELECT sites.name AS site_name,
               releases.uuid AS release_uuid,
               COALESCE(NULLIF(releases.release_date, '-infinity'), '1970-01-01'::date) as release_date,
               releases.short_name AS release_short_name, 
               releases.name AS release_name, 
               releases.url AS release_url,
               releases.json_document AS release_json_document,
               downloads.uuid AS downloads_uuid,
               downloads.file_type,
               downloads.content_type,
               downloads.saved_filename,
               downloads.file_metadata
        FROM releases
        JOIN sites ON releases.site_uuid = sites.uuid
        JOIN downloads ON releases.uuid = downloads.release_uuid
        WHERE file_type = 'zip' AND content_type = 'gallery' AND sites.name = 'Femjoy'
    """
    cursor.execute(query)

    # Fetch all the results
    matched_performers = cursor.fetchall()

    # Get the column names from cursor.description
    column_names = [desc[0] for desc in cursor.description]

    # Close the cursor
    cursor.close()

    # No need to manually close the connection when using 'with'

    # Convert the results to a data frame with column names
    df_releases = pd.DataFrame(matched_performers, columns=column_names)
    # df_releases['sha256'] = df_releases['file_metadata'].apply(lambda x: x['sha256Sum'] if isinstance(x, dict) and 'sha256Sum' in x else None)
    # df_releases['oshash'] = df_releases['file_metadata'].apply(lambda x: x['oshash'] if isinstance(x, dict) and 'oshash' in x else None)
    
    
    df_releases = df_releases.copy().add_prefix('culture_extractor_')

# Extract model names from json_document
def extract_model_names(json_doc):
    try:
        if isinstance(json_doc, str):
            import json
            json_doc = json.loads(json_doc)
            
        if isinstance(json_doc, dict) and 'models' in json_doc:
            return [model['name'] for model in json_doc['models']]
        return []
    except:
        return []

# Add new column with model names
df_releases['culture_extractor_models'] = df_releases['culture_extractor_release_json_document'].apply(extract_model_names)

# If you want the names as a comma-separated string instead of a list
df_releases['culture_extractor_models_string'] = df_releases['culture_extractor_models'].apply(lambda x: ', '.join(x) if x else '')

df_releases = df_releases[['culture_extractor_release_uuid', 'culture_extractor_release_date', 'culture_extractor_release_name', 'culture_extractor_models', 'culture_extractor_models_string']]


In [None]:
target_uuid = '01928448-5251-716f-8499-d5c7a99a1e48'
df_releases[df_releases['culture_extractor_release_uuid'].astype(str) == target_uuid]

In [None]:
# Copy culture_extractor_release_uuid from releases which have a specific model name
model_name = "Marria Leeah"
df_releases_with_model_name = df_releases[df_releases['culture_extractor_models_string'].str.contains(model_name)]
df_releases_with_model_name = df_releases_with_model_name[['culture_extractor_release_uuid', 'culture_extractor_release_name', 'culture_extractor_models_string']]
df_releases_with_model_name.columns = ['release_uuid', 'release_name', 'model_name']

df_releases_with_model_name

In [311]:
df_releases_with_model_name["release_uuid"].to_clipboard(index=False, header=False, sep='|')

In [14]:
with psycopg.connect(connection_string) as conn:
    # Create a cursor object
    cursor = conn.cursor()

    # Execute the query
    query = """
        SELECT sub_sites.*, sites.name AS site_name
        FROM sub_sites
        JOIN sites ON sub_sites.site_uuid = sites.uuid
        ORDER BY sub_sites.name
    """
    cursor.execute(query)

    # Fetch all the results
    matched_performers = cursor.fetchall()

    # Get the column names from cursor.description
    column_names = [desc[0] for desc in cursor.description]

    # Close the cursor
    cursor.close()

    # No need to manually close the connection when using 'with'

    # Convert the results to a data frame with column names
    df_subsites = pd.DataFrame(matched_performers, columns=column_names)


In [None]:
from libraries.client_stashapp import get_stashapp_client

stash = get_stashapp_client()

def get_parent_studio_id(studio):
    if studio["parent_studio"] is not None:
        return studio["parent_studio"]["id"]
    return None


In [8]:
studios = stash.find_studios({})
df_stash_studios = pd.DataFrame(studios)
df_stash_studios["parent_studio_id"] = df_stash_studios.apply(get_parent_studio_id, axis=1)

# Lookup functions
def get_studio_by_id(studio_id):
    return df_stash_studios[df_stash_studios["id"] == studio_id]

def get_studio_by_name(studio_name):
    return df_stash_studios[df_stash_studios["name"] == studio_name]

In [17]:
# Create new dataframes with prefixed column names
df_sites_prefixed = df_sites.add_prefix('culture_extractor_')
df_stash_studios_prefixed = df_stash_studios.add_prefix('stash_')

# Merge the prefixed dataframes
df_matched_studios = pd.merge(df_stash_studios_prefixed, df_sites_prefixed, 
                              left_on='stash_name', 
                              right_on='culture_extractor_name', 
                              how='inner')

In [None]:
# Store Culture Extractor UUID in Stash studio
name = "SexArt"

df_matched_studio = df_matched_studios[df_matched_studios["stash_name"] == name]
if not df_matched_studio.empty:
    df_matched_studio = df_matched_studio.iloc[0]
else:
    print(f"No studio found with name: {name}")
    raise ValueError(f"No studio found with name: {name}")

refreshed_studio = stash.find_studio(name)
assert refreshed_studio is not None, f"No studio found with name: {name}"

stashbox_ids = refreshed_studio["stash_ids"]
culture_extractor_endpoint = "https://culture.extractor/graphql"
culture_extractor_uuid = str(df_matched_studio["culture_extractor_uuid"])
existing_stash_id = next((stash_id for stash_id in stashbox_ids if stash_id["endpoint"] == culture_extractor_endpoint), None)

if existing_stash_id:
    if existing_stash_id["stash_id"] != culture_extractor_uuid:
        existing_stash_id["stash_id"] = culture_extractor_uuid
        stash.update_studio({"id": refreshed_studio["id"], "stash_ids": stashbox_ids})
        print(f"Updated stash_id for {culture_extractor_endpoint}")
else:
    stashbox_ids.append({"endpoint": culture_extractor_endpoint, "stash_id": culture_extractor_uuid})
    stash.update_studio({"id": refreshed_studio["id"], "stash_ids": stashbox_ids})
    print(f"Added new stash_id for {culture_extractor_endpoint}")


In [None]:
# Get studio for scene matching
stash_site_name = 'Slayed'

current_studio = get_studio_by_name(stash_site_name)

if isinstance(current_studio, pd.DataFrame) and not current_studio.empty:
    current_studio = current_studio.iloc[0].to_dict()

culture_extractor_site_uuid = None
if isinstance(current_studio, dict) and "stash_ids" in current_studio:
    for stash_id in current_studio["stash_ids"]:
        if isinstance(stash_id, dict) and stash_id.get("endpoint") == "https://culture.extractor/graphql":
            culture_extractor_site_uuid = stash_id.get("stash_id")
            break

assert culture_extractor_site_uuid is not None, f"No Culture Extractor site uuid found for {stash_site_name}"
print(f"Matched Stash studio {stash_site_name} to Culture Extractor site uuid {culture_extractor_site_uuid}")


In [130]:
delete_uuid = "018b8e83-e2e3-718e-966d-c4f745149c79"

with psycopg.connect(connection_string) as conn:
    cursor = conn.cursor()
    cursor.execute("DELETE FROM releases WHERE uuid = %s", (delete_uuid,))
    cursor.close()


In [None]:
# Get scenes from Culture Extractor
query_template = """
    SELECT
        sites.uuid AS site_uuid,
        sites.short_name AS site_short_name,
        sites.name AS site_name,
        
        releases.uuid AS release_uuid,
        releases.release_date AS release_date,
        releases.short_name AS release_short_name,
        releases.name AS release_name,
        releases.url AS release_url,
        releases.json_document AS release_json_document,
        downloads.uuid AS downloads_uuid,
        downloads.downloaded_at AS downloads_downloaded_at,
        downloads.variant AS downloads_variant,
        downloads.file_type AS downloads_file_type,
        downloads.content_type AS downloads_content_type,
        downloads.file_metadata AS downloads_file_metadata
    FROM releases
    JOIN sites ON releases.site_uuid = sites.uuid
    JOIN downloads ON releases.uuid = downloads.release_uuid
    WHERE
        sites.uuid = '%s' AND
        downloads.file_type = 'video' AND
        downloads.content_type = 'scene' AND
        (downloads.variant NOT IN ('480x270', '270p', '320p', '360p', '270p MOBILE'));
    """
query = query_template % culture_extractor_site_uuid

with psycopg.connect(connection_string) as conn:
    cursor = conn.cursor()
    cursor.execute(query)
    matched_performers = cursor.fetchall()
    column_names = [desc[0] for desc in cursor.description]
    cursor.close()

df_culture_extractor_scenes = pd.DataFrame(matched_performers, columns=column_names)
df_culture_extractor_scenes["culture_extractor_duration"] = df_culture_extractor_scenes["downloads_file_metadata"].apply(lambda x: x["duration"] if isinstance(x, dict) and "duration" in x else None)
df_culture_extractor_scenes["culture_extractor_phash"] = df_culture_extractor_scenes["downloads_file_metadata"].apply(lambda x: x["phash"] if isinstance(x, dict) and "phash" in x else None)
df_culture_extractor_scenes

In [None]:
df_nonunique_release_short_name = df_culture_extractor_scenes[df_culture_extractor_scenes.duplicated(subset=['release_short_name'], keep=False)]
df_nonunique_release_short_name


In [55]:
# Get scenes from Stash
stash_scenes = stash.find_scenes(
    # {
    #     "studios": { "value": [current_studio["id"]], "excludes": [], "modifier": "INCLUDES", "depth": -1 }
    # },
    # filter={
    #     "per_page": 500, "page": 1, "sort": "path", "direction": "DESC"
    # },
    fragment="id title code date files { id path basename fingerprints { type value } format width height video_codec frame_rate duration } studio { id name tags { id name } } performers { id name gender tags { id name} } stash_ids { endpoint stash_id } urls"
)
df_stash_scenes = pd.DataFrame(stash_scenes)

def get_endpoint_stash_id(stash_ids, endpoint):
    for stash_id in stash_ids:
        if stash_id["endpoint"] == endpoint:
            return stash_id["stash_id"]
    return None

def get_tpdb_id(stash_ids):
    return get_endpoint_stash_id(stash_ids, "https://theporndb.net/graphql")

def get_stashdb_id(stash_ids):
    return get_endpoint_stash_id(stash_ids, "https://stashdb.org/graphql")

def get_culture_extractor_id(stash_ids):
    return get_endpoint_stash_id(stash_ids, "https://culture.extractor/graphql")

df_stash_scenes["date"] = pd.to_datetime(df_stash_scenes["date"])
df_stash_scenes["stashdb_id"] = df_stash_scenes["stash_ids"].apply(get_stashdb_id)
df_stash_scenes["tpdb_id"] = df_stash_scenes["stash_ids"].apply(get_tpdb_id)
df_stash_scenes["culture_extractor_id"] = df_stash_scenes["stash_ids"].apply(get_culture_extractor_id)
df_stash_scenes["stash_duration"] = df_stash_scenes["files"].apply(lambda x: x[0]["duration"])
df_stash_scenes["stash_phash"] = df_stash_scenes["files"].apply(lambda x: next((y["value"] for y in x[0]["fingerprints"] if y["type"] == "phash"), None))
df_stash_scenes["stash_oshash"] = df_stash_scenes["files"].apply(lambda x: next((y["value"] for y in x[0]["fingerprints"] if y["type"] == "oshash"), None))


In [None]:
# Check for duplicate scenes based on StashDB ID
df_duplicate_stash_scenes = df_stash_scenes[
    (df_stash_scenes['stashdb_id'].notna()) &
    (df_stash_scenes.duplicated(subset=['stashdb_id'], keep=False))
]
df_duplicate_stash_scenes


In [56]:
# Add the duplicate tag to the scenes
duplicate_stashdb_ids_tag = stash.find_tag("StashDB: Duplicate Scenes Based On ID")
duplicate_stashdb_ids_tag

for index, row in df_duplicate_stash_scenes.iterrows():
    refreshed_scene = stash.find_scene(row["id"])
    existing_tag_ids = [tag["id"] for tag in refreshed_scene["tags"]]
    if duplicate_stashdb_ids_tag["id"] not in existing_tag_ids:
        updated_tag_ids = existing_tag_ids + [duplicate_stashdb_ids_tag["id"]]
        stash.update_scene({ "id": row["id"], "tag_ids": updated_tag_ids })


# Matching existing scenes

In [None]:
# Match Stash and Culture Extractor scenes based on oshash
df_merged_by_oshash = pd.merge(df_stash_scenes, df_releases, 
                               left_on='stash_oshash', right_on='culture_extractor_oshash', 
                               how='inner')

df_merged_by_oshash = df_merged_by_oshash[
    df_merged_by_oshash['culture_extractor_id'].str.strip() != df_merged_by_oshash['culture_extractor_release_uuid'].astype(str).str.strip()
]
len(df_merged_by_oshash)


In [None]:
# Match first x rows from df_merged_by_oshash to df_stash_scenes
for index, row in df_merged_by_oshash[df_merged_by_oshash['culture_extractor_site_name'] == "It's POV"].iterrows():
    refreshed_scene = stash.find_scene(row["id"])

    release_uuid = row["culture_extractor_release_uuid"]
    if pd.isnull(release_uuid):
        continue

    existing_stash_ids = refreshed_scene["stash_ids"]
    if "https://culture.extractor/graphql" not in [stash_id_obj["endpoint"] for stash_id_obj in existing_stash_ids]:
        existing_stash_ids.append({ "endpoint": "https://culture.extractor/graphql", "stash_id": str(release_uuid) })
        id = row["id"]
        stash.update_scene({ "id": id, "stash_ids": existing_stash_ids })
        print(f"Updated scene {id} with Culture Extractor ID {release_uuid}")


In [135]:
# Match Stash and Culture Extractor scenes based on phash
df_culture_extractor_scenes['release_date'] = pd.to_datetime(df_culture_extractor_scenes['release_date'])

# Merge the dataframes
df_merged_scenes = pd.merge(df_stash_scenes, df_culture_extractor_scenes, 
                            left_on='stash_phash', right_on='culture_extractor_phash', 
                            how='left')

df_merged_matched_scenes = df_merged_scenes.copy()
df_merged_matched_scenes = df_merged_matched_scenes[df_merged_matched_scenes["release_uuid"].notnull()]

df_merged_matched_scenes["duration_difference"] = df_merged_matched_scenes["stash_duration"] - df_merged_matched_scenes["culture_extractor_duration"]
df_merged_matched_scenes["phash_distance"] = df_merged_matched_scenes.apply(lambda row: calculate_hamming_distance(row["stash_phash"], row["culture_extractor_phash"]), axis=1)
df_merged_matched_scenes["title_levenshtein"] = df_merged_matched_scenes.apply(lambda row: levenshtein(row["title"], row["release_name"]), axis=1)

In [None]:
# Update Stash scenes with Culture Extractor ID
for index, row in df_merged_matched_scenes.iterrows():
    refreshed_scene = stash.find_scene(row["id"])

    release_uuid = row["release_uuid"]
    if pd.isnull(release_uuid):
        continue

    existing_stash_ids = refreshed_scene["stash_ids"]
    if "https://culture.extractor/graphql" not in [stash_id_obj["endpoint"] for stash_id_obj in existing_stash_ids]:
        existing_stash_ids.append({ "endpoint": "https://culture.extractor/graphql", "stash_id": str(release_uuid) })
        id = row["id"]
        code = row["release_short_name"]
        stash.update_scene({ "id": id, "code": code, "stash_ids": existing_stash_ids })
        print(f"Updated scene {id} with Culture Extractor ID {release_uuid}")


In [None]:
# Create a new column 'new_urls' where 'release_url' is appended to 'urls' if not already present
df_merged_scenes['new_urls'] = df_merged_scenes.apply(lambda row: row['urls'] + [row['release_url']] if row['release_url'] not in row['urls'] else row['urls'], axis=1)

df_merged_scenes[["id", "title", "new_urls"]]

In [None]:
# Create a new column 'new_urls' where 'release_url' is appended to 'urls' if not already present
df_merged_scenes['new_stash_ids'] = df_merged_scenes.apply(lambda row:
                                                           row['stash_ids'] + [{ "endpoint": "https://culture.extractor/graphql", "stash_id": str(row["release_uuid"]) }] if "https://culture.extractor/graphql" not in [stash_id_obj["endpoint"] for stash_id_obj in row['stash_ids']] else row['stash_ids'],
                                                           axis=1)
df_merged_scenes[["new_stash_ids"]].values


In [None]:
for index, row in df_merged_scenes.iterrows():
    id = row["id"]
    old_urls = row["urls"]
    new_urls = row["new_urls"]
    old_urls_set = set(old_urls)
    new_urls_set = set(new_urls)
    if old_urls_set != new_urls_set:
        print(row["title"])
        print(old_urls_set)
        print(new_urls_set)
        stash.update_scene({ "id": id, "urls": new_urls })

    # stash_ids = row["stash_ids"]
    # new_stash_ids = row["new_stash_ids"]
    # stash_ids_set = set(stash_id_obj["stash_id"] for stash_id_obj in row["stash_ids"])
    # new_stash_ids_set = set(stash_id_obj["stash_id"] for stash_id_obj in row["new_stash_ids"])
    # if stash_ids_set != new_stash_ids_set:
    #     print(row["title"])
    #     print(stash_ids_set)
    #     print(new_stash_ids_set)
    #     stash.update_scene({ "id": id, "stash_ids": new_stash_ids })

# Import new scenes

In [23]:
import os
import pandas as pd
from pathlib import Path
import re

# Define the root directory
root_dir = Path(r"F:\Ripping\Tickling Submission\Metadata")

# Initialize lists to store data
data = []

# Walk through the directory structure
for release_dir in root_dir.iterdir():
    if release_dir.is_dir():
        release_uuid = release_dir.name
        preview_image = None
        full_scene = None
        trailer = None

        # Check each file in the release directory
        for file in release_dir.iterdir():
            if file.suffix.lower() == '.jpg':
                preview_image = file.name
            elif file.suffix.lower() in ['.wmv', '.mp4']:
                try:
                    file_stem = file.stem  # Get filename without extension
                    if file_stem.endswith(release_uuid):
                        trailer = file.name
                    elif re.search(r'- \d+x\d+$', file_stem):
                        full_scene = file.name
                except OSError as e:
                    print(f"Error accessing file: {file}. Error: {e}")
                    continue

        # Append data to the list
        data.append({
            'release_uuid': release_uuid,
            'preview_image': preview_image,
            'full_scene': full_scene,
            'trailer': trailer
        })

# Create a DataFrame
df_files = pd.DataFrame(data)


In [24]:
df_merged = pd.merge(df_files, df_culture_extractor_scenes, 
                     left_on='release_uuid', 
                     right_on='release_uuid', 
                     how='left')


In [None]:
# Get release_uuid values where full_scene is missing
missing_full_scene = df_merged[df_merged['full_scene'].isnull()]['release_uuid'].tolist()

# Format the list for VS Code breakpoint condition
breakpoint_condition = f"release_id in {missing_full_scene}"

print("VS Code breakpoint condition:")
print(breakpoint_condition)


In [None]:
len(missing_full_scene)

# Trailers

In [None]:
studio_name = "Vixen"

studio_for_trailers = stash.find_studio(studio_name)
scenes_for_trailers = stash.find_scenes(
    {
        "studios": {
            "value": [studio_for_trailers["id"]],
            "excludes": [],
            "modifier": "INCLUDES",
            "depth": -1
        }
    },
    fragment="id title code date files { id path basename fingerprints { type value } format width height video_codec frame_rate duration } studio { id name tags { id name } } performers { id name gender tags { id name} } stash_ids { endpoint stash_id } urls"
)

df_stash_scenes_for_trailers = pd.DataFrame(scenes_for_trailers)
df_stash_scenes_for_trailers["stash_oshash"] = df_stash_scenes_for_trailers["files"].apply(lambda x: next((y["value"] for y in x[0]["fingerprints"] if y["type"] == "oshash"), None))
df_stash_scenes_for_trailers["culture_extractor_uuid"] = df_stash_scenes_for_trailers["stash_ids"].apply(lambda x: get_culture_extractor_id(x))

In [None]:
trailer_tag = stash.find_tag("Trailer Associated", create=True)


In [None]:
import shutil

# Source paths
culture_extractor_trailer_base_path_d = f"D:\\Ripping\\{studio_name}\\Metadata\\"
culture_extractor_trailer_base_path_f = f"F:\\Ripping\\{studio_name}\\Metadata\\"

# Target path
stash_trailer_base_path = "H:\\Stash\\generated\\trailers"

for index, row in df_stash_scenes_for_trailers.iterrows():
    scene_id = row["id"]
    scene_oshash = row["stash_oshash"]
    culture_extractor_uuid = row["culture_extractor_uuid"]

    stash_trailer_path = os.path.join(stash_trailer_base_path, f"{scene_oshash}.mp4")
    if os.path.exists(stash_trailer_path):
        refreshed_scene = stash.find_scene(scene_id)
        existing_tag_ids = [tag["id"] for tag in refreshed_scene["tags"]]
        if trailer_tag["id"] not in existing_tag_ids:
            updated_tag_ids = existing_tag_ids + [trailer_tag["id"]]
            stash.update_scene({ "id": scene_id, "tag_ids": updated_tag_ids })
        
        print(f"Stash: Trailer already exists for scene {scene_id} at {stash_trailer_path}")
        continue
    
    trailer_filename_candidates = ["trailer_2160.mp4", "trailer_1080.mp4"]
    trailer_candidate_paths_d = [os.path.join(culture_extractor_trailer_base_path_d, f"{culture_extractor_uuid}", filename) for filename in trailer_filename_candidates]
    trailer_candidate_paths_f = [os.path.join(culture_extractor_trailer_base_path_f, f"{culture_extractor_uuid}", filename) for filename in trailer_filename_candidates]    
    matching_trailer_path = next((path for path in trailer_candidate_paths_d + trailer_candidate_paths_f if os.path.exists(path)), None)
    if matching_trailer_path:
        shutil.copy(matching_trailer_path, stash_trailer_path)
        print(f"Copied trailer for scene {scene_id} to {stash_trailer_path}")
        refreshed_scene = stash.find_scene(scene_id)
        existing_tag_ids = [tag["id"] for tag in refreshed_scene["tags"]]
        if trailer_tag["id"] not in existing_tag_ids:
            updated_tag_ids = existing_tag_ids + [trailer_tag["id"]]
            stash.update_scene({ "id": scene_id, "tag_ids": updated_tag_ids })
    else:
        print(f"No trailer found for scene {scene_id}")
        continue
