In [None]:
%pip install psycopg2-binary
%pip install sqlalchemy

In [2]:
import psycopg
import pandas as pd

from dotenv import load_dotenv
import os

# Load the .env file
load_dotenv()

user = os.environ.get("CE_DB_USERNAME")
pw = os.environ.get("CE_DB_PASSWORD")
host = os.environ.get("CE_DB_HOST")
port = os.environ.get("CE_DB_PORT")
db = os.environ.get("CE_DB_NAME")

connection_string = f"dbname={db} user={user} password={pw} host={host} port={port}"

def hex_to_binary(hex_string):
    return bin(int(hex_string, 16))[2:].zfill(64)

def calculate_hamming_distance(phash1, phash2):
    # Convert hexadecimal phashes to binary
    binary1 = hex_to_binary(phash1)
    binary2 = hex_to_binary(phash2)
    
    # Ensure both binary strings are of equal length
    if len(binary1) != len(binary2):
        raise ValueError("Binary strings must be of equal length")
    
    # Calculate Hamming distance
    return sum(c1 != c2 for c1, c2 in zip(binary1, binary2))

# Example usage:
# phash1 = "951428607cf7cb8f"
# phash2 = "951428607cf7cb8e"
# distance = calculate_hamming_distance(phash1, phash2)
# print(f"Hamming distance between {phash1} and {phash2}: {distance}")

def levenshtein(s1, s2):
    from Levenshtein import distance
    return distance(s1, s2)


In [11]:
with psycopg.connect(connection_string) as conn:
    # Create a cursor object
    cursor = conn.cursor()

    # Execute the query
    query = "SELECT * FROM sites ORDER BY name"
    cursor.execute(query)

    # Fetch all the results
    results = cursor.fetchall()

    # Get the column names from cursor.description
    column_names = [desc[0] for desc in cursor.description]

    # Close the cursor
    cursor.close()

    # No need to manually close the connection when using 'with'

    # Convert the results to a data frame with column names
    df_sites = pd.DataFrame(results, columns=column_names)


In [16]:
with psycopg.connect(connection_string) as conn:
    # Create a cursor object
    cursor = conn.cursor()

    # Execute the query
    query = """
        SELECT sub_sites.*, sites.name AS site_name
        FROM sub_sites
        JOIN sites ON sub_sites.site_uuid = sites.uuid
        ORDER BY sub_sites.name
    """
    cursor.execute(query)

    # Fetch all the results
    results = cursor.fetchall()

    # Get the column names from cursor.description
    column_names = [desc[0] for desc in cursor.description]

    # Close the cursor
    cursor.close()

    # No need to manually close the connection when using 'with'

    # Convert the results to a data frame with column names
    df_subsites = pd.DataFrame(results, columns=column_names)


In [None]:
from libraries.client_stashapp import get_stashapp_client

stash = get_stashapp_client()

def get_parent_studio_id(studio):
    if studio["parent_studio"] is not None:
        return studio["parent_studio"]["id"]
    return None


In [116]:
studios = stash.find_studios({})
df_stash_studios = pd.DataFrame(studios)
df_stash_studios["parent_studio_id"] = df_stash_studios.apply(get_parent_studio_id, axis=1)

# Lookup functions
def get_studio_by_id(studio_id):
    return df_stash_studios[df_stash_studios["id"] == studio_id]

def get_studio_by_name(studio_name):
    return df_stash_studios[df_stash_studios["name"] == studio_name]

In [13]:
# Create new dataframes with prefixed column names
df_sites_prefixed = df_sites.add_prefix('culture_extractor_')
df_stash_studios_prefixed = df_stash_studios.add_prefix('stash_')

# Merge the prefixed dataframes
df_matched_studios = pd.merge(df_stash_studios_prefixed, df_sites_prefixed, 
                              left_on='stash_name', 
                              right_on='culture_extractor_name', 
                              how='inner')

In [None]:
# Store Culture Extractor UUID in Stash studio
name = "SexArt"

df_matched_studio = df_matched_studios[df_matched_studios["stash_name"] == name]
if not df_matched_studio.empty:
    df_matched_studio = df_matched_studio.iloc[0]
else:
    print(f"No studio found with name: {name}")
    raise ValueError(f"No studio found with name: {name}")

refreshed_studio = stash.find_studio(name)
assert refreshed_studio is not None, f"No studio found with name: {name}"

stashbox_ids = refreshed_studio["stash_ids"]
culture_extractor_endpoint = "https://culture.extractor/graphql"
culture_extractor_uuid = str(df_matched_studio["culture_extractor_uuid"])
existing_stash_id = next((stash_id for stash_id in stashbox_ids if stash_id["endpoint"] == culture_extractor_endpoint), None)

if existing_stash_id:
    if existing_stash_id["stash_id"] != culture_extractor_uuid:
        existing_stash_id["stash_id"] = culture_extractor_uuid
        stash.update_studio({"id": refreshed_studio["id"], "stash_ids": stashbox_ids})
        print(f"Updated stash_id for {culture_extractor_endpoint}")
else:
    stashbox_ids.append({"endpoint": culture_extractor_endpoint, "stash_id": culture_extractor_uuid})
    stash.update_studio({"id": refreshed_studio["id"], "stash_ids": stashbox_ids})
    print(f"Added new stash_id for {culture_extractor_endpoint}")


In [None]:
# Get studio for scene matching
stash_site_name = 'SexArt'

current_studio = get_studio_by_name(stash_site_name)

if isinstance(current_studio, pd.DataFrame) and not current_studio.empty:
    current_studio = current_studio.iloc[0].to_dict()

culture_extractor_site_uuid = None
if isinstance(current_studio, dict) and "stash_ids" in current_studio:
    for stash_id in current_studio["stash_ids"]:
        if isinstance(stash_id, dict) and stash_id.get("endpoint") == "https://culture.extractor/graphql":
            culture_extractor_site_uuid = stash_id.get("stash_id")
            break

assert culture_extractor_site_uuid is not None, f"No Culture Extractor site uuid found for {stash_site_name}"
print(f"Matched Stash studio {stash_site_name} to Culture Extractor site uuid {culture_extractor_site_uuid}")


In [130]:
delete_uuid = "018b8e83-e2e3-718e-966d-c4f745149c79"

with psycopg.connect(connection_string) as conn:
    cursor = conn.cursor()
    cursor.execute("DELETE FROM releases WHERE uuid = %s", (delete_uuid,))
    cursor.close()


In [None]:
# Get scenes from Culture Extractor
query_template = """
    SELECT
        sites.uuid AS site_uuid,
        sites.short_name AS site_short_name,
        sites.name AS site_name,
        
        releases.uuid AS release_uuid,
        releases.release_date AS release_date,
        releases.short_name AS release_short_name,
        releases.name AS release_name,
        releases.url AS release_url,
        releases.json_document AS release_json_document,
        downloads.uuid AS downloads_uuid,
        downloads.downloaded_at AS downloads_downloaded_at,
        downloads.variant AS downloads_variant,
        downloads.file_type AS downloads_file_type,
        downloads.content_type AS downloads_content_type,
        downloads.file_metadata AS downloads_file_metadata
    FROM releases
    JOIN sites ON releases.site_uuid = sites.uuid
    JOIN downloads ON releases.uuid = downloads.release_uuid
    WHERE
        sites.uuid = '%s' AND
        downloads.file_type = 'video' AND
        downloads.content_type = 'scene' AND
        (downloads.variant NOT IN ('480x270', '270p', '320p', '360p', '270p MOBILE'));
    """
query = query_template % culture_extractor_site_uuid

with psycopg.connect(connection_string) as conn:
    cursor = conn.cursor()
    cursor.execute(query)
    results = cursor.fetchall()
    column_names = [desc[0] for desc in cursor.description]
    cursor.close()

df_culture_extractor_scenes = pd.DataFrame(results, columns=column_names)
df_culture_extractor_scenes["culture_extractor_duration"] = df_culture_extractor_scenes["downloads_file_metadata"].apply(lambda x: x["duration"] if isinstance(x, dict) and "duration" in x else None)
df_culture_extractor_scenes["culture_extractor_phash"] = df_culture_extractor_scenes["downloads_file_metadata"].apply(lambda x: x["phash"] if isinstance(x, dict) and "phash" in x else None)
df_culture_extractor_scenes

In [None]:
df_nonunique_release_short_name = df_culture_extractor_scenes[df_culture_extractor_scenes.duplicated(subset=['release_short_name'], keep=False)]
df_nonunique_release_short_name


In [None]:
# Get scenes from Stash
stash_scenes = stash.find_scenes(
    {
        "studios": { "value": [current_studio["id"]], "excludes": [], "modifier": "INCLUDES", "depth": -1 }
    },
    # filter={
    #     "per_page": 500, "page": 1, "sort": "path", "direction": "DESC"
    # },
    fragment="id title code date files { id path basename fingerprints { type value } format width height video_codec frame_rate duration } studio { id name tags { id name } } performers { id name gender tags { id name} } stash_ids { endpoint stash_id } urls"
)
df_stash_scenes = pd.DataFrame(stash_scenes)

def get_endpoint_stash_id(stash_ids, endpoint):
    for stash_id in stash_ids:
        if stash_id["endpoint"] == endpoint:
            return stash_id["stash_id"]
    return None

def get_tpdb_id(stash_ids):
    return get_endpoint_stash_id(stash_ids, "https://theporndb.net/graphql")

def get_stashdb_id(stash_ids):
    return get_endpoint_stash_id(stash_ids, "https://stashdb.org/graphql")

def get_culture_extractor_id(stash_ids):
    return get_endpoint_stash_id(stash_ids, "https://culture.extractor/graphql")

df_stash_scenes["date"] = pd.to_datetime(df_stash_scenes["date"])
df_stash_scenes["stashdb_id"] = df_stash_scenes["stash_ids"].apply(get_stashdb_id)
df_stash_scenes["tpdb_id"] = df_stash_scenes["stash_ids"].apply(get_tpdb_id)
df_stash_scenes["culture_extractor_id"] = df_stash_scenes["stash_ids"].apply(get_culture_extractor_id)
df_stash_scenes["stash_duration"] = df_stash_scenes["files"].apply(lambda x: x[0]["duration"])
df_stash_scenes["stash_phash"] = df_stash_scenes["files"].apply(lambda x: next((y["value"] for y in x[0]["fingerprints"] if y["type"] == "phash"), None))
df_stash_scenes["stash_oshash"] = df_stash_scenes["files"].apply(lambda x: next((y["value"] for y in x[0]["fingerprints"] if y["type"] == "oshash"), None))

df_stash_scenes

In [None]:
# Check for duplicate scenes based on StashDB ID
df_duplicate_stash_scenes = df_stash_scenes[
    (df_stash_scenes['stashdb_id'].notna()) &
    (df_stash_scenes.duplicated(subset=['stashdb_id'], keep=False))
]
df_duplicate_stash_scenes


In [56]:
# Add the duplicate tag to the scenes
duplicate_stashdb_ids_tag = stash.find_tag("StashDB: Duplicate Scenes Based On ID")
duplicate_stashdb_ids_tag

for index, row in df_duplicate_stash_scenes.iterrows():
    refreshed_scene = stash.find_scene(row["id"])
    existing_tag_ids = [tag["id"] for tag in refreshed_scene["tags"]]
    if duplicate_stashdb_ids_tag["id"] not in existing_tag_ids:
        updated_tag_ids = existing_tag_ids + [duplicate_stashdb_ids_tag["id"]]
        stash.update_scene({ "id": row["id"], "tag_ids": updated_tag_ids })


# Matching existing scenes

In [135]:
# Match Stash and Culture Extractor scenes based on phash
df_culture_extractor_scenes['release_date'] = pd.to_datetime(df_culture_extractor_scenes['release_date'])

# Merge the dataframes
df_merged_scenes = pd.merge(df_stash_scenes, df_culture_extractor_scenes, 
                            left_on='stash_phash', right_on='culture_extractor_phash', 
                            how='left')

df_merged_matched_scenes = df_merged_scenes.copy()
df_merged_matched_scenes = df_merged_matched_scenes[df_merged_matched_scenes["release_uuid"].notnull()]

df_merged_matched_scenes["duration_difference"] = df_merged_matched_scenes["stash_duration"] - df_merged_matched_scenes["culture_extractor_duration"]
df_merged_matched_scenes["phash_distance"] = df_merged_matched_scenes.apply(lambda row: calculate_hamming_distance(row["stash_phash"], row["culture_extractor_phash"]), axis=1)
df_merged_matched_scenes["title_levenshtein"] = df_merged_matched_scenes.apply(lambda row: levenshtein(row["title"], row["release_name"]), axis=1)

In [None]:
# Update Stash scenes with Culture Extractor ID
for index, row in df_merged_matched_scenes.iterrows():
    refreshed_scene = stash.find_scene(row["id"])

    release_uuid = row["release_uuid"]
    if pd.isnull(release_uuid):
        continue

    existing_stash_ids = refreshed_scene["stash_ids"]
    if "https://culture.extractor/graphql" not in [stash_id_obj["endpoint"] for stash_id_obj in existing_stash_ids]:
        existing_stash_ids.append({ "endpoint": "https://culture.extractor/graphql", "stash_id": str(release_uuid) })
        id = row["id"]
        code = row["release_short_name"]
        stash.update_scene({ "id": id, "code": code, "stash_ids": existing_stash_ids })
        print(f"Updated scene {id} with Culture Extractor ID {release_uuid}")


In [None]:
# Create a new column 'new_urls' where 'release_url' is appended to 'urls' if not already present
df_merged_scenes['new_urls'] = df_merged_scenes.apply(lambda row: row['urls'] + [row['release_url']] if row['release_url'] not in row['urls'] else row['urls'], axis=1)

df_merged_scenes[["id", "title", "new_urls"]]

In [None]:
# Create a new column 'new_urls' where 'release_url' is appended to 'urls' if not already present
df_merged_scenes['new_stash_ids'] = df_merged_scenes.apply(lambda row:
                                                           row['stash_ids'] + [{ "endpoint": "https://culture.extractor/graphql", "stash_id": str(row["release_uuid"]) }] if "https://culture.extractor/graphql" not in [stash_id_obj["endpoint"] for stash_id_obj in row['stash_ids']] else row['stash_ids'],
                                                           axis=1)
df_merged_scenes[["new_stash_ids"]].values


In [None]:
for index, row in df_merged_scenes.iterrows():
    id = row["id"]
    old_urls = row["urls"]
    new_urls = row["new_urls"]
    old_urls_set = set(old_urls)
    new_urls_set = set(new_urls)
    if old_urls_set != new_urls_set:
        print(row["title"])
        print(old_urls_set)
        print(new_urls_set)
        stash.update_scene({ "id": id, "urls": new_urls })

    # stash_ids = row["stash_ids"]
    # new_stash_ids = row["new_stash_ids"]
    # stash_ids_set = set(stash_id_obj["stash_id"] for stash_id_obj in row["stash_ids"])
    # new_stash_ids_set = set(stash_id_obj["stash_id"] for stash_id_obj in row["new_stash_ids"])
    # if stash_ids_set != new_stash_ids_set:
    #     print(row["title"])
    #     print(stash_ids_set)
    #     print(new_stash_ids_set)
    #     stash.update_scene({ "id": id, "stash_ids": new_stash_ids })

# Import new scenes

In [23]:
import os
import pandas as pd
from pathlib import Path
import re

# Define the root directory
root_dir = Path(r"F:\Ripping\Tickling Submission\Metadata")

# Initialize lists to store data
data = []

# Walk through the directory structure
for release_dir in root_dir.iterdir():
    if release_dir.is_dir():
        release_uuid = release_dir.name
        preview_image = None
        full_scene = None
        trailer = None

        # Check each file in the release directory
        for file in release_dir.iterdir():
            if file.suffix.lower() == '.jpg':
                preview_image = file.name
            elif file.suffix.lower() in ['.wmv', '.mp4']:
                try:
                    file_stem = file.stem  # Get filename without extension
                    if file_stem.endswith(release_uuid):
                        trailer = file.name
                    elif re.search(r'- \d+x\d+$', file_stem):
                        full_scene = file.name
                except OSError as e:
                    print(f"Error accessing file: {file}. Error: {e}")
                    continue

        # Append data to the list
        data.append({
            'release_uuid': release_uuid,
            'preview_image': preview_image,
            'full_scene': full_scene,
            'trailer': trailer
        })

# Create a DataFrame
df_files = pd.DataFrame(data)


In [24]:
df_merged = pd.merge(df_files, df_culture_extractor_scenes, 
                     left_on='release_uuid', 
                     right_on='release_uuid', 
                     how='left')


In [None]:
# Get release_uuid values where full_scene is missing
missing_full_scene = df_merged[df_merged['full_scene'].isnull()]['release_uuid'].tolist()

# Format the list for VS Code breakpoint condition
breakpoint_condition = f"release_id in {missing_full_scene}"

print("VS Code breakpoint condition:")
print(breakpoint_condition)


In [None]:
len(missing_full_scene)