In [None]:
import os
from pathlib import Path
import sys

import polars as pl
from dotenv import load_dotenv


load_dotenv()

from dotenv import load_dotenv


sys.path.append(str(Path.cwd().parent))

import libraries.client_culture_extractor as client_culture_extractor


# Culture Extractor
user = os.environ.get("CE_DB_USERNAME")
pw = os.environ.get("CE_DB_PASSWORD")
host = os.environ.get("CE_DB_HOST")
port = os.environ.get("CE_DB_PORT")
db = os.environ.get("CE_DB_NAME")

connection_string = f"dbname={db} user={user} password={pw} host={host} port={port}"

culture_extractor_client = client_culture_extractor.ClientCultureExtractor(
    connection_string
)


# StashApp
from libraries.client_stashapp import StashAppClient, get_stashapp_client


stash_client = StashAppClient()
stash_raw_client = get_stashapp_client()


# StashDB
import os

import dotenv

from libraries.StashDbClient import StashDbClient


dotenv.load_dotenv()

stashbox_client = StashDbClient(
    os.getenv("STASHDB_ENDPOINT"),
    os.getenv("STASHDB_API_KEY"),
)


# Functions
def hex_to_binary(hex_string):
    return bin(int(hex_string, 16))[2:].zfill(64)


def calculate_hamming_distance(phash1, phash2):
    # Convert hexadecimal phashes to binary
    binary1 = hex_to_binary(phash1)
    binary2 = hex_to_binary(phash2)

    # Ensure both binary strings are of equal length
    if len(binary1) != len(binary2):
        raise ValueError("Binary strings must be of equal length")

    # Calculate Hamming distance
    return sum(c1 != c2 for c1, c2 in zip(binary1, binary2))


# Example usage:
# phash1 = "951428607cf7cb8f"
# phash2 = "951428607cf7cb8e"
# distance = calculate_hamming_distance(phash1, phash2)
# print(f"Hamming distance between {phash1} and {phash2}: {distance}")


def levenshtein(s1: str, s2: str):
    if not s1:
        return None
    if not s2:
        return None
    from Levenshtein import distance

    return distance(s1.lower(), s2.lower())

In [None]:
all_ce_sites = culture_extractor_client.get_sites()
all_ce_sub_sites = culture_extractor_client.get_sub_sites()

In [None]:
all_ce_sites

In [None]:
# Function Definition
def list_files_in_directory(root_dir):
    """
    Lists all files in a directory and its subdirectories.

    Args:
        root_dir (str): The root directory to search.

    Returns:
        polars.DataFrame: A DataFrame with 'file_path' and 'extension' columns.
    """
    file_paths = []
    for dirpath, _, filenames in os.walk(root_dir):
        for filename in filenames:
            full_path = os.path.join(dirpath, filename)
            file_paths.append(full_path)

    if not file_paths:
        # Return an empty DataFrame with the correct schema if no files are found
        return pl.DataFrame(
            {"file_path": [], "extension": []},
            schema={"file_path": pl.Utf8, "extension": pl.Utf8},
        )

    df = pl.DataFrame({"file_path": file_paths})
    df = df.with_columns(
        pl.col("file_path")
        .map_elements(lambda x: os.path.splitext(x)[1].lower(), return_dtype=pl.Utf8)
        .alias("extension")
    )
    return df

In [None]:
# Main execution cell

site_name = "Sexy Hub"
sub_site_name = "Dane Jones"

site_row = all_ce_sites.filter(pl.col("ce_sites_name").str.contains(site_name))
sub_site_row = all_ce_sub_sites.filter(
    pl.col("ce_sub_sites_name").str.contains(sub_site_name)
)
print(site_row)
print(sub_site_row)
site_uuid = site_row["ce_sites_uuid"].to_list()[0]
sub_site_uuid = sub_site_row["ce_sub_sites_uuid"].to_list()[0]
print(site_uuid)
print(sub_site_uuid)

In [None]:
downloads = culture_extractor_client.get_downloads(site_uuid, sub_site_uuid)
print(downloads.schema)
downloads.head(1).write_json()

In [None]:
# Replace with the actual directory you want to scan
# For Windows paths, use raw strings (r"...") or escape backslashes (\\)
target_directory = r"F:\Ripping\Sexy Hubêž‰ Dane Jones"

all_files_df = list_files_in_directory(target_directory)
print(all_files_df.schema)
all_files_df.head(1).write_json()

In [None]:
# Join downloads with all_files_df

# Extract filename from file_path in all_files_df
all_files_df_with_filename = all_files_df.with_columns(
    pl.col("file_path")
    .map_elements(lambda x: os.path.basename(x), return_dtype=pl.Utf8)
    .alias("filename")
)

# Perform the join
joined_df = downloads.join(
    all_files_df_with_filename,
    left_on="ce_downloads_saved_filename",
    right_on="filename",
    how="inner",  # Or "left", "outer", etc., depending on your needs
)
joined_df

In [None]:
# Filtering: Video files

video_extensions = [".mp4", ".mkv", ".avi", ".mov", ".wmv", ".flv"]
joined_df_video_files_df = joined_df.filter(pl.col("extension").is_in(video_extensions))
joined_df_video_files_df

In [None]:
print(joined_df_video_files_df.schema)
joined_df_video_files_df.head(5).select("ce_downloads_performers").write_json()

In [None]:
# Count files per performer
performer_counts = (
    joined_df_video_files_df.explode("ce_downloads_performers")
    .group_by(pl.col("ce_downloads_performers").struct.field("name"))
    .agg(pl.count().alias("file_count"))
    .sort("file_count", descending=True)
).filter(pl.col("name").is_not_null())
performer_counts

In [None]:
# Filtering: Video and ZIP files

video_and_zip_extensions = video_extensions + [".zip"]
joined_df_video_and_zip_files_df = joined_df.filter(
    pl.col("extension").is_in(video_and_zip_extensions)
)
joined_df_video_and_zip_files_df