In [None]:
import os
import sys
from pathlib import Path

import polars as pl


sys.path.append(str(Path.cwd().parent))

from libraries.client_stashapp import StashAppClient, get_stashapp_client


# Initialize Stash client
stash_client = StashAppClient()
stash_raw_client = get_stashapp_client()

def find_csv_files(base_paths):
    """Find all .Scenes.csv files in the given base paths"""
    csv_files = []

    for base_path in base_paths:
        path = Path(base_path)
        if path.exists():
            # Find all .Scenes.csv files recursively
            for csv_file in path.rglob("*.Scenes.csv"):
                # Get the corresponding video file name by removing .Scenes.csv
                video_name = str(csv_file.name).replace(".Scenes.csv", "")
                csv_files.append({
                    "csv_path": str(csv_file),
                    "video_name": video_name,
                    "drive": os.path.splitdrive(str(csv_file))[0].upper(),
                    "directory": str(csv_file.parent)
                })
        else:
            print(f"Warning: Path {base_path} does not exist")

    return pl.DataFrame(csv_files)

def get_processed_scenes():
    tag_id = stash_raw_client.find_tag(
        {"name": "Scenes: PySceneDetect: Processed"}
    )["id"]
    scene_fragment = (
        "id title files { id basename path }"
        " scene_markers { id primary_tag { id name }}"
    )
    return stash_raw_client.find_scenes(
        {"tags": {"value": [tag_id], "modifier": "INCLUDES_ALL"}},
        fragment=scene_fragment,
    )

print("Utility functions loaded and Stash client initialized")

In [None]:
# Define the Culture directory paths
culture_paths = [
    "F:\\Culture",
    "W:\\Culture",
    "X:\\Culture",
    "Y:\\Culture",
    "Z:\\Culture"
]

print("Scanning for CSV files in Culture directories...")
csv_files_df = find_csv_files(culture_paths)

print(f"Found {len(csv_files_df)} CSV files")
print(f"CSV files by drive:")
csv_files_df.group_by("drive").agg(pl.len().alias("count")).sort("drive")
csv_files_df


In [None]:
print("Getting processed scenes from Stash...")
processed_scenes_result = get_processed_scenes()

print(f"Found {len(processed_scenes_result)} processed scenes")

# Convert to DataFrame
processed_scenes_data = []
for scene in processed_scenes_result:
    for file in scene["files"]:
        # Get the basename without extension for comparison
        basename_no_ext = Path(file["basename"]).stem
        processed_scenes_data.append({
            "scene_id": scene["id"],
            "title": scene["title"],
            "file_path": file["path"],
            "basename": file["basename"],
            "basename_no_ext": basename_no_ext,
            "drive": os.path.splitdrive(file["path"])[0].upper()
        })

processed_scenes_df = pl.DataFrame(processed_scenes_data)

print(f"Processed scenes by drive:")
processed_scenes_df.group_by("drive").agg(pl.len().alias("count")).sort("drive")
processed_scenes_df

In [None]:
# Compare CSV files with processed scenes
print("Comparing CSV files with processed scenes...")

# Find CSV files that don't have corresponding processed scenes
csv_without_processed = csv_files_df.join(
    processed_scenes_df.select(["basename_no_ext", "scene_id"]),
    left_on="video_name",
    right_on="basename_no_ext",
    how="left"
).filter(pl.col("scene_id").is_null())

print(f"\nCSV files without corresponding processed scenes: {len(csv_without_processed)}")
if len(csv_without_processed) > 0:
    print("\nCSV files without processed scenes by drive:")
    print(csv_without_processed.group_by("drive").agg(pl.len().alias("count")).sort("drive"))

csv_without_processed


In [None]:
# Find processed scenes that don't have corresponding CSV files
processed_without_csv = processed_scenes_df.join(
    csv_files_df.select(["video_name", "csv_path"]),
    left_on="basename_no_ext",
    right_on="video_name",
    how="left"
).filter(pl.col("csv_path").is_null())

print(f"\nProcessed scenes without corresponding CSV files: {len(processed_without_csv)}")
if len(processed_without_csv) > 0:
    print("\nProcessed scenes without CSV files by drive:")
    print(processed_without_csv.group_by("drive").agg(pl.len().alias("count")).sort("drive"))

processed_without_csv


In [None]:
# Summary analysis
print("=== SUMMARY ===")
print(f"Total CSV files found: {len(csv_files_df)}")
print(f"Total processed scenes: {len(processed_scenes_df)}")
print(f"CSV files without processed scenes: {len(csv_without_processed)}")
print(f"Processed scenes without CSV files: {len(processed_without_csv)}")

# Check for exact matches
exact_matches = csv_files_df.join(
    processed_scenes_df.select(["basename_no_ext", "scene_id"]),
    left_on="video_name",
    right_on="basename_no_ext",
    how="inner"
)

print(f"Exact matches (CSV + Processed): {len(exact_matches)}")
print(f"\nDifference analysis:")
print(f"CSV files - Processed scenes = {len(csv_files_df)} - {len(processed_scenes_df)} = {len(csv_files_df) - len(processed_scenes_df)}")


In [None]:
# Look for potential filename mismatches
print("\n=== POTENTIAL FILENAME MISMATCHES ===")

# Get unique video names from CSV files
csv_video_names = set(csv_files_df["video_name"].to_list())
processed_basenames = set(processed_scenes_df["basename_no_ext"].to_list())

# Find similar names that might be mismatches
from difflib import get_close_matches


if len(csv_without_processed) > 0:
    print("\nLooking for similar filenames for unmatched CSV files:")
    for row in csv_without_processed.iter_rows(named=True):
        video_name = row["video_name"]
        # Find close matches in processed scenes
        close_matches = get_close_matches(video_name, processed_basenames, n=3, cutoff=0.8)
        if close_matches:
            print(f"\nCSV: {video_name}")
            print(f"  Possible matches: {close_matches}")
            print(f"  CSV path: {row['csv_path']}")


In [None]:
# Fix the CSV video names by removing video extensions
video_extensions = [".mp4", ".mkv", ".avi", ".wmv", ".mov"]

csv_files_fixed = csv_files_df.with_columns([
    pl.col("video_name").map_elements(lambda x:
        next((x.replace(ext, "") for ext in video_extensions if x.endswith(ext)), x)
    ).alias("video_name_fixed")
])

# Now redo the comparison with fixed names
csv_without_processed_fixed = csv_files_fixed.join(
    processed_scenes_df.select(["basename_no_ext", "scene_id"]),
    left_on="video_name_fixed",
    right_on="basename_no_ext",
    how="left"
).filter(pl.col("scene_id").is_null())

processed_without_csv_fixed = processed_scenes_df.join(
    csv_files_fixed.select(["video_name_fixed", "csv_path"]),
    left_on="basename_no_ext",
    right_on="video_name_fixed",
    how="left"
).filter(pl.col("csv_path").is_null())

exact_matches_fixed = csv_files_fixed.join(
    processed_scenes_df.select(["basename_no_ext", "scene_id"]),
    left_on="video_name_fixed",
    right_on="basename_no_ext",
    how="inner"
)

print("=== FIXED SUMMARY ===")
print(f"Total CSV files found: {len(csv_files_fixed)}")
print(f"Total processed scenes: {len(processed_scenes_df)}")
print(f"CSV files without processed scenes: {len(csv_without_processed_fixed)}")
print(f"Processed scenes without CSV files: {len(processed_without_csv_fixed)}")
print(f"Exact matches (CSV + Processed): {len(exact_matches_fixed)}")

print(f"\nThe missing CSV file(s):")
processed_without_csv_fixed


In [None]:
# Find the 2 CSV files that don't have processed scenes
unprocessed_csv_files = csv_without_processed_fixed.select([
    "csv_path",
    "video_name_fixed",
    "drive",
    "directory"
])

print(f"CSV files that need to be processed ({len(unprocessed_csv_files)}):")
unprocessed_csv_files
