# X-Art File Deduplication and Organization

This notebook implements a multi-stage matching and deduplication workflow for X-Art content:

1. **Phase 1**: Export Stashapp X-Art scenes without CE UUID pattern
2. **Phase 2**: Export Culture Extractor X-Art downloads with hashes
3. **Phase 3**: Multi-stage matching (oshash → phash → fuzzy title)
4. **Phase 4**: Review and approve matches
5. **Phase 5**: Execute approved file operations
6. **Phase 6**: Verify and report results

In [37]:
import json
import os
import re
import shutil
import sys
from dataclasses import asdict, dataclass
from pathlib import Path

import polars as pl
from dotenv import load_dotenv


# Load environment variables
load_dotenv()

# Add libraries to path (same pattern as other notebooks)
sys.path.append(str(Path.cwd().parent))

from culture_cli.modules.ce.utils.tag_matcher import calculate_similarity
from libraries.client_culture_extractor import ClientCultureExtractor
from libraries.client_stashapp import StashAppClient


def hamming_distance(hash1: str, hash2: str) -> int:
    """Calculate the Hamming distance between two hex strings (phashes)."""
    bin1 = bin(int(hash1, 16))[2:].zfill(64)
    bin2 = bin(int(hash2, 16))[2:].zfill(64)
    return sum(c1 != c2 for c1, c2 in zip(bin1, bin2))

## Configuration

In [38]:
# Output directory for intermediate results
OUTPUT_DIR = Path("/Users/thardas/Private/Code/Culture/analysis/notebooks/stashapp/Stashapp/output")
OUTPUT_DIR.mkdir(exist_ok=True)

# CE UUID pattern at end of filename (before extension)
CE_UUID_PATTERN = re.compile(r"[ -][0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}\.[^.]+$", re.IGNORECASE)

# Matching thresholds
PHASH_HAMMING_THRESHOLD = 16  # Maximum hamming distance for phash match
DURATION_DIFF_THRESHOLD = 60  # Maximum duration difference in seconds
TITLE_SIMILARITY_THRESHOLD = 0.85  # Minimum Levenshtein similarity for fuzzy title match

# Dry run mode (set to False to execute actual file operations)
DRY_RUN = True

## Data Structures

In [39]:
from datetime import date, timedelta


@dataclass
class FileMatch:
    stashapp_scene_id: int
    stashapp_title: str
    stashapp_date: date | None
    stashapp_file_path: str
    stashapp_oshash: str | None
    stashapp_phash: str | None
    stashapp_duration: timedelta | None
    ce_release_uuid: str
    ce_release_name: str
    ce_release_date: date | None
    ce_saved_filename: str
    ce_oshash: str | None
    ce_phash: str | None
    match_type: str  # "oshash", "phash_duration", "date_title_fuzzy", "date_title_partial"
    confidence: float  # 0.0 to 1.0
    action: str  # "RENAME_DELETE", "MOVE_NEXT_TO", "FLAG_FOR_REVIEW"
    reason: str  # Human-readable explanation
    approved: bool = False  # User approval flag

    def to_dict(self):
        d = asdict(self)
        # Convert date and timedelta to strings for JSON serialization
        if d["stashapp_date"]:
            d["stashapp_date"] = str(d["stashapp_date"])
        if d["ce_release_date"]:
            d["ce_release_date"] = str(d["ce_release_date"])
        if d["stashapp_duration"]:
            d["stashapp_duration"] = d["stashapp_duration"].total_seconds()
        return d

## Phase 1: Export Stashapp X-Art Scenes Without CE UUID

In [40]:
# Initialize Stashapp client
stash = StashAppClient()

# Get X-Art studio ID
studios_df = stash.get_studios()
xart_studio = studios_df.filter(pl.col("stash_studios_name") == "X-Art")

if len(xart_studio) == 0:
    raise ValueError("X-Art studio not found in Stashapp")

xart_studio_id = xart_studio["stash_studios_id"][0]
print(f"Found X-Art studio with ID: {xart_studio_id}")

dUsing stash (v0.30.1-0) endpoint at https://stash.chiefsclub.com:443/graphql


Found X-Art studio with ID: 6


In [41]:
# Query all X-Art scenes
print("Querying X-Art scenes from Stashapp...")
xart_scenes_df = stash.find_scenes_by_studio([xart_studio_id])
print(f"Found {len(xart_scenes_df)} X-Art scenes")

# Check if filename has CE UUID pattern
xart_scenes_df = xart_scenes_df.with_columns(
    pl.col("stashapp_primary_file_basename").map_elements(
        lambda x: bool(CE_UUID_PATTERN.search(x)) if x else False,
        return_dtype=pl.Boolean
    ).alias("has_ce_uuid")
)

# Filter for scenes WITHOUT CE UUID and select/rename relevant columns
stashapp_df = xart_scenes_df.filter(~pl.col("has_ce_uuid")).select([
    pl.col("stashapp_id").alias("scene_id"),
    pl.col("stashapp_title").alias("title"),
    pl.col("stashapp_date").alias("date"),
    pl.col("stashapp_primary_file_path").alias("file_path"),
    pl.col("stashapp_primary_file_basename").alias("basename"),
    pl.col("stashapp_primary_file_oshash").alias("oshash"),
    pl.col("stashapp_primary_file_phash").alias("phash"),
    pl.col("stashapp_primary_file_duration").alias("duration"),
    pl.col("has_ce_uuid"),
])

print(f"Found {len(stashapp_df)} scenes without CE UUID pattern")

Querying X-Art scenes from Stashapp...
Found 1021 X-Art scenes
Found 935 scenes without CE UUID pattern


In [42]:
# Display Stashapp DataFrame
print(f"Stashapp DataFrame shape: {stashapp_df.shape}")
stashapp_df.head()

Stashapp DataFrame shape: (935, 9)


scene_id,title,date,file_path,basename,oshash,phash,duration,has_ce_uuid
i64,str,date,str,str,str,str,duration[ms],bool
158,"""A Long Time Cumming""",2018-05-24,"""Z:\Culture\Videos\Sites\Malibu…","""Malibu Media꞉ X-Art – 2018-05-…","""50b16d5b4cc22782""","""cac478f840e97ba5""",18m 30s 600ms,False
290,"""Blondes Have More Fun""",2016-06-08,"""Z:\Culture\Videos\Sites\Malibu…","""Malibu Media꞉ X-Art – 2016-06-…","""170166e84c21588d""","""fbda9da194b048c3""",17m 10s 260ms,False
415,"""Let Me Take Your Picture""",2013-09-14,"""Z:\Culture\Videos\Sites\Malibu…","""Malibu Media꞉ X-Art – 2013-09-…","""eb4174867c51b83e""","""ddc3094d991166ee""",17m 5s 700ms,False
624,"""Coming Late""",2014-06-19,"""Z:\Culture\Videos\Sites\Malibu…","""Malibu Media꞉ X-Art – 2014-06-…","""ea62317d7272a7a3""","""d4e0f233b4f2985c""",24m 45s 440ms,False
724,"""Czechmates""",2016-04-01,"""Z:\Culture\Videos\Sites\Malibu…","""Malibu Media꞉ X-Art – 2016-04-…","""105772790cc6245f""","""e5acd29cac964963""",28m 29s 190ms,False


In [43]:
# Save to JSON for inspection
output_file = OUTPUT_DIR / "stashapp_xart_scenes_without_ce_uuid.json"
stashapp_df.write_json(output_file)
print(f"Saved Stashapp scenes to: {output_file}")

Saved Stashapp scenes to: /Users/thardas/Private/Code/Culture/analysis/notebooks/stashapp/Stashapp/output/stashapp_xart_scenes_without_ce_uuid.json


## Phase 2: Export Culture Extractor X-Art Downloads

In [44]:
# Initialize CE client
ce_connection_string = (
    f"dbname={os.environ.get('CE_DB_NAME')} "
    f"user={os.environ.get('CE_DB_USERNAME')} "
    f"password={os.environ.get('CE_DB_PASSWORD')} "
    f"host={os.environ.get('CE_DB_HOST')} "
    f"port={os.environ.get('CE_DB_PORT')}"
)
ce_client = ClientCultureExtractor(ce_connection_string)

# Get X-Art site UUID
sites_df = ce_client.get_sites()
xart_site = sites_df.filter(pl.col("ce_sites_name") == "X-Art")

if len(xart_site) == 0:
    raise ValueError("X-Art site not found in Culture Extractor database")

xart_site_uuid = xart_site["ce_sites_uuid"][0]
print(f"Found X-Art site with UUID: {xart_site_uuid}")

Found X-Art site with UUID: 019c4c0b-ceda-7706-8658-01ff934bced5


In [45]:
# Query all X-Art downloads
print("Querying X-Art downloads from Culture Extractor...")
ce_downloads_df = ce_client.get_downloads(xart_site_uuid)
print(f"Found {len(ce_downloads_df)} total downloads")

# Filter for video files only
ce_videos_df = ce_downloads_df.filter(pl.col("ce_downloads_file_type") == "video")
print(f"Found {len(ce_videos_df)} video downloads")

ce_videos_df.head()

Querying X-Art downloads from Culture Extractor...
Found 1915 total downloads
Found 514 video downloads


ce_downloads_site_uuid,ce_downloads_site_name,ce_downloads_sub_site_name,ce_downloads_release_uuid,ce_downloads_release_date,ce_downloads_release_short_name,ce_downloads_release_name,ce_downloads_release_url,ce_downloads_release_description,ce_downloads_release_created,ce_downloads_release_last_updated,ce_downloads_release_available_files,ce_downloads_release_json_document,ce_downloads_uuid,ce_downloads_downloaded_at,ce_downloads_file_type,ce_downloads_content_type,ce_downloads_variant,ce_downloads_available_file,ce_downloads_original_filename,ce_downloads_saved_filename,ce_downloads_file_metadata,ce_downloads_performers,ce_downloads_tags,ce_downloads_hash_oshash,ce_downloads_hash_phash,ce_downloads_hash_sha256
str,str,str,str,date,str,str,str,str,datetime[μs],datetime[μs],str,str,str,datetime[μs],str,str,str,str,str,str,str,list[struct[4]],list[struct[4]],str,str,str
"""019c4c0b-ceda-7706-8658-01ff93…","""X-Art""",,"""019c5161-9d4e-7103-986b-50d794…",2025-12-19,"""a-very-xart-xmas""","""A Very X-Art Xmas""","""https://www.x-art.com/members/…","""Set against the glow of twinkl…",2026-02-12 10:24:37.717489,2026-02-12 10:24:37.717501,"""[{""file_type"": ""video"", ""conte…","""{""slug"": ""a-very-xart-xmas"", ""…","""019c5162-fbad-7649-9075-53c14d…",2026-02-12 12:26:07.405065,"""video""","""scene""","""4k""","""{""file_type"": ""video"", ""conten…","""xart_novella_night_a_very_x-ar…","""X-Art - 2025-12-19 - a-very-xa…","""{""$type"": ""VideoFileMetadata"",…",[],[],,,
"""019c4c0b-ceda-7706-8658-01ff93…","""X-Art""",,"""019c5163-1d12-77df-b51b-7c450b…",2025-08-26,"""first-time-anal-adventure""","""First Time Anal Adventure""","""https://www.x-art.com/members/…","""The Adventure continues... in …",2026-02-12 10:26:15.959220,2026-02-12 10:26:15.959240,"""[{""file_type"": ""video"", ""conte…","""{""slug"": ""first-time-anal-adve…","""019c5164-3c0f-75bb-a64c-5d461e…",2026-02-12 12:27:29.424015,"""video""","""scene""","""4k""","""{""file_type"": ""video"", ""conten…","""xart_cherry_kiss_charles_first…","""X-Art - 2025-08-26 - first-tim…","""{""$type"": ""VideoFileMetadata"",…",[],[],,,
"""019c4c0b-ceda-7706-8658-01ff93…","""X-Art""",,"""019c5164-4069-730d-96e6-a0e7c7…",2025-08-28,"""xart-tryout""","""X-Art Tryout""","""https://www.x-art.com/members/…","""Gorgeous model Nikki Hill emai…",2026-02-12 10:27:30.541229,2026-02-12 10:27:30.541249,"""[{""file_type"": ""video"", ""conte…","""{""slug"": ""xart-tryout"", ""title…","""019c5165-039c-71c7-9835-bb4af0…",2026-02-12 12:28:20.508622,"""video""","""scene""","""4k""","""{""file_type"": ""video"", ""conten…","""xart_nikki_hill_tryout_solo_4k…","""X-Art - 2025-08-28 - xart-tryo…","""{""$type"": ""VideoFileMetadata"",…",[],[],,,
"""019c4c0b-ceda-7706-8658-01ff93…","""X-Art""",,"""019c5165-07c4-72ba-af12-73d5e8…",2025-09-12,"""morning-delight""","""Morning Delight""","""https://www.x-art.com/members/…","""""",2026-02-12 10:28:21.577122,2026-02-12 10:28:21.577169,"""[{""file_type"": ""video"", ""conte…","""{""slug"": ""morning-delight"", ""t…","""019c5166-1042-7740-87c1-1fa342…",2026-02-12 12:29:29.282546,"""video""","""scene""","""4k""","""{""file_type"": ""video"", ""conten…","""xart_sybil_sasha_alina_morning…","""X-Art - 2025-09-12 - morning-d…","""{""$type"": ""VideoFileMetadata"",…",[],[],,,
"""019c4c0b-ceda-7706-8658-01ff93…","""X-Art""",,"""019c5166-24dc-752e-835a-7b3c68…",2025-09-20,"""the-studio-part-4""","""The Studio Part 4""","""https://www.x-art.com/members/…","""What's your greatest wish? May…",2026-02-12 10:29:34.559336,2026-02-12 10:29:34.559349,"""[{""file_type"": ""video"", ""conte…","""{""slug"": ""the-studio-part-4"", …","""019c5167-13a0-7594-a2ec-9a1028…",2026-02-12 12:30:35.680461,"""video""","""scene""","""4k""","""{""file_type"": ""video"", ""conten…","""xart_angelica_red_fox_the_stud…","""X-Art - 2025-09-20 - the-studi…","""{""$type"": ""VideoFileMetadata"",…",[],[],,,


In [46]:
# Save to JSON for inspection
output_file = OUTPUT_DIR / "ce_xart_downloads.json"
ce_videos_df.write_json(output_file)
print(f"Saved CE downloads to: {output_file}")

Saved CE downloads to: /Users/thardas/Private/Code/Culture/analysis/notebooks/stashapp/Stashapp/output/ce_xart_downloads.json


## Phase 3: Multi-Stage Matching

Priority order:
1. **OSHash match** (confidence: 1.0) → RENAME_DELETE
2. **Phash + duration match** (confidence: 0.95) → MOVE_NEXT_TO
3. **Date + fuzzy title match** (confidence: 0.80) → MOVE_NEXT_TO
4. **Date + partial title match** (confidence: 0.70) → FLAG_FOR_REVIEW

In [47]:
# Helper function to normalize titles for comparison
def normalize_title(title: str) -> str:
    """Normalize title for comparison by lowercasing and removing special characters."""
    if not title:
        return ""
    return re.sub(r"[^a-z0-9]+", "", title.lower())

# Helper function to check if one title contains the other
def partial_title_match(title1: str, title2: str) -> bool:
    """Check if one title is a substring of the other."""
    if not title1 or not title2:
        return False
    norm1 = normalize_title(title1)
    norm2 = normalize_title(title2)
    return norm1 in norm2 or norm2 in norm1

### Stage 1: OSHash Exact Match

In [48]:
matches: list[FileMatch] = []

# Filter out null oshashes
stashapp_with_oshash = stashapp_df.filter(pl.col("oshash").is_not_null())
ce_with_oshash = ce_videos_df.filter(pl.col("ce_downloads_hash_oshash").is_not_null())

# Join on oshash
oshash_matches = stashapp_with_oshash.join(
    ce_with_oshash,
    left_on="oshash",
    right_on="ce_downloads_hash_oshash",
    how="inner"
)

print(f"Found {len(oshash_matches)} OSHash matches")

# Convert to FileMatch objects
for row in oshash_matches.iter_rows(named=True):
    matches.append(FileMatch(
        stashapp_scene_id=row["scene_id"],
        stashapp_title=row["title"],
        stashapp_date=row["date"],
        stashapp_file_path=row["file_path"],
        stashapp_oshash=row["oshash"],
        stashapp_phash=row["phash"],
        stashapp_duration=row["duration"],
        ce_release_uuid=row["ce_downloads_release_uuid"],
        ce_release_name=row["ce_downloads_release_name"],
        ce_release_date=row["ce_downloads_release_date"],
        ce_saved_filename=row["ce_downloads_saved_filename"],
        ce_oshash=row["ce_downloads_hash_oshash"],
        ce_phash=row["ce_downloads_hash_phash"],
        match_type="oshash",
        confidence=1.0,
        action="RENAME_DELETE",
        reason="Identical file (same OSHash) - rename Stashapp file to CE naming, delete CE duplicate"
    ))

print(f"Created {len(matches)} OSHash match records")

Found 0 OSHash matches
Created 0 OSHash match records


### Stage 2: Phash + Duration Match

In [49]:
# Get already matched scene IDs and CE release UUIDs
matched_scene_ids = {m.stashapp_scene_id for m in matches}
matched_ce_uuids = {m.ce_release_uuid for m in matches}

# Filter out already matched records
remaining_stashapp = stashapp_df.filter(
    ~pl.col("scene_id").is_in(matched_scene_ids) &
    pl.col("phash").is_not_null() &
    pl.col("duration").is_not_null()
)

remaining_ce = ce_videos_df.filter(
    ~pl.col("ce_downloads_release_uuid").is_in(matched_ce_uuids) &
    pl.col("ce_downloads_hash_phash").is_not_null()
)

print(f"Remaining Stashapp scenes: {len(remaining_stashapp)}")
print(f"Remaining CE downloads: {len(remaining_ce)}")

# Manual phash comparison (Polars doesn't have built-in hamming distance)
phash_matches = []
for stash_row in remaining_stashapp.iter_rows(named=True):
    stash_phash = stash_row["phash"]
    stash_duration = stash_row["duration"]

    if not stash_phash or not stash_duration:
        continue

    for ce_row in remaining_ce.iter_rows(named=True):
        ce_phash = ce_row["ce_downloads_hash_phash"]

        if not ce_phash:
            continue

        # Calculate hamming distance
        try:
            hamming_dist = hamming_distance(stash_phash, ce_phash)
        except:
            continue

        if hamming_dist <= PHASH_HAMMING_THRESHOLD:
            # Check duration difference (CE doesn't store duration, so skip this check)
            # Note: CE database doesn't have duration in downloads table
            phash_matches.append((stash_row, ce_row, hamming_dist))
            break  # Match found, move to next Stashapp scene

print(f"Found {len(phash_matches)} Phash matches")

# Convert to FileMatch objects
for stash_row, ce_row, hamming_dist in phash_matches:
    matches.append(FileMatch(
        stashapp_scene_id=stash_row["scene_id"],
        stashapp_title=stash_row["title"],
        stashapp_date=stash_row["date"],
        stashapp_file_path=stash_row["file_path"],
        stashapp_oshash=stash_row["oshash"],
        stashapp_phash=stash_row["phash"],
        stashapp_duration=stash_row["duration"],
        ce_release_uuid=ce_row["ce_downloads_release_uuid"],
        ce_release_name=ce_row["ce_downloads_release_name"],
        ce_release_date=ce_row["ce_downloads_release_date"],
        ce_saved_filename=ce_row["ce_downloads_saved_filename"],
        ce_oshash=ce_row["ce_downloads_hash_oshash"],
        ce_phash=ce_row["ce_downloads_hash_phash"],
        match_type="phash_duration",
        confidence=0.95,
        action="MOVE_NEXT_TO",
        reason=f"Similar content (Phash hamming distance: {hamming_dist}) - likely different quality/encoding"
    ))

print(f"Total matches so far: {len(matches)}")

Remaining Stashapp scenes: 935
Remaining CE downloads: 0
Found 0 Phash matches
Total matches so far: 0


### Stage 3: Date + Fuzzy Title Match

In [None]:
# Update matched IDs
matched_scene_ids = {m.stashapp_scene_id for m in matches}
matched_ce_uuids = {m.ce_release_uuid for m in matches}

# Filter out already matched records
remaining_stashapp = stashapp_df.filter(
    ~pl.col("scene_id").is_in(matched_scene_ids) &
    pl.col("date").is_not_null() &
    pl.col("title").is_not_null()
)

remaining_ce = ce_videos_df.filter(
    ~pl.col("ce_downloads_release_uuid").is_in(matched_ce_uuids) &
    pl.col("ce_downloads_release_date").is_not_null() &
    pl.col("ce_downloads_release_name").is_not_null()
)

print(f"Remaining Stashapp scenes: {len(remaining_stashapp)}")
print(f"Remaining CE downloads: {len(remaining_ce)}")

# Manual fuzzy title matching
fuzzy_matches = []
for stash_row in remaining_stashapp.iter_rows(named=True):
    stash_date = stash_row["date"]
    stash_title = stash_row["title"]

    if not stash_date or not stash_title:
        continue

    for ce_row in remaining_ce.iter_rows(named=True):
        ce_date = ce_row["ce_downloads_release_date"]
        ce_title = ce_row["ce_downloads_release_name"]

        if not ce_date or not ce_title:
            continue

        # Check exact date match
        if stash_date != ce_date:
            continue

        # Calculate title similarity
        similarity = calculate_similarity(stash_title, ce_title)

        if similarity >= TITLE_SIMILARITY_THRESHOLD:
            fuzzy_matches.append((stash_row, ce_row, similarity))
            break  # Match found, move to next Stashapp scene

print(f"Found {len(fuzzy_matches)} fuzzy title matches")

# Convert to FileMatch objects
for stash_row, ce_row, similarity in fuzzy_matches:
    matches.append(FileMatch(
        stashapp_scene_id=stash_row["scene_id"],
        stashapp_title=stash_row["title"],
        stashapp_date=stash_row["date"],
        stashapp_file_path=stash_row["file_path"],
        stashapp_oshash=stash_row["oshash"],
        stashapp_phash=stash_row["phash"],
        stashapp_duration=stash_row["duration"],
        ce_release_uuid=ce_row["ce_downloads_release_uuid"],
        ce_release_name=ce_row["ce_downloads_release_name"],
        ce_release_date=ce_row["ce_downloads_release_date"],
        ce_saved_filename=ce_row["ce_downloads_saved_filename"],
        ce_oshash=ce_row["ce_downloads_hash_oshash"],
        ce_phash=ce_row["ce_downloads_hash_phash"],
        match_type="date_title_fuzzy",
        confidence=0.80,
        action="MOVE_NEXT_TO",
        reason=f"Same date and similar title (similarity: {similarity:.2f})"
    ))

print(f"Total matches so far: {len(matches)}")

### Stage 4: Date + Partial Title Match

In [None]:
# Update matched IDs
matched_scene_ids = {m.stashapp_scene_id for m in matches}
matched_ce_uuids = {m.ce_release_uuid for m in matches}

# Filter out already matched records
remaining_stashapp = stashapp_df.filter(
    ~pl.col("scene_id").is_in(matched_scene_ids) &
    pl.col("date").is_not_null() &
    pl.col("title").is_not_null()
)

remaining_ce = ce_videos_df.filter(
    ~pl.col("ce_downloads_release_uuid").is_in(matched_ce_uuids) &
    pl.col("ce_downloads_release_date").is_not_null() &
    pl.col("ce_downloads_release_name").is_not_null()
)

print(f"Remaining Stashapp scenes: {len(remaining_stashapp)}")
print(f"Remaining CE downloads: {len(remaining_ce)}")

# Manual partial title matching
partial_matches = []
for stash_row in remaining_stashapp.iter_rows(named=True):
    stash_date = stash_row["date"]
    stash_title = stash_row["title"]

    if not stash_date or not stash_title:
        continue

    for ce_row in remaining_ce.iter_rows(named=True):
        ce_date = ce_row["ce_downloads_release_date"]
        ce_title = ce_row["ce_downloads_release_name"]

        if not ce_date or not ce_title:
            continue

        # Check exact date match
        if stash_date != ce_date:
            continue

        # Check partial title match
        if partial_title_match(stash_title, ce_title):
            partial_matches.append((stash_row, ce_row))
            break  # Match found, move to next Stashapp scene

print(f"Found {len(partial_matches)} partial title matches")

# Convert to FileMatch objects
for stash_row, ce_row in partial_matches:
    matches.append(FileMatch(
        stashapp_scene_id=stash_row["scene_id"],
        stashapp_title=stash_row["title"],
        stashapp_date=stash_row["date"],
        stashapp_file_path=stash_row["file_path"],
        stashapp_oshash=stash_row["oshash"],
        stashapp_phash=stash_row["phash"],
        stashapp_duration=stash_row["duration"],
        ce_release_uuid=ce_row["ce_downloads_release_uuid"],
        ce_release_name=ce_row["ce_downloads_release_name"],
        ce_release_date=ce_row["ce_downloads_release_date"],
        ce_saved_filename=ce_row["ce_downloads_saved_filename"],
        ce_oshash=ce_row["ce_downloads_hash_oshash"],
        ce_phash=ce_row["ce_downloads_hash_phash"],
        match_type="date_title_partial",
        confidence=0.70,
        action="FLAG_FOR_REVIEW",
        reason="Same date and partial title match - requires manual verification"
    ))

print(f"Total matches: {len(matches)}")

### Summary of Matches

In [None]:
# Summary statistics
matches_by_type = {}
matches_by_action = {}
matches_by_confidence = {"high": 0, "medium": 0, "low": 0}

for match in matches:
    matches_by_type[match.match_type] = matches_by_type.get(match.match_type, 0) + 1
    matches_by_action[match.action] = matches_by_action.get(match.action, 0) + 1

    if match.confidence >= 0.95:
        matches_by_confidence["high"] += 1
    elif match.confidence >= 0.75:
        matches_by_confidence["medium"] += 1
    else:
        matches_by_confidence["low"] += 1

print("\n=== MATCHING SUMMARY ===")
print(f"\nTotal matches: {len(matches)}")
print("\nBy match type:")
for match_type, count in matches_by_type.items():
    print(f"  {match_type}: {count}")
print("\nBy action:")
for action, count in matches_by_action.items():
    print(f"  {action}: {count}")
print("\nBy confidence:")
for level, count in matches_by_confidence.items():
    print(f"  {level}: {count}")

In [None]:
# Save matches to JSON
output_file = OUTPUT_DIR / "xart_matches.json"
with output_file.open("w") as f:
    json.dump([m.to_dict() for m in matches], f, indent=2)
print(f"\nSaved matches to: {output_file}")

## Phase 4: Review and Approve Matches

Display matches grouped by confidence for user review.

In [None]:
def display_match(match: FileMatch, index: int):
    """Display a single match for user review."""
    print(f"\n--- Match #{index + 1} ---")
    print(f"Match Type: {match.match_type}")
    print(f"Confidence: {match.confidence:.2f}")
    print(f"Action: {match.action}")
    print(f"Reason: {match.reason}")
    print("\nStashapp Scene:")
    print(f"  ID: {match.stashapp_scene_id}")
    print(f"  Title: {match.stashapp_title}")
    print(f"  Date: {match.stashapp_date}")
    print(f"  File: {match.stashapp_file_path}")
    print(f"  OSHash: {match.stashapp_oshash}")
    print("\nCE Download:")
    print(f"  UUID: {match.ce_release_uuid}")
    print(f"  Title: {match.ce_release_name}")
    print(f"  Date: {match.ce_release_date}")
    print(f"  File: {match.ce_saved_filename}")
    print(f"  OSHash: {match.ce_oshash}")

In [None]:
# Display high-confidence matches (>= 0.95)
high_confidence = [m for m in matches if m.confidence >= 0.95]
print(f"\n=== HIGH CONFIDENCE MATCHES ({len(high_confidence)}) ===")
print("These matches can typically be approved in bulk.\n")

for i, match in enumerate(high_confidence[:5]):  # Show first 5
    display_match(match, i)

if len(high_confidence) > 5:
    print(f"\n... and {len(high_confidence) - 5} more high-confidence matches")

In [None]:
# Display medium-confidence matches (0.75 - 0.94)
medium_confidence = [m for m in matches if 0.75 <= m.confidence < 0.95]
print(f"\n=== MEDIUM CONFIDENCE MATCHES ({len(medium_confidence)}) ===")
print("These matches should be reviewed individually.\n")

for i, match in enumerate(medium_confidence[:5]):  # Show first 5
    display_match(match, i)

if len(medium_confidence) > 5:
    print(f"\n... and {len(medium_confidence) - 5} more medium-confidence matches")

In [None]:
# Display low-confidence matches (< 0.75)
low_confidence = [m for m in matches if m.confidence < 0.75]
print(f"\n=== LOW CONFIDENCE MATCHES ({len(low_confidence)}) ===")
print("These matches require careful manual verification.\n")

for i, match in enumerate(low_confidence[:5]):  # Show first 5
    display_match(match, i)

if len(low_confidence) > 5:
    print(f"\n... and {len(low_confidence) - 5} more low-confidence matches")

### Approval Interface

Manually set `approved=True` for matches you want to process.

In [None]:
# Example: Approve all high-confidence matches
# Uncomment to approve:
# for match in high_confidence:
#     match.approved = True

# Example: Approve specific matches by index
# matches[0].approved = True
# matches[5].approved = True

approved_matches = [m for m in matches if m.approved]
print(f"\nApproved matches: {len(approved_matches)} / {len(matches)}")

## Phase 5: Execute Approved File Operations

**WARNING**: This will modify files on disk. Set `DRY_RUN = False` to execute.

In [None]:
from pathlib import Path


def execute_rename_delete(match: FileMatch, dry_run: bool = True) -> dict:
    """Execute RENAME_DELETE action for exact duplicates."""
    result = {
        "success": False,
        "action": "RENAME_DELETE",
        "match": match.to_dict(),
        "operations": [],
        "errors": []
    }

    try:
        # Get CE basename for new Stashapp filename
        ce_basename = Path(match.ce_saved_filename).name
        stash_dir = str(Path(match.stashapp_file_path).parent)
        new_stash_path = str(Path(stash_dir) / ce_basename)

        # Operation 1: Rename Stashapp file
        if dry_run:
            result["operations"].append(f"[DRY RUN] Rename: {match.stashapp_file_path} -> {new_stash_path}")
        else:
            Path(match.stashapp_file_path).rename(new_stash_path)
            result["operations"].append(f"Renamed: {match.stashapp_file_path} -> {new_stash_path}")

        # Operation 2: Trigger Stashapp scan
        if dry_run:
            result["operations"].append(f"[DRY RUN] Scan directory: {stash_dir}")
        else:
            stash.metadata_scan(paths=[stash_dir])
            result["operations"].append(f"Scanned directory: {stash_dir}")

        # Operation 3: Delete CE video file (keep images)
        if dry_run:
            result["operations"].append(f"[DRY RUN] Delete CE video: {match.ce_saved_filename}")
        elif Path(match.ce_saved_filename).exists():
            Path(match.ce_saved_filename).unlink()
            result["operations"].append(f"Deleted CE video: {match.ce_saved_filename}")

        result["success"] = True

    except Exception as e:
        result["errors"].append(str(e))

    return result

In [None]:
from pathlib import Path


def execute_move_next_to(match: FileMatch, dry_run: bool = True) -> dict:
    """Execute MOVE_NEXT_TO action for similar files."""
    result = {
        "success": False,
        "action": "MOVE_NEXT_TO",
        "match": match.to_dict(),
        "operations": [],
        "errors": []
    }

    try:
        # Get target directory from Stashapp file
        target_dir = str(Path(match.stashapp_file_path).parent)
        ce_basename = Path(match.ce_saved_filename).name
        new_ce_path = str(Path(target_dir) / ce_basename)

        # Operation 1: Move CE video file
        if dry_run:
            result["operations"].append(f"[DRY RUN] Move: {match.ce_saved_filename} -> {new_ce_path}")
        else:
            shutil.move(match.ce_saved_filename, new_ce_path)
            result["operations"].append(f"Moved: {match.ce_saved_filename} -> {new_ce_path}")

        # Operation 2: Move preview images from same release directory
        ce_release_dir = str(Path(match.ce_saved_filename).parent)
        for img_file in Path(ce_release_dir).glob("*.jpg"):
            img_target = str(Path(target_dir) / img_file.name)
            if dry_run:
                result["operations"].append(f"[DRY RUN] Move image: {img_file} -> {img_target}")
            else:
                shutil.move(str(img_file), img_target)
                result["operations"].append(f"Moved image: {img_file} -> {img_target}")

        # Operation 3: Trigger Stashapp scan
        if dry_run:
            result["operations"].append(f"[DRY RUN] Scan directory: {target_dir}")
        else:
            stash.metadata_scan(paths=[target_dir])
            result["operations"].append(f"Scanned directory: {target_dir}")

        result["success"] = True

    except Exception as e:
        result["errors"].append(str(e))

    return result

In [None]:
# Execute approved operations
execution_results = []

print(f"\n=== EXECUTING APPROVED OPERATIONS (DRY_RUN={DRY_RUN}) ===")
print(f"Processing {len(approved_matches)} approved matches...\n")

for i, match in enumerate(approved_matches):
    print(f"\nProcessing match #{i + 1}/{len(approved_matches)}...")
    print(f"  Stashapp: {match.stashapp_title}")
    print(f"  CE: {match.ce_release_name}")
    print(f"  Action: {match.action}")

    if match.action == "RENAME_DELETE":
        result = execute_rename_delete(match, dry_run=DRY_RUN)
    elif match.action == "MOVE_NEXT_TO":
        result = execute_move_next_to(match, dry_run=DRY_RUN)
    else:
        result = {
            "success": False,
            "action": match.action,
            "match": match.to_dict(),
            "operations": [],
            "errors": [f"Unknown action: {match.action}"]
        }

    execution_results.append(result)

    if result["success"]:
        print("  ✓ Success")
        for op in result["operations"]:
            print(f"    - {op}")
    else:
        print("  ✗ Failed")
        for error in result["errors"]:
            print(f"    - ERROR: {error}")

print("\n=== EXECUTION COMPLETE ===")

## Phase 6: Verify and Report Results

In [None]:
# Summary statistics
total_operations = len(execution_results)
successful_operations = sum(1 for r in execution_results if r["success"])
failed_operations = total_operations - successful_operations

rename_delete_count = sum(1 for r in execution_results if r["action"] == "RENAME_DELETE")
move_next_to_count = sum(1 for r in execution_results if r["action"] == "MOVE_NEXT_TO")

print("\n=== EXECUTION SUMMARY ===")
print(f"\nTotal operations: {total_operations}")
print(f"Successful: {successful_operations}")
print(f"Failed: {failed_operations}")
print("\nBy action type:")
print(f"  RENAME_DELETE: {rename_delete_count}")
print(f"  MOVE_NEXT_TO: {move_next_to_count}")

if DRY_RUN:
    print("\n⚠️  DRY RUN MODE - No files were actually modified")
    print("Set DRY_RUN = False to execute actual file operations")
else:
    print("\n✓ File operations executed")

In [None]:
# Save execution log
log_file = OUTPUT_DIR / "execution_log.json"
with log_file.open("w") as f:
    json.dump({
        "dry_run": DRY_RUN,
        "total_operations": total_operations,
        "successful": successful_operations,
        "failed": failed_operations,
        "results": execution_results
    }, f, indent=2)

print(f"\nExecution log saved to: {log_file}")

In [None]:
# Display failed operations
failed = [r for r in execution_results if not r["success"]]
if failed:
    print("\n=== FAILED OPERATIONS ===")
    for i, result in enumerate(failed):
        print(f"\n#{i + 1}:")
        print(f"  Stashapp: {result['match']['stashapp_title']}")
        print(f"  CE: {result['match']['ce_release_name']}")
        print("  Errors:")
        for error in result["errors"]:
            print(f"    - {error}")
else:
    print("\n✓ No failed operations")