In [None]:
import json
import sys
import os
from pathlib import Path

import polars as pl


sys.path.append(str(Path.cwd().parent))

from libraries.client_stashapp import StashAppClient, get_stashapp_client
from libraries.FaceDatasetBuilder import FaceDatasetBuilder
from libraries.scene_states import DatasetStructure, SceneState


def get_stashdb_performer(performer):
    """Extract StashDB ID and name from performer data"""
    for stash_id in performer["stashapp_performers_stash_ids"]:
        if stash_id["endpoint"] == "https://stashdb.org/graphql":
            return {
                "stash_id": stash_id["stash_id"],
                "name": performer["stashapp_performers_name"]
            }
    return None

# Process a scene
def process_scene(scene):
    # Get performers with their StashDB IDs
    performers = []
    for performer in scene["stashapp_performers"]:
        stashdb_info = get_stashdb_performer(performer)
        if stashdb_info:
            performers.append(f"{stashdb_info['stash_id']} - {stashdb_info['name']}")

    # Process the scene
    if performers:
        return builder.process_scene(
            video_path=scene["stashapp_primary_file_path"],
            scene_id=scene["stashapp_stashdb_id"],
            performers=performers
        )
    return 0

# Initialize builder
builder = FaceDatasetBuilder()

stash = get_stashapp_client()
stash_client = StashAppClient()

In [50]:
all_performers = stash_client.get_performers()
performer = all_performers.filter(pl.col("stashapp_name").str.contains("Tori Black")).to_dicts()[0]
favorite_performers = all_performers.filter(pl.col("stashapp_favorite") == True)
favorite_performers

favorite_performer_ids = favorite_performers.select(pl.col("stashapp_id")).to_series().to_list()


In [66]:
face_hidden_tag = stash.find_tag("Face Hidden")["id"]

In [None]:
stash.find_performers({
    "gender": {"value_list": ["MALE"], "modifier": "INCLUDES" },
    "tags": {"value": [], "excludes": [face_hidden_tag], "modifier": "INCLUDES" }
}, { "sort": "scenes_count", "direction": "DESC", "per_page": 200 }, fragment="id name")

In [68]:
virtual_reality_tag = stash.find_tag("Virtual Reality")["id"]

sample_scenes = stash_client.find_scenes({
    "performers": {"value": favorite_performer_ids, "excludes": [], "modifier": "INCLUDES" },
    "tags": {"value": [], "excludes": [virtual_reality_tag], "modifier": "INCLUDES" }
})

In [None]:
# Find the processed scene IDs from file system
dataset = DatasetStructure("H:\\Faces\\dataset")

# Get processed scenes from dataset info
processed_scenes = {
    scene_id for scene_id, state in dataset.info["processed_scenes"].items()
    if state in [SceneState.FACES_EXTRACTED.value, SceneState.VERIFIED.value]
}

# Filter out processed scenes from sample_scenes
unprocessed_scenes = sample_scenes.filter(
    ~pl.col("stashapp_stashdb_id").is_in(processed_scenes)
)

print(f"Total scenes: {len(sample_scenes)}")
print(f"Already processed: {len(processed_scenes)}")
print(f"Remaining to process: {len(unprocessed_scenes)}")

In [None]:
# Count images per performer and create a DataFrame
from pathlib import Path

import polars as pl


def get_performer_stats():
    performer_dir = Path("H:\\Faces\\dataset") / "performers" / "deduplicated"

    # Collect performer statistics
    stats = []
    if performer_dir.exists():
        for perf_dir in performer_dir.iterdir():
            if perf_dir.is_dir():
                # Split performer ID and name
                try:
                    performer_id, performer_name = perf_dir.name.split(" - ", 1)
                except ValueError:
                    performer_id = perf_dir.name
                    performer_name = "Unknown"

                # Count images
                image_count = len(list(perf_dir.glob("*.jpg")))

                if image_count > 0:  # Only include performers with images
                    stats.append({
                        "performer_id": performer_id,
                        "name": performer_name,
                        "image_count": image_count
                    })

    # Create DataFrame and sort by image count
    df = pl.DataFrame(stats).sort("image_count", descending=True)

    return df

# Get and display performer statistics
performer_stats = get_performer_stats()
performer_stats

In [55]:
performers_by_scene_count = stash.find_performers()

In [None]:
# Get a balanced batch of scenes to process
MAX_SCENES = 512  # Total scenes to process
MAX_PER_DRIVE = 1  # Maximum concurrent scenes per HDD
STORAGE_DRIVES = ["X:", "Y:", "Z:", "W:"]  # Available drives for processing

# First, group all unprocessed scenes by drive
drive_scenes = {drive: [] for drive in STORAGE_DRIVES}  # Initialize all drives

# Group scenes by drive
for scene in unprocessed_scenes.to_dicts():
    performers = []
    for performer in scene["stashapp_performers"]:
        stashdb_info = get_stashdb_performer(performer)
        if stashdb_info:
            performers.append(f"{stashdb_info['stash_id']} - {stashdb_info['name']}")

    if performers:
        video_path = scene["stashapp_primary_file_path"]
        drive = os.path.splitdrive(video_path)[0].upper()

        if drive in STORAGE_DRIVES:  # Only process from our storage drives
            scene["video_path"] = video_path
            scene["performers"] = performers
            drive_scenes[drive].append(scene)

# Simple round-robin selection
scenes_to_process = []
for i in range(MAX_SCENES):
    drive = STORAGE_DRIVES[i % len(STORAGE_DRIVES)]
    if drive_scenes[drive]:  # If this drive has any scenes left
        scenes_to_process.append(drive_scenes[drive].pop(0))

# Print scene distribution
print("Scenes per drive:")
drive_counts = {}
for scene in scenes_to_process:
    drive = os.path.splitdrive(scene["video_path"])[0].upper()
    drive_counts[drive] = drive_counts.get(drive, 0) + 1

for drive in STORAGE_DRIVES:
    print(f"{drive}: {drive_counts.get(drive, 0)} scenes")

# Create JSON files in pending directory
pending_dir = Path("H:\\Faces\\dataset") / "scenes" / SceneState.PENDING.value
pending_dir.mkdir(parents=True, exist_ok=True)

for scene in scenes_to_process:
    scene_id = scene["stashapp_stashdb_id"]
    scene_data = {
        "video_path": scene["video_path"],
        "performers": scene["performers"]
    }

    json_path = pending_dir / f"{scene_id}.json"
    with open(json_path, "w") as f:
        json.dump(scene_data, f, indent=2)

print(f"\nCreated {len(scenes_to_process)} JSON files in {pending_dir}")

# Diversifying selected images

!pip install imagehash scikit-learn


In [None]:
import shutil

import imagehash
import numpy as np
from PIL import Image
from sklearn.cluster import KMeans


def deduplicate_faces(face_dir, output_dir, max_faces=100):
    """
    Cluster similar faces and keep the most representative ones using perceptual hashing.
    Saves results to a separate directory instead of moving original files.
    """
    # Calculate perceptual hashes for all images
    hashes = []
    paths = []
    for img_path in Path(face_dir).iterdir():
        if img_path.suffix == ".jpg":
            full_path = str(img_path)
            try:
                # Use both average and perceptual hash for better similarity detection
                img = Image.open(full_path)
                avg_hash = imagehash.average_hash(img)
                phash = imagehash.phash(img)
                # Convert hash to binary array (each bit becomes a feature)
                hash_array = np.array([bit for bit in bin(int(str(avg_hash), 16))[2:].zfill(64)] +
                                    [bit for bit in bin(int(str(phash), 16))[2:].zfill(64)], dtype=int)
                hashes.append(hash_array)
                paths.append(full_path)
            except Exception as e:
                print(f"Error processing {img_path.name}: {e}")

    if not hashes:
        return []

    # Convert list of hash arrays to 2D numpy array
    hash_vectors = np.array(hashes)

    # Cluster similar images
    n_clusters = min(max_faces, len(paths))
    kmeans = KMeans(n_clusters=n_clusters, n_init=10)
    clusters = kmeans.fit_predict(hash_vectors)

    # Keep images closest to cluster centers
    kept_images = []
    for i in range(n_clusters):
        cluster_indices = np.where(clusters == i)[0]
        if len(cluster_indices) > 0:
            # Find image closest to cluster center
            distances = np.linalg.norm(
                hash_vectors[cluster_indices] - kmeans.cluster_centers_[i],
                axis=1
            )
            closest_idx = cluster_indices[np.argmin(distances)]
            kept_images.append(paths[closest_idx])

    return kept_images

# Set up directories
base_dir = Path("H:\\Faces\\dataset")
source_dir = base_dir / "performers" / "verified"
output_dir = base_dir / "performers" / "deduplicated"

# Create output directory
output_dir.mkdir(parents=True, exist_ok=True)

print("\nDeduplication Summary:")
print("-" * 50)

if source_dir.exists():
    for perf_dir in source_dir.iterdir():
        if perf_dir.is_dir():
            print(f"\nProcessing {perf_dir.name}")

            # Create performer directory in output
            perf_output_dir = output_dir / perf_dir.name
            perf_output_dir.mkdir(exist_ok=True)

            # Count images
            original_count = len(list(perf_dir.glob("*.jpg")))

            if original_count > 0:
                # Get most representative images
                kept_images = deduplicate_faces(perf_dir, perf_output_dir,
                                             max_faces=min(100, original_count))

                # Copy selected images to output directory
                for img_path in kept_images:
                    shutil.copy2(img_path, perf_output_dir)

                final_count = len(kept_images)
                print(f"Original: {original_count}")
                print(f"Kept: {final_count}")
                if original_count > final_count:
                    print(f"Reduction: {((original_count - final_count) / original_count * 100):.1f}%")

# Print warning for performers with few images
print("\nPerformers with fewer than 20 images:")
print("-" * 50)
for perf_dir in source_dir.iterdir():
    if perf_dir.is_dir():
        image_count = len(list(perf_dir.glob("*.jpg")))
        if image_count < 20:
            print(f"{perf_dir.name}: {image_count} images")