In [6]:
import polars as pl
import sys
import os
import json
from pathlib import Path

sys.path.append(os.path.dirname(os.path.abspath('')))

from libraries.scene_states import SceneState  # Changed from FaceDatasetBuilder to scene_states
from libraries.FaceDatasetBuilder import FaceDatasetBuilder
from libraries.client_stashapp import get_stashapp_client, StashAppClient

def get_stashdb_performer(performer):
    """Extract StashDB ID and name from performer data"""
    for stash_id in performer["stashapp_performers_stash_ids"]:
        if stash_id["endpoint"] == "https://stashdb.org/graphql":
            return {
                'stash_id': stash_id["stash_id"],
                'name': performer["stashapp_performers_name"]
            }
    return None

# Process a scene
def process_scene(scene):
    # Get performers with their StashDB IDs
    performers = []
    for performer in scene["stashapp_performers"]:
        stashdb_info = get_stashdb_performer(performer)
        if stashdb_info:
            performers.append(f"{stashdb_info['stash_id']} - {stashdb_info['name']}")
    
    # Process the scene
    if performers:
        return builder.process_scene(
            video_path=scene["stashapp_primary_file_path"],
            scene_id=scene["stashapp_stashdb_id"],
            performers=performers
        )
    return 0

# Initialize builder
builder = FaceDatasetBuilder()

stash = get_stashapp_client()
stash_client = StashAppClient()

dUsing stash (v0.27.2-37-g0621d871) endpoint at http://localhost:6969/graphql
dPersisting Connection to Stash with ApiKey...
dUsing stash (v0.27.2-37-g0621d871) endpoint at http://localhost:6969/graphql
dPersisting Connection to Stash with ApiKey...


In [7]:
all_performers = stash_client.get_performers()
performer = all_performers.filter(pl.col("stashapp_name").str.contains("Alexis Crystal")).to_dicts()[0]

In [8]:
virtual_reality_tag = stash.find_tag("Virtual Reality")["id"]

sample_scenes = stash_client.find_scenes({
    "performers": {"value": [performer["stashapp_id"]], "excludes": [], "modifier": "INCLUDES" },
    "tags": {"value": [], "excludes": [virtual_reality_tag], "modifier": "INCLUDES" }
})

In [9]:
# Find the processed scene IDs from file system
if os.path.exists(builder.structure['scenes'][SceneState.FACES_EXTRACTED.value]):
    processed_scenes = set(os.listdir(os.path.join(builder.structure['scenes'][SceneState.FACES_EXTRACTED.value])))
else:
    # If directory doesn't exist, create it and start with empty set
    os.makedirs(os.path.join(builder.structure['scenes'], SceneState.FACES_EXTRACTED.value), exist_ok=True)
    processed_scenes = set()

# Filter out processed scenes from sample_scenes
unprocessed_scenes = sample_scenes.filter(
    ~pl.col("stashapp_stashdb_id").is_in(processed_scenes)
)

print(f"Total scenes: {len(sample_scenes)}")
print(f"Already processed: {len(processed_scenes)}")
print(f"Remaining to process: {len(unprocessed_scenes)}")

Total scenes: 1143
Already processed: 0
Remaining to process: 964


In [10]:
# Get a balanced batch of scenes to process
MAX_SCENES = 1  # Total scenes to process
MAX_PER_DRIVE = 1  # Maximum concurrent scenes per HDD
STORAGE_DRIVES = ['X:', 'Y:', 'Z:', 'W:']  # Available drives for processing

# First, group all unprocessed scenes by drive
drive_scenes = {drive: [] for drive in STORAGE_DRIVES}  # Initialize all drives

# Group scenes by drive
for scene in unprocessed_scenes.to_dicts():
    performers = []
    for performer in scene["stashapp_performers"]:
        stashdb_info = get_stashdb_performer(performer)
        if stashdb_info:
            performers.append(f"{stashdb_info['stash_id']} - {stashdb_info['name']}")
    
    if performers:
        video_path = scene["stashapp_primary_file_path"]
        drive = os.path.splitdrive(video_path)[0].upper()
        
        if drive in STORAGE_DRIVES:  # Only process from our storage drives
            scene['video_path'] = video_path
            scene['performers'] = performers
            drive_scenes[drive].append(scene)

# Simple round-robin selection
scenes_to_process = []
for i in range(MAX_SCENES):
    drive = STORAGE_DRIVES[i % len(STORAGE_DRIVES)]
    if drive_scenes[drive]:  # If this drive has any scenes left
        scenes_to_process.append(drive_scenes[drive].pop(0))

# Print scene distribution
print("Scenes per drive:")
drive_counts = {}
for scene in scenes_to_process:
    drive = os.path.splitdrive(scene['video_path'])[0].upper()
    drive_counts[drive] = drive_counts.get(drive, 0) + 1

for drive in STORAGE_DRIVES:
    print(f"{drive}: {drive_counts.get(drive, 0)} scenes")

# Create JSON files in pending directory
pending_dir = Path("H:\\Faces\\dataset") / 'scenes' / SceneState.PENDING.value
os.makedirs(pending_dir, exist_ok=True)

for scene in scenes_to_process:
    scene_id = scene['stashapp_stashdb_id']
    scene_data = {
        'video_path': scene['video_path'],
        'performers': scene['performers']
    }
    
    json_path = pending_dir / f"{scene_id}.json"
    with open(json_path, 'w') as f:
        json.dump(scene_data, f, indent=2)

print(f"\nCreated {len(scenes_to_process)} JSON files in {pending_dir}")

Scenes per drive:
X:: 1 scenes
Y:: 0 scenes
Z:: 0 scenes
W:: 0 scenes

Created 1 JSON files in H:\Faces\dataset\scenes\1_pending


In [None]:
from scripts.prepare_scenes import prepare_scenes_for_performer

# Queue scenes for processing
base_dir = "H:\\Faces\\dataset"
stats = prepare_scenes_for_performer("Alexis Crystal", base_dir)

print(f"Total scenes found: {stats['total_scenes']}")
print(f"Already processed: {stats['already_processed']}")
print(f"Newly queued: {stats['newly_queued']}")

In [None]:
# Get a batch of scenes to process
scenes_to_process = []
drive_counts = {}  # Track number of scenes per drive

# Get scenes and count per drive
for scene in unprocessed_scenes.head(12).to_dicts():
    performers = []
    for performer in scene["stashapp_performers"]:
        stashdb_info = get_stashdb_performer(performer)
        if stashdb_info:
            performers.append(f"{stashdb_info['stash_id']} - {stashdb_info['name']}")
    
    if performers:
        video_path = scene["stashapp_primary_file_path"]
        drive = os.path.splitdrive(video_path)[0].upper()
        
        if drive not in drive_counts:
            drive_counts[drive] = 0
        drive_counts[drive] += 1
        
        scene['video_path'] = video_path
        scene['performers'] = performers
        scenes_to_process.append(scene)

print("Scenes per drive:")
for drive, count in drive_counts.items():
    print(f"{drive}: {count} scenes")

# Process multiple scenes in parallel
builder = FaceDatasetBuilder(max_concurrent_scenes=8)  # Total concurrent scenes
results = builder.process_multiple_scenes(scenes_to_process)

# Print results with more detailed error information
for result in results:
    if result['status'] == 'success':
        print(f"Scene {result['scene_id']}: Extracted {result['faces_extracted']} faces")
    else:
        print(f"Scene {result['scene_id']}: Error - {result['error']}")
        if 'stderr' in result:
            print(f"ffmpeg stderr:\n{result['stderr']}")

In [None]:
from line_profiler import LineProfiler

# Create line profiler
lp = LineProfiler()

# Profile specific methods
lp.add_function(builder.process_scene)
lp_wrapper = lp(builder.process_scene)

# Run the profiled code
faces_extracted = lp_wrapper(
    sample_scene["stashapp_primary_file_path"],
    sample_scene['stashapp_stashdb_id'],
    performers
)

# Print the line-by-line stats
lp.print_stats()

In [None]:
# Process your sample scene
faces_extracted = process_scene(sample_scene)
print(f"Extracted {faces_extracted} faces")

# Verify directories exist
import os
base_dir = "H:\\Faces\\dataset"
print("\nDirectory structure:")
for root, dirs, files in os.walk(base_dir):
    level = root.replace(base_dir, '').count(os.sep)
    indent = ' ' * 4 * level
    print(f"{indent}{os.path.basename(root)}/")
    if level < 2:  # Only show first two levels
        for d in dirs:
            print(f"{indent}    {d}/")

In [None]:
builder = FaceDatasetBuilder(max_concurrent_scenes=4)  # Adjust based on your storage devices

# After manually moving faces to correct performer directories in Windows Explorer
scene_stashdb_id = "7a39d783-a458-4c6d-8407-7565e09a3c12"
builder.verify_scene(scene_stashdb_id)

# Check how many faces we have for each performer
face_counts = builder.get_performer_face_count()
for performer, count in sorted(face_counts.items(), key=lambda x: x[1], reverse=True):
    print(f"{performer}: {count} faces")

# Diversifying selected images

In [None]:
from PIL import Image
import imagehash
import numpy as np
from sklearn.cluster import KMeans

def deduplicate_faces(face_dir, max_faces=100):
    """
    Cluster similar faces and keep the most representative ones
    """
    # Calculate perceptual hashes for all images
    hashes = []
    paths = []
    for img_path in os.listdir(face_dir):
        if img_path.endswith('.jpg'):
            full_path = os.path.join(face_dir, img_path)
            try:
                img_hash = imagehash.average_hash(Image.open(full_path))
                hashes.append(img_hash)
                paths.append(full_path)
            except Exception as e:
                print(f"Error processing {img_path}: {e}")
    
    # Convert hashes to feature vectors
    hash_vectors = np.array([[int(b) for b in str(h)] for h in hashes])
    
    # Cluster similar images
    n_clusters = min(max_faces, len(paths))
    kmeans = KMeans(n_clusters=n_clusters)
    clusters = kmeans.fit_predict(hash_vectors)
    
    # Keep images closest to cluster centers
    kept_images = []
    for i in range(n_clusters):
        cluster_indices = np.where(clusters == i)[0]
        if len(cluster_indices) > 0:
            # Find image closest to cluster center
            center_idx = cluster_indices[0]
            kept_images.append(paths[center_idx])
    
    return kept_images

In [None]:
import face_recognition
import numpy as np

def get_diverse_faces(face_dir, max_faces=100):
    """
    Select faces with diverse angles and expressions
    """
    faces_data = []
    for img_path in os.listdir(face_dir):
        if img_path.endswith('.jpg'):
            full_path = os.path.join(face_dir, img_path)
            try:
                # Get face landmarks
                image = face_recognition.load_image_file(full_path)
                landmarks = face_recognition.face_landmarks(image)
                if landmarks:
                    # Calculate features from landmarks (e.g., eye distance, mouth openness)
                    features = extract_features_from_landmarks(landmarks[0])
                    faces_data.append((full_path, features))
            except Exception as e:
                print(f"Error processing {img_path}: {e}")
    
    # Cluster faces based on features
    features_array = np.array([f for _, f in faces_data])
    n_clusters = min(max_faces, len(faces_data))
    kmeans = KMeans(n_clusters=n_clusters)
    clusters = kmeans.fit_predict(features_array)
    
    # Select representative faces from each cluster
    diverse_faces = []
    for i in range(n_clusters):
        cluster_indices = np.where(clusters == i)[0]
        if len(cluster_indices) > 0:
            diverse_faces.append(faces_data[cluster_indices[0]][0])
    
    return diverse_faces