In [None]:
import polars as pl
import sys
import os

sys.path.append(os.path.dirname(os.path.abspath('')))

from libraries.client_stashapp import get_stashapp_client, StashAppClient
from libraries.StashDbClient import StashDbClient

stash = get_stashapp_client()
stash_client = StashAppClient()

stashbox_client = StashDbClient(
    os.getenv("STASHDB_ENDPOINT"),
    os.getenv("STASHDB_API_KEY"),
)

In [None]:
all_performers = stash_client.get_performers().select(
    pl.col("stashapp_id"),
    pl.col("stashapp_name"),
    pl.col("stashapp_stashdb_id")
).filter(pl.col("stashapp_stashdb_id").is_not_null())

In [None]:
import requests
from pathlib import Path
import mimetypes
import os
import sys
from tqdm import tqdm

class StashDBImageDownloader:
    def __init__(self, output_dir: Path):
        self.output_dir = Path(output_dir)
        self.output_dir.mkdir(parents=True, exist_ok=True)
        
        # Ensure we have valid MIME type mappings
        mimetypes.init()
        # Add common image MIME types if not present
        if not mimetypes.types_map.get('.jpg'):
            mimetypes.add_type('image/jpeg', '.jpg')
        if not mimetypes.types_map.get('.png'):
            mimetypes.add_type('image/png', '.png')
    
    def get_file_extension(self, mime_type: str) -> str:
        """Get appropriate file extension for MIME type"""
        if not mime_type:
            return '.jpg'  # Default to jpg if no MIME type
            
        # Get extension from MIME type
        ext = mimetypes.guess_extension(mime_type)
        if ext:
            return ext
            
        # Fallback mappings
        mime_to_ext = {
            'image/jpeg': '.jpg',
            'image/jpg': '.jpg',
            'image/png': '.png',
            'image/webp': '.webp'
        }
        return mime_to_ext.get(mime_type.lower(), '.jpg')
    
    def download_image(self, url: str, performer_id: str, performer_name: str) -> bool:
        """Download a single image with proper extension"""
        try:
            # Make request with proper headers
            headers = {
                'User-Agent': 'Mozilla/5.0',
                'Accept': 'image/jpeg,image/png,image/*'
            }
            response = requests.get(url, headers=headers, stream=True)
            response.raise_for_status()
            
            # Get MIME type and extension
            mime_type = response.headers.get('content-type', '').split(';')[0]
            ext = self.get_file_extension(mime_type)
            
            # Create performer directory
            performer_dir = self.output_dir / f"{performer_id} - {performer_name}"
            performer_dir.mkdir(exist_ok=True)
            
            # Save image with proper extension
            image_name = url.split('/')[-1].split('?')[0]  # Remove query params
            image_path = performer_dir / f"{image_name}{ext}"
            
            if not image_path.exists():
                with open(image_path, 'wb') as f:
                    for chunk in response.iter_content(chunk_size=8192):
                        f.write(chunk)
                return True
                
            return False  # Image already exists
            
        except Exception as e:
            print(f"Error downloading {url}: {str(e)}")
            return False
    
    def download_performer_images(self, performer_id: str, performer_name: str, image_urls: list):
        """Download all images for a performer"""
        
        # Normalize path for comparison
        performer_dir = self.output_dir / f"{performer_id} - {performer_name}"
        performer_dir = performer_dir.resolve()  # Normalize path
        
        if performer_dir.exists():
            print(f"Skipping {performer_name} - directory already exists")
            return
        
        if not image_urls:
            print(f"No images found for {performer_name}")
            return
        
        successful = 0
        for url in tqdm(image_urls, desc="Downloading"):
            if self.download_image(url, performer_id, performer_name):
                successful += 1
                
        print(f"Successfully downloaded {successful}/{len(image_urls)} images")


In [None]:
import time

output_directory = f"H:\\Faces\\StashDB\\"
downloader = StashDBImageDownloader(output_directory)

for i in range(0, len(all_performers), 100):
    downloaded = 0
    for performer in all_performers.slice(i, 100).iter_rows(named=True):
        path = f"H:\\Faces\\StashDB\\{performer['stashapp_stashdb_id']} - {performer['stashapp_name']}"
        if not os.path.exists(path):
            print(f"Downloading images for {performer['stashapp_name']}")
            performer_image_urls = stashbox_client.query_performer_images(performer["stashapp_stashdb_id"])
            
            if performer_image_urls:  # Check if we got any URLs
                downloader.download_performer_images(
                    performer['stashapp_stashdb_id'],
                    performer['stashapp_name'],
                    performer_image_urls
                )
                downloaded += 1
            else:
                print(f"No images found for {performer['stashapp_name']}")
    
    # Sleep between batches if we downloaded anything
    if downloaded > 0:
        print(f"Sleeping for 2 minutes after downloading {downloaded} performers...")
        time.sleep(120)