In [2]:
import pandas as pd
import dotenv
import os
from libraries.client_stashapp import get_stashapp_client
from libraries.StashDbClient import StashDbClient

dotenv.load_dotenv()

stash = get_stashapp_client()

stashbox_client = StashDbClient(
    os.getenv("STASHDB_ENDPOINT"),
    os.getenv("STASHDB_API_KEY"),
)

dUsing stash (v0.27.1-0) endpoint at http://localhost:6969/graphql
dPersisting Connection to Stash with ApiKey...


In [10]:
def get_scenes_with_stashdb_id():
    scenes = stash.find_scenes(
        {
            "stash_id_endpoint": {
                "endpoint": "https://stashdb.org/graphql",
                "modifier": "NOT_NULL",
            }
        },
        fragment = "id title details performers { id name stash_ids { stash_id endpoint } } stash_ids { stash_id endpoint } tags { id name }"
    )
    return scenes

scenes_with_stashdb_id = get_scenes_with_stashdb_id()
df_scenes = pd.DataFrame(scenes_with_stashdb_id)


In [11]:
# Step 2: Extract StashDB IDs, existing tags, and existing performers
df_scenes['stashdb_id'] = df_scenes['stash_ids'].apply(
    lambda x: next((stash_id['stash_id'] for stash_id in x if stash_id['endpoint'] == 'https://stashdb.org/graphql'), None)
)
df_scenes['existing_tags'] = df_scenes['tags'].apply(lambda x: sorted([tag['name'] for tag in x]))
df_scenes['existing_performers'] = df_scenes['performers'].apply(
    lambda x: set([
        next((stash_id['stash_id'] for stash_id in performer.get('stash_ids', []) if stash_id['endpoint'] == 'https://stashdb.org/graphql'), None)
        for performer in x
    ])
)

In [12]:
# Step 3: Fetch StashDB data for each scene
import concurrent.futures

def get_stashdb_data(stashdb_id):
    return stashbox_client.query_scenes(stashdb_id)

def parallel_fetch_stashdb_data(stashdb_ids):
    with concurrent.futures.ThreadPoolExecutor(max_workers=16) as executor:
        return list(executor.map(get_stashdb_data, stashdb_ids))

stashdb_ids = df_scenes['stashdb_id'].tolist()
df_scenes['stashdb_data'] = parallel_fetch_stashdb_data(stashdb_ids)


In [14]:
import os
import json
import requests
from urllib.parse import urlparse
from pathlib import Path

# Define the directory to store JSON files and images
json_dir = r"X:\StashDB JSON"

# Create the directory if it doesn't exist
os.makedirs(json_dir, exist_ok=True)

# Function to save JSON data
def save_json(stashdb_id, data):
    filename = f"{stashdb_id}.json"
    filepath = os.path.join(json_dir, filename)
    with open(filepath, 'w') as f:
        json.dump(data, f, indent=2)

# Function to download and save the first image
def save_first_image(stashdb_id, image_url):
    response = requests.get(image_url)
    if response.status_code == 200:
        content_type = response.headers.get('content-type')
        if content_type:
            ext = content_type.split('/')[-1]
            filename = f"{stashdb_id}.{ext}"
            filepath = os.path.join(json_dir, filename)
            with open(filepath, 'wb') as f:
                f.write(response.content)
            print(f"Saved image: {filename}")
        else:
            print(f"Could not determine file type for image: {image_url}")
    else:
        print(f"Failed to download image: {image_url}")

# Save StashDB data and first image for each scene
for stashdb_id, stashdb_data in zip(df_scenes['stashdb_id'], df_scenes['stashdb_data']):
    if stashdb_id and stashdb_data:
        save_json(stashdb_id, stashdb_data)
        
        # Check if there are images and download the first one
        if 'data' in stashdb_data and 'findScene' in stashdb_data['data']:
            images = stashdb_data['data']['findScene'].get('images', [])
            if images:
                first_image_url = images[0].get('url')
                if first_image_url:
                    save_first_image(stashdb_id, first_image_url)

print(f"Saved StashDB JSON responses and first images to {json_dir}")


Saved image: 76911cf0-07ca-4fca-8765-53b5bb64243d.jpeg
Saved image: 47abd0ba-d2a5-4147-9b3e-757908551a4c.jpeg
Saved image: 98ee1214-ed1e-4665-b7c2-a77a6b887f86.jpeg
Saved image: 38368c65-7137-4814-9076-c56b6b249f47.jpeg
Saved image: 5ea75f22-3f3a-4d63-9013-a0165e7080ff.jpeg
Saved image: 00f83726-2e1c-480c-b5c2-64353ea2ecf2.jpeg
Saved image: 6b375722-1988-4126-bf23-cabf4bb2b218.jpeg
Saved image: 9617ac3d-c8c3-4988-9db4-0c4c698ed924.jpeg
Saved image: ca7889e6-0399-48c5-95e1-fd7b8fc45834.jpeg
Saved image: 2ab79c7f-bc59-4aee-9c76-99cce61ccddc.jpeg
Saved image: d46c66c4-da90-496e-a847-017e2d816ca2.jpeg
Saved image: 5fdd0e5a-85e5-44d4-b297-01f398a5451f.jpeg
Saved image: e6b8ccd4-da7d-42fc-a2de-1dd8428e2fc7.jpeg
Saved image: 419558e4-341e-4bee-bbea-407a8edd61d6.jpeg
Saved image: 693721b1-6326-479e-8439-f8b728a87221.jpeg
Saved image: 0e3b3943-b916-4371-abfd-de8938cb426c.jpeg
Saved image: 2635b20e-e321-465d-be10-1246c111089a.jpeg
Saved image: 6e1a9973-2b19-4bee-acb6-4a9c5d6803d0.jpeg
Saved imag

In [15]:
# Create a backup copy of the original df_scenes
# df_scenes_original = df_scenes.copy()

In [32]:


# Create a new df_scenes with only 100 scenes
df_scenes = df_scenes_original.copy()

print(f"Original number of scenes: {len(df_scenes_original)}")
print(f"New number of scenes: {len(df_scenes)}")


Original number of scenes: 15583
New number of scenes: 15583


In [22]:
# Step 4: Extract performers from StashDB data
def extract_performers(stashdb_data):
    if stashdb_data and 'data' in stashdb_data and 'findScene' in stashdb_data['data']:
        return set([performer['performer']['id'] for performer in stashdb_data['data']['findScene']['performers']])
    return []

df_scenes['stashdb_performers'] = df_scenes['stashdb_data'].apply(extract_performers)

# Step 5: Create a dataframe for performers
df_performers = pd.DataFrame({
    'scene_id': df_scenes['id'],
    'existing_performers': df_scenes['existing_performers'],
    'stashdb_performers': df_scenes['stashdb_performers']
})

# Step 6: Identify missing and extra performers
df_performers['missing_performers'] = df_performers.apply(lambda row: set(row['stashdb_performers']) - set(row['existing_performers']), axis=1)
df_performers['extra_performers'] = df_performers.apply(lambda row: set(row['existing_performers']) - set(row['stashdb_performers']), axis=1)

# Step 7: Find corresponding performers in local Stash
def find_stash_performers(performer_ids):
    return [
        stash.find_performers({"stash_id_endpoint": {"endpoint": "https://stashdb.org/graphql", "stash_id": performer_id, "modifier": "EQUALS"}}) 
        for performer_id in performer_ids
        if performer_id
    ]

df_performers['stash_performers'] = df_performers['stashdb_performers'].apply(find_stash_performers)

# Step 8: Add boolean columns for performers
df_performers['has_local_performers_not_in_stashdb'] = df_performers['extra_performers'].apply(lambda x: len(x) > 0)
df_performers['has_stashdb_performers_not_in_local'] = df_performers['missing_performers'].apply(lambda x: len(x) > 0)

# Step 9: Select relevant columns for final dataframes
df_performers_selected = df_performers[['scene_id', 'existing_performers', 'stashdb_performers', 'missing_performers', 'extra_performers', 'stash_performers', 'has_local_performers_not_in_stashdb', 'has_stashdb_performers_not_in_local']]

In [29]:
tag_incoming_performers = stash.find_tag("StashDB: Incoming Performers")
tag_outgoing_performers = stash.find_tag("StashDB: Outgoing Performers")

In [27]:
df_scenes_with_incoming_stashdb_performers = df_performers_selected[df_performers_selected['has_stashdb_performers_not_in_local']][['scene_id']]

for scene_id in df_scenes_with_incoming_stashdb_performers['scene_id']:
    scene = stash.find_scene(scene_id)
    current_tag_ids = [tag['id'] for tag in scene['tags']]
    updated_tag_ids = [tag_incoming_performers['id']] + current_tag_ids
    stash.update_scene({
        "id": scene_id,
        "tag_ids": updated_tag_ids
    })

In [31]:
df_scenes_with_outgoing_stashdb_performers = df_performers_selected[df_performers_selected['has_local_performers_not_in_stashdb']][['scene_id']]

for scene_id in df_scenes_with_outgoing_stashdb_performers['scene_id']:
    scene = stash.find_scene(scene_id)
    current_tag_ids = [tag['id'] for tag in scene['tags']]
    updated_tag_ids = [tag_outgoing_performers['id']] + current_tag_ids
    stash.update_scene({
        "id": scene_id,
        "tag_ids": updated_tag_ids
    })


In [33]:
# Step 4: Extract tags and performers from StashDB data
def extract_tags(stashdb_data):
    if stashdb_data and 'data' in stashdb_data and 'findScene' in stashdb_data['data']:
        return sorted([tag['name'] for tag in stashdb_data['data']['findScene']['tags']])
    return []

df_scenes['stashdb_tags'] = df_scenes['stashdb_data'].apply(extract_tags)

# Step 5: Create separate dataframes for tags and performers
df_tags = pd.DataFrame({
    'scene_id': df_scenes['id'],
    'existing_tags': df_scenes['existing_tags'],
    'stashdb_tags': df_scenes['stashdb_tags']
})

# Step 6: Identify missing and extra tags/performers
df_tags['missing_tags'] = df_tags.apply(lambda row: sorted(set(row['stashdb_tags']) - set(row['existing_tags'])), axis=1)
df_tags['extra_tags'] = df_tags.apply(lambda row: sorted(set(row['existing_tags']) - set(row['stashdb_tags'])), axis=1)

# Step 7: Find corresponding tags and performers in local Stash
def find_stash_tags(tag_names):
    return sorted([stash.find_tag(tag_name) for tag_name in tag_names], key=lambda x: x['name'] if x else '')

df_tags['stash_tags'] = df_tags['stashdb_tags'].apply(find_stash_tags)

# Step 8: Add boolean columns for tags and performers
df_tags['has_local_tags_not_in_stashdb'] = df_tags['extra_tags'].apply(lambda x: len(x) > 0)
df_tags['has_stashdb_tags_not_in_local'] = df_tags['missing_tags'].apply(lambda x: len(x) > 0)

# Step 9: Select relevant columns for final dataframes
df_tags_selected = df_tags[['scene_id', 'existing_tags', 'stashdb_tags', 'missing_tags', 'extra_tags', 'stash_tags', 'has_local_tags_not_in_stashdb', 'has_stashdb_tags_not_in_local']]


dMatched multiple tags with name='18+' ['5030', '7304'] returning first match
dMatched multiple tags with name='18+' ['5030', '7304'] returning first match


In [None]:
# Update tags for scenes where tags are not identical
for index, row in df_selected.iterrows():
    if not row['tags_identical'] and row['stash_tags']:
        stash.update_scene({
            "id": row['id'],
            "tag_ids": [tag['id'] for tag in row['stash_tags'] if tag]
        })