In [None]:
import os
from pathlib import Path
import sys

import dotenv
import polars as pl


sys.path.append(str(Path.cwd().parent))

from libraries.client_stashapp import StashAppClient, get_stashapp_client
from libraries.StashDbClient import StashDbClient


STASHDB_ENDPOINT = "https://stashdb.org/graphql"


def contains_cjk(text):
    """Check if text contains CJK (Chinese, Japanese, Korean) characters."""
    # Unicode ranges for CJK characters
    cjk_ranges = [
        (0x4E00, 0x9FFF),  # CJK Unified Ideographs
        (0x3040, 0x309F),  # Hiragana
        (0x30A0, 0x30FF),  # Katakana
        (0x3400, 0x4DBF),  # CJK Unified Ideographs Extension A
        (0xF900, 0xFAFF),  # CJK Compatibility Ideographs
        (0xAC00, 0xD7AF),  # Korean Hangul Syllables
    ]

    return any(
        any(ord(char) >= start and ord(char) <= end for start, end in cjk_ranges)
        for char in text
    )


def extract_stashdb_id(stash_ids):
    """Extract StashDB ID from stash_ids array."""
    if stash_ids is None or len(stash_ids) == 0:
        return None
    for entry in stash_ids:
        if entry.get("endpoint") == STASHDB_ENDPOINT:
            return entry.get("stash_id")
    return None


dotenv.load_dotenv()

stash = get_stashapp_client()
stash_client = StashAppClient()

stashbox_client = StashDbClient(
    os.getenv("STASHDB_ENDPOINT"),
    os.getenv("STASHDB_API_KEY"),
)

In [None]:
stashdb_tags = stashbox_client.query_tags()
stashdb_tags

In [None]:
df_stashdb_tags = pl.DataFrame(stashdb_tags)

df_stashdb_tags = df_stashdb_tags.with_columns(
    pl.col("category")
    .map_elements(lambda x: x["id"] if x else None, return_dtype=pl.Utf8)
    .alias("category_id"),
    pl.col("category")
    .map_elements(lambda x: x["name"] if x else None, return_dtype=pl.Utf8)
    .alias("category_name"),
    pl.col("category")
    .map_elements(lambda x: x["description"] if x else None, return_dtype=pl.Utf8)
    .alias("category_description"),
    pl.col("category")
    .map_elements(lambda x: x["group"] if x else None, return_dtype=pl.Utf8)
    .alias("category_group"),
).drop("category")

df_stashdb_tags

In [None]:
df_stashdb_tags.write_json("H:\\Parquet Data\\StashDB\\stashdb_tags.json")

In [None]:
stash_tags = stash_client.get_tags_by_names([
    "My Very Own Tags", "AI", "AI Group Makeup", "AI Scene Metadata",
    "Performer Metadata", "Marker Metadata", "Missing From Stashbox",
    "Custom Category", "Source", "Purchased",

    "Category Groups", "StashDB Tags", "Locations: Generic",
    "Locations: Specific",
])

excluded_ids = [tag["id"] for tag in [
    stash_tags.ai,
    stash_tags.my_very_own_tags,
    stash_tags.ai_group_makeup,
    stash_tags.ai_scene_metadata,
    stash_tags.performer_metadata,
    stash_tags.marker_metadata,
    stash_tags.missing_from_stashbox,
    stash_tags.custom_category,
    stash_tags.source,
    stash_tags.purchased
] if tag is not None]

single_ids = [tag["id"] for tag in [
    stash_tags.category_groups,
    stash_tags.stashdb_tags,
    stash_tags.locations_generic,
    stash_tags.locations_specific
] if tag is not None]

excluded_names = [
    "Custom Category: "
]

# Get tags from Stash with stash_ids
tags_from_stash = stash.find_tags({
    "parents": {
        "value": [],
        "modifier": "INCLUDES",
        "depth": -1,
        "excludes": excluded_ids
    },
}, fragment="id name aliases description scene_count parents { id name } stash_ids { endpoint stash_id }")

df_stash_tags = pl.DataFrame(tags_from_stash)
df_stash_tags = df_stash_tags.filter(
    ~pl.col("id").is_in(excluded_ids + single_ids)
).with_columns(
    pl.col("stash_ids")
    .map_elements(extract_stashdb_id, return_dtype=pl.Utf8)
    .alias("stashdb_id")
)
df_stash_tags

In [None]:
# Merge df_stashdb_tags and df_stash_tags based on the 'name' column
merged_df = df_stashdb_tags.join(
    df_stash_tags, left_on="id", right_on="stashdb_id", how="full", suffix="_stash"
)

# Identify matching and non-matching tags
matching_tags = merged_df.filter(
    pl.col("id").is_not_null() & pl.col("id_stash").is_not_null()
)
stashdb_only_tags = merged_df.filter(pl.col("id_stash").is_null())
stash_only_tags = merged_df.filter(pl.col("id").is_null())

# Display results
print(f"Total matching tags: {len(matching_tags)}")
print(f"Tags only in StashDB: {len(stashdb_only_tags)}")
print(f"Tags only in Stash: {len(stash_only_tags)}")

merged_df

In [None]:
# Match remaining unmatched tags by name
stash_only_df = df_stash_tags.filter(
    pl.col("id").is_in(stash_only_tags.select("id_stash").unique().to_series().to_list())
)
stashdb_only_df = df_stashdb_tags.filter(
    pl.col("id").is_in(stashdb_only_tags.select("id").unique().to_series().to_list())
)

# Join by name to find matches
name_matches = stashdb_only_df.join(
    stash_only_df, left_on="name", right_on="name", how="inner", suffix="_stash"
)

print(f"Tags that match by name but are missing StashDB ID linking: {len(name_matches)}")
name_matches

In [None]:
# Update StashDB IDs and parent tags for name-matched tags
for row in name_matches.iter_rows(named=True):
    stash_tag_id = row["id_stash"]
    stashdb_id = row["id"]
    stashdb_category_name = row["category_name"]

    # Find category parent tag if it exists
    parent_ids = []
    if stashdb_category_name:
        category_tag = stash.find_tag(f"Category: {stashdb_category_name}")
        if category_tag:
            parent_ids.append(category_tag["id"])

    # Update the tag with stash_ids
    updates = {
        "id": stash_tag_id,
        "stash_ids": [{"endpoint": STASHDB_ENDPOINT, "stash_id": stashdb_id}]
    }

    # Only update parent if we found a category tag
    if parent_ids:
        updates["parent_ids"] = parent_ids

    stash.update_tag(updates)

    parent_info = f" (parent: Category: {stashdb_category_name})" if stashdb_category_name else ""
    print(f"Updated tag '{row['name']}' with StashDB ID {stashdb_id}{parent_info}")

print(f"\nCompleted updating {len(name_matches)} tags with StashDB ID linking and parent categories")

In [None]:
df_stash_only_tags = (
    df_stash_tags.filter(
        pl.col("id").is_in(stash_only_tags.select("id_stash").unique().to_series().to_list())
    )
    .filter(
        # Check if the tag doesn't have "My Very Own Tags" as parent
        pl.col("parents").map_elements(
            lambda parents: not any(
                parent.get("id") in [stash_tags.my_very_own_tags["id"], stash_tags.ai["id"]]
                for parent in parents
            ),
            return_dtype=pl.Boolean,
        )
    )
    .filter(
        ~pl.col("name").str.starts_with("Category:")
        & ~pl.col("name").str.starts_with("Category Group:")
        & ~pl.col("name").str.starts_with("AI_")
        & ~pl.col("name").str.ends_with("_AI")
        & ~pl.col("name").str.starts_with("Data Quality Issue")
        & ~pl.col("name").str.starts_with("Duplicate")
        & ~pl.col("name").str.starts_with("Galleries")
        & ~pl.col("name").str.starts_with("Group Makeup")
    )
    .select("id", "name", "aliases", "scene_count")
)
df_stash_only_tags

# Check stash-only tags with StashDB IDs for deleted status

Tags that exist only in Stash but have a StashDB ID might have been deleted from StashDB. Query each one individually to check.

In [None]:
# Find stash-only tags that have a StashDB ID and check if they were deleted
# Exclude "Category: X" tags as they map to StashDB tag categories, not tags
stash_only_with_stashdb_id = df_stash_tags.filter(
    pl.col("id").is_in(stash_only_tags.select("id_stash").unique().to_series().to_list()),
    pl.col("stashdb_id").is_not_null(),
    ~pl.col("name").str.starts_with("Category: ")
)

# Query each tag from StashDB to check deleted status
deleted_tags = []
not_found_tags = []
active_tags = []

for row in stash_only_with_stashdb_id.iter_rows(named=True):
    stashdb_tag = stashbox_client.query_tag(row["stashdb_id"])
    if stashdb_tag is None:
        not_found_tags.append(row)
    elif stashdb_tag.get("deleted"):
        deleted_tags.append({**row, "stashdb_tag": stashdb_tag})
    else:
        active_tags.append({**row, "stashdb_tag": stashdb_tag})

print(f"Tags with StashDB ID in stash-only (excluding categories): {len(stash_only_with_stashdb_id)}")
print(f"  - Deleted from StashDB: {len(deleted_tags)}")
print(f"  - Not found in StashDB: {len(not_found_tags)}")
print(f"  - Still active in StashDB: {len(active_tags)}")

if deleted_tags:
    print("\nDeleted tags:")
    for tag in deleted_tags:
        print(f"  - {tag['name']} (stash id: {tag['id']}, stashdb id: {tag['stashdb_id']}, scenes: {tag['scene_count']})")

In [None]:
not_found_tags

# Delete tags which originated from StashDB but have no scenes

In [None]:
# Find tags that originated from StashDB (have stashdb_id) but no longer exist in StashDB
unused_deleted_tags_via_stashdb = merged_df.filter(
    pl.col("stashdb_id").is_not_null(),  # Has StashDB ID in stash_ids
    pl.col("scene_count") == 0,
    pl.col("id").is_null(),  # Not found in StashDB anymore
    ~pl.col("name_stash").str.starts_with("Category: ")
)
unused_deleted_tags_via_stashdb

In [None]:
tag_ids = sorted(unused_deleted_tags_via_stashdb.select("id_stash").unique().to_series().to_list())
tag_ids

stash.destroy_tags(tag_ids)

# Confirm before deleting tags which originated from StashDB and have scenes

In [None]:
# Find tags that originated from StashDB (have stashdb_id) but no longer exist in StashDB AND have scenes
used_deleted_tags_via_stashdb = merged_df.filter(
    pl.col("stashdb_id").is_not_null(),  # Has StashDB ID in stash_ids
    pl.col("scene_count") > 0,
    pl.col("id").is_null(),  # Not found in StashDB anymore
    ~pl.col("name_stash").str.starts_with("Category: ")
)
used_deleted_tags_via_stashdb

In [None]:
tag_ids = sorted(used_deleted_tags_via_stashdb.select("id_stash").unique().to_series().to_list())
tag_ids

stash.destroy_tags(tag_ids)

# Category groups

In [None]:
# Get all unique category groups from StashDB tags
category_groups = (
    df_stashdb_tags.select("category_group").drop_nulls().unique().to_series().to_list()
)

# Display the category groups
print("Unique category groups in StashDB:")
for group in sorted(category_groups):
    print(f"- {group}")
    tag_name = f"Category Group: {group}"
    existing_tag = stash.find_tag(tag_name)
    if existing_tag is None:
        stash.create_tag(
            {
                "name": tag_name,
                "description": f"StashDB category group: {group}",
            }
        )
        print(f"Created tag: {tag_name}")
    else:
        print(f"Tag already exists: {tag_name}")

In [None]:
# Get all unique categories from StashDB tags
unique_categories = (
    df_stashdb_tags.select(
        ["category_id", "category_name", "category_group", "category_description"]
    )
    .drop_nulls()
    .unique()
)

# Display the unique categories
print("Unique categories in StashDB:")
for category in unique_categories.iter_rows(named=True):
    print(f"- Name: {category['category_name'] or 'N/A'}")
    print(f"  ID: {category['category_id']}")
    print(f"  Group: {category['category_group'] or 'N/A'}")
    print(f"  Description: {category['category_description'] or 'N/A'}")
    print()

# Create tags for each unique category in Stash
for category in unique_categories.iter_rows(named=True):
    name = category["category_name"]
    group = category["category_group"]
    description = category["category_description"]

    category_tag = stash.find_tag(f"Category: {name}")
    if category_tag is None:
        category_group_tag = stash.find_tag(f"Category Group: {group}")

        category_tag = stash.create_tag(
            {
                "name": f"Category: {name}",
                "description": f"StashDB category: {name}",
                "parent_ids": (
                    [category_group_tag["id"]] if category_group_tag else None
                ),
            }
        )
        print(f"Created category tag: {name}")
    else:
        # Update with stash_ids instead of aliases
        stash.update_tag({
            "id": category_tag["id"],
            "stash_ids": [{"endpoint": STASHDB_ENDPOINT, "stash_id": category["category_id"]}]
        })
        print(f"Updated category tag: {name}")

# Update tag names, aliases and descriptions

In [None]:
merged_df.filter(pl.col("name") == "Strap-on Blowjob")

In [None]:
for row in merged_df.filter(pl.col("id").is_not_null(), pl.col("id_stash").is_not_null()).iter_rows(named=True):
    originals = {}
    updates = {}

    if row["name"] and row["name_stash"] and row["name"] != row["name_stash"]:
        originals["name"] = row["name_stash"]
        updates["name"] = row["name"]

    cleaned_stashdb_description = row["description"] or ""
    cleaned_stash_description = row["description_stash"] or ""
    if cleaned_stashdb_description != cleaned_stash_description:
        originals["description"] = cleaned_stash_description
        updates["description"] = cleaned_stashdb_description

    # Compare aliases directly - no need to filter out StashDB ID aliases anymore
    cleaned_stashdb_aliases = [alias for alias in row["aliases"] if not contains_cjk(alias)] if row["aliases"] else []
    cleaned_stash_aliases = row["aliases_stash"] or []
    if set(cleaned_stashdb_aliases) != set(cleaned_stash_aliases):
        originals["aliases"] = cleaned_stash_aliases
        updates["aliases"] = cleaned_stashdb_aliases

    if updates:
        print(row["id_stash"] + ": " + row["name_stash"])
        print(originals)
        print(updates)
        print()
        updates["id"] = row["id_stash"]
        stash.update_tag(updates)