In [None]:
import polars as pl
import dotenv
import os
import sys

sys.path.append(os.path.dirname(os.path.abspath("")))

from libraries.client_stashapp import get_stashapp_client, StashAppClient
from libraries.StashDbClient import StashDbClient


# Format a StashDB ID for use as an aliasin Stash
stashdb_id_alias_prefix = "StashDB ID: "


def format_stashdb_id(id):
    return f"{stashdb_id_alias_prefix}{id}"


def contains_cjk(text):
    """Check if text contains CJK (Chinese, Japanese, Korean) characters."""
    # Unicode ranges for CJK characters
    cjk_ranges = [
        (0x4E00, 0x9FFF),  # CJK Unified Ideographs
        (0x3040, 0x309F),  # Hiragana
        (0x30A0, 0x30FF),  # Katakana
        (0x3400, 0x4DBF),  # CJK Unified Ideographs Extension A
        (0xF900, 0xFAFF),  # CJK Compatibility Ideographs
        (0xAC00, 0xD7AF),  # Korean Hangul Syllables
    ]

    return any(
        any(ord(char) >= start and ord(char) <= end for start, end in cjk_ranges)
        for char in text
    )


dotenv.load_dotenv()

stash = get_stashapp_client()
stash_client = StashAppClient()

stashbox_client = StashDbClient(
    os.getenv("STASHDB_ENDPOINT"),
    os.getenv("STASHDB_API_KEY"),
)

In [None]:
stashdb_tags = stashbox_client.query_tags()

In [None]:
df_stashdb_tags = pl.DataFrame(stashdb_tags)

df_stashdb_tags = df_stashdb_tags.with_columns(
    pl.col("category")
    .map_elements(lambda x: x["id"] if x else None, return_dtype=pl.Utf8)
    .alias("category_id"),
    pl.col("category")
    .map_elements(lambda x: x["name"] if x else None, return_dtype=pl.Utf8)
    .alias("category_name"),
    pl.col("category")
    .map_elements(lambda x: x["description"] if x else None, return_dtype=pl.Utf8)
    .alias("category_description"),
    pl.col("category")
    .map_elements(lambda x: x["group"] if x else None, return_dtype=pl.Utf8)
    .alias("category_group"),
).drop("category")

df_stashdb_tags

In [None]:
df_stashdb_tags.write_json("H:\\Parquet Data\\StashDB\\stashdb_tags.json")

In [None]:
# Get tags from Stash
stash_tags = stash.find_tags()
df_stash_tags = pl.DataFrame(stash_tags)
df_stash_tags = df_stash_tags.with_columns(
    pl.col("aliases")
    .map_elements(
        lambda aliases: next(
            (
                alias[len(stashdb_id_alias_prefix) :]
                for alias in aliases
                if isinstance(alias, str) and alias.startswith(stashdb_id_alias_prefix)
            ),
            None,
        ),
        return_dtype=pl.Utf8,
    )
    .alias("stashdb_id")
)
df_stash_tags

In [None]:
# Merge df_stashdb_tags and df_stash_tags based on the 'name' column
merged_df = df_stashdb_tags.join(
    df_stash_tags, left_on="id", right_on="stashdb_id", how="full", suffix="_stash"
)

# Identify matching and non-matching tags
matching_tags = merged_df.filter(
    pl.col("id").is_not_null() & pl.col("id_stash").is_not_null()
)
stashdb_only_tags = merged_df.filter(pl.col("id_stash").is_null())
stash_only_tags = merged_df.filter(pl.col("id").is_null())

# Display results
print(f"Total matching tags: {len(matching_tags)}")
print(f"Tags only in StashDB: {len(stashdb_only_tags)}")
print(f"Tags only in Stash: {len(stash_only_tags)}")

merged_df

In [None]:
my_very_own_tags_parent_tag = stash.find_tag({"name": "My Very Own Tags"})

df_stash_only_tags = (
    df_stash_tags.filter(
        pl.col("id").is_in(stash_only_tags.select("id_stash").unique().to_series().to_list())
    )
    .filter(
        # Check if the tag doesn't have "My Very Own Tags" as parent
        pl.col("parents").map_elements(
            lambda parents: not any(
                parent.get("id") == my_very_own_tags_parent_tag["id"]
                for parent in parents
            ),
            return_dtype=pl.Boolean,
        )
    )
    .filter(
        ~pl.col("name").str.starts_with("Category:")
        & ~pl.col("name").str.starts_with("Category Group:")
        & ~pl.col("name").str.starts_with("AI_")
        & ~pl.col("name").str.ends_with("_AI")
        & ~pl.col("name").str.starts_with("Data Quality Issue")
        & ~pl.col("name").str.starts_with("Duplicate")
        & ~pl.col("name").str.starts_with("Galleries")
        & ~pl.col("name").str.starts_with("Group Makeup")
    )
    .select("id", "name", "aliases", "scene_count")
)
df_stash_only_tags

# Delete tags which originated from StashDB but have no scenes

In [None]:
unused_deleted_tags_via_stashdb = merged_df.explode("aliases_stash").filter(
    pl.col("aliases_stash").str.starts_with("StashDB ID: "),
    pl.col("scene_count") == 0,
    pl.col("deleted") == True
)
unused_deleted_tags_via_stashdb

In [None]:
tag_ids = sorted(unused_deleted_tags_via_stashdb.select("id").unique().to_series().to_list())
len(tag_ids)

stash.destroy_tags(tag_ids)

# Confirm before deleting tags which originated from StashDB and have scenes

In [None]:
used_deleted_tags_via_stashdb = merged_df.explode("aliases_stash").filter(
    pl.col("aliases_stash").str.starts_with("StashDB ID: "),
    pl.col("scene_count") > 0,
    pl.col("deleted") == True
)
used_deleted_tags_via_stashdb

In [None]:
tag_ids = sorted(used_deleted_tags_via_stashdb.select("id").unique().to_series().to_list())
stash.destroy_tags(tag_ids)