In [1]:
import polars as pl
import dotenv
import os
import sys

sys.path.append(os.path.dirname(os.path.abspath("")))

from libraries.client_stashapp import get_stashapp_client, StashAppClient
from libraries.StashDbClient import StashDbClient


# Format a StashDB ID for use as an aliasin Stash
stashdb_id_alias_prefix = "StashDB ID: "


def format_stashdb_id(id):
    return f"{stashdb_id_alias_prefix}{id}"


def contains_cjk(text):
    """Check if text contains CJK (Chinese, Japanese, Korean) characters."""
    # Unicode ranges for CJK characters
    cjk_ranges = [
        (0x4E00, 0x9FFF),  # CJK Unified Ideographs
        (0x3040, 0x309F),  # Hiragana
        (0x30A0, 0x30FF),  # Katakana
        (0x3400, 0x4DBF),  # CJK Unified Ideographs Extension A
        (0xF900, 0xFAFF),  # CJK Compatibility Ideographs
        (0xAC00, 0xD7AF),  # Korean Hangul Syllables
    ]

    return any(
        any(ord(char) >= start and ord(char) <= end for start, end in cjk_ranges)
        for char in text
    )


dotenv.load_dotenv()

stash = get_stashapp_client()
stash_client = StashAppClient()

stashbox_client = StashDbClient(
    os.getenv("STASHDB_ENDPOINT"),
    os.getenv("STASHDB_API_KEY"),
)

dUsing stash (v0.28.1-89-g642b0f22) endpoint at http://localhost:6969/graphql
dUsing stash (v0.28.1-89-g642b0f22) endpoint at http://localhost:6969/graphql


In [2]:
stashdb_tags = stashbox_client.query_tags()

In [57]:
df_stashdb_tags = pl.DataFrame(stashdb_tags)

df_stashdb_tags = df_stashdb_tags.with_columns(
    pl.col("category")
    .map_elements(lambda x: x["id"] if x else None, return_dtype=pl.Utf8)
    .alias("category_id"),
    pl.col("category")
    .map_elements(lambda x: x["name"] if x else None, return_dtype=pl.Utf8)
    .alias("category_name"),
    pl.col("category")
    .map_elements(lambda x: x["description"] if x else None, return_dtype=pl.Utf8)
    .alias("category_description"),
    pl.col("category")
    .map_elements(lambda x: x["group"] if x else None, return_dtype=pl.Utf8)
    .alias("category_group"),
).drop("category")

df_stashdb_tags

id,name,description,aliases,deleted,created,updated,category_id,category_name,category_description,category_group
str,str,str,list[str],bool,str,str,str,str,str,str
"""9441c3ad-41d2-4d6e-bc97-54ad8c…","""120 FPS""","""Scenes offered at 120 frames p…","[""120帧""]",false,"""2022-04-05T20:28:06Z""","""2024-02-17T18:36:12.991842Z""","""ef4ae6d1-d13c-4195-b47e-f245e4…","""Shot Type""","""Technical details of how a vid…","""SCENE"""
"""42d9e5c4-1a1d-4c93-bf47-9086f2…","""12K Available""","""Scenes offered in a resolution…","[""12K"", ""12K Shemale VR Porn"", … ""True 12K""]",false,"""2024-12-03T05:31:48.278753Z""","""2024-12-03T05:31:48.278753Z""","""7f4ddc1b-8169-4d5b-b764-04ad07…","""Misc""","""Information about the video it…","""SCENE"""
"""8534d108-1f4c-42f9-8caa-5ca906…","""18+""","""Primary performer (not charact…","[""18 Plus"", ""Over 18""]",false,"""2024-03-30T04:09:32.347616Z""","""2024-03-30T04:09:32.347616Z""","""b40e08dd-314e-40ca-8fdb-bf7541…","""Age Group""","""Implied age ranges for charact…","""PEOPLE"""
"""103a1f16-83e1-4b9f-ab14-e85e04…","""180°""","""Virtual reality scenes with a …","[""180"", ""180 FOV"", … ""VR180""]",false,"""2020-04-27T18:59:52Z""","""2023-05-25T09:25:21.314083Z""","""ef4ae6d1-d13c-4195-b47e-f245e4…","""Shot Type""","""Technical details of how a vid…","""SCENE"""
"""6cd87d98-eea8-4b97-9db9-aa38a9…","""1800s""","""Inspired by the history and cu…","[""1800's"", ""19th Century"", … ""Victorian""]",false,"""2024-02-15T10:25:01.839985Z""","""2024-02-15T10:25:01.839985Z""","""0319d5d6-a07f-4e0d-809d-c09fb1…","""Themes""","""Events, contexts, or fetishes …","""SCENE"""
…,…,…,…,…,…,…,…,…,…,…
"""e7f1f848-4350-4bda-925c-b01235…","""Young Man (22–30)""","""Male presented as generally yo…","[""Young Guy"", ""Young Male"", … ""青年男子 (22–30)""]",false,"""2020-04-27T18:59:52Z""","""2024-11-12T06:27:11.374593Z""","""b40e08dd-314e-40ca-8fdb-bf7541…","""Age Group""","""Implied age ranges for charact…","""PEOPLE"""
"""84ba8ef1-084c-46f8-b352-31154f…","""Young Woman (22–30)""","""Female character presented as …","[""Chick"", ""Woman (20-29)"", … ""Youthful Woman""]",false,"""2020-04-27T18:59:52Z""","""2024-11-14T00:15:52.456833Z""","""b40e08dd-314e-40ca-8fdb-bf7541…","""Age Group""","""Implied age ranges for charact…","""PEOPLE"""
"""6c0a2824-acd2-4b64-9a2c-634bd9…","""Zentai""","""Skin-tight garment that covers…","[""Zentai Suit""]",false,"""2022-07-10T22:17:47.537338Z""","""2022-07-10T22:17:47.537338Z""","""dc566ccc-0584-41d8-b9f5-4d8680…","""Clothing""","""Articles or styles of clothing…","""PEOPLE"""
"""bed78871-9bb8-40c2-97b1-347c43…","""Zip Front Dress""","""A dress where the zipper runs …","[""Zipper Dress""]",false,"""2023-10-23T23:37:34.546141Z""","""2023-10-23T23:37:34.546141Z""","""dc566ccc-0584-41d8-b9f5-4d8680…","""Clothing""","""Articles or styles of clothing…","""PEOPLE"""


In [58]:
df_stashdb_tags.write_json("H:\\Parquet Data\\StashDB\\stashdb_tags.json")

In [59]:
# Get tags from Stash
stash_tags = stash.find_tags()
df_stash_tags = pl.DataFrame(stash_tags)
df_stash_tags = df_stash_tags.with_columns(
    pl.col("aliases")
    .map_elements(
        lambda aliases: next(
            (
                alias[len(stashdb_id_alias_prefix) :]
                for alias in aliases
                if isinstance(alias, str) and alias.startswith(stashdb_id_alias_prefix)
            ),
            None,
        ),
        return_dtype=pl.Utf8,
    )
    .alias("stashdb_id")
)
df_stash_tags

id,name,sort_name,description,aliases,ignore_auto_tag,created_at,updated_at,favorite,image_path,scene_count,scene_marker_count,image_count,gallery_count,performer_count,studio_count,group_count,parents,children,parent_count,child_count,stashdb_id
str,str,str,str,list[str],bool,str,str,bool,str,i64,i64,i64,i64,i64,i64,i64,list[struct[1]],list[struct[1]],i64,i64,str
"""5045""","""2D Available""","""""","""3D or VR scenes that offer a m…","[""2-D"", ""2D"", … ""Two-Dimensional""]",false,"""2024-04-23T12:50:49Z""","""2025-01-26T10:32:59+02:00""",false,"""http://localhost:6969/tag/5045…",2,0,0,0,0,0,0,"[{""7752""}]",[],1,0,"""1257be8b-d1ec-4cb1-bb22-beeb89…"
"""5049""","""3D Available""","""""","""Offered in a format with a thr…","[""3-D"", ""3D"", … ""Three-Dimensional""]",false,"""2024-04-23T12:50:49Z""","""2025-01-26T10:32:59+02:00""",false,"""http://localhost:6969/tag/5049…",66,0,0,0,0,0,0,"[{""7752""}]",[],1,0,"""52992c2c-4617-4540-8ca4-291e9c…"
"""5050""","""3K Available""","""""","""Scenes offered in a resolution…","[""1600p"", ""3K VP9"", … ""StashDB ID: c3794d99-1b5b-47b3-86f7-75ff2de748b8""]",false,"""2024-04-23T12:50:49Z""","""2025-01-26T10:32:59+02:00""",false,"""http://localhost:6969/tag/5050…",14,0,0,0,0,0,0,"[{""7752""}]",[],1,0,"""c3794d99-1b5b-47b3-86f7-75ff2d…"
"""5051""","""3rd Person Narrative""","""""","""Features a storyline with fict…","[""3rd Person Perspective"", ""StashDB ID: f562975c-e209-464c-83ed-8ac18eb3a2e8"", ""Third Person Perspective""]",false,"""2024-04-23T12:50:49Z""","""2025-01-26T10:32:59+02:00""",false,"""http://localhost:6969/tag/5051…",65,0,0,15,0,0,0,"[{""7751""}]",[],1,0,"""f562975c-e209-464c-83ed-8ac18e…"
"""5053""","""4:3 Aspect Ratio""","""""","""Footage shot in a 4:3 (1.33:1)…","[""1.33:1"", ""1.33:1 Aspect Ratio"", … ""StashDB ID: 6958c8ed-1948-46d2-89e0-cb48919bf8f1""]",false,"""2024-04-23T12:50:49Z""","""2025-01-26T10:32:59+02:00""",false,"""http://localhost:6969/tag/5053…",0,0,0,0,0,0,0,"[{""7749""}]",[],1,0,"""6958c8ed-1948-46d2-89e0-cb4891…"
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""7559""","""Young Man (22–30)""","""""","""Male presented as generally yo…","[""StashDB ID: e7f1f848-4350-4bda-925c-b0123521b4de"", ""Young Guy"", … ""Youthful Man""]",false,"""2024-04-23T12:52:46Z""","""2025-01-26T10:33:27+02:00""",false,"""http://localhost:6969/tag/7559…",129,0,0,2,0,0,0,"[{""7750""}]",[],1,0,"""e7f1f848-4350-4bda-925c-b01235…"
"""7560""","""Young Woman (22–30)""","""""","""Female character presented as …","[""Chick"", ""StashDB ID: 84ba8ef1-084c-46f8-b352-31154f5bfbbc"", … ""Youthful Woman""]",false,"""2024-04-23T12:52:46Z""","""2025-01-26T10:33:27+02:00""",false,"""http://localhost:6969/tag/7560…",1216,0,0,48,0,0,0,"[{""7750""}]",[],1,0,"""84ba8ef1-084c-46f8-b352-31154f…"
"""7563""","""Zentai""","""""","""Skin-tight garment that covers…","[""StashDB ID: 6c0a2824-acd2-4b64-9a2c-634bd9e4d0d0"", ""Zentai Suit""]",false,"""2024-04-23T12:52:46Z""","""2025-01-26T10:33:27+02:00""",false,"""http://localhost:6969/tag/7563…",0,0,0,0,0,0,0,"[{""7762""}]",[],1,0,"""6c0a2824-acd2-4b64-9a2c-634bd9…"
"""7564""","""Zip Front Dress""","""""","""A dress where the zipper runs …","[""StashDB ID: bed78871-9bb8-40c2-97b1-347c43ca7113"", ""Zipper Dress""]",false,"""2024-04-23T12:52:46Z""","""2025-01-26T10:33:27+02:00""",false,"""http://localhost:6969/tag/7564…",0,0,0,0,0,0,0,"[{""7762""}]",[],1,0,"""bed78871-9bb8-40c2-97b1-347c43…"


In [60]:
# Merge df_stashdb_tags and df_stash_tags based on the 'name' column
merged_df = df_stashdb_tags.join(
    df_stash_tags, left_on="id", right_on="stashdb_id", how="full", suffix="_stash"
)

# Identify matching and non-matching tags
matching_tags = merged_df.filter(
    pl.col("id").is_not_null() & pl.col("id_stash").is_not_null()
)
stashdb_only_tags = merged_df.filter(pl.col("id_stash").is_null())
stash_only_tags = merged_df.filter(pl.col("id").is_null())

# Display results
print(f"Total matching tags: {len(matching_tags)}")
print(f"Tags only in StashDB: {len(stashdb_only_tags)}")
print(f"Tags only in Stash: {len(stash_only_tags)}")

merged_df

Total matching tags: 2773
Tags only in StashDB: 77
Tags only in Stash: 478


id,name,description,aliases,deleted,created,updated,category_id,category_name,category_description,category_group,id_stash,name_stash,sort_name,description_stash,aliases_stash,ignore_auto_tag,created_at,updated_at,favorite,image_path,scene_count,scene_marker_count,image_count,gallery_count,performer_count,studio_count,group_count,parents,children,parent_count,child_count,stashdb_id
str,str,str,list[str],bool,str,str,str,str,str,str,str,str,str,str,list[str],bool,str,str,bool,str,i64,i64,i64,i64,i64,i64,i64,list[struct[1]],list[struct[1]],i64,i64,str
"""1257be8b-d1ec-4cb1-bb22-beeb89…","""2D Available""","""3D or VR scenes that offer a m…","[""2-D"", ""2D"", … ""Two-Dimensional""]",false,"""2020-05-01T09:37:09Z""","""2022-02-22T21:51:53Z""","""7f4ddc1b-8169-4d5b-b764-04ad07…","""Misc""","""Information about the video it…","""SCENE""","""5045""","""2D Available""","""""","""3D or VR scenes that offer a m…","[""2-D"", ""2D"", … ""Two-Dimensional""]",false,"""2024-04-23T12:50:49Z""","""2025-01-26T10:32:59+02:00""",false,"""http://localhost:6969/tag/5045…",2,0,0,0,0,0,0,"[{""7752""}]",[],1,0,"""1257be8b-d1ec-4cb1-bb22-beeb89…"
"""52992c2c-4617-4540-8ca4-291e9c…","""3D Available""","""Offered in a format with a thr…","[""3-D"", ""3D"", … ""Three-Dimensional""]",false,"""2020-04-27T18:59:52Z""","""2022-02-22T21:52:15Z""","""7f4ddc1b-8169-4d5b-b764-04ad07…","""Misc""","""Information about the video it…","""SCENE""","""5049""","""3D Available""","""""","""Offered in a format with a thr…","[""3-D"", ""3D"", … ""Three-Dimensional""]",false,"""2024-04-23T12:50:49Z""","""2025-01-26T10:32:59+02:00""",false,"""http://localhost:6969/tag/5049…",66,0,0,0,0,0,0,"[{""7752""}]",[],1,0,"""52992c2c-4617-4540-8ca4-291e9c…"
"""c3794d99-1b5b-47b3-86f7-75ff2d…","""3K Available""","""Scenes offered in a resolution…","[""1600p"", ""3K VP9"", … ""3KVR""]",false,"""2023-02-06T23:54:49.304855Z""","""2023-02-06T23:54:49.304855Z""","""7f4ddc1b-8169-4d5b-b764-04ad07…","""Misc""","""Information about the video it…","""SCENE""","""5050""","""3K Available""","""""","""Scenes offered in a resolution…","[""1600p"", ""3K VP9"", … ""StashDB ID: c3794d99-1b5b-47b3-86f7-75ff2de748b8""]",false,"""2024-04-23T12:50:49Z""","""2025-01-26T10:32:59+02:00""",false,"""http://localhost:6969/tag/5050…",14,0,0,0,0,0,0,"[{""7752""}]",[],1,0,"""c3794d99-1b5b-47b3-86f7-75ff2d…"
"""f562975c-e209-464c-83ed-8ac18e…","""3rd Person Narrative""","""Features a storyline with fict…","[""3rd Person Perspective"", ""Third Person Perspective"", ""第三者撮り""]",false,"""2021-02-02T16:02:36Z""","""2024-12-01T16:57:51.952057Z""","""0319d5d6-a07f-4e0d-809d-c09fb1…","""Themes""","""Events, contexts, or fetishes …","""SCENE""","""5051""","""3rd Person Narrative""","""""","""Features a storyline with fict…","[""3rd Person Perspective"", ""StashDB ID: f562975c-e209-464c-83ed-8ac18eb3a2e8"", ""Third Person Perspective""]",false,"""2024-04-23T12:50:49Z""","""2025-01-26T10:32:59+02:00""",false,"""http://localhost:6969/tag/5051…",65,0,0,15,0,0,0,"[{""7751""}]",[],1,0,"""f562975c-e209-464c-83ed-8ac18e…"
"""6958c8ed-1948-46d2-89e0-cb4891…","""4:3 Aspect Ratio""","""Footage shot in a 4:3 (1.33:1)…","[""1.33:1"", ""1.33:1 Aspect Ratio"", … ""Fullscreen""]",false,"""2022-08-08T00:33:32.647805Z""","""2022-08-26T06:14:19.530426Z""","""ef4ae6d1-d13c-4195-b47e-f245e4…","""Shot Type""","""Technical details of how a vid…","""SCENE""","""5053""","""4:3 Aspect Ratio""","""""","""Footage shot in a 4:3 (1.33:1)…","[""1.33:1"", ""1.33:1 Aspect Ratio"", … ""StashDB ID: 6958c8ed-1948-46d2-89e0-cb48919bf8f1""]",false,"""2024-04-23T12:50:49Z""","""2025-01-26T10:32:59+02:00""",false,"""http://localhost:6969/tag/5053…",0,0,0,0,0,0,0,"[{""7749""}]",[],1,0,"""6958c8ed-1948-46d2-89e0-cb4891…"
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""c17d8324-03c4-46ab-a0e8-a4a7b0…","""Blowjob Machine""","""A fucking machine modified to …","[""automatic blowjob machine"", ""blow jobs machine""]",false,"""2025-05-06T22:21:56.270533Z""","""2025-05-06T22:21:56.270533Z""","""bc5f4d70-c5f3-4cd6-bc60-d44700…","""Accessories""","""Notable devices or materials d…","""ACTION""",,,,,,,,,,,,,,,,,,,,,,
"""d0503307-4f04-41f3-bb52-0a288f…","""Cum in Chastity""","""Cuming while wearing a chastit…","[""Cum in Cage""]",false,"""2025-05-13T14:46:24.457677Z""","""2025-05-17T03:50:02.94602Z""","""939c769b-7c41-4a04-983e-5a30a8…","""Finishers""","""Acts that typically end a scen…","""ACTION""",,,,,,,,,,,,,,,,,,,,,,
"""cfc9d4d5-2003-4d43-99eb-bc72f1…","""Fangs""","""Performer has elongated inciso…","[""Fang""]",false,"""2025-03-03T14:23:51.401577Z""","""2025-03-03T14:23:51.401577Z""","""ce09c0f1-06e3-4f13-a5f6-7c5f93…","""Face""","""Various descriptions of a perf…","""PEOPLE""",,,,,,,,,,,,,,,,,,,,,,
"""468fa590-e8ee-4b5c-91bd-8efa48…","""Forced Feminization""","""Forced feminization is a fetis…","[""forced fem""]",false,"""2025-04-17T18:57:12.746423Z""","""2025-04-17T18:57:12.746423Z""","""feca7511-ac91-42c0-a032-8fb8f3…","""Acts""","""Various sexual acts or positio…","""ACTION""",,,,,,,,,,,,,,,,,,,,,,


In [61]:
my_very_own_tags_parent_tag = stash.find_tag({"name": "My Very Own Tags"})

df_stash_only_tags = (
    df_stash_tags.filter(
        pl.col("id").is_in(stash_only_tags.select("id_stash").unique().to_series().to_list())
    )
    .filter(
        # Check if the tag doesn't have "My Very Own Tags" as parent
        pl.col("parents").map_elements(
            lambda parents: not any(
                parent.get("id") == my_very_own_tags_parent_tag["id"]
                for parent in parents
            ),
            return_dtype=pl.Boolean,
        )
    )
    .filter(
        ~pl.col("name").str.starts_with("Category:")
        & ~pl.col("name").str.starts_with("Category Group:")
        & ~pl.col("name").str.starts_with("AI_")
        & ~pl.col("name").str.ends_with("_AI")
        & ~pl.col("name").str.starts_with("Data Quality Issue")
        & ~pl.col("name").str.starts_with("Duplicate")
        & ~pl.col("name").str.starts_with("Galleries")
        & ~pl.col("name").str.starts_with("Group Makeup")
    )
    .select("id", "name", "aliases", "scene_count")
)
df_stash_only_tags

id,name,aliases,scene_count
str,str,list[str],i64
"""8792""","""Accidental Nipple Exposure""",[],1
"""8552""","""AI""",[],0
"""8800""","""Armpit Job""",[],1
"""8793""","""Background Moaning""",[],1
"""8788""","""Boobs Jiggling Against Boobs""",[],1
…,…,…,…
"""8796""","""Vaginal Insertion""",[],5
"""7967""","""Verified: Locations""",[],21
"""8797""","""Vibrators""",[],7
"""7666""","""Wet (Genitals)""",[],5


# Delete tags which originated from StashDB but have no scenes

In [47]:
# Delete tags which originated from StashDB but have no scenes
df_previous_stashdb_tags = df_stash_only_tags.explode("aliases").filter(
    pl.col("aliases").str.starts_with("StashDB ID: ")
).filter(
    pl.col("scene_count") == 0
)
df_previous_stashdb_tags

id,name,aliases,scene_count
str,str,str,i64
"""5756""","""Edgy""","""StashDB ID: 9673720a-3ae0-4cfb…",0
"""6110""","""Hand Rail""","""StashDB ID: 3d58274a-6a82-4fb3…",0
"""6134""","""Headstand""","""StashDB ID: 0fe40fde-ebd2-4f3b…",0
"""6258""","""Kink-POV-Gay""","""StashDB ID: b1760d3d-e3ef-4054…",0
"""6492""","""Motorcycle Fucking""","""StashDB ID: fcab90fb-529c-4419…",0
…,…,…,…
"""7163""","""Squats""","""StashDB ID: 25047e52-b834-4888…",0
"""7254""","""Svelte""","""StashDB ID: a222d4a8-d9a7-4c5e…",0
"""7262""","""Sweeping""","""StashDB ID: 988ffc2c-9a03-42df…",0
"""7370""","""Totem""","""StashDB ID: 8ebdccc4-bf1a-4408…",0


In [56]:
tag_ids = sorted(df_previous_stashdb_tags.select("id").unique().to_series().to_list())
len(tag_ids)

stash.destroy_tags(tag_ids)