In [5]:
from libraries.client_stashapp import get_stashapp_client

stash = get_stashapp_client()

dUsing stash (v0.27.2-7-ge642cf32) endpoint at http://localhost:6969/graphql
dPersisting Connection to Stash with ApiKey...


In [13]:
import pandas as pd

# Open parquet file
df = pd.read_parquet("H:\\Git\\scrapymetadata\\scrapymetadata\\xart_data.parquet")

In [15]:
# Get unique performers across all galleries
unique_performers = set()
for performers_list in df['performers'].dropna():
    unique_performers.update(performers_list)

# Convert to sorted list for display
unique_performers = sorted(list(unique_performers))


In [19]:
for performer in unique_performers:
    stash.create_performer({"name": f"X-Art: {performer}"})


In [21]:
# Create lookup table of X-Art performers from Stash
performer_lookup = {}
for performer in unique_performers:
    stash_name = f"X-Art: {performer}"
    result = stash.find_performers({"name": {"value": stash_name, "modifier": "EQUALS"}})
    if result and len(result) > 0:
        performer_lookup[performer] = result[0]["id"]
    else:
        print(f"Warning: Could not find Stash performer '{stash_name}'")


In [113]:
x_art_galleries = stash.find_galleries(fragment="id title date details performers { id name } urls files { id basename } image_count")
df_x_art_galleries = pd.DataFrame(x_art_galleries)
df_x_art_galleries['basename'] = df_x_art_galleries['files'].apply(lambda x: x[0]['basename'] if x and len(x) > 0 else None)
df_x_art_galleries = df_x_art_galleries[df_x_art_galleries['basename'].str.lower().str.contains("x-art".lower())].copy()


In [77]:
from openai import OpenAI

client = OpenAI(
    base_url="http://localhost:1234/v1",
    api_key="lum-studio"
)

system_prompt = """
Extract a release name from these poorly named zip files. Zip names may contain studio name X-Art in some variations. It might contain quality like lrg which means large. File names can contain performer names like silvie here. There can be multiple performers. Performer names and qualities and similar things should be removed. Only answer with the likely release name, no other text.

Examples:
X-Art - 2013-01-09 - Susie & Clover - Warm Inside.zip => Warm Inside
x-art hayden h the dressing room-lrg.zip: hayden h the dressing room => The Dressing Room
TayTO-X-Art.13.12.02.Scarlet.Lucky.Man.IMAGESET.zip: TayTO-X-Art.13.12.02 IMAGESET => Lucky Man
x-art_leila_carmen_christmas_vacation-lrg.zip => Christmas Vacation
"""

def get_release_name(zip_file_name):
    completion = client.chat.completions.create(
        model="meta-llama-3-8b-instruct",
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": zip_file_name}
        ]
    )

    return completion.choices[0].message.content


In [114]:
# Create extracted_release_name column if it doesn't exist
if 'extracted_release_name' not in df_x_art_galleries.columns:
    df_x_art_galleries['extracted_release_name'] = ''

# Fill in extracted names for empty titles
for idx in df_x_art_galleries[df_x_art_galleries['title'] == ''].index:
    basename = df_x_art_galleries.loc[idx, 'basename']
    extracted_name = get_release_name(basename)
    df_x_art_galleries.loc[idx, 'extracted_release_name'] = extracted_name
    print(f"Extracted name for {basename}: {extracted_name}")


Extracted name for x-art brynn lexi three sisters-lrg.zip: Three Sisters
Extracted name for x-art hayden h summer plaything-lrg.zip: Summer Plaything
Extracted name for x-art hayden h the dressing room-lrg.zip: The Dressing Room
Extracted name for x-art hayden pink and tight-lrg.zip: Pink And Tight
Extracted name for x-art tori beach beauty-lrg.zip: Tori Beach Beauty
Extracted name for x-art victoria exotic angel-lrg.zip: Exotic Angel
Extracted name for x-art victoria melanie every mans desire-lrg.zip: Every Mans Desire
Extracted name for x-art victoria melanie the juicer-lrg.zip: The Juicer
Extracted name for x-art victoria olivia pink-lrg.zip: Pink
Extracted name for x-art_abby_the_rich_girl-lrg.zip: The Rich Girl
Extracted name for x-art_faye_leila_awesome_threesome-lrg.zip: Awesome Threesome
Extracted name for x-art_francesca_caprice_tiffany_suite_19-lrg.zip: Suite 19
Extracted name for x-art_gigi_r_hayden_play_time-lrg.zip: Play Time
Extracted name for x-art good night kiss.zip: G

In [115]:
# Using .loc[] is safer for boolean indexing and avoids SettingWithCopyWarning
df_galleries = df.loc[df['type'] == "gallery"].copy()

# Merge directly with df_galleries instead of df since we only want gallery records
df_merged = pd.merge(df_x_art_galleries, df_galleries,
                     left_on=df_x_art_galleries['extracted_release_name'].str.lower(),
                     right_on=df_galleries['title'].str.lower(),
                     how='inner')

In [117]:
df_merged['image_count_diff'] = df_merged['image_count_x'] - df_merged['image_count_y']


In [120]:
for idx, row in df_merged.iterrows():
    url = row['url']

    stash_id = row['id']

    refreshed_gallery = stash.find_gallery(stash_id)

    existing_urls = refreshed_gallery["urls"]
    if url not in existing_urls:
        existing_urls.append(url)

    performer_ids = [performer_lookup[performer] for performer in row["performers_y"]]
    
    updated_gallery = {
        "id": stash_id,
        "title": row["title_y"],
        "date": row["date_y"],
        "details": row["description"],
        "performer_ids": performer_ids,
        "urls": existing_urls
    }
    
    stash.update_gallery(updated_gallery)

In [101]:
df[df['type'].eq("gallery") & df['title'].notnull() & df['title'].str.lower().str.contains("our style".lower())]


Unnamed: 0,type,url,title,date,performers,description,video_resolution,html,image_count,image_resolution
328,gallery,https://www.x-art.com/galleries/Aperitif%20Our...,Aperitif Our Style,2013-12-15,[Adriana],Enjoy this delicious kitchen romp between the ...,,"<!doctype html>\n<html class=""no-js"" lang=""en""...",69.0,4000 pixels
