In [None]:
import polars as pl

from libraries.client_stashapp import StashAppClient, get_stashapp_client


stash = get_stashapp_client()
stash_client = StashAppClient()

In [None]:

from openai import OpenAI


client = OpenAI(
    base_url="http://localhost:1234/v1",
    api_key="lum-studio"
)

system_prompt_release_name = """
Extract a release name from these poorly named zip files.
Zip names may contain studio name X-Art in some variations.
It might contain quality like lrg which means large.
File names can contain performer names like silvie here.
There can be multiple performers.
Performer names and qualities and similar things should be removed.
Only answer with the likely release name, no other text.

Examples:
X-Art - 2013-01-09 - Susie & Clover - Warm Inside.zip => Warm Inside
x-art hayden h the dressing room-lrg.zip => The Dressing Room
TayTO-X-Art.13.12.02.Scarlet.Lucky.Man.IMAGESET.zip => Lucky Man
x-art_leila_carmen_christmas_vacation-lrg.zip => Christmas Vacation
X-Art.com_12.05.07.Carla.Golden.Blonde.XXX.IMAGESET-FuGLi.zip => Golden Blonde
"""

system_prompt_release_date = """
Extract a release date from these poorly named zip files.
Zip names may contain studio name X-Art in some variations.
It might contain quality like lrg which means large.
File names can contain performer names like silvie here.
There can be multiple performers.
Performer names and qualities and similar things should be removed.
Only answer with the likely release date in YYYY-MM-DD format,
no other text. If there is no date in the zip file name, return None.

Examples:
X-Art - 2013-01-09 - Susie & Clover - Warm Inside.zip => 2013-01-09
x-art hayden h the dressing room-lrg.zip => None
TayTO-X-Art.13.12.02.Scarlet.Lucky.Man.IMAGESET.zip => 2013-12-02
x-art_leila_carmen_christmas_vacation-lrg.zip => None
X-Art com 17 11 29 Susie 5 Reasons To Love Sex With Blondes XXX IMAGESET-FuGLi.zip => 2017-11-29
X-Art.com_12.05.07.Carla.Golden.Blonde.XXX.IMAGESET-FuGLi.zip => 2012-05-07
X-Art.com_12.08.30.Leila.Side.By.Side.XXX.IMAGESET-FuGLi.zip => 2012-08-30
x-art hayden h summer plaything-lrg.zip => None
"""


def get_release_name(zip_file_name):
    completion = client.chat.completions.create(
        model="meta-llama-3-8b-instruct",
        messages=[
            {"role": "system", "content": system_prompt_release_name},
            {"role": "user", "content": zip_file_name}
        ]
    )

    return completion.choices[0].message.content

def get_release_date(zip_file_name):
    completion = client.chat.completions.create(
        model="meta-llama-3-8b-instruct",
        messages=[
            {"role": "system", "content": system_prompt_release_date},
            {"role": "user", "content": zip_file_name}
        ]
    )

    return completion.choices[0].message.content

In [None]:
xart_models_df = pl.read_parquet("H:\\Parquet Data\\X-Art\\xart_models.parquet")
xart_models_df.schema

In [None]:
xart_indexxx_models_df = pl.read_parquet("H:\\Parquet Data\\X-Art\\xart_indexxx_models.parquet")
xart_indexxx_models_df.schema

In [None]:
xart_stashdb_models_df = pl.read_parquet("H:\\Parquet Data\\X-Art\\xart_stashdb_models.parquet")
xart_stashdb_models_df.schema

In [None]:
xart_indexxx_joined_models_df = (
    xart_models_df.join(
        xart_indexxx_models_df,
        how="left",
        left_on=pl.col("name").str.to_lowercase().str.replace(r"\.", ""),
        right_on=pl.col("site_name").str.to_lowercase().str.replace(r"\.", ""),
        coalesce=False
    )
)

unmatched_with_indexxx = xart_indexxx_joined_models_df.filter(pl.col("site_name").is_null())
if len(unmatched_with_indexxx) > 0:
    print(unmatched_with_indexxx)

xart_indexxx_joined_models_df

In [None]:
def match_models_with_aliases(indexxx_df: pl.DataFrame, stashdb_df: pl.DataFrame) -> pl.DataFrame:
    """
    Match models between indexxx and stashdb dataframes, using both name and aliases
    """
    # First do direct name matching
    result_df = indexxx_df.join(
        stashdb_df,
        left_on="general_name",
        right_on="name",
        how="left"
    )

    # For unmatched rows, try matching against aliases
    unmatched_mask = result_df["id"].is_null()  # Use id to check for matches
    unmatched_df = result_df.filter(unmatched_mask)

    # Explode aliases into separate rows
    stashdb_aliases = stashdb_df.explode("aliases").select(["name", "aliases"])

    # Match against aliases
    alias_matches = unmatched_df.join(
        stashdb_aliases,
        left_on="general_name",
        right_on="aliases",
        how="left"
    )

    # Get the main stashdb data for alias matches
    alias_matched = alias_matches.filter(pl.col("name").is_not_null()).join(
        stashdb_df,
        left_on="name",  # Use the name column from the alias match
        right_on="name",
        how="left",
        suffix="_right"  # Add suffix to avoid column conflicts
    )

    # First drop the original columns that we want to replace with _right versions
    columns_to_drop = ["id", "disambiguation", "deleted", "aliases", "gender", "birth_date", "is_favorite", "images", "scenes"]
    keep_columns = [col for col in alias_matched.columns if col not in columns_to_drop]

    # Create a mapping of right columns to their original names
    column_mapping = {f"{col}_right": col for col in columns_to_drop}

    # Select and rename columns
    alias_matched = alias_matched.select([
        *keep_columns,  # Keep columns we didn't drop
        *(pl.col(right_col).alias(original_col) for right_col, original_col in column_mapping.items())
    ])

    # Make sure alias_matched has the same columns as result_df
    alias_matched = alias_matched.select(result_df.columns)

    # Update the original result with alias matches
    result_df = pl.concat([
        result_df.filter(~unmatched_mask),  # Keep direct matches
        alias_matched  # Add alias matches
    ])

    return result_df

# Usage
xart_indexxx_stashdb_joined_models_df = match_models_with_aliases(
    xart_indexxx_models_df,
    xart_stashdb_models_df
)

# Check match statistics
total_models = len(xart_indexxx_stashdb_joined_models_df)
matched_models = xart_indexxx_stashdb_joined_models_df.filter(
    pl.col("id").is_not_null()  # Use id to check for matches
).height

print(f"\nMatch Statistics:")
print(f"Total models: {total_models}")
print(f"Matched models: {matched_models}")
print(f"Match rate: {matched_models/total_models:.1%}")

# See which models weren't matched
unmatched = xart_indexxx_stashdb_joined_models_df.filter(
    pl.col("id").is_null()  # Use id to check for matches
).select(["site_name", "general_name"])

# See which models were matched via aliases
alias_matches = xart_indexxx_stashdb_joined_models_df.filter(
    pl.col("aliases").is_not_null() &
    pl.col("id").is_not_null()  # Use id to check for matches
).select(["site_name", "general_name", "aliases"])  # Remove duplicate general_name

if len(unmatched) > 0:
    print("\nUnmatched models:")
    print(unmatched)
if len(alias_matches) > 0:
    print("\nAlias matches:")
    print(alias_matches)

xart_indexxx_stashdb_joined_models_df

In [None]:
stash_performers = stash_client.get_performers()
xart_indexxx_stashdb_stashapp_joined_models_df = (
    xart_indexxx_stashdb_joined_models_df.join(
        stash_performers,
        how="left",
        left_on="id",
        right_on="stashapp_stashdb_id",
        suffix="stashapp_",
    )
)
xart_indexxx_stashdb_stashapp_joined_models_df

In [None]:
xart_scenes = pl.read_parquet("H:\\Parquet Data\\X-Art\\xart_scenes.parquet")
xart_scenes


In [None]:
selected_scenes = selected_scenes.slice(2, 1)
selected_scenes


In [None]:
gallery_name = "moving"

In [None]:
gallery_id = 7479
print(stash.find_gallery(gallery_id)["files"][0]["basename"])
gallery_name = get_release_name(stash.find_gallery(gallery_id)["files"][0]["basename"])
print(gallery_name)
# gallery_date = get_release_date(stash.find_gallery(gallery_id)["files"][0]["basename"])
# print(gallery_date)

In [None]:
# Filter scenes by title, type and date if available
selected_scenes = xart_scenes.sort(by="date").filter(
    pl.col("title").str.to_lowercase().str.contains(gallery_name.lower()) &
    pl.col("type").str.contains("gallery") # &
    # (gallery_date is None or pl.col("date") == gallery_date)  # Only apply date filter if gallery_date exists
)
selected_scenes

In [None]:
# Find scene performers
scene_performers = (
    selected_scenes
    .select(pl.col("performers"))
    .explode("performers")
    .unique()
    .filter(pl.col("performers").is_not_null())
    .sort(by="performers")
)

scene_performers_joined = scene_performers.join(
    xart_indexxx_stashdb_stashapp_joined_models_df,
    how="left",
    left_on=pl.col("performers").str.to_lowercase(),
    right_on=pl.col("site_name").str.to_lowercase(),
    suffix="metadata_",
    coalesce=False
)

scene_performers_joined.select("general_name", "stashapp_id")

In [None]:
performer_ids = scene_performers_joined.select("stashapp_id").drop_nulls().to_series().to_list()

refreshed_gallery = stash.find_gallery(gallery_id)
stash.update_gallery({
    "id": gallery_id,
    "title": selected_scenes.select(pl.col("title")).to_series().to_list()[0],
    "date": selected_scenes.select(pl.col("date")).to_series().to_list()[0],
    "details": selected_scenes.select(pl.col("description")).to_series().to_list()[0],
    "urls": list(set(refreshed_gallery["urls"] + [selected_scenes.select(pl.col("url")).to_series().to_list()[0]])),
    "performer_ids": performer_ids
})

# Extracting metadata from Indexxx

In [None]:
import json
import time
from pathlib import Path

import requests
from bs4 import BeautifulSoup


class Release:
    def __init__(self, url, title, date, models):
        self.url = url
        self.title = title
        self.date = date
        self.models = models  # List of (official_name, alias) tuples

    def to_dict(self):
        return {
            "url": self.url,
            "title": self.title,
            "date": self.date,
            "models": [{"official_name": m[0], "alias": m[1]} for m in self.models]
        }

class Scraper:
    def __init__(self, base_url="https://www.indexxx.com", site_id=None, site_slug=None):
        self.base_url = base_url
        self.site_id = site_id
        self.site_slug = site_slug
        self.session = requests.Session()

        # Add comprehensive headers
        self.session.headers.update({
            "authority": "www.indexxx.com",
            "accept": (
                "text/html,application/xhtml+xml,application/xml;q=0.9,"
                "image/avif,image/webp,image/apng,*/*;q=0.8,"
                "application/signed-exchange;v=b3;q=0.7"
            ),
            "accept-encoding": "gzip, deflate, br, zstd",
            "accept-language": "en-US,en;q=0.9,fi;q=0.8",
            "cache-control": "no-cache",
            "pragma": "no-cache",
            "sec-ch-ua": '"Not(A:Brand";v="99", "Microsoft Edge";v="133", "Chromium";v="133"',
            "sec-ch-ua-mobile": "?0",
            "sec-ch-ua-platform": '"Windows"',
            "sec-fetch-dest": "document",
            "sec-fetch-mode": "navigate",
            "sec-fetch-site": "same-origin",
            "sec-fetch-user": "?1",
            "upgrade-insecure-requests": "1",
            "user-agent": (
                "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                "AppleWebKit/537.36 (KHTML, like Gecko) "
                "Chrome/133.0.0.0 Safari/537.36 Edg/133.0.0.0"
            ),
        })

        # Add cookies
        cf_clearance = (
            "gws_iACWaFPjTCrarTjqpMa82rEgRmXK45l4ndArCEY"
            "-1738553527-1.2.1.1-Fn3Z5R5yOSuklpSjU6IAWyjX"
            "MtrIzGuJ1Ex9VzAjwSrfoqlXCqxjnJ2JPjxqdbg34iLD"
            "ziG0KE4EnkIv1PH3g6ydFu77SzoUo0IL04ojYme2kHf7u"
            "czVsRTjk1MVaRBbX17a7SV2j2LyawuCMWIn3a2dqH0agR"
            "ql0stNYOTT6MDHEpKkw8jA9_vP0L8hSeYcJ1YlD8g.tYm"
            "SZmBYsWZMhiS6Ip7VQhqsMiNY3iHI8Be30kWItk6FMTw2"
            "HcwMw4SeUkgOLv.GVjievGbzr4X4359LGWzold4LK1c9v"
            ".B0myA"
        )
        self.session.cookies.update({
            "ageVerified": "true",
            "cf_clearance": cf_clearance,
        })

    def get_site_url(self, page=None):
        """Get URL for site's releases page"""
        if not self.site_id or not self.site_slug:
            raise ValueError("site_id and site_slug must be set")

        url = f"{self.base_url}/websites/{self.site_id}/{self.site_slug}/sets/"
        if page is not None:
            url += f"?page={page}"
        return url

    def get_soup(self, url, delay=1):
        """Get BeautifulSoup object for a URL with rate limiting"""
        time.sleep(delay)  # Rate limiting

        # Update referer for each request
        self.session.headers.update({
            "referer": "/".join(url.split("/")[:-1]) if url.endswith("/") else "/".join(url.split("/")[:-2])
        })

        response = self.session.get(url)
        if response.status_code != 200:
            raise Exception(f"Failed to fetch {url}: Status code {response.status_code}")

        return BeautifulSoup(response.text, "html.parser")

    def parse_release_page(self, url):
        """Parse individual release page"""
        soup = self.get_soup(url)

        # Extract basic info
        date = soup.find("span", {"itemprop": "datePublished"}).text
        title = soup.find("span", {"itemprop": "description"}).text

        # Extract models
        models = []
        model_section = soup.find("div", {"itemprop": "about"})
        if model_section:
            for model in model_section.find_all("div", {"itemtype": "http://schema.org/Person"}):
                official_name = model.find("span", {"itemprop": "name"}).text
                alias_div = model.find("div", text=lambda t: t and "as:" in t)
                alias = alias_div.text.replace("as:", "").strip() if alias_div else official_name
                models.append((official_name, alias))

        return Release(url, title, date, models)

    def scrape_list_page(self, page_num):
        """Scrape a single list page"""
        url = self.get_site_url(page_num)
        soup = self.get_soup(url)

        releases = []
        for pset in soup.find_all("div", {"class": "pset"}):
            link = pset.find("div", {"class": "my-2"}).find("a")
            if link:
                release_url = self.base_url + link["href"]
                try:
                    release = self.parse_release_page(release_url)
                    releases.append(release)
                    print(f"Scraped: {release.title}")
                except Exception as e:
                    print(f"Error scraping {release_url}: {e}")

        return releases

    def get_last_page(self):
        """Get the last page number from the pagination"""
        url = self.get_site_url()
        soup = self.get_soup(url)

        # Find the first page number in pagination (which is the last page)
        pagination = soup.find("ul", {"class": "pagination"})
        if pagination:
            # Skip the "newer" button and get the first actual page number
            pages = pagination.find_all("li", {"class": "page-item"})
            for page in pages:
                if "disabled" not in page.get("class", []):
                    try:
                        return int(page.find("a").text)
                    except (ValueError, AttributeError):
                        continue

        raise ValueError("Could not determine last page number")

    def scrape_all_pages(self, start_page=1, end_page=None):
        """Scrape all pages in range"""
        if end_page is None:
            end_page = self.get_last_page()
            print(f"Detected {end_page} total pages")

        all_releases = []

        for page in range(start_page, end_page + 1):
            print(f"\nScraping page {page}/{end_page}...")
            releases = self.scrape_list_page(page)
            all_releases.extend(releases)

            # Save progress after each page
            self.save_releases(all_releases, f"releases_progress_p{page}.json")

        return all_releases

    def save_releases(self, releases, filename):
        """Save releases to JSON file"""
        with Path(filename).open("w", encoding="utf-8") as f:
            json.dump([r.to_dict() for r in releases], f, indent=2)

# Example usage for x-art
scraper = Scraper(
    site_id="293",
    site_slug="x-art"
)
releases = scraper.scrape_all_pages()
scraper.save_releases(releases, "releases_final.json")

# Could also scrape other sites
# scraper2 = Scraper(site_id="123", site_slug="other-site")
# releases2 = scraper2.scrape_all_pages()

# Extracting models from StashDB

In [None]:
import os

import dotenv
import polars as pl
import requests


dotenv.load_dotenv()

# Define the headers, including any necessary authentication
headers = {
    "Content-Type": "application/json",
    "ApiKey": os.getenv("ApiKey"),
}

def gql_query(query, variables=None):
    headers = {"Content-Type": "application/json"}
    if os.getenv("STASHDB_API_KEY"):
        headers["Apikey"] = os.getenv("STASHDB_API_KEY")
    response = requests.post(
        "https://stashdb.org/graphql",
        json={"query": query, "variables": variables},
        headers=headers,
    )
    if response.status_code == 200:
        return response.json()
    else:
        raise Exception(
            f"Query failed with status code {response.status_code}: {response.text}"
        )

In [None]:
# Define the GraphQL query
query = """
    query StudioPerformers(
        $studioId: ID!, 
        $gender: GenderFilterEnum, 
        $favorite: Boolean, 
        $names: String, 
        $page: Int! = 1, 
        $per_page: Int! = 25, 
        $direction: SortDirectionEnum!, 
        $sort: PerformerSortEnum!
    ) {
        queryPerformers(
            input: {
                studio_id: $studioId,
                gender: $gender,
                is_favorite: $favorite,
                names: $names,
                page: $page,
                per_page: $per_page,
                direction: $direction,
                sort: $sort
            }
        ) {
            count
            performers {
                id
                name
                disambiguation
                deleted
                aliases
                gender
                birth_date
                is_favorite
                images {
                    ...ImageFragment
                }
                scenes(input: {studio_id: $studioId}) {
                    id
                    title
                    duration
                    release_date
                    production_date
                    studio {
                        id
                        name
                    }
                    images {
                        ...ImageFragment
                    }
                }
            }
        }
    }

    fragment ImageFragment on Image {
        id
        url
        width
        height
    }
"""

studio_id = "d22943f5-9bb5-495e-8b01-6e12d2fffc80"

all_performers = []
for page in range(1, 18):
    variables = {
        "studioId": studio_id,
        "page": page,
        "per_page": 25,
        "sort": "LAST_SCENE",
        "direction": "DESC"
    }

    data = gql_query(
        query,
        variables
    )

    all_performers.extend(data["data"]["queryPerformers"]["performers"])

all_performers_df = pl.DataFrame(all_performers)
all_performers_df

In [None]:
all_performers_df.write_parquet("H:\\Parquet Data\\xart_stashdb_all_performers.parquet")