In [1]:
import os
import re
import pandas as pd
from tqdm import tqdm
from pathlib import Path
import concurrent.futures

SRC_DIR = Path('/Volumes/external_drive')
ACRONYMS = ['69', 'BBC', 'BBW', 'BDSM', 'CFNM', 'DP', 'GILF', 'MILF', 'PAWG', 'POV']

In [None]:
def process_string(input_string):
    word_list = re.findall('[A-Z][^A-Z]*', input_string)

    joined_words = []
    temp_word = ""
    for word in word_list:
        if word.endswith(" "):
            temp_word += word
        else:
            if temp_word:
                joined_words.append(temp_word + word)
                temp_word = ""
            else:
                joined_words.append(word)

    final_words = []
    temp_word = ""
    for word in joined_words:
        if len(word) == 1 and word.isupper():
            temp_word += word
        elif len(word) > 1 and word[1] == " ":
            temp_word += word
        elif len(word) > 1 and word[-2] == " ":
            final_words.append(temp_word)
            temp_word = ""
            temp_word += word

        else:
            if temp_word:
                final_words.append(temp_word)
                temp_word = ""
            final_words.append(word)
    if temp_word:
        final_words.append(temp_word)

    word_list = [w for w in final_words if w != ""]

    new_list = []
    for word in word_list:
        match = re.match(r"([a-zA-Z]+)([0-9]+)", word)
        if match:
            new_list.append(match.group(1))
            new_list.append(int(match.group(2)))
        else:
            new_list.append(word)
    return new_list


def split_and_verify_acronyms(words_list, acronyms):
    out = []

    for word in words_list:
        word = str(word)
        if word.isupper():
            for acronym in acronyms:
                if acronym in word:
                    out.append(acronym)
                    word = word.replace(acronym, "")
        out.append(word)

    return [w for w in out if w != ""]


def split_and_unify_tags(tags_list: list[str]):
    out = []
    acronyms = []
    for tag in tags_list:
        for acronym in ACRONYMS:
            if acronym in tag:
                tag = tag.replace(acronym, "").strip()
                if acronym not in acronyms:
                    acronyms.append(acronym)
        out.append(tag)

    result = list(set(out + acronyms))
    for tag in result:
        if tag in out:
            for other_tag in out:
                if tag in other_tag and tag != other_tag:
                    result.remove(tag)
                    break

    return result


def process_line(line_string: str) -> list[str]:
    words_list = process_string(line_string)
    words_list = split_and_verify_acronyms(words_list, ACRONYMS)
    return [word.replace("  ", " ").strip() for word in words_list]

In [None]:
def parse_gallery_info(filepath: Path):
    # Get all the filenames as list
    filenames = []
    for f in os.listdir(filepath.parent):
        if not f.startswith(".") and not f.endswith(".txt"):
            filenames.append(f)

    with open(filepath, 'r') as file:
        content = file.readlines()

    info = {}
    spl = filepath.as_posix().split("/")
    info['category'] = spl[3]
    info['gallery_name'] = spl[4]
    info['filenames'] = filenames
    for line in content:
        if line.startswith("Channel"):
            info['channel'] = line.removeprefix("Channel: : ").removesuffix("\n")
        elif line.startswith("Models"):
            models = process_line(line.removeprefix("Models:: ").removesuffix(" + Suggest\n"))
            if "Suggest" in models:
                models.remove("Suggest")
            info['models'] = models
        elif line.startswith("Categories"):
            info['categories'] = process_line(line.removeprefix("Categories:: ").removesuffix(" + Suggest\n"))
        elif line.startswith("Tags List"):
            info['tags_list'] = split_and_unify_tags(process_line(line.removeprefix("Tags List:: ").removesuffix("\n")))
        elif line.startswith("Stats"):
            splitted = line.removeprefix("Stats:: ").removesuffix("\n").split("; ")

            try:
                info['rating'] = splitted[0].removeprefix("Rating: ")
            except:
                info['rating'] = None

            try:
                info['views'] = splitted[1].removeprefix("Views: ")
            except:
                info['views'] = None

    return info

In [None]:
def process_gallery(category_name: str, gallery: str):
    gallery_info_path = SRC_DIR / category_name / gallery / 'gallery_info.txt'
    if gallery_info_path.exists():
        gallery_info = parse_gallery_info(gallery_info_path)
        return gallery_info

In [None]:
def process_category(category_name: str):
    category_path = SRC_DIR / category_name
    category_galleries = [
        gallery
        for gallery in os.listdir(category_path)
        if (category_path / gallery).is_dir()
    ]

    results = []
    with concurrent.futures.ThreadPoolExecutor(max_workers=16) as executor:
        future_to_gallery = [
            executor.submit(process_gallery, category_name, gallery)
            for gallery in category_galleries
        ]

        for future in tqdm(
                concurrent.futures.as_completed(future_to_gallery),
                total=len(future_to_gallery),
                desc=f"Processing {category_name}",
                unit="gallery"
        ):
            gallery_info = future.result()
            if gallery_info:
                results.append(gallery_info)

    return results

In [None]:
def crawl_images_folder():
    categories = [
        category for category in os.listdir(SRC_DIR)
        if (SRC_DIR / category).is_dir() and not category.startswith(".")
    ]

    all_galleries_info = []
    with concurrent.futures.ThreadPoolExecutor() as executor:
        future_to_category = [
            executor.submit(process_category, category)
            for category in categories
        ]
        for future in tqdm(
                concurrent.futures.as_completed(future_to_category),
                total=len(future_to_category),
                desc="Processing categories", unit="category"
        ):
            category_galleries_info = future.result()
            all_galleries_info.extend(category_galleries_info)

    return all_galleries_info

In [None]:
all_galleries_info = crawl_images_folder()
df = pd.DataFrame(all_galleries_info)

In [None]:
df.shape

# ENHANCED DATAFRAME

In [None]:
from PIL import Image


def process_filenames_to_mini_df(row):
    category = row['category']
    gallery_name = row['gallery_name']
    gallery_dicts = []

    def _process_image(category: str, gallery_name: str, filename: str) -> None:
        file_path = os.path.join(SRC_DIR, category, gallery_name, filename)
        try:
            with Image.open(file_path) as img:
                height, width = img.size
                new_row = row.to_dict()
                new_row['filename'] = filename
                new_row['width'] = width
                new_row['height'] = height
                gallery_dicts.append(new_row)
        except Exception:
            print(f"Error processing {file_path}")
            os.remove(file_path)

    with concurrent.futures.ThreadPoolExecutor() as executor:
        futures = [
            executor.submit(_process_image, category, gallery_name, filename)
            for filename in row['filenames']
        ]
        for future in concurrent.futures.as_completed(futures):
            future.result()

    return pd.DataFrame(gallery_dicts)

In [None]:
futures = []

mini_dfs = []
for _, row in tqdm(df.iterrows()):
    mini_dfs.append(process_filenames_to_mini_df(row))

In [None]:
enhanced_df = pd.concat(mini_dfs)

In [None]:
enhanced_df.shape

In [None]:
enhanced_df.to_csv('filenames_dataset.csv', index=False)

In [None]:
all_tags = [sublist for sublist in df['tags_list'].values if sublist != [] and not isinstance(sublist, float)]
merged_list = sorted([item for sublist in all_tags for item in sublist])

# Dataset inspection

In [3]:
def strip_channel_name(channel_name: str):
    if isinstance(channel_name, str):
        return channel_name.split("Pics")[0].strip()
    return channel_name

In [4]:
enhanced_df = pd.read_csv('filenames_dataset.csv', index_col=0)
enhanced_df.drop(['filenames'], axis=1, inplace=True)
enhanced_df['channel'] = enhanced_df.channel.apply(strip_channel_name)

In [5]:
enhanced_df.head()

Unnamed: 0_level_0,gallery_name,channel,models,categories,tags_list,rating,views,filename,width,height
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
thong,slutty-blonde-poses-in-sheer-lingerie-with-her...,Strictly Glamour,['Mary Queen'],"['Stockings', 'Lingerie', 'Babe', 'High Heels'...","['Big Tits Babe', 'Blonde Stockings', 'Perfect...",94%,9792,94680408_002_9900.jpg,1280,853
thong,slutty-blonde-poses-in-sheer-lingerie-with-her...,Strictly Glamour,['Mary Queen'],"['Stockings', 'Lingerie', 'Babe', 'High Heels'...","['Big Tits Babe', 'Blonde Stockings', 'Perfect...",94%,9792,94680408_008_cd7f.jpg,1280,853
thong,slutty-blonde-poses-in-sheer-lingerie-with-her...,Strictly Glamour,['Mary Queen'],"['Stockings', 'Lingerie', 'Babe', 'High Heels'...","['Big Tits Babe', 'Blonde Stockings', 'Perfect...",94%,9792,94680408_009_1f22.jpg,1280,853
thong,slutty-blonde-poses-in-sheer-lingerie-with-her...,Strictly Glamour,['Mary Queen'],"['Stockings', 'Lingerie', 'Babe', 'High Heels'...","['Big Tits Babe', 'Blonde Stockings', 'Perfect...",94%,9792,94680408_011_5291.jpg,1280,853
thong,slutty-blonde-poses-in-sheer-lingerie-with-her...,Strictly Glamour,['Mary Queen'],"['Stockings', 'Lingerie', 'Babe', 'High Heels'...","['Big Tits Babe', 'Blonde Stockings', 'Perfect...",94%,9792,94680408_006_6674.jpg,1280,853


In [6]:
# filter out rows where width and height are less than 768
enhanced_df = enhanced_df[(enhanced_df['width'] >= 768) & (enhanced_df['height'] >= 768)]
enhanced_df

Unnamed: 0_level_0,gallery_name,channel,models,categories,tags_list,rating,views,filename,width,height
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
thong,slutty-blonde-poses-in-sheer-lingerie-with-her...,Strictly Glamour,['Mary Queen'],"['Stockings', 'Lingerie', 'Babe', 'High Heels'...","['Big Tits Babe', 'Blonde Stockings', 'Perfect...",94%,9792,94680408_002_9900.jpg,1280,853
thong,slutty-blonde-poses-in-sheer-lingerie-with-her...,Strictly Glamour,['Mary Queen'],"['Stockings', 'Lingerie', 'Babe', 'High Heels'...","['Big Tits Babe', 'Blonde Stockings', 'Perfect...",94%,9792,94680408_008_cd7f.jpg,1280,853
thong,slutty-blonde-poses-in-sheer-lingerie-with-her...,Strictly Glamour,['Mary Queen'],"['Stockings', 'Lingerie', 'Babe', 'High Heels'...","['Big Tits Babe', 'Blonde Stockings', 'Perfect...",94%,9792,94680408_009_1f22.jpg,1280,853
thong,slutty-blonde-poses-in-sheer-lingerie-with-her...,Strictly Glamour,['Mary Queen'],"['Stockings', 'Lingerie', 'Babe', 'High Heels'...","['Big Tits Babe', 'Blonde Stockings', 'Perfect...",94%,9792,94680408_011_5291.jpg,1280,853
thong,slutty-blonde-poses-in-sheer-lingerie-with-her...,Strictly Glamour,['Mary Queen'],"['Stockings', 'Lingerie', 'Babe', 'High Heels'...","['Big Tits Babe', 'Blonde Stockings', 'Perfect...",94%,9792,94680408_006_6674.jpg,1280,853
...,...,...,...,...,...,...,...,...,...,...
cougar,busty-milf-alura-jenson-enjoying-a-hard-dickin...,Brazzers Network,"['Alura Jenson', 'Robby Echo']","['Cougar', 'Big Tits', 'BBW', 'MILF', 'Thick',...","['Cougar Seduction', 'Mega Boobs', 'Big Tits F...",94%,257802,13945022_101_24fa.jpg,853,1280
cougar,busty-milf-alura-jenson-enjoying-a-hard-dickin...,Brazzers Network,"['Alura Jenson', 'Robby Echo']","['Cougar', 'Big Tits', 'BBW', 'MILF', 'Thick',...","['Cougar Seduction', 'Mega Boobs', 'Big Tits F...",94%,257802,13945022_067_b810.jpg,1280,853
cougar,busty-milf-alura-jenson-enjoying-a-hard-dickin...,Brazzers Network,"['Alura Jenson', 'Robby Echo']","['Cougar', 'Big Tits', 'BBW', 'MILF', 'Thick',...","['Cougar Seduction', 'Mega Boobs', 'Big Tits F...",94%,257802,13945022_243_f12d.jpg,1280,853
cougar,busty-milf-alura-jenson-enjoying-a-hard-dickin...,Brazzers Network,"['Alura Jenson', 'Robby Echo']","['Cougar', 'Big Tits', 'BBW', 'MILF', 'Thick',...","['Cougar Seduction', 'Mega Boobs', 'Big Tits F...",94%,257802,13945022_324_65b2.jpg,853,1280


In [7]:
enhanced_df.to_csv('image_high_res.csv', index=True)