In [1]:
import os
import re
import pandas as pd
from tqdm import tqdm
from pathlib import Path
import concurrent.futures

SRC_DIR = Path('/Volumes/external_drive')
ACRONYMS = ['69', 'BBC', 'BBW', 'BDSM', 'CFNM', 'DP', 'GILF', 'MILF', 'PAWG', 'POV']

In [2]:
def process_string(input_string):
    word_list = re.findall('[A-Z][^A-Z]*', input_string)

    joined_words = []
    temp_word = ""
    for word in word_list:
        if word.endswith(" "):
            temp_word += word
        else:
            if temp_word:
                joined_words.append(temp_word + word)
                temp_word = ""
            else:
                joined_words.append(word)

    final_words = []
    temp_word = ""
    for word in joined_words:
        if len(word) == 1 and word.isupper():
            temp_word += word
        elif len(word) > 1 and word[1] == " ":
            temp_word += word
        elif len(word) > 1 and word[-2] == " ":
            final_words.append(temp_word)
            temp_word = ""
            temp_word += word

        else:
            if temp_word:
                final_words.append(temp_word)
                temp_word = ""
            final_words.append(word)
    if temp_word:
        final_words.append(temp_word)

    word_list = [w for w in final_words if w != ""]

    new_list = []
    for word in word_list:
        match = re.match(r"([a-zA-Z]+)([0-9]+)", word)
        if match:
            new_list.append(match.group(1))
            new_list.append(int(match.group(2)))
        else:
            new_list.append(word)
    return new_list


def split_and_verify_acronyms(words_list, acronyms):
    out = []

    for word in words_list:
        word = str(word)
        if word.isupper():
            for acronym in acronyms:
                if acronym in word:
                    out.append(acronym)
                    word = word.replace(acronym, "")
        out.append(word)

    return [w for w in out if w != ""]


def split_and_unify_tags(tags_list: list[str]):
    out = []
    acronyms = []
    for tag in tags_list:
        for acronym in ACRONYMS:
            if acronym in tag:
                tag = tag.replace(acronym, "").strip()
                if acronym not in acronyms:
                    acronyms.append(acronym)
        out.append(tag)

    result = list(set(out + acronyms))
    for tag in result:
        if tag in out:
            for other_tag in out:
                if tag in other_tag and tag != other_tag:
                    result.remove(tag)
                    break

    return result


def process_line(line_string: str) -> list[str]:
    words_list = process_string(line_string)
    words_list = split_and_verify_acronyms(words_list, ACRONYMS)
    return [word.replace("  ", " ").strip() for word in words_list]

In [3]:
def parse_gallery_info(filepath: Path):
    # Get all the filenames as list
    filenames = []
    for f in os.listdir(filepath.parent):
        if not f.startswith(".") and not f.endswith(".txt"):
            filenames.append(f)

    with open(filepath, 'r') as file:
        content = file.readlines()

    info = {}
    spl = filepath.as_posix().split("/")
    info['category'] = spl[3]
    info['gallery_name'] = spl[4]
    info['filenames'] = filenames
    for line in content:
        if line.startswith("Channel"):
            info['channel'] = line.removeprefix("Channel: : ").removesuffix("\n")
        elif line.startswith("Models"):
            models = process_line(line.removeprefix("Models:: ").removesuffix(" + Suggest\n"))
            if "Suggest" in models:
                models.remove("Suggest")
            info['models'] = models
        elif line.startswith("Categories"):
            info['categories'] = process_line(line.removeprefix("Categories:: ").removesuffix(" + Suggest\n"))
        elif line.startswith("Tags List"):
            info['tags_list'] = split_and_unify_tags(process_line(line.removeprefix("Tags List:: ").removesuffix("\n")))
        elif line.startswith("Stats"):
            splitted = line.removeprefix("Stats:: ").removesuffix("\n").split("; ")

            try:
                info['rating'] = splitted[0].removeprefix("Rating: ")
            except:
                info['rating'] = None

            try:
                info['views'] = splitted[1].removeprefix("Views: ")
            except:
                info['views'] = None

    return info

In [4]:
def process_gallery(category_name: str, gallery: str):
    gallery_info_path = SRC_DIR / category_name / gallery / 'gallery_info.txt'
    if gallery_info_path.exists():
        gallery_info = parse_gallery_info(gallery_info_path)
        return gallery_info

In [5]:
def process_category(category_name: str):
    category_path = SRC_DIR / category_name
    category_galleries = [
        gallery
        for gallery in os.listdir(category_path)
        if (category_path / gallery).is_dir()
    ]

    results = []
    with concurrent.futures.ThreadPoolExecutor(max_workers=16) as executor:
        future_to_gallery = [
            executor.submit(process_gallery, category_name, gallery)
            for gallery in category_galleries
        ]

        for future in tqdm(
                concurrent.futures.as_completed(future_to_gallery),
                total=len(future_to_gallery),
                desc=f"Processing {category_name}",
                unit="gallery"
        ):
            gallery_info = future.result()
            if gallery_info:
                results.append(gallery_info)

    return results

In [6]:
def crawl_images_folder():
    categories = [
        category for category in os.listdir(SRC_DIR)
        if (SRC_DIR / category).is_dir() and not category.startswith(".")
    ]

    all_galleries_info = []
    with concurrent.futures.ThreadPoolExecutor() as executor:
        future_to_category = [
            executor.submit(process_category, category)
            for category in categories
        ]
        for future in tqdm(
                concurrent.futures.as_completed(future_to_category),
                total=len(future_to_category),
                desc="Processing categories", unit="category"
        ):
            category_galleries_info = future.result()
            all_galleries_info.extend(category_galleries_info)

    return all_galleries_info

In [7]:
all_galleries_info = crawl_images_folder()
df = pd.DataFrame(all_galleries_info)

Processing categories:   0%|          | 0/83 [00:00<?, ?category/s]
Processing selfie:   0%|          | 0/998 [00:00<?, ?gallery/s][A

Processing thong:   0%|          | 0/1000 [00:00<?, ?gallery/s][A[A
Processing selfie:  19%|█▊        | 185/998 [00:00<00:00, 1841.27gallery/s][A

Processing thong:  15%|█▌        | 154/1000 [00:00<00:00, 1458.79gallery/s][A[A


Processing fake-tits:   0%|          | 0/1000 [00:00<?, ?gallery/s][A[A[A



Processing skirt:   0%|          | 0/1000 [00:00<?, ?gallery/s][A[A[A[A
Processing selfie:  37%|███▋      | 370/998 [00:00<00:00, 1270.24gallery/s][A


Processing fake-tits:  12%|█▏        | 121/1000 [00:00<00:00, 1200.38gallery/s][A[A[A

Processing thong:  30%|███       | 300/1000 [00:00<00:00, 1002.22gallery/s][A[A



Processing skirt:  12%|█▏        | 119/1000 [00:00<00:00, 1170.06gallery/s][A[A[A[A
Processing selfie:  51%|█████     | 508/998 [00:00<00:00, 1087.95gallery/s][A

Processing thong:  41%|████      | 408/1000 [00:00

In [8]:
df.shape

(82588, 9)

In [9]:
# remove gallery name duplicates
df.drop_duplicates(subset=['gallery_name'], inplace=True)
df.shape

(71989, 9)

# ENHANCED DATAFRAME

In [10]:
from PIL import Image


def process_filenames_to_mini_df(row):
    category = row['category']
    gallery_name = row['gallery_name']
    gallery_dicts = []

    def _process_image(category: str, gallery_name: str, filename: str) -> None:
        file_path = os.path.join(SRC_DIR, category, gallery_name, filename)
        try:
            with Image.open(file_path) as img:
                height, width = img.size
                new_row = row.to_dict()
                new_row['filename'] = filename
                new_row['width'] = width
                new_row['height'] = height
                gallery_dicts.append(new_row)
        except Exception:
            print(f"Error processing {file_path}")
            os.remove(file_path)

    with concurrent.futures.ThreadPoolExecutor() as executor:
        futures = [
            executor.submit(_process_image, category, gallery_name, filename)
            for filename in row['filenames']
        ]
        for future in concurrent.futures.as_completed(futures):
            future.result()

    return pd.DataFrame(gallery_dicts)

In [11]:
futures = []

mini_dfs = []
for _, row in tqdm(df.iterrows()):
    mini_dfs.append(process_filenames_to_mini_df(row))

71989it [04:08, 289.24it/s]


In [12]:
enhanced_df = pd.concat(mini_dfs)

In [13]:
enhanced_df.shape

(1346913, 12)

In [14]:
enhanced_df.to_csv('filenames_dataset.csv', index=False)

In [15]:
all_tags = [sublist for sublist in df['tags_list'].values if sublist != [] and not isinstance(sublist, float)]
merged_list = sorted([item for sublist in all_tags for item in sublist])

# Dataset inspection

In [16]:
def strip_channel_name(channel_name: str):
    if isinstance(channel_name, str):
        return channel_name.split("Pics")[0].strip()
    return channel_name

In [17]:
enhanced_df = pd.read_csv('filenames_dataset.csv', index_col=0)
enhanced_df.drop(['filenames'], axis=1, inplace=True)
enhanced_df['channel'] = enhanced_df.channel.apply(strip_channel_name)

In [18]:
enhanced_df.head()

Unnamed: 0_level_0,gallery_name,channel,models,categories,tags_list,rating,views,filename,width,height
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
bondage,blonde-chastity-lynn-gets-bound-and-toyed-to-o...,Kink,['Chastity Lynn'],"['Socks', 'Emo', 'BDSM', 'Shorts', 'Clothed', ...","['Teen Bondage', 'Sneakers', 'BDSM', 'Big Ass ...",100%,11710,15595757_087_fa5c.jpg,675,1200
bondage,blonde-chastity-lynn-gets-bound-and-toyed-to-o...,Kink,['Chastity Lynn'],"['Socks', 'Emo', 'BDSM', 'Shorts', 'Clothed', ...","['Teen Bondage', 'Sneakers', 'BDSM', 'Big Ass ...",100%,11710,15595757_101_b4af.jpg,675,1200
bondage,blonde-chastity-lynn-gets-bound-and-toyed-to-o...,Kink,['Chastity Lynn'],"['Socks', 'Emo', 'BDSM', 'Shorts', 'Clothed', ...","['Teen Bondage', 'Sneakers', 'BDSM', 'Big Ass ...",100%,11710,15595757_013_b31a.jpg,1200,800
bondage,blonde-chastity-lynn-gets-bound-and-toyed-to-o...,Kink,['Chastity Lynn'],"['Socks', 'Emo', 'BDSM', 'Shorts', 'Clothed', ...","['Teen Bondage', 'Sneakers', 'BDSM', 'Big Ass ...",100%,11710,15595757_042_a9d1.jpg,1200,800
bondage,blonde-chastity-lynn-gets-bound-and-toyed-to-o...,Kink,['Chastity Lynn'],"['Socks', 'Emo', 'BDSM', 'Shorts', 'Clothed', ...","['Teen Bondage', 'Sneakers', 'BDSM', 'Big Ass ...",100%,11710,15595757_067_19dc.jpg,675,1200


In [19]:
# filter out rows where width and height are less than 768
enhanced_df = enhanced_df[(enhanced_df['width'] >= 768) & (enhanced_df['height'] >= 768)]
enhanced_df

Unnamed: 0_level_0,gallery_name,channel,models,categories,tags_list,rating,views,filename,width,height
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
bondage,blonde-chastity-lynn-gets-bound-and-toyed-to-o...,Kink,['Chastity Lynn'],"['Socks', 'Emo', 'BDSM', 'Shorts', 'Clothed', ...","['Teen Bondage', 'Sneakers', 'BDSM', 'Big Ass ...",100%,11710,15595757_013_b31a.jpg,1200,800
bondage,blonde-chastity-lynn-gets-bound-and-toyed-to-o...,Kink,['Chastity Lynn'],"['Socks', 'Emo', 'BDSM', 'Shorts', 'Clothed', ...","['Teen Bondage', 'Sneakers', 'BDSM', 'Big Ass ...",100%,11710,15595757_042_a9d1.jpg,1200,800
bondage,blonde-chastity-lynn-gets-bound-and-toyed-to-o...,Kink,['Chastity Lynn'],"['Socks', 'Emo', 'BDSM', 'Shorts', 'Clothed', ...","['Teen Bondage', 'Sneakers', 'BDSM', 'Big Ass ...",100%,11710,15595757_020_0776.jpg,800,1200
bondage,blonde-chastity-lynn-gets-bound-and-toyed-to-o...,Kink,['Chastity Lynn'],"['Socks', 'Emo', 'BDSM', 'Shorts', 'Clothed', ...","['Teen Bondage', 'Sneakers', 'BDSM', 'Big Ass ...",100%,11710,15595757_003_7522.jpg,1200,800
bondage,blonde-chastity-lynn-gets-bound-and-toyed-to-o...,Kink,['Chastity Lynn'],"['Socks', 'Emo', 'BDSM', 'Shorts', 'Clothed', ...","['Teen Bondage', 'Sneakers', 'BDSM', 'Big Ass ...",100%,11710,15595757_011_873b.jpg,1200,800
...,...,...,...,...,...,...,...,...,...,...
cosplay,redhead-cosplay-girl-kayla-kiss-showing-bare-b...,Kayla Kiss Official,['Kayla Kiss'],"['Cosplay', 'Amateur', 'Big Tits', 'Redhead', ...","['Big Tits Amateur', 'Redhead Big Tits', 'Upsk...",92%,48707,88810420_004_2fdc.jpg,1200,800
cosplay,redhead-cosplay-girl-kayla-kiss-showing-bare-b...,Kayla Kiss Official,['Kayla Kiss'],"['Cosplay', 'Amateur', 'Big Tits', 'Redhead', ...","['Big Tits Amateur', 'Redhead Big Tits', 'Upsk...",92%,48707,88810420_005_9580.jpg,1200,800
cosplay,redhead-cosplay-girl-kayla-kiss-showing-bare-b...,Kayla Kiss Official,['Kayla Kiss'],"['Cosplay', 'Amateur', 'Big Tits', 'Redhead', ...","['Big Tits Amateur', 'Redhead Big Tits', 'Upsk...",92%,48707,88810420_010_d053.jpg,1200,800
cosplay,redhead-cosplay-girl-kayla-kiss-showing-bare-b...,Kayla Kiss Official,['Kayla Kiss'],"['Cosplay', 'Amateur', 'Big Tits', 'Redhead', ...","['Big Tits Amateur', 'Redhead Big Tits', 'Upsk...",92%,48707,88810420_011_ba1b.jpg,1200,800


In [20]:
enhanced_df.to_csv('image_high_res.csv', index=True)