In [3]:
import os
import re
import cv2
import pandas as pd
from tqdm import tqdm
from pathlib import Path
import concurrent.futures

SRC_DIR = Path('/Volumes/external_drive')
ACRONYMS = ['69', 'BBC', 'BBW', 'BDSM', 'CFNM', 'DP', 'GILF', 'MILF', 'PAWG', 'POV']

In [4]:
def process_string(input_string):
    word_list = re.findall('[A-Z][^A-Z]*', input_string)

    joined_words = []
    temp_word = ""
    for word in word_list:
        if word.endswith(" "):
            temp_word += word
        else:
            if temp_word:
                joined_words.append(temp_word + word)
                temp_word = ""
            else:
                joined_words.append(word)

    final_words = []
    temp_word = ""
    for word in joined_words:
        if len(word) == 1 and word.isupper():
            temp_word += word
        elif len(word) > 1 and word[1] == " ":
            temp_word += word
        elif len(word) > 1 and word[-2] == " ":
            final_words.append(temp_word)
            temp_word = ""
            temp_word += word

        else:
            if temp_word:
                final_words.append(temp_word)
                temp_word = ""
            final_words.append(word)
    if temp_word:
        final_words.append(temp_word)

    word_list = [w for w in final_words if w != ""]

    new_list = []
    for word in word_list:
        match = re.match(r"([a-zA-Z]+)([0-9]+)", word)
        if match:
            new_list.append(match.group(1))
            new_list.append(int(match.group(2)))
        else:
            new_list.append(word)
    return new_list


def split_and_verify_acronyms(words_list, acronyms):
    out = []

    for word in words_list:
        word = str(word)
        if word.isupper():
            for acronym in acronyms:
                if acronym in word:
                    out.append(acronym)
                    word = word.replace(acronym, "")
        out.append(word)

    return [w for w in out if w != ""]


def split_and_unify_tags(tags_list: list[str]):
    out = []
    acronyms = []
    for tag in tags_list:
        for acronym in ACRONYMS:
            if acronym in tag:
                tag = tag.replace(acronym, "").strip()
                if acronym not in acronyms:
                    acronyms.append(acronym)
        out.append(tag)

    result = list(set(out + acronyms))
    for tag in result:
        if tag in out:
            for other_tag in out:
                if tag in other_tag and tag != other_tag:
                    result.remove(tag)
                    break

    return result


def process_line(line_string: str) -> list[str]:
    words_list = process_string(line_string)
    words_list = split_and_verify_acronyms(words_list, ACRONYMS)
    return [word.replace("  ", " ").strip() for word in words_list]

In [5]:
def parse_gallery_info(filepath: Path):
    # Get all the filenames as list
    filenames = []
    for f in os.listdir(filepath.parent):
        if not f.startswith(".") and not f.endswith(".txt"):
            filenames.append(f)

    with open(filepath, 'r') as file:
        content = file.readlines()

    info = {}
    spl = filepath.as_posix().split("/")
    info['category'] = spl[3]
    info['gallery_name'] = spl[4]
    info['filenames'] = filenames
    for line in content:
        if line.startswith("Channel"):
            info['channel'] = line.removeprefix("Channel: : ").removesuffix("\n")
        elif line.startswith("Models"):
            models = process_line(line.removeprefix("Models:: ").removesuffix(" + Suggest\n"))
            if "Suggest" in models:
                models.remove("Suggest")
            info['models'] = models
        elif line.startswith("Categories"):
            info['categories'] = process_line(line.removeprefix("Categories:: ").removesuffix(" + Suggest\n"))
        elif line.startswith("Tags List"):
            info['tags_list'] = split_and_unify_tags(process_line(line.removeprefix("Tags List:: ").removesuffix("\n")))
        elif line.startswith("Stats"):
            splitted = line.removeprefix("Stats:: ").removesuffix("\n").split("; ")

            try:
                info['rating'] = splitted[0].removeprefix("Rating: ")
            except:
                info['rating'] = None

            try:
                info['views'] = splitted[1].removeprefix("Views: ")
            except:
                info['views'] = None

    return info

In [6]:
def process_gallery(category_name: str, gallery: str):
    gallery_info_path = SRC_DIR / category_name / gallery / 'gallery_info.txt'
    if gallery_info_path.exists():
        gallery_info = parse_gallery_info(gallery_info_path)
        return gallery_info

In [7]:
def process_category(category_name: str):
    category_path = SRC_DIR / category_name
    category_galleries = [
        gallery
        for gallery in os.listdir(category_path)
        if (category_path / gallery).is_dir()
    ]

    results = []
    with concurrent.futures.ThreadPoolExecutor(max_workers=16) as executor:
        future_to_gallery = [
            executor.submit(process_gallery, category_name, gallery)
            for gallery in category_galleries
        ]

        for future in tqdm(
                concurrent.futures.as_completed(future_to_gallery),
                total=len(future_to_gallery),
                desc=f"Processing {category_name}",
                unit="gallery"
        ):
            gallery_info = future.result()
            if gallery_info:
                results.append(gallery_info)

    return results

In [8]:
def crawl_images_folder():
    categories = [
        category for category in os.listdir(SRC_DIR)
        if (SRC_DIR / category).is_dir() and not category.startswith(".")
    ]

    all_galleries_info = []
    with concurrent.futures.ThreadPoolExecutor() as executor:
        future_to_category = [
            executor.submit(process_category, category)
            for category in categories
        ]
        for future in tqdm(
                concurrent.futures.as_completed(future_to_category),
                total=len(future_to_category),
                desc="Processing categories", unit="category"
        ):
            category_galleries_info = future.result()
            all_galleries_info.extend(category_galleries_info)

    return all_galleries_info

In [9]:
all_galleries_info = crawl_images_folder()
df = pd.DataFrame(all_galleries_info)

Processing categories:   0%|          | 0/83 [00:00<?, ?category/s]
Processing non-nude:   0%|          | 0/1000 [00:00<?, ?gallery/s][A

Processing bondage:   0%|          | 0/1000 [00:00<?, ?gallery/s][A[A


Processing panties:   0%|          | 0/1000 [00:00<?, ?gallery/s][A[A[A
Processing non-nude:  20%|██        | 204/1000 [00:00<00:00, 2035.01gallery/s][A

Processing bondage:  12%|█▏        | 121/1000 [00:00<00:00, 1137.30gallery/s][A[A


Processing panties:  10%|▉         | 99/1000 [00:00<00:00, 980.45gallery/s][A[A[A



Processing skinny:   0%|          | 0/995 [00:00<?, ?gallery/s][A[A[A[A




Processing nurse:   0%|          | 0/1000 [00:00<?, ?gallery/s][A[A[A[A[A
Processing non-nude:  41%|████      | 408/1000 [00:00<00:00, 1123.09gallery/s][A


Processing panties:  20%|█▉        | 198/1000 [00:00<00:01, 797.26gallery/s][A[A[A



Processing skinny:  22%|██▏       | 223/995 [00:00<00:00, 2221.45gallery/s][A[A[A[A

Processing bondage:  24%|██▎      

In [10]:
# def strip_channel_name(channel_name: str):
#     if isinstance(channel_name, str):
#         return channel_name.split("Pics")[0].strip()
#     return channel_name
# 
# 
# df['channel'] = df.channel.apply(strip_channel_name)

In [11]:
# df

In [12]:
df.shape

(82588, 9)

In [15]:
df.head()

Unnamed: 0,category,gallery_name,filenames,channel,models,categories,tags_list,rating,views
0,girlfriend,atk-girlfriends-riley-star-90474599,[],ATK Girlfriends Pics,[Riley Star],"[Girlfriend, Cute, Amateur, Skinny, Teen, Whit...","[Skinny Petite Teen, Amateur Girlfriend, Vacat...",100%,40267
1,girlfriend,little-russian-girlfriend-sasha-paige-blows-a-...,[],Team Skeet Pics,"[Andrew Marshall, Sasha Paige]","[Skinny, Girlfriend, Russian, White, Teen, Cut...","[Tall Skinny Brunette, Teen Babe, Tall Skinny ...",97%,110885
2,girlfriend,amateur-honeys-kadence-and-pinky-lee-undress-b...,[],Naughty America PicsNeighbor Affair Pics,"[Kadence, Mikey Butders, Pinky Lee]","[Girlfriend, Reality, Amateur, Blonde, White, ...",,83%,12189
3,girlfriend,curvy-teen-leana-lovings-shows-off-her-big-ass...,[],ATK Petites Pics,[Leana Lovings],"[Petite, Girlfriend, Teen, Cute, Step Sister, ...","[Amateur Girlfriend, Petite Teen, Cute Amateur...",100%,48878
4,girlfriend,big-titted-french-milf-katsuni-engages-in-sexu...,[],Naughty America PicsMy Dads Hot Girlfriend Pics,"[Bill Bailey, Katsuni]","[Girlfriend, French, MILF, Mature, Ebony, Asia...","[Mature Massage, Girlfriend Pussy, French Matu...",100%,68103


# ENHANCED DATAFRAME

In [13]:
def process_filenames_to_mini_df(row):
    category = row['category']
    gallery_name = row['gallery_name']
    gallery_dicts = []

    def _process_image(category: str, gallery_name: str, filename: str) -> None:
        file_path = os.path.join(SRC_DIR, category, gallery_name, filename)
        img = cv2.imread(file_path)
        height, width, _ = img.shape
        new_row = row.to_dict()
        new_row['filename'] = filename
        new_row['width'] = width
        new_row['height'] = height
        gallery_dicts.append(new_row)
        img.close()

    with concurrent.futures.ThreadPoolExecutor() as executor:
        futures = [
            executor.submit(_process_image, category, gallery_name, filename)
            for filename in row['filenames']
        ]
        for future in concurrent.futures.as_completed(futures):
            future.result()

    return pd.DataFrame(gallery_dicts).drop(columns=['filenames'])

In [14]:
futures = []

mini_dfs = []
for _, row in tqdm(df.iloc[10000:10010].iterrows()):
    mini_dfs.append(process_filenames_to_mini_df(row))

0it [00:00, ?it/s]


KeyError: "['filenames'] not found in axis"

In [None]:
enhanced_df = pd.concat(mini_dfs)

In [None]:
enhanced_df.shape

In [None]:
enhanced_df.to_csv('filenames_dataset.csv', index=False)

In [None]:
all_tags = [sublist for sublist in df['tags_list'].values if sublist != [] and not isinstance(sublist, float)]
merged_list = sorted([item for sublist in all_tags for item in sublist])

# Training pipeline

In [None]:
!pip install transformers datasets torch torchvision accelerate

# Load the dataset

In [None]:
from datasets import load_dataset
from torchvision import transforms

# Load your dataset (replace 'your_dataset' with the actual dataset)
dataset = load_dataset('your_dataset')

# Define the preprocessing function for higher resolution
preprocess = transforms.Compose([
    transforms.Resize((384, 384)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
])


# Apply the preprocessing function to the dataset
def preprocess_images(examples):
    examples['pixel_values'] = [preprocess(image.convert("RGB")) for image in examples['image']]
    return examples


# Preprocess the train and test sets
dataset = dataset.map(preprocess_images, batched=True)

# Load the model

In [None]:
from transformers import ViTForImageClassification, ViTFeatureExtractor

# Load the feature extractor for higher resolution
feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-large-patch16-384')

# Number of classes in your dataset
num_classes = len(dataset['train'].features['label'].names)

# Load the ViT model for higher resolution image classification
model = ViTForImageClassification.from_pretrained('google/vit-large-patch16-384', num_labels=num_classes)

# Training the model

In [None]:
import torch
from torch.utils.data import DataLoader


# Convert the dataset to a format compatible with PyTorch DataLoader
def collate_fn(batch):
    return {
        'pixel_values': torch.stack([item['pixel_values'] for item in batch]),
        'labels': torch.tensor([item['label'] for item in batch])
    }


# Create DataLoader instances
train_loader = DataLoader(dataset['train'], batch_size=16, shuffle=True, collate_fn=collate_fn)
test_loader = DataLoader(dataset['test'], batch_size=16, shuffle=False, collate_fn=collate_fn)

# Accelerate the training

In [None]:
from accelerate import Accelerator

# Initialize the Accelerator
accelerator = Accelerator()

# Prepare the model, optimizer, and dataloaders
model, train_loader, test_loader = accelerator.prepare(model, train_loader, test_loader)

# Define the optimizer and training loop

In [None]:
from transformers import AdamW
from tqdm.auto import tqdm

# Define the optimizer
optimizer = AdamW(model.parameters(), lr=2e-5)

# Training loop
num_epochs = 3
for epoch in range(num_epochs):
    model.train()
    for batch in tqdm(train_loader):
        optimizer.zero_grad()
        outputs = model(**batch)
        loss = outputs.loss
        accelerator.backward(loss)
        optimizer.step()

    # Evaluation loop
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for batch in test_loader:
            outputs = model(**batch)
            predictions = outputs.logits.argmax(dim=-1)
            correct += (predictions == batch['labels']).sum().item()
            total += batch['labels'].size(0)
    accuracy = correct / total
    print(f'Epoch {epoch + 1}/{num_epochs}, Accuracy: {accuracy:.4f}')