In [None]:
import pandas as pd
from pathlib import Path

from classifier.training.utils import set_seed

set_seed(42)

SRC_DATASET_ROOT = Path('./fn-content-dataset')
SRC_DATASET_NAME = '../images__positions.parquet'

In [45]:
df = pd.read_parquet('../images__positions.parquet')
df

Unnamed: 0,file_name,1F1M_AI,1F2M_AI,1F_AI,1M_AI,2F1M_AI,2F_AI,2M_AI,3F_AI,3M_AI,...,Kneeling_AI,Laying Down_AI,MPOV_AI,Missionary_AI,Orgy_AI,Rev Cowgirl_AI,Sitting_AI,Squatting_AI,Stand Cradle_AI,Upskirt_AI
0,0.jpg,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,1.jpg,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,10.jpg,0,0,0,0,1,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
3,100.jpg,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,1000.jpg,1,0,0,0,0,0,0,0,0,...,0,1,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
625134,99995.jpg,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
625135,99996.jpg,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
625136,99997.jpg,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
625137,99998.jpg,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
column = {
    '1F1M_AI': '1Female1Male',
    'Laying Down_AI': 'Laying_down',
    'Kneeling_AI': 'Kneeling',
    'Doggystyle_AI': 'Doggystyle',
    'CloseUp_AI': 'Closeup',
    '2F_AI': '2Female',
    'Sitting_AI': 'Sitting',
    'MPOV_AI': 'MalePOV',
    'Squatting_AI': 'Squatting',
    'G Missionary_AI': 'Missionary_g',
    'Bent Over_AI': 'Bent_over',
    'Rev Cowgirl_AI': 'Reverse_cowgirl',
    'Flexible_AI': 'Flexible',
    'Cowgirl_AI': 'Cowgirl',
    '1F2M_AI': '1Female2Male',
    '3F_AI': '3Female',
    '2F1M_AI': '2Female1Male',
    'Upskirt_AI': 'Upskirt',
    'Missionary_AI': 'Missionary',
    'Facesitting_AI': 'Facesitting',
    'FPOV_AI': 'FemalePOV',
}
label_df = label_df.drop(
    [
        # '1F_AI',  # no females
        # '69_Position_AI',  # otehr models
        # '1M_AI',  # no males
        'Orgy_AI',
        '2M_AI',
        'Stand Cradle_AI',
        '3M_AI'
    ], axis=1
)
label_df

In [None]:
label_df = label_df.rename(column, axis='columns')

In [None]:
# plot label distribution
import matplotlib.pyplot as plt


def plot_label_dist(label_dataframe: pd.DataFrame, name: str = ''):
    label_counts = label_dataframe.sum(axis=0).sort_values(ascending=False)

    plt.figure(figsize=(12, 6))
    plt.bar(label_counts.index.astype(str), label_counts.values)
    plt.xticks(rotation=90)
    plt.xlabel('Labels')
    plt.ylabel('Count')
    plt.title(f'Label Distribution {name}')
    plt.tight_layout()
    plt.show()

    return label_counts


filtered_counts = plot_label_dist(label_df)

# Reduce TOP N labels

In [None]:
# top_n_labels = filtered_counts[filtered_counts > 25650]
# labels_sum = label_df.sum(axis=1)
# top_n_labels_sum = label_df[top_n_labels.index].sum(axis=1)
# to_delete = labels_sum == top_n_labels_sum
# reduced_df = label_df[~to_delete]

In [None]:
# filtered_counts = plot_label_dist(reduced_df)

# Build balanced dataset column by column

In [None]:
filtered_counts.sort_values().index

In [None]:
import pandas as pd

threshold = 10000
out: pd.DataFrame = pd.DataFrame()
reduced_df = label_df.copy()

for col in filtered_counts.sort_values().index:
    current_columns = set(out.columns)
    print(f"Current columns: {current_columns}")
    if col in current_columns:
        # count how many already exist in the dataframe
        col_count = out[col].sum()
        print(f"{col}: {col_count} out of {threshold}")
        if col_count > threshold:
            continue

    print(f"Processing {col}")

    max_possible_sample = len(reduced_df[reduced_df[col] == 1])
    proposed_df = reduced_df[reduced_df[col] == 1][[col]].sample(min(max_possible_sample, threshold))
    if out is None:
        out = proposed_df
        continue

    # break
    existing_indexes = list(set(out.index).intersection(set(proposed_df.index)))
    existing_indexes = out.loc[existing_indexes]
    print(f"Existing indexes {len(existing_indexes)}")

    previous_size = len(proposed_df)
    proposed_df = proposed_df.drop(existing_indexes.index, axis=0)
    current_size = len(proposed_df)
    print(f"Propsed df reduced from {previous_size} to {current_size}, delta {abs(previous_size - current_size)}")
    # merge proposed_df to out on out index
    previous_size = len(out)
    out = out.join(proposed_df, how='outer')
    current_size = len(out)
    print(f"Propsed df increased from {previous_size} to {current_size}, delta {abs(previous_size - current_size)}")
    # plot_label_dist(out)
    print()

In [None]:
out

In [None]:
plot_label_dist(out)

# Get only the labels below a certain threshold

In [None]:
merged_ = file_names.merge(out, left_index=True, right_index=True)
merged_ = merged_.reset_index(drop=True).fillna(0)
merged_['file_name'] = merged_.image_path.apply(lambda image_path: image_path.split('\\')[-1])
merged_.drop('image_path', axis=1, inplace=True)
labels = list(merged_.drop('file_name', axis=1, inplace=False).columns)
merged_ = merged_[['file_name', *labels]]
merged_

In [None]:
plot_label_dist(merged_.drop('file_name', axis=1, inplace=False))

In [None]:
merged_.to_csv('positions_labels.csv')

In [None]:
from tqdm import tqdm
import shutil
from concurrent.futures import ThreadPoolExecutor

COMPILED_DATASET = SRC_DATASET_ROOT / 'positions'
COMPILED_DATASET.mkdir(exist_ok=True)


def copy_the_image(image_path: str):
    src = SRC_DATASET_ROOT / 'positions' / image_path
    dst = COMPILED_DATASET / image_path
    if dst.exists():
        return

    shutil.copy(src, dst)


with ThreadPoolExecutor(max_workers=16) as executor:
    futures = [
        executor.submit(copy_the_image, fp) for fp in merged_['image_path'].tolist()
    ]

    for future in tqdm(futures, total=len(futures)):
        _ = future.result()