In [1]:
import pandas as pd
import random

In [2]:
def is_a_full_combination(combination):
    if 'shirts_tops' in combination: return any(item in combination for item in ['pants', 'skirts', 'shorts'])
    if 'dresses' in combination:
        if any(item in combination for item in ['shirts_tops', 'pants', 'shorts', 'skirts']):
            return False
        else: return True
    return False

In [3]:
# Function to extract category from an image path
def extract_category(image_path):
    return image_path.split('/')[1]  # Assuming the category is the first part of the path

# Function to replace only one item with an item from another list with the same category
def replace_one_item_with_another_list(df):
    replaced_df = df.copy()
    for index, row in df.iterrows():
        # Randomly select an item from the current list
        index_to_replace = random.randint(0, len(row['image_paths']) - 1)
        item_to_replace = row['image_paths'][index_to_replace]

        # Extract the category of the item to be replaced
        category_to_replace = extract_category(item_to_replace)

        # Find another list with a different index but containing items with the same category
        other_lists = [lst for i, lst in enumerate(df['image_paths']) if i != index and any(extract_category(image_path) == category_to_replace for image_path in lst)]

        # If there are no other lists with the same category, skip replacement
        if not other_lists:
            continue

        # Choose a random list from the other lists
        other_list = random.choice(other_lists)

        # Choose a replacement item from the other list with the same category
        replacement_item = random.choice([image_path for image_path in other_list if extract_category(image_path) == category_to_replace])

        # Replace the item with the replacement item
        replaced_list = [replacement_item if i == index_to_replace else image_path for i, image_path in enumerate(row['image_paths'])]
        replaced_df.at[index, 'image_paths'] = replaced_list

    return replaced_df

In [4]:
def process_dataframe(df_path, parent_dir="../complete-the-look-dataset/items", sub_dir="train"):
    df = pd.read_csv(df_path, engine='c')
    df['image_paths'] = parent_dir + "/" + sub_dir + "/" + df['label'] + "/" + df['image_signature'] + ".jpg"
    
    remove_items_list = ['belts', 'gloves_mittens', 'jumpsuits_rompers', 'neckties', 'rings', 'socks', 'stockings', 'swimwear']
    # List all image_signatures contains at least one of the values in remove_items_list
    removed_df = df[df['label'].isin(remove_items_list)]
    # Removed all image_signatures in df contains in removed_df
    df = df[~df['image_signature'].isin(removed_df['image_signature'])]
    # Remove combinations contains less than 3 items and more than 5 items
    grouped_data = df.groupby('image_signature')['label'].apply(list).reset_index()
    filtered_data = grouped_data[(grouped_data['label'].apply(len) >= 3) & (grouped_data['label'].apply(len) <= 5) & (grouped_data['label'].apply(is_a_full_combination))]
    df = df[df['image_signature'].isin(filtered_data['image_signature'])]
    # Reset index
    new_df = df.groupby('image_signature')['image_paths'].apply(list).reset_index()
    return new_df

In [5]:
true_train_df = process_dataframe(
    df_path="../complete-the-look-dataset/datasets/preprocessed/true_train.csv",
    parent_dir="/kaggle/input/ctl-items/items",
    sub_dir="train"
)
true_train_df['label'] = 1
if 'image_signature' in true_train_df.columns:
    true_train_df = true_train_df.drop('image_signature', axis=1)

In [6]:
fake_train_df = replace_one_item_with_another_list(true_train_df)
fake_train_df['label'] = 0

In [7]:
# Stack 2 dataframes vertically
train_df = pd.concat([true_train_df, fake_train_df])
train_df.to_csv("grouped_train.csv", index=False)
train_df.head()

Unnamed: 0,image_paths,label
0,[/kaggle/input/ctl-items/items/train/handbags/...,1
1,[/kaggle/input/ctl-items/items/train/pants/04f...,1
2,[/kaggle/input/ctl-items/items/train/coats_jac...,1
3,[/kaggle/input/ctl-items/items/train/handbags/...,1
4,[/kaggle/input/ctl-items/items/train/dresses/0...,1


In [8]:
true_test_df = process_dataframe(
    df_path="../complete-the-look-dataset/datasets/preprocessed/true_test.csv",
    parent_dir="/kaggle/input/ctl-items/items",
    sub_dir="test"
)
true_test_df['label'] = 1
if 'image_signature' in true_test_df.columns:
    true_test_df = true_test_df.drop('image_signature', axis=1)

In [9]:
fake_test_df = replace_one_item_with_another_list(true_test_df)
fake_test_df['label'] = 0

In [10]:
# Stack 2 dataframes vertically
test_df = pd.concat([true_test_df, fake_test_df])
test_df.to_csv("grouped_test.csv", index=False)
test_df.head()

Unnamed: 0,image_paths,label
0,[/kaggle/input/ctl-items/items/test/shirts_top...,1
1,[/kaggle/input/ctl-items/items/test/shirts_top...,1
2,[/kaggle/input/ctl-items/items/test/shoes/0001...,1
3,[/kaggle/input/ctl-items/items/test/sunglasses...,1
4,[/kaggle/input/ctl-items/items/test/coats_jack...,1
