### Purpose of the notebook: 
- In this notebook I preapre the data for the InterfaceGAN approach. 
- Since this approach only allows for binary manipulations (e.g. day dress - evening dress), I will prepare the data for all combinations of a specific attribute.


In [1]:
import platform
import os
if platform.system() == 'Darwin':
    DATA_PATH = "/Users/maltegenschow/Documents/Uni/Thesis/Data.nosync"
    ROOT_PATH = "/Users/maltegenschow/Documents/Uni/Thesis/Thesis"
elif platform.system() == 'Linux':
    DATA_PATH = "/pfs/work7/workspace/scratch/tu_zxmav84-thesis/Data.nosync"
    ROOT_PATH = "/pfs/work7/workspace/scratch/tu_zxmav84-thesis/Thesis"

current_wd = os.getcwd()

In [2]:
from glob import glob 
import pandas as pd
import torch
import numpy as np
import itertools

In [3]:
save_path = f"{DATA_PATH}/Models/InterfaceGAN/Inputs/e4e_00005/"
latents_path = f"{DATA_PATH}/Models/e4e/00005_snapshot_1200/inversions/latents_dict.pt"

In [4]:
# Load in Latents
latents = torch.load(latents_path)

In [5]:
# Import metadata
meta = pd.read_json(f'{DATA_PATH}/Zalando_Germany_Dataset/dresses/metadata/dresses_metadata.json').T.reset_index().rename(columns={'index':'sku'})

# Rename category "Cocktail dress / Party dress" to "Cocktail dress" to avoid problems in path definitions
meta['category'] = meta['category'].replace('Cocktail dress / Party dress', 'Cocktail dress')
# Rename "3/4 length" to "34 length" to avoid problems in path definitions
meta['sleeve_length'] = meta['sleeve_length'].replace('3/4 length', '34 length')


meta.head(3)

Unnamed: 0,sku,name,sku_base,sku_color_code,url,brand,original_price,current_price,brand_url,category,...,fabric,fit,neckline,pattern,collar,length,shape,sleeve_length,thumbnail_url,packshot_url
0,AN621C22S-O11,Jersey dress - brown,AN621C22S,O11,https://en.zalando.de/anna-field-shift-dress-b...,Anna Field,39.99,39.99,https://en.zalando.de/anna-field/,Shift dress,...,Jersey,Slim Fit,,Plain,Standing collar,Calf-length,Body-hugging,Short,https://img01.ztat.net/article/spp-media-p1/fb...,https://img01.ztat.net/article/spp-media-p1/c8...
1,BU321C01G-K11,Jersey dress - marine/bedruckt,BU321C01G,K11,https://en.zalando.de/buffalo-jersey-dress-mar...,Buffalo,39.99,39.99,https://en.zalando.de/buffalo/,Jersey dress,...,Jersey,Regular Fit,Low-cut v-neck,Print,,Knee-length,Fitted,Sleeveless,https://img01.ztat.net/article/spp-media-p1/50...,https://img01.ztat.net/article/spp-media-p1/17...
2,JY121C0TB-A11,JDYCARLA CATHINKA DRESS - Jersey dress - cloud...,JY121C0TB,A11,https://en.zalando.de/jdy-carla-cathinka-dress...,JDY,34.99,34.99,https://en.zalando.de/jacqueline-de-yong/,Jersey dress,...,,Regular Fit,Crew neck,Plain,Standing collar,Knee-length,Flared,Short,https://img01.ztat.net/article/spp-media-p1/20...,https://img01.ztat.net/article/spp-media-p1/20...


In [6]:
def prepare_data(attribute:str):

    # Subset to rows where the attribute is not none    
    meta_sub = meta[meta[attribute].isnull() == False]

    # Get all combinations of the attribute
    all_combinations = list(itertools.combinations(meta_sub[attribute].unique(), 2))
    print(f"Found {len(all_combinations)} combinations of {attribute} attributes")

    summary_stats = {}
    for combination in all_combinations:
        # Subset to correct attribute values and create target data
        subset = meta_sub[meta_sub[attribute].isin(combination)][['sku', attribute]].copy()
        labels = {combination[0]:0, combination[1]:1}
        subset['label'] = subset[attribute].map(labels)
        target = np.array(subset.label).reshape(-1, 1)
        # Subset latents
        latents_subset = [latents[sku].squeeze(0) for sku in subset.sku]

        # Save everything
        os.makedirs(f"{save_path}/{attribute}/{combination[0]}_{combination[1]}/", exist_ok=True)



        # Split dimensions and save
        for i in range(16):
            latents_subset_dim = torch.stack([elem[i,:] for elem in latents_subset])
            assert latents_subset_dim.shape[0] == target.shape[0]
            np.save(f"{save_path}/{attribute}/{combination[0]}_{combination[1]}/latents_dim_{i}.npy", latents_subset_dim)

        np.save(f"{save_path}/{attribute}/{combination[0]}_{combination[1]}/target.npy", target)
        subset.to_csv(f'{save_path}/{attribute}/{combination[0]}_{combination[1]}/metadata.csv', index=False)

        summary_stats[combination] = {
            'num_samples': target.shape[0],
            'num_positives': target.sum(),
            'num_negatives': (1 - target).sum()
        }

    # Save summary stats
    stats = pd.DataFrame(summary_stats).T
    stats.to_csv(f"{save_path}/{attribute}/summary_stats.csv")

In [7]:
# attributes = ['category', 'sleeve_length', 'color']

# for attribute in attributes:
#     prepare_data(attribute)

Found 28 combinations of category attributes
Found 55 combinations of sleeve_length attributes
Found 91 combinations of color attributes


### Brands
- Subset metadata to include only brands that have more than 80 images in the dataset.


In [7]:
counts = meta.brand.value_counts().sort_values(ascending=False)
subset_brands = list(counts[counts > 80].index)
meta = meta[meta.brand.isin(subset_brands)]

In [9]:
prepare_data('brand')

Found 703 combinations of brand attributes
