In [1]:
import platform
import os
if platform.system() == 'Darwin':
    DATA_PATH = "/Users/maltegenschow/Documents/Uni/Thesis/Data.nosync"
    ROOT_PATH = "/Users/maltegenschow/Documents/Uni/Thesis/Thesis"
elif platform.system() == 'Linux':
    DATA_PATH = "/pfs/work7/workspace/scratch/tu_zxmav84-thesis/Data.nosync"
    ROOT_PATH = "/pfs/work7/workspace/scratch/tu_zxmav84-thesis/Thesis"

current_wd = os.getcwd()

In [2]:
import pandas as pd
import torch
import numpy as np
from tqdm import tqdm
import itertools
import pickle

### Load in Metadata and Latents

In [None]:
latents = torch.load(f"{DATA_PATH}/Models/e4e/00005_snapshot_1200/inversions/latents_dict.pt")
meta = pd.read_json(f"{DATA_PATH}/Zalando_Germany_Dataset/dresses/metadata/dresses_metadata.json").T.rename_axis('sku').reset_index()

### Single Attribute Data Prep

In [None]:
attribute = 'category'
meta = meta[['sku', attribute]]
if meta[attribute].isna().any():
    # Remove observations with missing values in attribute
    meta = meta[meta[attribute].isna() == False].reset_index()
    # Subset latents to the cleaned metadata
    latents = {k:v for k,v in latents.items() if k in list(meta.sku)}

assert len(latents) == len(meta), "Mismatch between latents shape and metadata shape"

In [None]:
latents_out = np.zeros([len(meta), 1, 16, 512])
targets = np.zeros([len(meta), meta[attribute].nunique(), 1])

# Get one-hot encoded array of targets
dummies = pd.get_dummies(meta[attribute])
attribute_order = list(dummies.columns)
dummies = np.array(dummies)
dummies = dummies.reshape(len(meta), -1, 1)

for i in tqdm(range(len(meta))):
    sku = meta.iloc[i].sku
    latents_out[i, :,:,:] = latents[sku].numpy()
    targets[i,:,:] = dummies[i,:,:]

print(f"Latents out shape: {latents_out.shape}")
print(f"Target shape: {targets.shape}")

In [None]:
# Save Data
base_save_dir = f"{DATA_PATH}/Models/StyleFlow/Inputs/"
os.makedirs(f"{base_save_dir}{attribute}", exist_ok=True)
np.save(f"{base_save_dir}{attribute}/latents.npy", latents_out)
np.save(f"{base_save_dir}{attribute}/targets.npy", targets)

## Multi-Attribute Data Prep

In [3]:
cats = ['category', 'sleeve_length', 'color', 'pattern']

In [4]:
latents = torch.load(f"{DATA_PATH}/Models/e4e/00005_snapshot_1200/inversions/latents_dict.pt")
meta = pd.read_json(f"{DATA_PATH}/Zalando_Germany_Dataset/dresses/metadata/dresses_metadata.json").T.rename_axis('sku').reset_index()
meta = meta[['sku'] + cats]

# Check how many complete cases (no None in any column) there are
print(f"There are {meta.isna().any(axis = 1).sum()} of {meta.shape[0]} rows with any missing value")
# Drop non-complete rows
meta = meta.dropna().reset_index(drop=True)

There are 650 of 14060 rows with any missing value


In [5]:
# Get binary targets for all categories
dummies = {cat:pd.get_dummies(meta[cat]) for cat in cats}
attributes_list = [list(v.columns) for k,v in dummies.items()]
attributes_list = list(itertools.chain(*attributes_list))
print(f"Length of attribute list: {len(attributes_list)}")

# Create target matrix from dummies
dummies = [np.array(v).reshape(len(meta), -1, 1) for k,v in dummies.items()]
targets = np.concatenate(dummies, axis = 1)
print(f"Shape of targets: {targets.shape}")

# Create Latents
latents_out = np.zeros([len(meta), 1, 16, 512])
for i in tqdm(range(len(meta)), leave = False):
    sku = meta.iloc[i].sku
    latents_out[i, :,:,:] = latents[sku].numpy()
print(f"Shape of latents_out: {latents_out.shape}")

Length of attribute list: 48
Shape of targets: (13410, 48, 1)


                                                        

Shape of latents_out: (13410, 1, 16, 512)




In [6]:
# Save Data
name = 'multiple'
base_save_dir = f"{DATA_PATH}/Models/StyleFlow/Inputs/"
os.makedirs(f"{base_save_dir}{name}", exist_ok=True)

np.save(f"{base_save_dir}{name}/latents.npy", latents_out)
np.save(f"{base_save_dir}{name}/targets.npy", targets)
with open(f"{base_save_dir}{name}/attributes_list.pkl", 'wb') as f:
    pickle.dump(attributes_list, f)
meta.to_csv(f"{base_save_dir}{name}/meta.csv")

### Check Data correctness

In [7]:
def target_to_label(target):
    return list(np.array(attributes_list)[np.argwhere(target.squeeze(1)==1)].squeeze(1))

In [8]:
idx = 2345

display(meta.iloc[idx])
display(target_to_label(targets[idx]))

np.all(latents[meta.iloc[idx].sku].numpy() == latents_out[idx])

sku              CO121C1D5-B11
category           Shift dress
sleeve_length      Extra short
color                    beige
pattern                 Floral
Name: 2345, dtype: object

['Shift dress', 'Extra short', 'beige', 'Floral']

True