In [1]:
import platform
import os
if platform.system() == 'Darwin':
    DATA_PATH = "/Users/maltegenschow/Documents/Uni/Thesis/Data.nosync"
    ROOT_PATH = "/Users/maltegenschow/Documents/Uni/Thesis/Thesis"
elif platform.system() == 'Linux':
    DATA_PATH = "/pfs/work7/workspace/scratch/tu_zxmav84-thesis/Data.nosync"
    ROOT_PATH = "/pfs/work7/workspace/scratch/tu_zxmav84-thesis/Thesis"

current_wd = os.getcwd()

In [2]:
import pandas as pd
from pprint import pprint
import numpy as np

In [3]:
# Import metadata
meta = pd.read_json(f'{DATA_PATH}/Zalando_Germany_Dataset/dresses/metadata/dresses_metadata.json').T.reset_index().rename(columns={'index':'sku'})
meta['sleeve_length'] = meta['sleeve_length'].replace('3/4 length', '34 length')
meta.rename(columns = {'sku':'article_id'}, inplace=True)
meta = meta[['article_id', 'brand', 'category', 'garment_type', 'color', 'fabric', 'fit', 'neckline', 'pattern', 'collar', 'length', 'shape', 'sleeve_length']]
cols = ['color', 'fabric', 'fit', 'neckline', 'pattern', 'collar', 'length', 'shape', 'sleeve_length']
meta.rename(columns = {elem:elem.title() for elem in cols}, inplace=True)
cols = [elem.title() for elem in cols]
meta

Unnamed: 0,article_id,brand,category,garment_type,Color,Fabric,Fit,Neckline,Pattern,Collar,Length,Shape,Sleeve_Length
0,AN621C22S-O11,Anna Field,Shift dress,jersey_dresses,brown,Jersey,Slim Fit,,Plain,Standing collar,Calf-length,Body-hugging,Short
1,BU321C01G-K11,Buffalo,Jersey dress,jersey_dresses,brown,Jersey,Regular Fit,Low-cut v-neck,Print,,Knee-length,Fitted,Sleeveless
2,JY121C0TB-A11,JDY,Jersey dress,jersey_dresses,beige,,Regular Fit,Crew neck,Plain,Standing collar,Knee-length,Flared,Short
3,AN621C1UQ-Q11,Anna Field,Jersey dress,jersey_dresses,brown,Jersey,Slim Fit,Crew neck,Floral,,Short,Fitted,34 length
4,SHI21C0KH-K11,Sheego,Jersey dress,jersey_dresses,blue,Chiffon,Regular Fit,V-neck,Plain,,Knee-length,Flared,34 length
...,...,...,...,...,...,...,...,...,...,...,...,...,...
14055,VE121C4BL-Q11,Vero Moda,Maxi dress,maxi_dresses,yellow,,Regular Fit,Backless,Floral,,Long,Flared,Sleeveless
14056,TO221C0LB-G11,TOM TAILOR,Maxi dress,maxi_dresses,pink,,Regular Fit,,Plain,,Long,Fitted,Spaghetti straps
14057,UP121C0TZ-Q11,Ulla Popken,Maxi dress,maxi_dresses,black,Jersey,Regular Fit,Square neck,Plain,,Long,Flared,Extra short
14058,I0621C03C-B11,INFLUENCER,Maxi dress,maxi_dresses,beige,,Regular Fit,Low-cut v-neck,Plain,,Long,Flared,Sleeveless


In [4]:
for col in cols:
    na = meta[col].isna().sum()
    print(col + ': ' + str(na))

Color: 0
Fabric: 6416
Fit: 115
Neckline: 2359
Pattern: 640
Collar: 11081
Length: 15
Shape: 340
Sleeve_Length: 12


In [5]:
meta.fillna('Missing', inplace=True)

In [6]:
from pprint import pprint

dict_out = {}
for col in cols:
    # Create an ordered dictionary to maintain the order of elements
    d = {elem: i for i, elem in enumerate(meta[col].unique())}
    dict_out[col] = d

pprint(dict_out, sort_dicts=False)

{'Color': {'brown': 0,
           'beige': 1,
           'blue': 2,
           'black': 3,
           'grey': 4,
           'green': 5,
           'red': 6,
           'orange': 7,
           'pink': 8,
           'purple': 9,
           'yellow': 10,
           'gold': 11,
           'silver': 12,
           'white': 13},
 'Fabric': {'Jersey': 0,
            'Missing': 1,
            'Chiffon': 2,
            'Knit': 3,
            'Rib': 4,
            'Sweat': 5,
            'Mixed': 6,
            'Satin': 7,
            'Lace': 8,
            'Velvet/velour': 9,
            'Cord': 10,
            'Crocheted': 11,
            'Mesh': 12,
            'Piqué': 13,
            'Tulle': 14,
            'Other': 15,
            'Denim': 16,
            'Faux leather': 17},
 'Fit': {'Slim Fit': 0,
         'Regular Fit': 1,
         'Loose Fit': 2,
         'Skinny Fit': 3,
         'Fitted waist': 4,
         'Missing': 5,
         'Oversized': 6,
         'Normal fit': 7},
 'Neckline'

### Sample Split

In [7]:
np.random.seed(0)

# Split into train and test set
train_ids = meta.sample(frac=0.7).article_id

# Assign to new column
meta['sample'] = np.where(meta.article_id.isin(train_ids), 'train', 'val')

In [8]:
meta

Unnamed: 0,article_id,brand,category,garment_type,Color,Fabric,Fit,Neckline,Pattern,Collar,Length,Shape,Sleeve_Length,sample
0,AN621C22S-O11,Anna Field,Shift dress,jersey_dresses,brown,Jersey,Slim Fit,Missing,Plain,Standing collar,Calf-length,Body-hugging,Short,val
1,BU321C01G-K11,Buffalo,Jersey dress,jersey_dresses,brown,Jersey,Regular Fit,Low-cut v-neck,Print,Missing,Knee-length,Fitted,Sleeveless,train
2,JY121C0TB-A11,JDY,Jersey dress,jersey_dresses,beige,Missing,Regular Fit,Crew neck,Plain,Standing collar,Knee-length,Flared,Short,val
3,AN621C1UQ-Q11,Anna Field,Jersey dress,jersey_dresses,brown,Jersey,Slim Fit,Crew neck,Floral,Missing,Short,Fitted,34 length,val
4,SHI21C0KH-K11,Sheego,Jersey dress,jersey_dresses,blue,Chiffon,Regular Fit,V-neck,Plain,Missing,Knee-length,Flared,34 length,train
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14055,VE121C4BL-Q11,Vero Moda,Maxi dress,maxi_dresses,yellow,Missing,Regular Fit,Backless,Floral,Missing,Long,Flared,Sleeveless,val
14056,TO221C0LB-G11,TOM TAILOR,Maxi dress,maxi_dresses,pink,Missing,Regular Fit,Missing,Plain,Missing,Long,Fitted,Spaghetti straps,train
14057,UP121C0TZ-Q11,Ulla Popken,Maxi dress,maxi_dresses,black,Jersey,Regular Fit,Square neck,Plain,Missing,Long,Flared,Extra short,val
14058,I0621C03C-B11,INFLUENCER,Maxi dress,maxi_dresses,beige,Missing,Regular Fit,Low-cut v-neck,Plain,Missing,Long,Flared,Sleeveless,train


In [11]:
meta.to_csv(f'{DATA_PATH}/Models/disentangled_representations/zalando_germany/labels.csv', index=False)