In [2]:
import pandas as pd
import json
import numpy as np
from pprint import pprint
from glob import glob
from PIL import Image 
from IPython.display import display

## Read in Data

In [3]:
# Metadata file
path = '../../../Data.nosync/Zalando_UK_Dataset/dresses/metadata/images_dict.json'

with open(path) as f:
    meta_data = json.load(f)


In [4]:
dfs = []
for id in meta_data.keys():
    garment_type = meta_data[id][0]
    url = meta_data[id][1]['url']
    brand = meta_data[id][1]['brand']['name']

    attribute_data = meta_data[id][1]['attributeCategories']
    rows = []
    for category in attribute_data:
        for attribute in category['attributes']:
            row = {
                'category_name': category['categoryName'],
                'attribute_name': attribute['key'],
                'attribute_value': attribute['value']
            }
            rows.append(row)

    df = pd.DataFrame(rows)
    df['garment_type'] = garment_type
    df['url'] = url
    df['brand'] = brand
    df['id'] = id
    df = df[['id','url', 'brand', 'garment_type','category_name', 'attribute_name', 'attribute_value']]
    dfs.append(df)

df = pd.concat(dfs)

# Subset to only relevant attributes
df = df[df.attribute_name.isin(['Fabric', 'Pattern', 'Collar', 'Neckline', 'Length', 'Fit'])]

df

Unnamed: 0,id,url,brand,garment_type,category_name,attribute_name,attribute_value
1,GE221C0GK-K11,https://www.zalando.co.uk/gestuz-frilly-long-d...,Gestuz,denim_dress,Material & care,Fabric,Denim
2,GE221C0GK-K11,https://www.zalando.co.uk/gestuz-frilly-long-d...,Gestuz,denim_dress,Details,Neckline,Backless
3,GE221C0GK-K11,https://www.zalando.co.uk/gestuz-frilly-long-d...,Gestuz,denim_dress,Details,Collar,Standing collar
7,GE221C0GK-K11,https://www.zalando.co.uk/gestuz-frilly-long-d...,Gestuz,denim_dress,Size & fit,Fit,Slim Fit
9,GE221C0GK-K11,https://www.zalando.co.uk/gestuz-frilly-long-d...,Gestuz,denim_dress,Size & fit,Length,Long
...,...,...,...,...,...,...,...
1,6CA21C09N-B11,https://www.zalando.co.uk/calvin-klein-mini-ta...,Calvin Klein,work_dress,Material & care,Fabric,Jersey
3,6CA21C09N-B11,https://www.zalando.co.uk/calvin-klein-mini-ta...,Calvin Klein,work_dress,Details,Neckline,Crew neck
5,6CA21C09N-B11,https://www.zalando.co.uk/calvin-klein-mini-ta...,Calvin Klein,work_dress,Details,Pattern,Plain
9,6CA21C09N-B11,https://www.zalando.co.uk/calvin-klein-mini-ta...,Calvin Klein,work_dress,Size & fit,Fit,Slim Fit


## Clean Attributes and Correct Mapping

- Most of the attribute corrections are taken from the desentangling-aesthetics repo by Aseem Behl

In [4]:
def map_value(df, attribute_name, mapping):
    df.loc[df.attribute_name == attribute_name, 'attribute_value'] = (
        df.loc[df.attribute_name == attribute_name, 'attribute_value']
        .map(mapping)
    )
    return df

def trancate_mapping_with_other(df, attribute_name, threshold=10):
    counts = df[df.attribute_name == attribute_name].attribute_value.value_counts()
    other_mapping = {elem:'Other' for elem in counts[counts < threshold].index.to_list()}
    mapping = {elem:elem for elem in counts.index.to_list()}
    mapping.update(other_mapping)
    df.loc[df.attribute_name == attribute_name, 'attribute_value'] = (
        df.loc[df.attribute_name == attribute_name, 'attribute_value']
        .map(mapping)
    )
    return df

### Fabric

In [5]:
fabric_mapping = {
    'Jersey': 'Jersey',
    'Knit': 'Knit',
    'Rib': 'Rib',
    'Satin': 'Satin',
    'Chiffon': 'Chiffon',
    'Denim': 'Denim',
    'Tulle': 'Tulle',
    'Sweat': 'Sweat',
    'Velvet/velour': 'Velvet/Velour',
    'Lace': 'Lace',
    'Mesh': 'Mesh',
    'Piqué': 'Piqué',
    'Faux leather': 'Faux Leather',
    'Crocheted': 'Crocheted',
    'Jersey, tulle': 'Jersey',
    'Cord': 'Cord',
    'Lace, chiffon': 'Lace',
    'Rib, knit': 'Rib',
    'Jersey, Lace': 'Jersey',
    'Jersey, mesh': 'Jersey',
    'Satin, Lace': 'Satin',
    'Jersey, rib': 'Jersey',
    'Canvas': 'Canvas',
    'Satin, tulle': 'Satin',
    'Knit, rib': 'Knit',
    'Satin, Jersey': 'Satin',
    'Jersey, chiffon': 'Jersey',
    'Lace, satin': 'Lace',
    'Chiffon, Lace': 'Chiffon',
    'Chiffon, Jersey': 'Chiffon',
    'Lace, tulle': 'Lace',
    'Mesh, Lace': 'Mesh',
    'Flannel': 'Flannel',
    'Fleece': 'Fleece',
    'Jersey, faux leather': 'Jersey',
    'Chiffon, knit': 'Chiffon',
    'Tulle, satin': 'Tulle',
    'Chiffon, tulle': 'Chiffon',
    'Tulle, Lace': 'Tulle',
    'Rib, mesh': 'Rib',
    'Braided': 'Braided',
    'Faux leather, Jersey': 'Faux Leather',
    'Tulle, chiffon': 'Tulle',
    'Chiffon, rib': 'Chiffon',
    'Sweat, tulle': 'Sweat',
    'Jersey, satin': 'Jersey',
    'Satin, chiffon': 'Satin',
    'Faux leather, satin': 'Faux Leather',
    'Tulle, Jersey': 'Tulle',
    'Faux fur': 'Faux Fur'
}

df = map_value(df, 'Fabric', fabric_mapping)
df = trancate_mapping_with_other(df, 'Fabric', threshold=10)
df[df.attribute_name == 'Fabric'].attribute_value.value_counts()

attribute_value
Jersey           618
Knit             370
Rib              302
Satin            253
Chiffon          228
Denim             63
Tulle             61
Lace              56
Sweat             55
Velvet/Velour     51
Mesh              31
Other             18
Faux Leather      16
Piqué             14
Crocheted         10
Name: count, dtype: int64

### Fit

In [6]:
fit_mapping = {
    'Regular Fit': 'Regular Fit',
    'Slim Fit': 'Slim Fit',
    'Skinny Fit': 'Skinny Fit',
    'Loose Fit': 'Loose Fit',
    'Oversized': 'Loose Fit',
    'Small': 'Other',
    'Fitted waist': 'Other',
    'Normal fit': 'Regular Fit',
    'Relaxed Fit': 'Loose Fit',
    'Bodycon': 'Slim Fit'
}

df = map_value(df, 'Fit', fit_mapping)
df = trancate_mapping_with_other(df, 'Fit', threshold=10)

df[df.attribute_name == 'Fit'].attribute_value.value_counts()

attribute_value
Regular Fit    1471
Slim Fit        847
Skinny Fit      480
Loose Fit       396
Other            13
Name: count, dtype: int64

### Neckline

In [7]:
df = trancate_mapping_with_other(df, 'Neckline', threshold=10)
df[df.attribute_name == 'Neckline'].attribute_value.value_counts()

attribute_value
Crew neck           741
Low-cut v-neck      491
Backless            386
Cache-coeur         264
V-neck              176
Off-the-shoulder     77
Square neck          71
Scoop neck           64
Boat neck            52
Henley               20
Cowl neck            19
Name: count, dtype: int64

### Pattern

In [8]:
df = trancate_mapping_with_other(df, 'Pattern', threshold=10)
df[df.attribute_name == 'Pattern'].attribute_value.value_counts()

attribute_value
Plain              1914
Floral              360
Print               313
Marl                213
Animal print         99
Striped              93
Checked              43
Polka dot            34
Colourful            32
Paisley              17
Other                16
Colour gradient      12
Name: count, dtype: int64

### Collar

In [9]:
collar_mapping = {
    'Standing collar': 'Standing Collar',
    'Turn-down collar': 'Turn-down Collar',
    'Polo neck': 'Polo Neck',
    'Mandarin collar': 'Mandarin Collar',
    'Hood': 'Hooded Collar',
    'Lapel collar': 'Lapel Collar',
    'Shirt collar': 'Turn-down Collar',
    'Shawl collar': 'Shawl Collar',
    'Volant collar': 'Volant Collar',
    'High collar': 'Standing Collar',
    'Zip neck': 'Zip Neck Collar',
    'Polo shirt': 'Polo Neck',
    'Contrasting collar': 'Turn-down Collar',
    'Button down': 'Turn-down Collar',
    'Peter Pan collar': 'Turn-down Collar'
}

df = map_value(df, 'Collar', collar_mapping)
df = trancate_mapping_with_other(df, 'Collar', threshold=10)
df[df.attribute_name == 'Collar'].attribute_value.value_counts()

attribute_value
Standing Collar     385
Turn-down Collar    313
Polo Neck           102
Mandarin Collar      42
Hooded Collar        40
Lapel Collar         31
Shawl Collar         21
Volant Collar        20
Zip Neck Collar      17
Name: count, dtype: int64

### Length

In [10]:
df = trancate_mapping_with_other(df, 'Length', threshold=10)
df[df.attribute_name == 'Length'].attribute_value.value_counts()

attribute_value
Calf-length    893
Short          847
Knee-length    668
Long           520
Extra short    285
Other           14
Normal          11
Name: count, dtype: int64

## Color
Code adapted from ``/attribute_driven_representations/infer_colors.py`` from the Desentangling Aesthetics repo by Aseem Behl

In [11]:
from PIL import Image
from transformers import CLIPProcessor, CLIPModel
from torch.utils.data import DataLoader
from torchvision import transforms
import torch
import os
import pandas as pd
from tqdm import tqdm

colors = [
    'red',
    'green',
    'blue',
    'pink',
    'orange',
    'yellow',
    'purple',
    'beige',
    'brown',
    'grey',
    'black',
    'white',
    'gold',
    'silver',
]
color_texts = ('a ' + pd.Series(colors) + ' dress').tolist()

model = CLIPModel.from_pretrained('openai/clip-vit-base-patch32').eval()
model = model.to('mps')
processor = CLIPProcessor.from_pretrained('openai/clip-vit-base-patch32')

ids = df.id.unique()
image_path = '../../../Data.nosync/Zalando_UK_Dataset/dresses/images/raw_images/'

predicted_colors = {}
for id in tqdm(ids, desc='Predicting colors'):
    image = Image.open(f'{image_path}{id}.jpg')
    inputs = processor(text=color_texts, images=image, return_tensors='pt').to('mps')
    with torch.no_grad():
        outputs = model(**inputs)
    logits_per_image = outputs.logits_per_image
    predicted_colors[id] = colors[logits_per_image.argmax(dim=-1)]

# Map the predicted colors to the dataframe
df['color'] = df.id.map(predicted_colors)

Predicting colors: 100%|██████████| 3238/3238 [05:17<00:00, 10.21it/s]


## Create Final Metadata Dict

In [12]:
# Function to safely retrieve attribute values
def get_attribute_value(data, attribute_name, default_value=None):
    try:
        # Attempt to return the first value that matches the attribute name
        return data.loc[data['attribute_name'] == attribute_name, 'attribute_value'].values[0]
    except IndexError:
        # Return the default value if the attribute doesn't exist
        return default_value
def create_data_dict(data):
    data_dict = {
        'url': data['url'].values[0] if 'url' in data.columns else None, 
        'brand': data['brand'].values[0] if 'brand' in data.columns else None, 
        'garment_type': data['garment_type'].values[0] if 'garment_type' in data.columns else None, 
        'color': data['color'].values[0] if 'color' in data.columns else None, 
        'fabric': get_attribute_value(data, 'Fabric'), 
        'fit': get_attribute_value(data, 'Fit'), 
        'neckline': get_attribute_value(data, 'Neckline'), 
        'pattern': get_attribute_value(data, 'Pattern'), 
        'collar': get_attribute_value(data, 'Collar'), 
        'length': get_attribute_value(data, 'Length')
    }
    return data_dict

new_metadata = {}
for id in df.id.unique():
    data = df[df.id == id]
    new_metadata[id] = create_data_dict(data)


# Dump to JSON
path = '../../../Data.nosync/Zalando_UK_Dataset/dresses/metadata/dresses_metadata.json'
with open(path, 'w') as f:
    json.dump(new_metadata, f)
    