In [1]:
import pandas as pd
import json
from glob import glob
from tqdm import tqdm
import numpy as np

### Read in All the Metadata Dicts

In [2]:
DATA_PATH = '../../Data.nosync/'

In [3]:
# Metadata files
path = f"{DATA_PATH}Zalando_Germany_Dataset/metadata_dicts/dresses/*"
meta_files = glob(path)
meta_files

['../../Data.nosync/Zalando_Germany_Dataset/metadata_dicts/dresses/jersey_dresses.json',
 '../../Data.nosync/Zalando_Germany_Dataset/metadata_dicts/dresses/dirndl_dresses.json',
 '../../Data.nosync/Zalando_Germany_Dataset/metadata_dicts/dresses/shift_dresses.json',
 '../../Data.nosync/Zalando_Germany_Dataset/metadata_dicts/dresses/occasion_dresses.json',
 '../../Data.nosync/Zalando_Germany_Dataset/metadata_dicts/dresses/shirt_dresses.json',
 '../../Data.nosync/Zalando_Germany_Dataset/metadata_dicts/dresses/evening_dresses.json',
 '../../Data.nosync/Zalando_Germany_Dataset/metadata_dicts/dresses/knitted_dresses.json',
 '../../Data.nosync/Zalando_Germany_Dataset/metadata_dicts/dresses/casual_dresses.json',
 '../../Data.nosync/Zalando_Germany_Dataset/metadata_dicts/dresses/denim_dresses.json',
 '../../Data.nosync/Zalando_Germany_Dataset/metadata_dicts/dresses/maxi_dresses.json']

In [4]:
def read_element(elem):
    attribute_data = elem['attributeCategories']
    rows = []
    for category in attribute_data:
        for attribute in category['attributes']:
            row = {
                'category_name': category['categoryName'],
                'attribute_name': attribute['key'],
                'attribute_value': attribute['value']
            }
            rows.append(row)

    df = pd.DataFrame(rows)

    df['name'] = elem['name']
    df['url'] = elem['url']
    df['sku'] = elem['sku']
    df['sku_base'] = elem['sku'].split('-')[0]
    df['sku_color_code'] = elem['sku'].split('-')[1]
    df['brand'] = elem['brand']['name']
    df['brand_url'] = elem['brand']['uri']
    df['category'] = elem['category']
    df['color_name'] = elem['color']['name']
    df['color_label'] = elem['color']['label']
    df['original_price'] = elem['price']['original']
    df['current_price'] = elem['price']['current']
    df['thumbnail_url'] = elem['thumbnail']

    df = df[['name', 'url', 'sku', 'sku_base', 'sku_color_code', 'brand', 'brand_url', 'category', 'color_name', 'color_label', 'original_price', 'current_price', 'category_name',
             'attribute_name', 'attribute_value', 'thumbnail_url']]
    
    return df

In [5]:
dfs = []
for path in tqdm(meta_files):
    with open(path, 'r') as f:
        meta_data = json.load(f)
    articles = [read_element(elem) for elem in meta_data]
    if len(articles) == 0:
        continue
    else:
        df = pd.concat(articles, ignore_index=True)
    df['source_file'] = path.split('/')[-1]
    dfs.append(df)
df = pd.concat(dfs, ignore_index=True).reset_index(drop=True)

100%|██████████| 10/10 [00:21<00:00,  2.12s/it]


### Identify Duplicates

In [6]:
print(f"Number of unique articles: {df['sku'].nunique()}")

Number of unique articles: 18106


In [7]:
# Create article dataframe without the attribute columns and duplicates
article_df = df.drop(columns=['category_name', 'attribute_name', 'attribute_value']).drop_duplicates()
print(article_df.shape)
article_df.head(2)

(19967, 14)


Unnamed: 0,name,url,sku,sku_base,sku_color_code,brand,brand_url,category,color_name,color_label,original_price,current_price,thumbnail_url,source_file
0,Jersey dress - brown,https://en.zalando.de/anna-field-shift-dress-b...,AN621C22S-O11,AN621C22S,O11,Anna Field,https://en.zalando.de/anna-field/,Shift dress,brown,brown,39.99,39.99,https://img01.ztat.net/article/spp-media-p1/fb...,jersey_dresses.json
13,Jersey dress - marine/bedruckt,https://en.zalando.de/buffalo-jersey-dress-mar...,BU321C01G-K11,BU321C01G,K11,Buffalo,https://en.zalando.de/buffalo/,Jersey dress,marine/bedruckt,blue,39.99,39.99,https://img01.ztat.net/article/spp-media-p1/50...,jersey_dresses.json


In [8]:
# How many products appear in more than one category metadata file?
article_df[['name', 'sku', 'source_file']].drop_duplicates().groupby('sku').source_file.count().value_counts()

source_file
1    16296
2     1772
3       25
4       13
Name: count, dtype: int64

In [9]:
# Complete duplicates including attribute columns
df[df.drop(columns = ['source_file']).duplicated(keep = False)].sort_values(['name', 'url', 'sku', 'sku_base', 'sku_color_code', 'brand',
       'brand_url', 'category', 'color_name', 'color_label', 'original_price',
       'current_price', 'category_name', 'attribute_name', 'attribute_value',
       'thumbnail_url']).head(6)

Unnamed: 0,name,url,sku,sku_base,sku_color_code,brand,brand_url,category,color_name,color_label,original_price,current_price,category_name,attribute_name,attribute_value,thumbnail_url,source_file
753,2 PACK - Jersey dress - black/mottled grey,https://en.zalando.de/evenandodd-2-pack-shift-...,EV421C131-Q11,EV421C131,Q11,Even&Odd,https://en.zalando.de/even-odd/,Shift dress,black/mottled grey,black,32.99,32.99,Details,Article number,EV421C131-Q11,https://img01.ztat.net/article/spp-media-p1/13...,jersey_dresses.json
32490,2 PACK - Jersey dress - black/mottled grey,https://en.zalando.de/evenandodd-2-pack-shift-...,EV421C131-Q11,EV421C131,Q11,Even&Odd,https://en.zalando.de/even-odd/,Shift dress,black/mottled grey,black,32.99,32.99,Details,Article number,EV421C131-Q11,https://img01.ztat.net/article/spp-media-p1/13...,shift_dresses.json
751,2 PACK - Jersey dress - black/mottled grey,https://en.zalando.de/evenandodd-2-pack-shift-...,EV421C131-Q11,EV421C131,Q11,Even&Odd,https://en.zalando.de/even-odd/,Shift dress,black/mottled grey,black,32.99,32.99,Details,Neckline,Scoop neck,https://img01.ztat.net/article/spp-media-p1/13...,jersey_dresses.json
32488,2 PACK - Jersey dress - black/mottled grey,https://en.zalando.de/evenandodd-2-pack-shift-...,EV421C131-Q11,EV421C131,Q11,Even&Odd,https://en.zalando.de/even-odd/,Shift dress,black/mottled grey,black,32.99,32.99,Details,Neckline,Scoop neck,https://img01.ztat.net/article/spp-media-p1/13...,shift_dresses.json
752,2 PACK - Jersey dress - black/mottled grey,https://en.zalando.de/evenandodd-2-pack-shift-...,EV421C131-Q11,EV421C131,Q11,Even&Odd,https://en.zalando.de/even-odd/,Shift dress,black/mottled grey,black,32.99,32.99,Details,Pattern,Marl,https://img01.ztat.net/article/spp-media-p1/13...,jersey_dresses.json
32489,2 PACK - Jersey dress - black/mottled grey,https://en.zalando.de/evenandodd-2-pack-shift-...,EV421C131-Q11,EV421C131,Q11,Even&Odd,https://en.zalando.de/even-odd/,Shift dress,black/mottled grey,black,32.99,32.99,Details,Pattern,Marl,https://img01.ztat.net/article/spp-media-p1/13...,shift_dresses.json


In [10]:
print(f"Dropping {df.drop(columns = ['source_file']).duplicated(keep = 'first').sum()} rows of duplicates")
print(f"This corersponds to {df[df.drop(columns = ['source_file']).duplicated(keep = 'first')].sku.nunique()} SKUs")
df = df[df.drop(columns = ['source_file']).duplicated(keep = 'first') == False]
print("Number of unique SKUs after dropping:", df.sku.nunique())
article_df = df.drop(columns=['category_name', 'attribute_name', 'attribute_value']).drop_duplicates()

Dropping 22104 rows of duplicates
This corersponds to 1777 SKUs
Number of unique SKUs after dropping: 18106


### Highlevel Overview of the Data

In [11]:
print(f"Number of unique SKUs: {article_df.sku.nunique()}")
print(f"Number of unique article names: {article_df.name.nunique()}")
print(f"Number of unique brands: {article_df.brand.nunique()}")
print(f"Number of unique categories: {article_df.category.nunique()}")
print(f"Number of unique colors: {article_df.color_name.nunique()}")
print(f"Number of unique color labels: {article_df.color_label.nunique()}")

Number of unique SKUs: 18106
Number of unique article names: 15115
Number of unique brands: 756
Number of unique categories: 40
Number of unique colors: 4644
Number of unique color labels: 118


- There are too many differnt categories for dresses -> Needs cleaning
- There are too many color names -> Not usable
- There are a lot color labels -> May be usable after cleaning, otherwise color must be infered from image

### Categories cleaning

In [12]:
article_df.category.value_counts()

category
Day dress                       6835
Jumper dress                    2511
Maxi dress                      2338
Jersey dress                    2069
Cocktail dress / Party dress    2023
Occasion wear                   1022
Shift dress                      917
Denim dress                      351
Shirt dress                       10
Long sleeved top                   5
Relaxed fit jeans                  5
Sports shorts                      4
Denim jacket                       4
Jumper                             4
Top                                4
Basic T-shirt                      4
Jeans Skinny Fit                   3
Trousers                           3
Beach accessory                    3
Waistcoat                          3
Button-down blouse                 2
Leggings                           2
Tracksuit bottoms                  2
Shirt                              1
Pencil skirt                       1
Bootcut jeans                      1
Cardigan                     

In [13]:
# Remove all articles that are from a cetagory with less than 100 articles. Those are wrongly labeled and should not be part of the dresses dataset
counts = article_df.category.value_counts()
cats_to_remove = counts[counts < 100].index.to_list()
print(f"Removing {len(cats_to_remove)} categories")
df = df[~df.category.isin(cats_to_remove)]
print(f"Number of categories after removal: {df.category.nunique()}")
article_df = df.drop(columns=['category_name', 'attribute_name', 'attribute_value']).drop_duplicates()
article_df.category.value_counts()

Removing 32 categories
Number of categories after removal: 8


category
Day dress                       6835
Jumper dress                    2511
Maxi dress                      2338
Jersey dress                    2069
Cocktail dress / Party dress    2023
Occasion wear                   1022
Shift dress                      917
Denim dress                      351
Name: count, dtype: int64

In [14]:
article_df.source_file.value_counts()

source_file
casual_dresses.json      6776
jersey_dresses.json      2414
maxi_dresses.json        2280
knitted_dresses.json     1909
evening_dresses.json     1885
shift_dresses.json       1490
occasion_dresses.json     959
denim_dresses.json        353
Name: count, dtype: int64

## Attributes Cleaning

1. Identify for which attributes we have a sufficient amount of data available
2. Identify attributes for which the number of unique values is not too high, i.e. a sensible usage of this attribute is possible

In [15]:
# Number of highlevel attribute categories
df.category_name.value_counts()

category_name
Size & fit         92079
Details            76559
Material & care    51242
Name: count, dtype: int64

In [16]:
# Number of attributes in each category
display(df.groupby('category_name').attribute_name.nunique())

category_name
Details            17
Material & care    13
Size & fit         11
Name: attribute_name, dtype: int64

In [17]:
for cat in df.category_name.unique():
    print(f"Category: {cat}")
    print(df[df.category_name == cat].attribute_name.value_counts())
    print('-'*50)

Category: Material & care
attribute_name
Outer fabric material                          18066
Care instructions                              10033
Fabric                                          9103
Lining                                          6713
Back material                                   1535
Insert material                                 1199
Bottom part material                            1134
Contains non-textile parts of animal origin     1000
Sleeves material                                 721
Front material                                   666
Top part material                                585
Filling                                          442
Middle part material                              45
Name: count, dtype: int64
--------------------------------------------------
Category: Details
attribute_name
Article number                   18066
Pattern                          17212
Neckline                         15016
Details                          10827
Fast

#### Attributes with sufficient data: 
##### Material & Care: 
    - Outer fabric material 
    - Fabric
    - Lining
##### Details: 
    - Pattern
    - Neckine
    - Details
    - Fastening
    - Collar
##### Size & fit
    - Sleeve length
    - Length
    - Fit
    - Shape
    - Total length


-> Now check how many unique values each of these attributes has. If there are too many unique values, this is not a meaningful attribute to consider in this application

In [18]:
print('Material & Care:')
for attr in ['Outer fabric material', 'Fabric', 'Lining']:
    print(f"\t{attr}: {df[df.attribute_name == attr].attribute_value.nunique()}")
print('Details:')
for attr in ['Pattern', 'Neckline', 'Details', 'Fasteing', 'Collar']:
    print(f"\t{attr}: {df[df.attribute_name == attr].attribute_value.nunique()}")
print('Size & Fit:')    
for attr in ['Sleeve length', 'Length', 'Fit', 'Shape', 'Total length']:
    print(f"\t{attr}: {df[df.attribute_name == attr].attribute_value.nunique()}")


Material & Care:
	Outer fabric material: 1889
	Fabric: 78
	Lining: 326
Details:
	Pattern: 16
	Neckline: 12
	Details: 789
	Fasteing: 0
	Collar: 18
Size & Fit:
	Sleeve length: 448
	Length: 11
	Fit: 14
	Shape: 6
	Total length: 1004


### Closer Look: Sleeve Length

In [19]:
sleeve = df[df.attribute_name == 'Sleeve length']
print(f"Number of unique sleeve lengths: {sleeve.attribute_value.nunique()}")
sleeve.attribute_value.value_counts()

Number of unique sleeve lengths: 448


attribute_value
Long                  4729
Short                 3321
Sleeveless            3256
3/4 length            1857
Spaghetti straps      1566
                      ... 
62.23 cm (Size M)        1
60.69 cm (Size M)        1
73 cm (Size 34)          1
67 cm (Size 36/38)       1
70.48 cm (Size 40)       1
Name: count, Length: 448, dtype: int64

### Correcting Sleeve Length Attribute Values: 
- Remove all values that have "cm" in their string -> Those cannot be used in a consistent way
- Combine "Sleeveless, strapless" and "Strapless, sleeveless" into one category
- Recode all other mixed values into "mixed" 

In [20]:
df['attribute_value'] = np.where((df.attribute_name == 'Sleeve length') & (df.attribute_value.str.contains('cm')), 
                                 None, 
                                 df.attribute_value)
df['attribute_value'] = np.where((df.attribute_name == 'Sleeve length') & (df.attribute_value.str.contains('Strapless, sleeveless')), 
                                 'Sleeveless, strapless', 
                                 df.attribute_value)
df['attribute_value'] = np.where((df.attribute_name == 'Sleeve length') & 
                                 (df.attribute_value.str.contains(',')) & 
                                 (df.attribute_value.str.contains('Sleeveless, strapless') == False), 
                                 'Mixed', 
                                 df.attribute_value)


In [21]:
df[df.attribute_name == 'Sleeve length'].attribute_value.value_counts()

attribute_value
Long                     4729
Short                    3321
Sleeveless               3256
3/4 length               1857
Spaghetti straps         1566
Elbow length              935
Extra short               914
Extra long                836
Mixed                     232
Strapless                 231
Sleeveless, strapless     170
Name: count, dtype: int64

### Result: Attributes to Keep:
Suitable attributes:
- Fabric
- Pattern
- Neckline
- Collar
- Length
- Fit
- Shape
- Sleeve length

In [22]:
attrs_to_keep = ['Fabric', 'Pattern', 'Neckline', 'Collar', 'Length', 'Fit', 'Shape', 'Sleeve length']
print(f"Number of rows before filtering: {df.shape[0]}")
df = df[df.attribute_name.isin(attrs_to_keep)]
print(f"Number of rows after filtering: {df.shape[0]}")

Number of rows before filtering: 219880
Number of rows after filtering: 118868


### Clean each attribute

- For each attribute, the unique values are checked and a corrected mapping is introduced if necessary
- Values with very low frequency are mapped to a common value ('Other')

In [23]:
def map_value(df, attribute_name, mapping):
    df.loc[df.attribute_name == attribute_name, 'attribute_value'] = (
        df.loc[df.attribute_name == attribute_name, 'attribute_value']
        .map(mapping)
    )
    return df

def trancate_mapping_with_other(df, attribute_name, threshold=10):
    counts = df[df.attribute_name == attribute_name].attribute_value.value_counts()
    other_mapping = {elem:'Other' for elem in counts[counts < threshold].index.to_list()}
    mapping = {elem:elem for elem in counts.index.to_list()}
    mapping.update(other_mapping)
    df.loc[df.attribute_name == attribute_name, 'attribute_value'] = (
        df.loc[df.attribute_name == attribute_name, 'attribute_value']
        .map(mapping)
    )
    return df

def print_summary(attribute_name):
    print(f"{attribute_name}")
    print(f"Unique values: {df[df.attribute_name == attribute_name].attribute_value.nunique()}")
    print(df[df.attribute_name == attribute_name].attribute_value.value_counts())
    

#### Fabric

In [24]:
mapping = {
'Jersey': 'Jersey',
 'Knit': 'Knit',
 'Chiffon': 'Chiffon',
 'Rib': 'Rib',
 'Satin': 'Satin',
 'Tulle': 'Tulle',
 'Lace': 'Lace',
 'Denim': 'Denim',
 'Sweat': 'Sweat',
 'Mesh': 'Mesh',
 'Velvet/velour': 'Velvet/velour',
 'Piqué': 'Piqué',
 'Crocheted': 'Crocheted',
 'Rib, knit': 'Rib',
 'Knit, rib': 'Knit',
 'Faux leather': 'Faux leather',
 'Lace, chiffon': 'Mixed',
 'Jersey, Lace': 'Mixed',
 'Chiffon, Lace': 'Mixed',
 'Cord': 'Cord',
 'Lace, tulle': 'Mixed',
 'Tulle, Lace': 'Mixed',
 'Jersey, tulle': 'Mixed',
 'Jersey, chiffon': 'Mixed',
 'Jersey, rib': 'Mixed',
 'Satin, Lace': 'Mixed',
 'Chiffon, satin': 'Mixed',
 'Canvas': 'Canvas',
 'Jersey, satin': 'Mixed',
 'Satin, Jersey': 'Mixed',
 'Satin, tulle': 'Mixed',
 'Flannel': 'Flannel',
 'Fleece': 'Fleece',
 'Satin, knit': 'Mixed',
 'Mesh, Lace': 'Mixed',
 'Satin, mesh': 'Mixed',
 'Crocheted, Lace': 'Mixed',
 'Chiffon, Jersey': 'Mixed',
 'Rib, chiffon': 'Mixed',
 'Lace, satin': 'Mixed',
 'Tulle, Lace, knit': 'Mixed',
 'Faux fur': 'Faux fur',
 'Knit, Lace': 'Mixed',
 'Rib, satin': 'Mixed',
 'Knit, Crocheted': 'Mixed',
 'Jersey, faux leather': 'Mixed',
 'Velvet/velour, tulle': 'Mixed',
 'Tulle, Jersey': 'Mixed',
 'Tulle, satin': 'Mixed',
 'Jersey, knit': 'Mixed',
 'Lace, Crocheted': 'Mixed',
 'Rib, Jersey': 'Mixed',
 'Rib, tulle': 'Mixed',
 'Terry towelling': 'Mixed',
 'Knit, Jersey': 'Mixed',
 'Rib, mesh': 'Mixed',
 'Softshell': 'Softshell',
 'Faux leather, rib': 'Mixed',
 'Knit, mesh': 'Mixed',
 'Braided': 'Braided',
 'Velvet/velour, satin': 'Mixed',
 'Rib, velvet/velour': 'Mixed',
 'Mesh, chiffon': 'Mixed',
 'Lace, Denim': 'Mixed',
 'Satin, chiffon': 'Mixed',
 'Softshell, mesh': 'Mixed',
 'Mesh, Jersey':'Mixed',
 'Chiffon, velvet/velour': 'Mixed',
 'Knit, tulle': 'Mixed',
 'Mesh, satin': 'Mixed',
 'Mesh, tulle': 'Mixed',
 'Hardshell': 'Hardshell',
 'Chiffon, tulle': 'Mixed',
 'Lace, Jersey': 'Mixed',
 'Tulle, chiffon':'Mixed',
 'Satin, Lace, mesh': 'Mixed',
 'Jersey, Crocheted': 'Mixed',
 'Knit, satin': 'Mixed',
}


In [25]:
df = map_value(df, 'Fabric', mapping)
df = trancate_mapping_with_other(df, 'Fabric', threshold=10)
print_summary('Fabric')

Fabric
Unique values: 17
attribute_value
Jersey           2271
Knit             1691
Chiffon          1189
Rib              1107
Satin             835
Tulle             388
Lace              346
Denim             340
Sweat             247
Mixed             208
Mesh              185
Velvet/velour     113
Piqué              59
Crocheted          58
Faux leather       30
Other              22
Cord               14
Name: count, dtype: int64


#### Pattern

In [26]:
print_summary('Pattern')
df = trancate_mapping_with_other(df, 'Pattern', threshold=10)

Pattern
Unique values: 16
attribute_value
Plain              9505
Floral             2332
Print              2312
Marl               1253
Striped             585
Animal print        436
Polka dot           248
Checked             157
Paisley             145
Colourful           118
Colour gradient      40
Herringbone          38
Pinstriped           22
Camouflage           11
Photo print           6
Burnout               4
Name: count, dtype: int64


#### Neckline

In [27]:
print_summary('Neckline')
df = trancate_mapping_with_other(df, 'Neckline', threshold=10)

Neckline
Unique values: 12
attribute_value
Crew neck           5010
Low-cut v-neck      3226
Backless            2511
Cache-coeur         1527
V-neck               940
Scoop neck           410
Square neck          385
Boat neck            366
Off-the-shoulder     330
Henley               210
Cowl neck             99
Envelope               2
Name: count, dtype: int64


#### Collar

In [28]:
print_summary('Collar')
df = trancate_mapping_with_other(df, 'Collar', threshold=10)

Collar
Unique values: 18
attribute_value
Standing collar       1822
Turn-down collar       677
Polo neck              287
Hood                   201
Volant collar          119
Lapel collar           107
Polo shirt             102
High collar             83
Mandarin collar         68
Shawl collar            65
Zip neck                55
Shirt collar            43
Peter Pan collar        16
Button down             15
Contrasting collar      13
Cup-shaped collar        3
Kent collar              2
V-neck                   1
Name: count, dtype: int64


#### Length

In [29]:
print_summary('Length')
df = trancate_mapping_with_other(df, 'Length', threshold=10)

Length
Unique values: 11
attribute_value
Calf-length        4777
Short              4446
Long               4037
Knee-length        3697
Extra short         990
Thigh-length         51
Extra long           18
Mid-length            9
7/8 length            8
Normal                8
81 cm (Size 38)       1
Name: count, dtype: int64


### Fit

In [30]:
print_summary('Fit')
fit_mapping = {
'Regular Fit': 'Regular Fit',
 'Slim Fit': 'Slim Fit',
 'Loose Fit': 'Loose Fit',
 'Skinny Fit': 'Skinny Fit',
 'Oversized': 'Oversized',
 'Fitted waist': 'Fitted waist',
 'Normal fit': 'Normal fit',
 'Small': 'Slim Fit',
 'Bodycon': 'Skinny Fit',
 'Regular fit': 'Regular Fit',
 'Loose fit': 'Loose Fit',
 'Tight fit': 'Skinny Fit',
 'Wide cut': 'Loose Fit',
 'Slim fit': 'Slim Fit'
 }
df = map_value(df, 'Fit', fit_mapping)
print_summary('Fit')

Fit
Unique values: 14
attribute_value
Regular Fit     10877
Slim Fit         4003
Loose Fit        1797
Skinny Fit       1064
Oversized          82
Fitted waist       39
Normal fit         25
Small               5
Bodycon             5
Regular fit         3
Loose fit           2
Tight fit           2
Wide cut            1
Slim fit            1
Name: count, dtype: int64
Fit
Unique values: 7
attribute_value
Regular Fit     10880
Slim Fit         4009
Loose Fit        1800
Skinny Fit       1071
Oversized          82
Fitted waist       39
Normal fit         25
Name: count, dtype: int64


#### Shape

In [31]:
print_summary('Shape')

Shape
Unique values: 6
attribute_value
Fitted          7539
Flared          4494
Straight        4332
Body-hugging    1229
Cocoon            60
Tapered           24
Name: count, dtype: int64


## Read in Image Dicts

In [32]:
# Metadata files
path = f"{DATA_PATH}Zalando_Germany_Dataset/image_dicts/dresses/*"
image_dicts = glob(path)
image_dicts

['../../Data.nosync/Zalando_Germany_Dataset/image_dicts/dresses/jersey_dresses.json',
 '../../Data.nosync/Zalando_Germany_Dataset/image_dicts/dresses/dirndl_dresses.json',
 '../../Data.nosync/Zalando_Germany_Dataset/image_dicts/dresses/shift_dresses.json',
 '../../Data.nosync/Zalando_Germany_Dataset/image_dicts/dresses/occasion_dresses.json',
 '../../Data.nosync/Zalando_Germany_Dataset/image_dicts/dresses/shirt_dresses.json',
 '../../Data.nosync/Zalando_Germany_Dataset/image_dicts/dresses/evening_dresses.json',
 '../../Data.nosync/Zalando_Germany_Dataset/image_dicts/dresses/knitted_dresses.json',
 '../../Data.nosync/Zalando_Germany_Dataset/image_dicts/dresses/casual_dresses.json',
 '../../Data.nosync/Zalando_Germany_Dataset/image_dicts/dresses/denim_dresses.json',
 '../../Data.nosync/Zalando_Germany_Dataset/image_dicts/dresses/maxi_dresses.json']

In [33]:
images_dict = {}
for path in tqdm(image_dicts):
    with open(path, 'r') as f:
        image_data = json.load(f)
    images_dict = {**images_dict, **image_data}
percent_missing = len([k for k,v in images_dict.items() if v['packshot_link'] == None])/len(images_dict)*100
print(f"{np.round(percent_missing, 2)}% of the images are missing packshot images")

# Create dict with only packshot images
packshot_images = {key:value['packshot_link'] for key, value in images_dict.items()}
# Map packshot images to the article dataframe
df['packshot_url'] = df.sku.map(packshot_images)


100%|██████████| 10/10 [00:00<00:00, 174.19it/s]

22.04% of the images are missing packshot images





### Parse everything into dictionary and Save

- Subset to only articles with packshot images
- Parse everything into single dictionary which will be used for all the other steps in the thesis
- Save the dictionary

In [34]:
# Subset to only articles with packshot images
print(f"Number of articles with packshot images: {df[df.packshot_url.notnull()].sku.nunique()}")
print(f"Number of articles without packshot images: {df[df.packshot_url.isnull()].sku.nunique()}")
df = df[df.packshot_url.notnull()]

Number of articles with packshot images: 14060
Number of articles without packshot images: 3971


In [35]:
# Function to safely retrieve attribute values
def get_attribute_value(data, attribute_name, default_value=None):
    try:
        # Attempt to return the first value that matches the attribute name
        return data.loc[data['attribute_name'] == attribute_name, 'attribute_value'].values[0]
    except IndexError:
        # Return the default value if the attribute doesn't exist
        return default_value
def create_data_dict(data):
    data_dict = {
        'name': data['name'].values[0],
        'sku_base': data['sku_base'].values[0],
        'sku_color_code': data['sku_color_code'].values[0],
        'url': data['url'].values[0] if 'url' in data.columns else None, 
        'brand': data['brand'].values[0] if 'brand' in data.columns else None, 
        'original_price': data['original_price'].values[0] if 'original_price' in data.columns else None,
        'current_price': data['current_price'].values[0] if 'current_price' in data.columns else None,
        'brand_url': data['brand_url'].values[0] if 'brand_url' in data.columns else None,
        'category': data['category'].values[0] if 'category' in data.columns else None,
        'garment_type': data['source_file'].values[0].split('.')[0],
        #'color': data['color'].values[0] if 'color' in data.columns else None, 
        'fabric': get_attribute_value(data, 'Fabric'), 
        'fit': get_attribute_value(data, 'Fit'), 
        'neckline': get_attribute_value(data, 'Neckline'), 
        'pattern': get_attribute_value(data, 'Pattern'), 
        'collar': get_attribute_value(data, 'Collar'), 
        'length': get_attribute_value(data, 'Length'),
        'shape': get_attribute_value(data, 'Shape'),
        'thumbnail_url': data['thumbnail_url'].values[0],
        'packshot_url': data['packshot_url'].values[0],
    }
    return data_dict

new_metadata = {}
for sku in tqdm(df.sku.unique()):
    data = df[df.sku == sku]
    new_metadata[sku] = create_data_dict(data)


# Dump to JSON
path = f'{DATA_PATH}Zalando_Germany_Dataset/dresses/metadata/dresses_metadata.json'
with open(path, 'w') as f:
    json.dump(new_metadata, f)

100%|██████████| 14060/14060 [00:47<00:00, 295.34it/s]


In [36]:
len(new_metadata)

14060

### Remove all Remaining Images in the Image Folder that do not correspond to the Articles in the final metadata


In [37]:
import os

existing_images = glob(f"{DATA_PATH}Zalando_Germany_Dataset/dresses/images/raw_images/*.jpg")
for file in existing_images:
    sku = file.split('/')[-1].split('.')[0]
    if sku not in new_metadata:
        print(f"Removing {file}")
        os.remove(file)

Removing ../../Data.nosync/Zalando_Germany_Dataset/dresses/images/raw_images/LE221T01Q-K14.jpg
Removing ../../Data.nosync/Zalando_Germany_Dataset/dresses/images/raw_images/GS121G0BS-K11.jpg
Removing ../../Data.nosync/Zalando_Germany_Dataset/dresses/images/raw_images/GS121E0FA-K11.jpg
Removing ../../Data.nosync/Zalando_Germany_Dataset/dresses/images/raw_images/HU721B0C4-D11.jpg
Removing ../../Data.nosync/Zalando_Germany_Dataset/dresses/images/raw_images/GS121N0RX-K11.jpg
Removing ../../Data.nosync/Zalando_Germany_Dataset/dresses/images/raw_images/GS121G0BR-K11.jpg
Removing ../../Data.nosync/Zalando_Germany_Dataset/dresses/images/raw_images/HU721I0CW-E11.jpg
Removing ../../Data.nosync/Zalando_Germany_Dataset/dresses/images/raw_images/M1X21C04P-A11.jpg
Removing ../../Data.nosync/Zalando_Germany_Dataset/dresses/images/raw_images/P1421C0X7-T11.jpg
Removing ../../Data.nosync/Zalando_Germany_Dataset/dresses/images/raw_images/M7521C08T-Q11.jpg
Removing ../../Data.nosync/Zalando_Germany_Dataset