In [1]:
import pandas as pd

In [2]:
import platform
if platform.system() == 'Darwin':
    DATA_PATH = "/Users/maltegenschow/Documents/Uni/Thesis/Data.nosync"
elif platform.system() == 'Linux':
    DATA_PATH = "/pfs/work7/workspace/scratch/tu_zxmav84-thesis/Data.nosync"

### Load in Metadata

In [3]:
df = pd.read_json(f"{DATA_PATH}/Zalando_Germany_Dataset/dresses/metadata/dresses_metadata.json").T.reset_index().rename(columns={"index": "sku"})[['sku', 'name', 'url', 'category', 'garment_type']]
df.head(3)

Unnamed: 0,sku,name,url,category,garment_type
0,AN621C22S-O11,Jersey dress - brown,https://en.zalando.de/anna-field-shift-dress-b...,Shift dress,jersey_dresses
1,BU321C01G-K11,Jersey dress - marine/bedruckt,https://en.zalando.de/buffalo-jersey-dress-mar...,Jersey dress,jersey_dresses
2,JY121C0TB-A11,JDYCARLA CATHINKA DRESS - Jersey dress - cloud...,https://en.zalando.de/jdy-carla-cathinka-dress...,Jersey dress,jersey_dresses


### Check most common words in article names


In [4]:
# Combine all names fields into one string
names = df.name
names = names.str.cat(sep=' ')
names = names.lower()


In [5]:
pd.Series(names.split()).value_counts().head(30)

-           24907
dress       17995
day          5328
black        2919
maxi         1778
jersey       1774
/            1683
cocktail     1679
party        1678
jumper       1477
blue         1289
shift        1247
occasion      763
wear          761
green         696
mit           658
schwarz       619
midi          567
white         511
navy          501
pink          483
denim         475
dark          426
sleeve        400
mini          396
neck          367
short         353
print         348
fit           343
red           315
Name: count, dtype: int64

### Develop simple heuristic to identify category from name


In [6]:
def infer_category(name):
    cats_found = []
    if 'shift dress' in name:
        cats_found.append('Shift dress')
    if 'jersey dress' in name:
        cats_found.append('Jersey dress')
    if 'cocktail' in name or 'party' in name:
        cats_found.append('Cocktail dress / Party dress')
    if 'day dress' in name:
        cats_found.append('Day dress')
    if 'maxi dress' in name:
        cats_found.append('Maxi dress')
    if 'occasion' in name:
        cats_found.append('Occasion wear')
    if 'denim dress' in name:
        cats_found.append('Denim dress')
    if 'jumper dress' in name:
        cats_found.append('Jumper dress')
    
    if len(cats_found) == 0:
        return None
    elif len(cats_found) == 1:
        return cats_found[0]
    else:
        return cats_found
    
df['category_inferred'] = df.name.str.lower().apply(infer_category)
df.category_inferred.value_counts()

category_inferred
Day dress                                                   5182
Jersey dress                                                1700
Cocktail dress / Party dress                                1656
Maxi dress                                                  1607
Jumper dress                                                1458
Shift dress                                                 1225
Occasion wear                                                702
Denim dress                                                  303
[Maxi dress, Occasion wear]                                   59
[Jersey dress, Day dress]                                     49
[Day dress, Maxi dress]                                       19
[Jersey dress, Maxi dress]                                    14
[Shift dress, Day dress]                                      13
[Day dress, Jumper dress]                                     10
[Cocktail dress / Party dress, Day dress]                     10
[Cockta

In [7]:
df[df.category_inferred.apply(lambda x: type(x) != list and x is not None)==False]


Unnamed: 0,sku,name,url,category,garment_type,category_inferred
61,WG021C0UI-K11,ANNA V NECK MAXI DRESS - Jersey dress - navy blue,https://en.zalando.de/wal-g-anna-v-neck-dress-...,Occasion wear,jersey_dresses,"[Jersey dress, Maxi dress]"
190,6CA21C0B7-Q11,SHIFT DRESS - Jersey dress - black,https://en.zalando.de/calvin-klein-shift-dress...,Shift dress,jersey_dresses,"[Shift dress, Jersey dress]"
215,WAH21C02A-Q11,HALTER NECK MAXI DRESS - Jersey dress - black,https://en.zalando.de/wal-g-tall-halter-neck-m...,Occasion wear,jersey_dresses,"[Jersey dress, Maxi dress]"
222,WG021C0W6-M11,TILLY RUFFLE HALTER NECK MAXI DRESS - Jersey d...,https://en.zalando.de/wal-g-tilly-ruffle-halte...,Occasion wear,jersey_dresses,"[Jersey dress, Maxi dress]"
237,LAR21C04H-K11,CHARLEY SLEEVELESS DAY DRESS - Jersey dress - ...,https://en.zalando.de/lauren-ralph-lauren-peti...,Jersey dress,jersey_dresses,"[Jersey dress, Day dress]"
...,...,...,...,...,...,...
13777,P1Y21C02P-Q11,CLARA DRESS - Shirt dress - black,https://en.zalando.de/proenza-schouler-white-l...,Maxi dress,maxi_dresses,
13813,SU221C0XU-O11,HALTER - Jumper - dark oak brown,https://en.zalando.de/superdry-halter-jumper-d...,Maxi dress,maxi_dresses,
13830,M8321C00P-A11,EMILIE NAROW LINES DRESS - Shirt dress - white,https://en.zalando.de/miista-shirt-dress-white...,Maxi dress,maxi_dresses,
13951,PO221C0CY-K11,ZAHA SLEEVELESS DAY DRESS - Maxi dress - cruis...,https://en.zalando.de/polo-ralph-lauren-zaha-s...,Maxi dress,maxi_dresses,"[Day dress, Maxi dress]"


In [8]:
# Subset to rows where infered category is not a list and not None
sub = df[df.category_inferred.apply(lambda x: type(x) != list and x is not None)]
print(f"Number of rows with clearly inferred category: {sub.shape[0]}")
print(f"Number of rows with clearly inferred category: {sub.shape[0]/df.shape[0]:.2%}")
print(f"Number of rows without inferred category: {df.shape[0] - sub.shape[0]}")

Number of rows with clearly inferred category: 13833
Number of rows with clearly inferred category: 98.39%
Number of rows without inferred category: 227


In [9]:
# Agreement between inferred and actual category
incorrect_classified = sub[sub.category != sub.category_inferred]
print(f"Number of rows where inferred category does not match scraper category: {incorrect_classified.shape[0]}")
print(f"Number of rows where inferred category does not match scraper category: {incorrect_classified.shape[0]/sub.shape[0]:.2%}")

Number of rows where inferred category does not match scraper category: 1647
Number of rows where inferred category does not match scraper category: 11.91%


In [10]:
# Plot confusion matrix between scraper category and inferred category
confusion_matrix = pd.crosstab(sub.category_inferred, sub.category)
confusion_matrix

category,Cocktail dress / Party dress,Day dress,Denim dress,Jersey dress,Jumper dress,Maxi dress,Occasion wear,Shift dress
category_inferred,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Cocktail dress / Party dress,1607,3,0,41,0,1,3,1
Day dress,2,5165,0,0,0,2,0,13
Denim dress,0,0,297,0,0,5,0,1
Jersey dress,141,17,0,1316,0,69,60,97
Jumper dress,0,0,0,0,1334,28,0,96
Maxi dress,2,5,6,131,111,1345,7,0
Occasion wear,0,0,0,27,0,1,674,0
Shift dress,0,35,2,222,518,0,0,448


In [11]:
# Plot confusion matrix between scraper category and inferred category
confusion_matrix = pd.crosstab(sub.category_inferred, sub.garment_type)
confusion_matrix

garment_type,casual_dresses,denim_dresses,evening_dresses,jersey_dresses,knitted_dresses,maxi_dresses,occasion_dresses,shift_dresses
category_inferred,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Cocktail dress / Party dress,2,2,1612,35,1,0,3,1
Day dress,5163,2,2,0,2,0,0,13
Denim dress,1,298,0,1,0,2,0,1
Jersey dress,0,0,6,1668,0,12,8,6
Jumper dress,0,0,0,0,1368,2,0,88
Maxi dress,8,5,2,115,103,1368,6,0
Occasion wear,0,0,0,21,0,0,681,0
Shift dress,3,0,0,212,26,0,0,984
