# H&M dataset preprocessing

This notebook is based on another notebook by Radek: https://github.com/radekosmulski/personalized_fashion_recs/blob/main/01_Solution_warmup.ipynb

The goal of this notebook is to cleanup the H&M dataset and make it less memory intensive. It does so by converting textual columns into numerical ones by ordinal encoding. It also cleans up the customer and articles ids to make them integers. FInally, it adds a week column to the transaction dataset. 

A secondary objective of the notebook is to create a 5% sample of the data to use while testing and doing quick iterations.

In [1]:
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np
import pandas as pd

In [2]:
# helper functions

# https://www.kaggle.com/c/h-and-m-personalized-fashion-recommendations/discussion/308635
def customer_hex_id_to_int(series):
    return series.str[-16:].apply(hex_id_to_int)

def hex_id_to_int(str):
    return int(str[-16:], 16)

def article_id_str_to_int(series):
    return series.astype('int32')

def article_id_int_to_str(series):
    return '0' + series.astype('str')

class Categorize(BaseEstimator, TransformerMixin):
    def __init__(self, min_examples=0):
        self.min_examples = min_examples
        self.categories = []
        
    def fit(self, X):
        for i in range(X.shape[1]):
            vc = X.iloc[:, i].value_counts()
            self.categories.append(vc[vc > self.min_examples].index.tolist())
        return self

    def transform(self, X):
        data = {X.columns[i]: pd.Categorical(X.iloc[:, i], categories=self.categories[i]).codes for i in range(X.shape[1])}
        return pd.DataFrame(data=data)

In [9]:
%%time

BASE_PATH = '../../../data/'

transactions = pd.read_csv(BASE_PATH + 'original/transactions_train.csv')
customers = pd.read_csv(BASE_PATH + 'original/customers.csv')
articles = pd.read_csv(BASE_PATH + 'original/articles.csv')

CPU times: user 18.8 s, sys: 2.8 s, total: 21.6 s
Wall time: 22.6 s


In [8]:
transactions2 = transactions.copy()
customers2 = customers.copy()
articles2 = articles.copy()

transactions2.customer_id = customer_hex_id_to_int(transactions2.customer_id)
customers2.customer_id = customer_hex_id_to_int(customers2.customer_id)
transactions2.t_dat = pd.to_datetime(transactions2.t_dat, format='%Y-%m-%d')
transactions2['week'] = 104 - (transactions2.t_dat.max() - transactions2.t_dat).dt.days // 7
transactions2.article_id = article_id_str_to_int(transactions2.article_id)
articles2.article_id = article_id_str_to_int(articles2.article_id)
transactions2.week = transactions2.week.astype('int8')
transactions2.sales_channel_id = transactions2.sales_channel_id.astype('int8')
transactions2.price = transactions2.price.astype('float32') * 590


for col in ['FN', 'Active']:
    customers2[col].fillna(0, inplace=True)
    customers2[col] = customers2[col].astype('bool')
customers2.age.fillna(-1, inplace=True)
customers2.age = customers2.age.astype('int32')
customers2.club_member_status = Categorize().fit_transform(customers2[['club_member_status']]).club_member_status
customers2.postal_code = Categorize().fit_transform(customers2[['postal_code']]).postal_code
customers2.fashion_news_frequency = Categorize().fit_transform(customers2[['fashion_news_frequency']]).fashion_news_frequency

for col in articles2.columns:
    if articles2[col].dtype == 'object':
        if col != 'detail_desc':
            articles2[col] = Categorize().fit_transform(articles2[[col]])[col]
    if articles2[col].dtype == 'int64':
        articles2[col] = articles2[col].astype('int32')

transactions2.sort_values(['t_dat', 'customer_id'], inplace=True)

In [11]:
%%time

transactions2.to_parquet(BASE_PATH + 'parquet/transactions_train.parquet')
customers2.to_parquet(BASE_PATH + 'parquet/customers.parquet')
articles2.to_parquet(BASE_PATH + 'parquet/articles.parquet')

# let's create a 5% sample of the entiriety of the data to speed up dev
sample = 0.05
customers_sample = customers2.sample(frac=sample, replace=False)
customers_sample_ids = set(customers_sample['customer_id'])
transactions_sample = transactions2[transactions2["customer_id"].isin(customers_sample_ids)]
articles_sample_ids = set(transactions_sample["article_id"])
articles_sample = articles2[articles2["article_id"].isin(articles_sample_ids)]

customers_sample.to_parquet(BASE_PATH + f'customers_sample_{sample}.parquet', index=False)
transactions_sample.to_parquet(BASE_PATH + f'transactions_train_sample_{sample}.parquet', index=False)
articles_sample.to_parquet(BASE_PATH + f'articles_train_sample_{sample}.parquet', index=False)

CPU times: user 5.1 s, sys: 417 ms, total: 5.52 s
Wall time: 5.45 s


In [12]:
# memory is way less
# transactions.info(memory_usage='deep')
# customers.info(memory_usage='deep')
# articles.info(memory_usage='deep')
# transactions2.info(memory_usage='deep')
# customers2.info(memory_usage='deep')
# articles2.info(memory_usage='deep')

## Get conversions of Categorized columns

In [47]:
cols = []
for col in articles.columns:
    if articles[col].dtype == 'object':
        if col != 'detail_desc':
            cols.append(col)

c = Categorize()
c.fit(articles[cols])
translation = dict(zip(cols, c.categories))

def print_translation(col):
    for i, c in enumerate(translation[col]):
        print(i, c)

In [62]:
cols

['prod_name',
 'product_type_name',
 'product_group_name',
 'graphical_appearance_name',
 'colour_group_name',
 'perceived_colour_value_name',
 'perceived_colour_master_name',
 'department_name',
 'index_code',
 'index_name',
 'index_group_name',
 'section_name',
 'garment_group_name']

In [48]:
print_translation('index_group_name')

0 Ladieswear
1 Baby/Children
2 Divided
3 Menswear
4 Sport


In [49]:
print_translation('index_name')

0 Ladieswear
1 Divided
2 Menswear
3 Children Sizes 92-140
4 Children Sizes 134-170
5 Baby Sizes 50-98
6 Ladies Accessories
7 Lingeries/Tights
8 Children Accessories, Swimwear
9 Sport


In [51]:
print_translation('garment_group_name')

0 Jersey Fancy
1 Accessories
2 Jersey Basic
3 Knitwear
4 Under-, Nightwear
5 Trousers
6 Blouses
7 Shoes
8 Dresses Ladies
9 Outdoor
10 Unknown
11 Trousers Denim
12 Swimwear
13 Socks and Tights
14 Shirts
15 Woven/Jersey/Knitted mix Baby
16 Shorts
17 Dresses/Skirts girls
18 Skirts
19 Special Offers
20 Dressed


In [52]:
print_translation('section_name')

0 Womens Everyday Collection
1 Divided Collection
2 Baby Essentials & Complements
3 Kids Girl
4 Young Girl
5 Womens Lingerie
6 Girls Underwear & Basics
7 Womens Tailoring
8 Kids Boy
9 Womens Small accessories
10 Womens Casual
11 Kids Outerwear
12 Womens Trend
13 Divided Projects
14 Young Boy
15 H&M+
16 Men Underwear
17 Mama
18 Kids & Baby Shoes
19 Boys Underwear & Basics
20 Womens Shoes
21 Ladies H&M Sport
22 Womens Swimwear, beachwear
23 Contemporary Smart
24 Baby Girl
25 Divided Accessories
26 Kids Accessories, Swimwear & D
27 Divided Basics
28 Baby Boy
29 Womens Big accessories
30 Womens Everyday Basics
31 Womens Nightwear, Socks & Tigh
32 Contemporary Casual
33 Contemporary Street
34 Men Suits & Tailoring
35 Men Accessories
36 Womens Premium
37 Ladies Denim
38 Divided Selected
39 Men H&M Sport
40 Womens Jackets
41 Special Collections
42 Men Shoes
43 Mens Outerwear
44 Kids Sports
45 Collaborations
46 Denim Men
47 Men Edition
48 Men Project
49 Divided Asia keys
50 Kids Local Relevanc

In [53]:
print_translation('department_name')

0 Jersey
1 Knitwear
2 Trouser
3 Blouse
4 Dress
5 Swimwear
6 Kids Girl Jersey Fancy
7 Expressive Lingerie
8 Young Girl Jersey Fancy
9 Jersey Fancy
10 Jersey Basic
11 Kids Boy Jersey Fancy
12 Tops Fancy Jersey
13 Jersey fancy
14 Dresses
15 Basic 1
16 Jewellery
17 Shirt
18 Baby basics
19 Tops Knitwear
20 Young Boy Jersey Fancy
21 Kids Girl Jersey Basic
22 Baby Toys/Acc
23 Outwear
24 Tops Woven
25 Ladies Sport Bras
26 Baby Exclusive
27 Blouse & Dress
28 Kids Girl Dresses
29 Small Accessories
30 Baby Boy Jersey Fancy
31 Casual Lingerie
32 Baby Girl Jersey Fancy
33 Newborn
34 Shorts
35 Bags
36 Young Girl Jersey Basic
37 Trousers
38 Skirt
39 Hair Accessories
40 Accessories
41 Woven bottoms
42 Nightwear
43 Woven top
44 Jewellery Extended
45 Kids Boy Jersey Basic
46 Outdoor/Blazers
47 Young Girl Dresses
48 Kids Girl Knitwear
49 Girls Small Acc/Bags
50 Kids Girl Shoes
51 Divided+
52 Projects Dresses
53 Woven Tops
54 Kids Girl UW/NW
55 Flats
56 Light Basic Jersey
57 Baby Girl Woven
58 Young Girl 

In [55]:
print_translation('product_group_name')

0 Garment Upper body
1 Garment Lower body
2 Garment Full body
3 Accessories
4 Underwear
5 Shoes
6 Swimwear
7 Socks & Tights
8 Nightwear
9 Unknown
10 Underwear/nightwear
11 Cosmetic
12 Bags
13 Items
14 Furniture
15 Garment and Shoe care
16 Stationery
17 Interior textile
18 Fun
