Based on https://github.com/radekosmulski/personalized_fashion_recs/blob/main/01_Solution_warmup.ipynb

In [1]:
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np
import pandas as pd

In [2]:
# helper functions

# https://www.kaggle.com/c/h-and-m-personalized-fashion-recommendations/discussion/308635
def customer_hex_id_to_int(series):
    return series.str[-16:].apply(hex_id_to_int)

def hex_id_to_int(str):
    return int(str[-16:], 16)

def article_id_str_to_int(series):
    return series.astype('int32')

def article_id_int_to_str(series):
    return '0' + series.astype('str')

class Categorize(BaseEstimator, TransformerMixin):
    def __init__(self, min_examples=0):
        self.min_examples = min_examples
        self.categories = []
        
    def fit(self, X):
        for i in range(X.shape[1]):
            vc = X.iloc[:, i].value_counts()
            self.categories.append(vc[vc > self.min_examples].index.tolist())
        return self

    def transform(self, X):
        data = {X.columns[i]: pd.Categorical(X.iloc[:, i], categories=self.categories[i]).codes for i in range(X.shape[1])}
        return pd.DataFrame(data=data)

In [7]:
%%time

BASE_PATH = '../../data/'

transactions = pd.read_csv(BASE_PATH + 'transactions_train.csv')
customers = pd.read_csv(BASE_PATH + 'customers.csv')
articles = pd.read_csv(BASE_PATH + 'articles.csv')

CPU times: user 18.5 s, sys: 2.41 s, total: 20.9 s
Wall time: 21.7 s


In [4]:
transactions.info(memory_usage='deep')
customers.info(memory_usage='deep')
articles.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31788324 entries, 0 to 31788323
Data columns (total 5 columns):
 #   Column            Dtype  
---  ------            -----  
 0   t_dat             object 
 1   customer_id       object 
 2   article_id        int64  
 3   price             float64
 4   sales_channel_id  int64  
dtypes: float64(1), int64(2), object(2)
memory usage: 6.3 GB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1371980 entries, 0 to 1371979
Data columns (total 7 columns):
 #   Column                  Non-Null Count    Dtype  
---  ------                  --------------    -----  
 0   customer_id             1371980 non-null  object 
 1   FN                      476930 non-null   float64
 2   Active                  464404 non-null   float64
 3   club_member_status      1365918 non-null  object 
 4   fashion_news_frequency  1355969 non-null  object 
 5   age                     1356119 non-null  float64
 6   postal_code             1371980 non-null  object 
d

In [8]:
transactions2 = transactions.copy()
customers2 = customers.copy()
articles2 = articles.copy()

transactions2['customer_id'] = customer_hex_id_to_int(transactions2['customer_id'])
transactions2.t_dat = pd.to_datetime(transactions2.t_dat, format='%Y-%m-%d')
transactions2['week'] = 104 - (transactions2.t_dat.max() - transactions2.t_dat).dt.days // 7
transactions2.article_id = article_id_str_to_int(transactions2.article_id)
articles2.article_id = article_id_str_to_int(articles2.article_id)
transactions2.week = transactions2.week.astype('int8')
transactions2.sales_channel_id = transactions2.sales_channel_id.astype('int8')
transactions2.price = transactions2.price.astype('float32')

customers2.customer_id = customer_hex_id_to_int(customers2.customer_id)
for col in ['FN', 'Active', 'age']:
    customers2[col].fillna(-1, inplace=True)
    customers2[col] = customers2[col].astype('int8')
customers2.club_member_status = Categorize().fit_transform(customers2[['club_member_status']]).club_member_status
customers2.postal_code = Categorize().fit_transform(customers2[['postal_code']]).postal_code
customers2.fashion_news_frequency = Categorize().fit_transform(customers2[['fashion_news_frequency']]).fashion_news_frequency

for col in articles2.columns:
    if articles2[col].dtype == 'object':
        articles2[col] = Categorize().fit_transform(articles[[col]])[col]
for col in articles2.columns:
    if articles2[col].dtype == 'int64':
        articles2[col] = articles2[col].astype('int32')

transactions2.sort_values(['t_dat', 'customer_id'], inplace=True)

In [12]:
c = Categorize()
c.fit(articles[['index_group_name']])
c.categories

[['Ladieswear', 'Baby/Children', 'Divided', 'Menswear', 'Sport']]

In [17]:
articles[articles.index_group_name == 'Baby/Children'].article_id

11        112679048
12        112679052
37        141661022
38        141661025
46        146706001
            ...    
105522    948152001
105523    948152002
105524    948758001
105526    948997001
105533    950449002
Name: article_id, Length: 34711, dtype: int64

In [19]:
articles2[articles2.index_group_name == 1].article_id

11        112679048
12        112679052
37        141661022
38        141661025
46        146706001
            ...    
105522    948152001
105523    948152002
105524    948758001
105526    948997001
105533    950449002
Name: article_id, Length: 34711, dtype: int32

In [9]:
transactions.info(memory_usage='deep')
customers.info(memory_usage='deep')
articles.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31788324 entries, 0 to 31788323
Data columns (total 6 columns):
 #   Column            Dtype         
---  ------            -----         
 0   t_dat             datetime64[ns]
 1   customer_id       uint64        
 2   article_id        int32         
 3   price             float32       
 4   sales_channel_id  int8          
 5   week              int8          
dtypes: datetime64[ns](1), float32(1), int32(1), int8(2), uint64(1)
memory usage: 788.2 MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1371980 entries, 0 to 1371979
Data columns (total 7 columns):
 #   Column                  Non-Null Count    Dtype 
---  ------                  --------------    ----- 
 0   customer_id             1371980 non-null  uint64
 1   FN                      1371980 non-null  int8  
 2   Active                  1371980 non-null  int8  
 3   club_member_status      1371980 non-null  int8  
 4   fashion_news_frequency  1371980 non-null  int8  
 

In [13]:
%%time

transactions.to_parquet(BASE_PATH + 'transactions_train.parquet')
customers.to_parquet(BASE_PATH + 'customers.parquet')
articles.to_parquet(BASE_PATH + 'articles.parquet')

CPU times: user 3.39 s, sys: 131 ms, total: 3.52 s
Wall time: 5.01 s


In [16]:
%%time
# let's create a 5% sample of the entiriety of the data to speed up dev

sample = 0.05
customers_sample = customers.sample(frac=sample, replace=False)
customers_sample_ids = set(customers_sample['customer_id'])
transactions_sample = transactions[transactions["customer_id"].isin(customers_sample_ids)]
articles_sample_ids = set(transactions_sample["article_id"])
articles_sample = articles[articles["article_id"].isin(articles_sample_ids)]

customers_sample.to_parquet(BASE_PATH + f'customers_sample_{sample}.parquet', index=False)
transactions_sample.to_parquet(BASE_PATH + f'transactions_train_sample_{sample}.parquet', index=False)
articles_sample.to_parquet(BASE_PATH + f'articles_train_sample_{sample}.parquet', index=False)

CPU times: user 1.72 s, sys: 206 ms, total: 1.93 s
Wall time: 1.93 s
