Based on https://github.com/radekosmulski/personalized_fashion_recs/blob/main/01_Solution_warmup.ipynb

In [1]:
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np
import pandas as pd

In [2]:
# helper functions

# https://www.kaggle.com/c/h-and-m-personalized-fashion-recommendations/discussion/308635
def customer_hex_id_to_int(series):
    return series.str[-16:].apply(hex_id_to_int)

def hex_id_to_int(str):
    return int(str[-16:], 16)

def article_id_str_to_int(series):
    return series.astype('int32')

def article_id_int_to_str(series):
    return '0' + series.astype('str')

class Categorize(BaseEstimator, TransformerMixin):
    def __init__(self, min_examples=0):
        self.min_examples = min_examples
        self.categories = []
        
    def fit(self, X):
        for i in range(X.shape[1]):
            vc = X.iloc[:, i].value_counts()
            self.categories.append(vc[vc > self.min_examples].index.tolist())
        return self

    def transform(self, X):
        data = {X.columns[i]: pd.Categorical(X.iloc[:, i], categories=self.categories[i]).codes for i in range(X.shape[1])}
        return pd.DataFrame(data=data)

In [3]:
%%time

BASE_PATH = '../data/'

transactions = pd.read_csv(BASE_PATH + 'csv/transactions_train.csv', dtype={'customer_id': str, 'article_id': str})
customers = pd.read_csv(BASE_PATH + 'csv/customers.csv', dtype={'customer_id': str, 'postal_code': str})
articles = pd.read_csv(BASE_PATH + 'csv/articles.csv', dtype={'article_id': str, 'product_code': str})

CPU times: user 13.8 s, sys: 2.47 s, total: 16.2 s
Wall time: 16.4 s


In [4]:
print(transactions.memory_usage(deep=True), transactions.info(memory_usage='deep'))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31788324 entries, 0 to 31788323
Data columns (total 5 columns):
 #   Column            Dtype  
---  ------            -----  
 0   t_dat             object 
 1   customer_id       object 
 2   article_id        object 
 3   price             float64
 4   sales_channel_id  int64  
dtypes: float64(1), int64(1), object(3)
memory usage: 8.0 GB
Index                      132
t_dat               2129817708
customer_id         3846387204
article_id          2129817708
price                254306592
sales_channel_id     254306592
dtype: int64 None


In [5]:
transactions['customer_id'].nunique()

1362281

In [6]:
transactions['customer_id'] = customer_hex_id_to_int(transactions['customer_id'])
transactions['customer_id'].nunique()

1362281

In [7]:
transactions.t_dat = pd.to_datetime(transactions.t_dat, format='%Y-%m-%d')
transactions['week'] = 104 - (transactions.t_dat.max() - transactions.t_dat).dt.days // 7

In [8]:
transactions.article_id = article_id_str_to_int(transactions.article_id)
articles.article_id = article_id_str_to_int(articles.article_id)

transactions.week = transactions.week.astype('int8')
transactions.sales_channel_id = transactions.sales_channel_id.astype('int8')
transactions.price = transactions.price.astype('float32')

In [9]:
customers.customer_id = customer_hex_id_to_int(customers.customer_id)
for col in ['FN', 'Active', 'age']:
    customers[col].fillna(-1, inplace=True)
    customers[col] = customers[col].astype('int8')

In [10]:
customers.club_member_status = Categorize().fit_transform(customers[['club_member_status']]).club_member_status
customers.postal_code = Categorize().fit_transform(customers[['postal_code']]).postal_code
customers.fashion_news_frequency = Categorize().fit_transform(customers[['fashion_news_frequency']]).fashion_news_frequency

In [11]:
for col in articles.columns:
    if articles[col].dtype == 'object':
        articles[col] = Categorize().fit_transform(articles[[col]])[col]
    if articles[col].dtype == 'int64':
        articles[col] = articles[col].astype('int32')

In [12]:
transactions.sort_values(['t_dat', 'customer_id'], inplace=True)

In [13]:
%%time

transactions.to_parquet(BASE_PATH + 'parquet/transactions_train.parquet')
customers.to_parquet(BASE_PATH + 'parquet/customers.parquet')
articles.to_parquet(BASE_PATH + 'parquet/articles.parquet')

# let's create a 5% sample of all the data to speed up dev
sample = 0.05
customers_sample = customers.sample(frac=sample, replace=False)
customers_sample_ids = set(customers_sample['customer_id'])
transactions_sample = transactions[transactions["customer_id"].isin(customers_sample_ids)]
articles_sample_ids = set(transactions_sample["article_id"])
articles_sample = articles[articles["article_id"].isin(articles_sample_ids)]

customers_sample.to_parquet(BASE_PATH + f'parquet/customers_sample_{sample}.parquet', index=False)
transactions_sample.to_parquet(BASE_PATH + f'parquet/transactions_train_sample_{sample}.parquet', index=False)
articles_sample.to_parquet(BASE_PATH + f'parquet/articles_train_sample_{sample}.parquet', index=False)

CPU times: user 3.63 s, sys: 810 ms, total: 4.44 s
Wall time: 4.45 s


In [14]:
# memory usage is a lot lower
print(transactions.memory_usage(deep=True), transactions.info(memory_usage='deep'))

<class 'pandas.core.frame.DataFrame'>
Index: 31788324 entries, 25784 to 31780475
Data columns (total 6 columns):
 #   Column            Dtype         
---  ------            -----         
 0   t_dat             datetime64[ns]
 1   customer_id       uint64        
 2   article_id        int32         
 3   price             float32       
 4   sales_channel_id  int8          
 5   week              int8          
dtypes: datetime64[ns](1), float32(1), int32(1), int8(2), uint64(1)
memory usage: 1.0 GB
Index               254306592
t_dat               254306592
customer_id         254306592
article_id          127153296
price               127153296
sales_channel_id     31788324
week                 31788324
dtype: int64 None
