# Lecture 4: Algorithms and Hyperparameter tuning

In [1]:
import numpy as np 
import pandas as pd
import random
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn import preprocessing
from sklearn.base import BaseEstimator, TransformerMixin

In [2]:
transactions = pd.read_csv('../data/transactions_train.csv', dtype={"article_id": "str"})
customers = pd.read_csv('../data/customers.csv')
articles = pd.read_csv('../data/articles.csv', dtype={"article_id": "str"})

### MAP @ k
Reliable MAP@k function from https://github.com/benhamner/Metrics/blob/master/Python/ml_metrics/average_precision.py, also discussed in the Kaggle discussion stated above.

In [3]:
import numpy as np

def apk(actual, predicted, k=10):
    """
    Computes the average precision at k.
    This function computes the average prescision at k between two lists of
    items.
    Parameters
    ----------
    actual : list
             A list of elements that are to be predicted (order doesn't matter)
    predicted : list
                A list of predicted elements (order does matter)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The average precision at k over the input lists
    """
    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), k)

def mapk(actual, predicted, k=10):
    """
    Computes the mean average precision at k.
    This function computes the mean average prescision at k between two lists
    of lists of items.
    Parameters
    ----------
    actual : list
             A list of lists of elements that are to be predicted 
             (order doesn't matter in the lists)
    predicted : list
                A list of lists of predicted elements
                (order matters in the lists)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The mean average precision at k over the input lists
    """
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])

## Preprocessing data
In the end we need to create predictions for every user. To do this, a lot of data will be needed, more than a random sample. My local machine only has 16 GB of RAM available, and the csv's together already add up to 4+ GB. Loading this in pandas DataFrames only expands this number, because some of the types are not ideal. 

Besides that, other preprocessing can be done on the data that can be usefull. Instead of finding the best methods myself, one of the top discussions on Kaggle has created a good starting point: https://www.kaggle.com/competitions/h-and-m-personalized-fashion-recommendations/discussion/309220
His code is bundled in a repo on GitHub: https://github.com/radekosmulski/personalized_fashion_recs

### Transactions

In [4]:
# https://www.kaggle.com/c/h-and-m-personalized-fashion-recommendations/discussion/308635
def customer_hex_id_to_int(series):
    return series.str[-16:].apply(hex_id_to_int)

def hex_id_to_int(str):
    return int(str[-16:], 16)

def article_id_str_to_int(series):
    return series.astype('int32')

def article_id_int_to_str(series):
    return '0' + series.astype('str')

class Categorize(BaseEstimator, TransformerMixin):
    def __init__(self, min_examples=0):
        self.min_examples = min_examples
        self.categories = []
        
    def fit(self, X):
        for i in range(X.shape[1]):
            vc = X.iloc[:, i].value_counts()
            self.categories.append(vc[vc > self.min_examples].index.tolist())
        return self

    def transform(self, X):
        data = {X.columns[i]: pd.Categorical(X.iloc[:, i], categories=self.categories[i]).codes for i in range(X.shape[1])}
        return pd.DataFrame(data=data)

In [5]:
transactions.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31788324 entries, 0 to 31788323
Data columns (total 5 columns):
 #   Column            Dtype  
---  ------            -----  
 0   t_dat             object 
 1   customer_id       object 
 2   article_id        object 
 3   price             float64
 4   sales_channel_id  int64  
dtypes: float64(1), int64(1), object(3)
memory usage: 8.0 GB


In [6]:
# transform customer ids to ints
transactions['customer_id'] = customer_hex_id_to_int(transactions['customer_id'])

# read transaction dates as datetime format, add column that describes week
transactions.t_dat = pd.to_datetime(transactions.t_dat, format='%Y-%m-%d')
transactions['week'] = 104 - (transactions.t_dat.max() - transactions.t_dat).dt.days // 7

# article ids stored as int
transactions.article_id = article_id_str_to_int(transactions.article_id)
articles.article_id = article_id_str_to_int(articles.article_id)

# store other columns in smaller datatypes
transactions.week = transactions.week.astype('int8')
transactions.sales_channel_id = transactions.sales_channel_id.astype('int8')
transactions.price = transactions.price.astype('float32')

transactions.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31788324 entries, 0 to 31788323
Data columns (total 6 columns):
 #   Column            Dtype         
---  ------            -----         
 0   t_dat             datetime64[ns]
 1   customer_id       uint64        
 2   article_id        int32         
 3   price             float32       
 4   sales_channel_id  int8          
 5   week              int8          
dtypes: datetime64[ns](1), float32(1), int32(1), int8(2), uint64(1)
memory usage: 788.2 MB


### Customers

In [7]:
customers.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1371980 entries, 0 to 1371979
Data columns (total 7 columns):
 #   Column                  Non-Null Count    Dtype  
---  ------                  --------------    -----  
 0   customer_id             1371980 non-null  object 
 1   FN                      476930 non-null   float64
 2   Active                  464404 non-null   float64
 3   club_member_status      1365918 non-null  object 
 4   fashion_news_frequency  1355971 non-null  object 
 5   age                     1356119 non-null  float64
 6   postal_code             1371980 non-null  object 
dtypes: float64(3), object(4)
memory usage: 512.3 MB


In [8]:
# customer ids as int
customers.customer_id = customer_hex_id_to_int(customers.customer_id)

# fill columns that have NaN entries, store the values in small ints
for col in ['FN', 'Active', 'age']:
    customers[col].fillna(-1, inplace=True)
    customers[col] = customers[col].astype('int8')

In [9]:
# transform objects to integer value
customers.club_member_status = Categorize().fit_transform(customers[['club_member_status']]).club_member_status
customers.postal_code = Categorize().fit_transform(customers[['postal_code']]).postal_code
customers.fashion_news_frequency = Categorize().fit_transform(customers[['fashion_news_frequency']]).fashion_news_frequency

In [10]:
customers.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1371980 entries, 0 to 1371979
Data columns (total 7 columns):
 #   Column                  Non-Null Count    Dtype 
---  ------                  --------------    ----- 
 0   customer_id             1371980 non-null  uint64
 1   FN                      1371980 non-null  int8  
 2   Active                  1371980 non-null  int8  
 3   club_member_status      1371980 non-null  int8  
 4   fashion_news_frequency  1371980 non-null  int8  
 5   age                     1371980 non-null  int8  
 6   postal_code             1371980 non-null  int32 
dtypes: int32(1), int8(5), uint64(1)
memory usage: 22.2 MB


### Articles

In [11]:
articles.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 105542 entries, 0 to 105541
Data columns (total 25 columns):
 #   Column                        Non-Null Count   Dtype 
---  ------                        --------------   ----- 
 0   article_id                    105542 non-null  int32 
 1   product_code                  105542 non-null  int64 
 2   prod_name                     105542 non-null  object
 3   product_type_no               105542 non-null  int64 
 4   product_type_name             105542 non-null  object
 5   product_group_name            105542 non-null  object
 6   graphical_appearance_no       105542 non-null  int64 
 7   graphical_appearance_name     105542 non-null  object
 8   colour_group_code             105542 non-null  int64 
 9   colour_group_name             105542 non-null  object
 10  perceived_colour_value_id     105542 non-null  int64 
 11  perceived_colour_value_name   105542 non-null  object
 12  perceived_colour_master_id    105542 non-null  int64 
 13 

In [12]:
# convert columns that have object dtype to int using categorizer
for col in articles.columns:
    if articles[col].dtype == 'object':
        articles[col] = Categorize().fit_transform(articles[[col]])[col]

In [13]:
# smaller ints can be used as well
for col in articles.columns:
    if articles[col].dtype == 'int64':
        articles[col] = articles[col].astype('int32')

In [14]:
articles.info(memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 105542 entries, 0 to 105541
Data columns (total 25 columns):
 #   Column                        Non-Null Count   Dtype
---  ------                        --------------   -----
 0   article_id                    105542 non-null  int32
 1   product_code                  105542 non-null  int32
 2   prod_name                     105542 non-null  int32
 3   product_type_no               105542 non-null  int32
 4   product_type_name             105542 non-null  int16
 5   product_group_name            105542 non-null  int8 
 6   graphical_appearance_no       105542 non-null  int32
 7   graphical_appearance_name     105542 non-null  int8 
 8   colour_group_code             105542 non-null  int32
 9   colour_group_name             105542 non-null  int8 
 10  perceived_colour_value_id     105542 non-null  int32
 11  perceived_colour_value_name   105542 non-null  int8 
 12  perceived_colour_master_id    105542 non-null  int32
 13  perceived_colo

## Saving preprocessed data

In [15]:
transactions.sort_values(['t_dat', 'customer_id'], inplace=True)

In [16]:
transactions.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 31788324 entries, 25784 to 31780475
Data columns (total 6 columns):
 #   Column            Dtype         
---  ------            -----         
 0   t_dat             datetime64[ns]
 1   customer_id       uint64        
 2   article_id        int32         
 3   price             float32       
 4   sales_channel_id  int8          
 5   week              int8          
dtypes: datetime64[ns](1), float32(1), int32(1), int8(2), uint64(1)
memory usage: 1.0 GB


Parquet is a column oriented file format for efficient storage and retrieval: https://parquet.apache.org/

In [17]:
transactions.to_parquet('../data/transactions_train.parquet')
customers.to_parquet('../data/customers.parquet')
articles.to_parquet('../data/articles.parquet')

For experimenting it is faster and easier to just use a sample.

In [18]:
sample = 0.05
customers_sample = customers.sample(frac=sample, replace=False)
customers_sample_ids = set(customers_sample['customer_id'])
transactions_sample = transactions[transactions["customer_id"].isin(customers_sample_ids)]
articles_sample_ids = set(transactions_sample["article_id"])
articles_sample = articles[articles["article_id"].isin(articles_sample_ids)]

customers_sample.to_parquet(f'../data/customers_sample_{sample}.parquet', index=False)
transactions_sample.to_parquet(f'../data/transactions_train_sample_{sample}.parquet', index=False)
articles_sample.to_parquet(f'../data/articles_train_sample_{sample}.parquet', index=False)

## First experiment
The first experiment will be very basic, it will only look at popularity and use random negative samples.

In [19]:
# load compressed tables from parquet files
transactions = pd.read_parquet('../data/transactions_train.parquet')
customers = pd.read_parquet('../data/customers.parquet')
articles = pd.read_parquet('../data/articles.parquet')

In [20]:
# use last 8 weeks (+- 2 months) as data
transactions = transactions[transactions.week > transactions.week.max() - 8]

### Generate negative samples

In [21]:
transactions['ordered'] = 1

# drop weeks, add them again after negative sampling
transactions = transactions.drop(columns="week").reindex()

In [22]:
positive_pairs = list(map(tuple, transactions[['customer_id', 'article_id']].drop_duplicates().values))

In [23]:
real_dates = transactions["t_dat"].unique()
real_customers = transactions["customer_id"].unique()
real_articles = transactions["article_id"].unique()
real_channels = transactions["sales_channel_id"].unique()
article_and_price = transactions[["article_id","price"]].drop_duplicates("article_id").set_index("article_id").squeeze()

In [24]:
num_neg_pos = transactions.shape[0]
print(num_neg_pos)

2173875


In [25]:
# Sampling negatives by selecting random users, articles, dates and sales channel:
# Note: This is quite naive. Some articles may not even have been available at the date we are sampling.
random.seed(42)

# Afterwards, we need to remove potential duplicates, so we'll sample too many.
num_neg_samples = int(num_neg_pos * 1.1)

# Sample each of the independent attributes.
neg_dates = np.random.choice(real_dates, size=num_neg_samples)
neg_articles = np.random.choice(real_articles, size=num_neg_samples)
neg_customers = np.random.choice(real_customers, size=num_neg_samples)
neg_channels = np.random.choice(real_channels, size=num_neg_samples)
ordered = np.array([0] * num_neg_samples)
# Assign to every article a real price.
neg_prices = article_and_price[neg_articles].values

In [26]:
neg_transactions = pd.DataFrame([neg_dates, neg_customers, neg_articles, neg_prices, neg_channels, ordered], index=transactions.columns).T

In [27]:
# Remove random negative samples that actually coincide with positives
df = neg_transactions[
    ~neg_transactions.set_index(["customer_id", "article_id"]).index.isin(positive_pairs)
]

# Remove any excess
chosen_neg_transactions = df.sample(num_neg_pos)

In [28]:
transactions = pd.concat([transactions, chosen_neg_transactions])
transactions = transactions.merge(customers, how="inner", on='customer_id')
transactions = transactions.merge(articles, how="inner", on='article_id')

In [29]:
# reinstall weeks
transactions['week'] = 104 - (transactions.t_dat.max() - transactions.t_dat).dt.days // 7

In [30]:
# last week in transactions as validation
test_week = transactions.week.max()

In [31]:
transactions.isnull().values.any()

False

## Adding popularity

In [32]:
sales = transactions[transactions["ordered"] == 1]
sales = sales["article_id"].value_counts().rename_axis("article_id").reset_index(name="sales")

In [33]:
sales.head()

Unnamed: 0,article_id,sales
0,751471001,4223
1,706016001,3719
2,918292001,3681
3,448509014,3035
4,751471043,2939


In [34]:
transactions = transactions.merge(sales, how="inner", on='article_id')
transactions.head()

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,ordered,FN,Active,club_member_status,fashion_news_frequency,...,index_name,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,detail_desc,week,sales
0,2020-07-29,1152192358796555,720125042,0.018627,1,1,-1,-1,0,0,...,9,26,4,5,21,1005,0,313,97,274
1,2020-08-01,110467586107431407,720125042,0.018627,1,1,-1,-1,0,0,...,9,26,4,5,21,1005,0,313,97,274
2,2020-07-29,165750356110826866,720125042,0.018627,1,1,1,1,0,1,...,9,26,4,5,21,1005,0,313,97,274
3,2020-07-29,506862778488049559,720125042,0.017712,1,1,1,1,0,1,...,9,26,4,5,21,1005,0,313,97,274
4,2020-08-01,1477115507390776780,720125042,0.018627,1,1,1,1,0,1,...,9,26,4,5,21,1005,0,313,97,274


In [35]:
transactions.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4347750 entries, 0 to 4347749
Data columns (total 38 columns):
 #   Column                        Dtype         
---  ------                        -----         
 0   t_dat                         datetime64[ns]
 1   customer_id                   object        
 2   article_id                    object        
 3   price                         object        
 4   sales_channel_id              object        
 5   ordered                       object        
 6   FN                            int8          
 7   Active                        int8          
 8   club_member_status            int8          
 9   fashion_news_frequency        int8          
 10  age                           int8          
 11  postal_code                   int32         
 12  product_code                  int32         
 13  prod_name                     int32         
 14  product_type_no               int32         
 15  product_type_name             in

We need to recast the dtypes for the first columns

In [36]:
# article ids stored as int
transactions.article_id = article_id_str_to_int(transactions.article_id)
transactions.ordered = transactions.ordered.astype('int8')

# store other columns in smaller datatypes
transactions.week = transactions.week.astype('int8')
transactions.sales_channel_id = transactions.sales_channel_id.astype('int8')
transactions.price = transactions.price.astype('float32')

In [37]:
transactions.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4347750 entries, 0 to 4347749
Data columns (total 38 columns):
 #   Column                        Dtype         
---  ------                        -----         
 0   t_dat                         datetime64[ns]
 1   customer_id                   object        
 2   article_id                    int32         
 3   price                         float32       
 4   sales_channel_id              int8          
 5   ordered                       int8          
 6   FN                            int8          
 7   Active                        int8          
 8   club_member_status            int8          
 9   fashion_news_frequency        int8          
 10  age                           int8          
 11  postal_code                   int32         
 12  product_code                  int32         
 13  prod_name                     int32         
 14  product_type_no               int32         
 15  product_type_name             in

In [38]:
transactions.head()

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,ordered,FN,Active,club_member_status,fashion_news_frequency,...,index_name,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,detail_desc,week,sales
0,2020-07-29,1152192358796555,720125042,0.018627,1,1,-1,-1,0,0,...,9,26,4,5,21,1005,0,313,97,274
1,2020-08-01,110467586107431407,720125042,0.018627,1,1,-1,-1,0,0,...,9,26,4,5,21,1005,0,313,97,274
2,2020-07-29,165750356110826866,720125042,0.018627,1,1,1,1,0,1,...,9,26,4,5,21,1005,0,313,97,274
3,2020-07-29,506862778488049559,720125042,0.017712,1,1,1,1,0,1,...,9,26,4,5,21,1005,0,313,97,274
4,2020-08-01,1477115507390776780,720125042,0.018627,1,1,1,1,0,1,...,9,26,4,5,21,1005,0,313,97,274


## Ranking with LightGBM
Using the notebook linked before

In [39]:
train = transactions[transactions.week != test_week]
test = transactions[transactions.week==test_week].drop_duplicates(['customer_id', 'article_id', 'sales_channel_id']).copy()

In [40]:
train_baskets = train.groupby(['week', 'customer_id'])['article_id'].count().values

In [41]:
columns_to_use = ['article_id', 'product_type_no', 'graphical_appearance_no', 'colour_group_code', 'perceived_colour_value_id',
'perceived_colour_master_id', 'department_no', 'index_code',
'index_group_no', 'section_no', 'garment_group_no', 'FN', 'Active',
'club_member_status', 'fashion_news_frequency', 'age', 'postal_code', 'sales']

In [42]:
train_X = train[columns_to_use]
train_y = train["ordered"]

In [43]:
test_X = test[columns_to_use]

In [44]:
from lightgbm.sklearn import LGBMRanker

In [45]:
ranker = LGBMRanker(
    objective="lambdarank",
    metric="ndcg",
    boosting_type="dart",
    n_estimators=1,
    importance_type='gain',
    verbose=10
)

In [46]:
ranker = ranker.fit(
    train_X,
    train_y,
    group=train_baskets,
)

[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.107649
[LightGBM] [Debug] init for col-wise cost 0.000048 seconds, init for row-wise cost 0.071825 seconds
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Debug] Using Dense Multi-Val Bin
[LightGBM] [Info] Total Bins 1383
[LightGBM] [Info] Number of data points in the train set: 3834644, number of used features: 18
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 8


In [47]:
for i in ranker.feature_importances_.argsort()[::-1]:
    print(columns_to_use[i], ranker.feature_importances_[i]/ranker.feature_importances_.sum())

age 0.5301524925443378
section_no 0.3390833933632354
index_group_no 0.069997281983742
product_type_no 0.020967243183457215
club_member_status 0.012541724514553869
Active 0.010002704398664426
department_no 0.008918801745346384
index_code 0.008336358266662907
perceived_colour_master_id 0.0
graphical_appearance_no 0.0
colour_group_code 0.0
perceived_colour_value_id 0.0
sales 0.0
postal_code 0.0
garment_group_no 0.0
FN 0.0
fashion_news_frequency 0.0
article_id 0.0


In [48]:
test['preds'] = ranker.predict(test_X)

predictions = test.groupby('customer_id').article_id.apply(list).reset_index()
predictions = predictions.rename({'article_id':'prediction'},axis=1)
# predictions['prediction'] =\
#     predictions.prediction.apply(lambda x: ' '.join(['0'+str(k) for k in x]))

In [49]:
predictions

Unnamed: 0,customer_id,prediction
0,28847241659200,"[517729044, 399201002]"
1,41318098387474,[599580041]
2,690285180337957,[864371001]
3,745180086074610,"[896897001, 762741001]"
4,805095543045062,"[636207010, 762287009]"
...,...,...
227922,18446566209623725451,"[831446001, 859399001]"
227923,18446571879212697038,[761018017]
227924,18446662237889060501,"[887181002, 300024006]"
227925,18446705133201055310,[762015008]


In [50]:
all_predictions = customers.copy().merge(predictions, how="left", on="customer_id")[["customer_id", "prediction"]]

In [51]:
all_predictions['prediction'] = all_predictions['prediction'].apply(lambda d: d if isinstance(d, list) else [])

In [52]:
all_predictions.head()

Unnamed: 0,customer_id,prediction
0,6883939031699146327,"[636935006, 676990004, 795675005]"
1,11246327431398957306,[]
2,18439897732908966680,[]
3,18352672461570950206,[]
4,18162778555210377306,"[723926002, 586796003, 634591002, 861167003]"


In [53]:
fill = random.sample(articles.article_id.tolist(), 12)

In [54]:
all_predictions['prediction'] =\
    all_predictions.prediction.apply(lambda x: (x + fill)[:12])

In [55]:
all_predictions['prediction'] =\
    all_predictions.prediction.apply(lambda x: ' '.join(['0'+str(k) for k in x]))

In [56]:
all_predictions["customer_id"] = pd.read_csv('../data/customers.csv')["customer_id"]

In [57]:
all_predictions

Unnamed: 0,customer_id,prediction
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0636935006 0676990004 0795675005 0813476001 05...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0813476001 0563577007 0452717006 0873108001 06...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0813476001 0563577007 0452717006 0873108001 06...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,0813476001 0563577007 0452717006 0873108001 06...
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,0723926002 0586796003 0634591002 0861167003 08...
...,...,...
1371975,ffffbbf78b6eaac697a8a5dfbfd2bfa8113ee5b403e474...,0897080002 0911880002 0697082013 0813476001 05...
1371976,ffffcd5046a6143d29a04fb8c424ce494a76e5cdf4fab5...,0813476001 0563577007 0452717006 0873108001 06...
1371977,ffffcf35913a0bee60e8741cb2b4e78b8a98ee5ff2e6a1...,0822895001 0813476001 0563577007 0452717006 08...
1371978,ffffd7744cebcf3aca44ae7049d2a94b87074c3d4ffe38...,0813476001 0563577007 0452717006 0873108001 06...


In [60]:
all_predictions.to_csv("../data/sub_experiment1.csv", index=False)