# Comparing classifier implementations
Objectives:
- Compare multiple classifiers and see which one gets the best leaderboard score when all else is equal
- Refactor code from the previous experiment (Unsold items test) to be more usable going forward


In [1]:
import numpy as np 
import pandas as pd
import random
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn import preprocessing
import time

In [2]:
transactions = pd.read_csv('./data/transactions_train.csv')

Replace customer ids with label encoding for space preservation purposes.

In [3]:
def encode_transactions_customers(transactions_df):
    customers = pd.read_csv('./data/customers.csv')
    customer_encoder = preprocessing.LabelEncoder()
    customer_encoder.fit(customers['customer_id'])
    transactions['customer_id'] = customer_encoder.transform(transactions['customer_id'])
    np.save('customer_ids.npy', customer_encoder.classes_) 
    return customer_encoder

In [4]:
customer_encoder = encode_transactions_customers(transactions)

In [5]:
transactions['purchased'] = 1

Transform string dates into weeks with the start of week 0 being the week for the final calculation (meaning everything in the data becomes a negative week)

In [6]:
def transform_string_dates_to_int(transactions_df):
    import datetime

    lookup = dict()
    def str_dat_to_weeks_int(datestring):
        return lookup.setdefault(datestring, (datetime.datetime.strptime(datestring, "%Y-%m-%d") - datetime.datetime(2020, 9, 23)).days//7)
    
    transactions_df["t_dat"] = transactions_df["t_dat"].map(str_dat_to_weeks_int)


In [7]:
transform_string_dates_to_int(transactions)

Drop all transactions which happened more than 20 weeks before the end of the data collection period

In [8]:
transactions.drop(transactions[transactions["t_dat"] < -20].index, inplace=True)

Perform random negative sampling, most of this code is copied from the 2nd lecture

In [9]:
def generate_negative_samples_np_version(transactions_df, num_neg_pos):
    real_dates = transactions["t_dat"].unique()
    real_customers = transactions["customer_id"].unique()
    real_articles = transactions["article_id"].unique()
    real_channels = transactions["sales_channel_id"].unique()
    article_and_price = transactions[["article_id","price"]].drop_duplicates("article_id").set_index("article_id").squeeze()

    random.seed(42)
    num_neg_samples = int(num_neg_pos * 1.1)

    neg_dates = np.random.choice(real_dates, size=num_neg_samples)
    neg_articles = np.random.choice(real_articles, size=num_neg_samples)
    neg_customers = np.random.choice(real_customers, size=num_neg_samples)
    neg_channels = np.random.choice(real_channels, size=num_neg_samples)
    ordered = np.array([0] * num_neg_samples)

    neg_prices = article_and_price[neg_articles].values
    
    return np.column_stack((neg_dates, neg_customers, neg_articles, neg_prices, neg_channels, ordered))
    
def generate_negative_samples(transactions_df):
    num_neg_pos = transactions_df.shape[0]
    positive_pairs = list(map(tuple, transactions_df[['customer_id', 'article_id']].drop_duplicates().values))
    neg_transactions = pd.DataFrame(generate_negative_samples_np_version(transactions_df, num_neg_pos), columns=transactions_df.columns)
    duplicate_indexes = neg_transactions[["customer_id", "article_id"]].apply(tuple, 1).isin(positive_pairs)
    neg_transactions = neg_transactions[~duplicate_indexes]
    return neg_transactions.sample(num_neg_pos)

def negative_sampling(transactions_df):
    transactions_df = pd.concat([transactions_df, generate_negative_samples(transactions_df)])
    transactions_df.reset_index(drop=True, inplace=True)
    return transactions_df

In [10]:
transactions = negative_sampling(transactions)

In [11]:
transactions.to_feather('./data/negativesampled.feather')

This is intended as a checkpoint for if I need to reset the jupyter notebook kernel be it due to crashing or due to running out of RAM.

In [28]:
articles = pd.read_csv('./data/articles.csv')

In [4]:
transactions = pd.read_feather("./data/negativesampled.feather")

In [29]:
customers = pd.read_csv('./data/customers.csv')

In [30]:
customer_encoder = preprocessing.LabelEncoder()
customer_encoder.classes_ = np.load("customer_ids.npy", allow_pickle=True)

Apply the label encoding to the customer table so it can be joined with transactions

In [32]:
customers['customer_id'] = customer_encoder.transform(customers['customer_id'])

In [33]:
zip_encoder = preprocessing.LabelEncoder()
customers["postal_code"] = zip_encoder.fit_transform(customers["postal_code"])

In [37]:
def correct_types_and_merge_transactions(transactions_df, customers_df, articles_df):
    customers_df["age"] = customers_df["age"].fillna(25)
    customers_df["age"] = customers_df["age"].astype(int)
    articles_df[['article_id', 'product_code', 'product_type_no','graphical_appearance_no','colour_group_code',
       'perceived_colour_value_id', 'perceived_colour_master_id', 'department_no',
       'index_group_no', 'section_no', 'garment_group_no']] = articles_df[['article_id', 'product_code',
       'product_type_no','graphical_appearance_no','colour_group_code',
       'perceived_colour_value_id', 'perceived_colour_master_id', 'department_no',
       'index_group_no', 'section_no', 'garment_group_no']].astype(int)
    transactions_df[['t_dat', 'customer_id', 'article_id', 'sales_channel_id', 'purchased']] = transactions_df[['t_dat', 'customer_id', 'article_id', 'sales_channel_id', 'purchased']].astype(int)

    transactions_df = transactions_df.merge(customers_df[["customer_id", "age", "postal_code"]], how="inner", on='customer_id')
    transactions_df = transactions_df.merge(articles_df[["article_id", "product_code", "product_type_no", "graphical_appearance_no", "colour_group_code", "department_no", "index_group_no", "section_no", "garment_group_no"]], how="inner", on='article_id')

In [36]:
correct_types_and_merge_transactions(transactions, customers, articles)

In [40]:
X_train, X_test, y_train, y_test = train_test_split(transactions.drop(['purchased', "price", 'sales_channel_id'], axis=1), transactions['purchased'], test_size=0.10, random_state=42)

Calculate the popular items which I will be using as candidates for the submission. 24 popular items in the past 20 weeks and 24 popular items in the past 4 weeks. If there is overlap, continue down the rank of the past 4 weeks. In extreme cases this results in 48 items that were popular in the past 4 weeks.

In [41]:
def get_popular_items(transaction_df):
    popular_all_time = transaction_df[transaction_df["purchased"] == 1][["article_id", "purchased"]].groupby("article_id").count().sort_values(ascending=False, by="purchased").head(24).index.to_series().reset_index(drop=True)
    popular_by_month = transaction_df[(transaction_df["purchased"] == 1) & (transaction_df["t_dat"] >= -4)][["article_id", "purchased"]].groupby("article_id").count().sort_values(ascending=False, by="purchased").head(48).index.to_series().reset_index(drop=True)
    popular_by_month2 = popular_by_month[~popular_by_month.isin(popular_all_time)]
    popular_by_month2 = popular_by_month2.reset_index(drop=True).head(24)
    popular_candidates = pd.DataFrame(pd.concat([popular_all_time, popular_by_month2])).astype(int).reset_index(drop=True)
    return popular_candidates, popular_by_month

popular_candidates, popular_by_month = get_popular_items(transactions)

0     706016001
1     372860002
2     751471001
3     599580038
4     610776002
5     759871002
6     372860001
7     610776001
8     841383002
9     599580052
10    448509014
11    783346001
12    806225002
13    749699002
14    800691007
15    817472002
16    739590032
17    806388002
18    850917001
19    688537004
20    811925009
21    827968001
22    759871025
23    760084003
Name: article_id, dtype: int64

Generate dataframe with all active customers and the candidates

In [46]:
def generate_ranker_input_df(transactions_df, candidates, customers_df, articles_df):
    return pd.DataFrame(data={"t_dat": 0}, index=[0]).merge(transactions_df[["customer_id"]].drop_duplicates(subset="customer_id"), how="cross").merge(candidates, how="cross").merge(customers_df[["customer_id", "age", "postal_code"]], how="inner", on="customer_id").merge(articles_df[["article_id", "product_code", "product_type_no", "graphical_appearance_no", "colour_group_code", "department_no", "index_group_no", "section_no", "garment_group_no"]], how="inner", on="article_id")

ranker_input = generate_ranker_input_df(transactions, popular_candidates, customers, articles)

Functions to perform predictions and write to file

In [61]:
# Copied from the radekosmulski notebook
def generate_ranking(df):
    return df.sort_values(by=["customer_id", "p1"], ascending=[True, False]).groupby("customer_id")["article_id"].apply(list).to_dict()

In [70]:
def output_preds(df, classifier, filename, filler_candidates):
    predictions = classifier.predict_proba(df)
    df[["p0", "p1"]] = predictions
    ranking = generate_ranking(df)
    
    # Copied from the radekosmulski notebook
    submission_df = pd.read_csv('data/sample_submission.csv')
    preds = []
    for c_id in customer_encoder.transform(submission_df.customer_id):
        pred = ranking.get(c_id, filler_candidates)
        preds.append(pred[:12])
        
    preds = [' '.join(['0' + str(p) for p in ps]) for ps in preds]
    submission_df.prediction = preds
    
    submission_df.to_csv('./data/{}.csv'.format(filename), index=False)
    df.drop(['p0', 'p1'], inplace=True, axis=1)

LightGBMClassifier implementation

In [19]:
# copying from https://github.com/microsoft/LightGBM/blob/master/examples/python-guide/simple_example.py
# combined with https://github.com/angelotc/LightGBM-binary-classification-example/blob/master/CCData.ipynb

import lightgbm as lgb
print('Starting training...')

gbm = lgb.LGBMClassifier(learning_rate = 0.1, metric = 'l1', 
                        n_estimators = 20)
gbm.fit(X_train, y_train,
        eval_set=[(X_test, y_test)],
        eval_metric=['auc', 'binary_logloss'],
        callbacks=[lgb.early_stopping(stopping_rounds=5)])

Starting training...
Training until validation scores don't improve for 5 rounds
Did not meet early stopping. Best iteration is:
[20]	valid_0's auc: 0.813333	valid_0's binary_logloss: 0.541055	valid_0's l1: 0.387631
Saving model...


In [37]:
output_preds(ranker_input, gbm, "model_lgbm_binary", popular_by_month)

LightGBMRanker implementation

In [41]:
# copying from https://github.com/microsoft/LightGBM/blob/master/examples/python-guide/simple_example.py
# combined with https://github.com/angelotc/LightGBM-binary-classification-example/blob/master/CCData.ipynb
# and this one https://github.com/radekosmulski/personalized_fashion_recs/blob/main/03c_Basic_Model_Submission.ipynb



train_baskets = X_train.groupby(['t_dat', 'customer_id'])['article_id'].count().values
train_baskets_test = X_test.groupby(['t_dat', 'customer_id'])['article_id'].count().values

print('Starting training...')

gbm = lgb.LGBMRanker(learning_rate = 0.1, metric = 'l1', 
                        n_estimators = 20)
gbm.fit(X_train, y_train,
        eval_set=[(X_test, y_test)],
        group=train_baskets,
        eval_group=[train_baskets_test],
        eval_metric=['auc', 'binary_logloss'],
        callbacks=[lgb.early_stopping(stopping_rounds=5)])

Starting training...
Training until validation scores don't improve for 5 rounds
Early stopping, best iteration is:
[1]	valid_0's auc: 0.778104	valid_0's binary_logloss: 5.37591	valid_0's l1: 0.522243
Saving model...


In [55]:
output_preds(ranker_input, gbm, "model_lgbm_ranker", popular_by_month)

Logistic regression implementation

In [81]:
from sklearn.linear_model import LogisticRegression

LG = LogisticRegression(random_state=42)
LG = LG.fit(X_train, y_train)

In [164]:
output_preds(ranker_input, LG, "model_logistic_regression", popular_by_month)

Naive Bayes implementation

In [170]:
from sklearn.naive_bayes import GaussianNB

GNB = GaussianNB()
GNB = GNB.fit(X_train, y_train)

In [197]:
output_preds(ranker_input, GNB, "model_naive_bayes", popular_by_month)

Random Forest implementation

In [23]:
from sklearn.ensemble import RandomForestClassifier

RFC = RandomForestClassifier(n_estimators=20, max_depth=10, random_state=42)
RFC = RFC.fit(X_train, y_train)

In [56]:
output_preds(ranker_input, RFC, "model_random_forest", popular_by_month)

Gradient boosting implementation via scikit learn

In [26]:
from sklearn.ensemble import GradientBoostingClassifier

GBC = GradientBoostingClassifier(n_estimators=20, random_state=42, n_iter_no_change=5)
GBC = GBC.fit(X_train, y_train)

In [42]:
output_preds(ranker_input, GBC, "model_sklearn_gradient", popular_by_month)

XGBoost implementation

In [55]:
from xgboost import XGBClassifier

XGBC = XGBClassifier(n_estimators=20, random_state=42, early_stopping_rounds=5, eval_metric=['aucpr', 'logloss'])
XGBC = XGBC.fit(X_train, y_train,
                eval_set=[(X_test, y_test)])

[0]	validation_0-aucpr:0.74081	validation_0-logloss:0.63255
[1]	validation_0-aucpr:0.75226	validation_0-logloss:0.60041
[2]	validation_0-aucpr:0.75789	validation_0-logloss:0.57958
[3]	validation_0-aucpr:0.76363	validation_0-logloss:0.56570
[4]	validation_0-aucpr:0.77414	validation_0-logloss:0.55265
[5]	validation_0-aucpr:0.78220	validation_0-logloss:0.54431
[6]	validation_0-aucpr:0.78674	validation_0-logloss:0.53890
[7]	validation_0-aucpr:0.79343	validation_0-logloss:0.53155
[8]	validation_0-aucpr:0.79587	validation_0-logloss:0.52812
[9]	validation_0-aucpr:0.79840	validation_0-logloss:0.52495
[10]	validation_0-aucpr:0.80207	validation_0-logloss:0.52111
[11]	validation_0-aucpr:0.80297	validation_0-logloss:0.51902
[12]	validation_0-aucpr:0.80440	validation_0-logloss:0.51692
[13]	validation_0-aucpr:0.80507	validation_0-logloss:0.51592
[14]	validation_0-aucpr:0.80664	validation_0-logloss:0.51364
[15]	validation_0-aucpr:0.80770	validation_0-logloss:0.51170
[16]	validation_0-aucpr:0.80990	va

In [72]:
output_preds(ranker_input, XGBC, "model_xgboost", popular_by_month)

Catboost implementation

In [78]:
from catboost import CatBoostClassifier

CBC = CatBoostClassifier(n_estimators=20, random_state=42, early_stopping_rounds=5, custom_metric=['AUC', 'Logloss'])
CBC = CBC.fit(X_train, y_train,
                eval_set=[(X_test, y_test)])

Learning rate set to 0.5
0:	learn: 0.6243122	test: 0.6251442	best: 0.6251442 (0)	total: 997ms	remaining: 18.9s
1:	learn: 0.5950279	test: 0.5953459	best: 0.5953459 (1)	total: 1.95s	remaining: 17.5s
2:	learn: 0.5785664	test: 0.5793541	best: 0.5793541 (2)	total: 2.92s	remaining: 16.5s
3:	learn: 0.5677026	test: 0.5685620	best: 0.5685620 (3)	total: 3.92s	remaining: 15.7s
4:	learn: 0.5575830	test: 0.5583476	best: 0.5583476 (4)	total: 4.96s	remaining: 14.9s
5:	learn: 0.5516687	test: 0.5524927	best: 0.5524927 (5)	total: 5.96s	remaining: 13.9s
6:	learn: 0.5478969	test: 0.5487378	best: 0.5487378 (6)	total: 6.88s	remaining: 12.8s
7:	learn: 0.5432536	test: 0.5440636	best: 0.5440636 (7)	total: 7.86s	remaining: 11.8s
8:	learn: 0.5401012	test: 0.5408300	best: 0.5408300 (8)	total: 8.81s	remaining: 10.8s
9:	learn: 0.5374814	test: 0.5382113	best: 0.5382113 (9)	total: 9.76s	remaining: 9.76s
10:	learn: 0.5339554	test: 0.5347174	best: 0.5347174 (10)	total: 10.9s	remaining: 8.91s
11:	learn: 0.5313766	test: 

In [79]:
output_preds(ranker_input, CBC, "model_catboost", popular_by_month)