In [1]:
import numpy as np 
import pandas as pd
import random
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn import preprocessing

In [2]:
import time

In [3]:
transactions = pd.read_csv('./data/transactions_train.csv')
articles = pd.read_csv('./data/articles.csv')
customers = pd.read_csv('./data/customers.csv')

In [4]:
transactions['purchased'] = 1

Transform string dates into weeks with the start of week 0 being the week for the final calculation (meaning everything in the data becomes a negative week)

In [9]:
import datetime

lookup = dict()
def str_dat_to_weeks_int(datestring):
    return lookup.setdefault(datestring, (datetime.datetime.strptime(datestring, "%Y-%m-%d") - datetime.datetime(2020, 9, 23)).days//7)

print(str_dat_to_weeks_int("2020-05-10"))
print(lookup)

-20
{'2020-05-10': -20}


In [10]:
transactions["t_dat"] = transactions["t_dat"].map(str_dat_to_weeks_int)

In [16]:
del lookup

Replace customer and article ids with label encoding. Article id is perhaps not necessary but I like it being parallel

In [14]:
customer_encoder = preprocessing.LabelEncoder()
article_encoder = preprocessing.LabelEncoder()

customer_encoder.fit(customers['customer_id'])
article_encoder.fit(articles['article_id'])

transactions['customer_id'] = customer_encoder.transform(transactions['customer_id'])
transactions['article_id'] = article_encoder.transform(transactions['article_id'])

del articles
del customers

In [29]:
np.save('customer_ids.npy', customer_encoder.classes_)
np.save('article_ids.npy', article_encoder.classes_)

Drop all transactions which happened more than 20 weeks before the end of the data collection period

In [11]:
transactions.drop(transactions[transactions["t_dat"] < -20].index, inplace=True)

Perform random negative sampling, most of this code is copied from the 2nd lecture

In [15]:
positive_pairs = list(map(tuple, transactions[['customer_id', 'article_id']].drop_duplicates().values))

In [17]:
real_dates = transactions["t_dat"].unique()
real_customers = transactions["customer_id"].unique()
real_articles = transactions["article_id"].unique()
real_channels = transactions["sales_channel_id"].unique()
article_and_price = transactions[["article_id","price"]].drop_duplicates("article_id").set_index("article_id").squeeze()

In [18]:
num_neg_pos = transactions.shape[0]

In [19]:
random.seed(42)
num_neg_samples = int(num_neg_pos * 1.1)

neg_dates = np.random.choice(real_dates, size=num_neg_samples)
neg_articles = np.random.choice(real_articles, size=num_neg_samples)
neg_customers = np.random.choice(real_customers, size=num_neg_samples)
neg_channels = np.random.choice(real_channels, size=num_neg_samples)
ordered = np.array([0] * num_neg_samples)

neg_prices = article_and_price[neg_articles].values

In [20]:
del real_dates
del real_customers
del real_articles
del real_channels
del article_and_price
del num_neg_samples

In [21]:
t_columns = transactions.columns

In [22]:
np_frame = np.column_stack((neg_dates, neg_customers, neg_articles, neg_prices, neg_channels, ordered))

In [23]:
neg_transactions = pd.DataFrame(np_frame, columns=t_columns)

In [24]:
del t_columns
del np_frame

In [25]:
duplicate_indexes = neg_transactions[["customer_id", "article_id"]].apply(tuple, 1).isin(positive_pairs)

In [26]:
neg_transactions = neg_transactions[~duplicate_indexes]

chosen_neg_transactions = neg_transactions.sample(num_neg_pos)
del neg_transactions

In [27]:
transactions = pd.concat([transactions, chosen_neg_transactions])

In [28]:
transactions

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,purchased
25414130,-20.0,47.0,70099.0,0.050831,2.0,1.0
25414131,-20.0,47.0,70976.0,0.025407,2.0,1.0
25414132,-20.0,66.0,91569.0,0.027441,2.0,1.0
25414133,-20.0,66.0,47295.0,0.015237,2.0,1.0
25414134,-20.0,66.0,92621.0,0.019814,2.0,1.0
...,...,...,...,...,...,...
1270157,-8.0,761104.0,71645.0,0.015237,1.0,0.0
3259023,-1.0,113194.0,86596.0,0.009458,2.0,0.0
6720473,-15.0,1113267.0,86411.0,0.042356,2.0,0.0
6960047,-12.0,748058.0,101298.0,0.025407,2.0,0.0


In [30]:
transactions.reset_index(inplace=True)
transactions.drop("index", axis=1, inplace=True)

In [31]:
print(transactions)

          t_dat  customer_id  article_id     price  sales_channel_id  \
0         -20.0         47.0     70099.0  0.050831               2.0   
1         -20.0         47.0     70976.0  0.025407               2.0   
2         -20.0         66.0     91569.0  0.027441               2.0   
3         -20.0         66.0     47295.0  0.015237               2.0   
4         -20.0         66.0     92621.0  0.019814               2.0   
...         ...          ...         ...       ...               ...   
12748383   -8.0     761104.0     71645.0  0.015237               1.0   
12748384   -1.0     113194.0     86596.0  0.009458               2.0   
12748385  -15.0    1113267.0     86411.0  0.042356               2.0   
12748386  -12.0     748058.0    101298.0  0.025407               2.0   
12748387   -8.0      47524.0     58329.0  0.012915               1.0   

          purchased  
0               1.0  
1               1.0  
2               1.0  
3               1.0  
4               1.0  
...

In [32]:
transactions.to_feather('./data/negativesampled.feather')

In [33]:
del num_neg_pos
del neg_dates
del neg_articles
del neg_customers
del neg_channels
del ordered
del neg_prices
del chosen_neg_transactions
del duplicate_indexes

This is intended as a checkpoint for if I need to reset the kernel for whatever reason. 

In [3]:
articles = pd.read_csv('./data/articles.csv')

In [4]:
transactions = pd.read_feather("./data/negativesampled.feather")

In [5]:
customers = pd.read_csv('./data/customers.csv')

In [6]:
customer_encoder = preprocessing.LabelEncoder()
customer_encoder.classes_ = np.load("customer_ids.npy", allow_pickle=True)

In [7]:
article_encoder = preprocessing.LabelEncoder()
article_encoder.classes_ = np.load("article_ids.npy", allow_pickle=True)

I decided to use an age of 25 as a default for those users without one

In [8]:
customers["age"] = customers["age"].fillna(25)

Apply the label encoding to the customer and article tables so they can be joined with transactions

In [9]:
customers['customer_id'] = customer_encoder.transform(customers['customer_id'])

In [10]:
articles['article_id'] = article_encoder.transform(articles['article_id'])

In [11]:
zip_encoder = preprocessing.LabelEncoder()
customers["postal_code"] = zip_encoder.fit_transform(customers["postal_code"])

In [14]:
customers.head()

Unnamed: 0,customer_id,FN,Active,club_member_status,fashion_news_frequency,age,postal_code
0,0,,,ACTIVE,NONE,49.0,112978
1,1,,,ACTIVE,NONE,25.0,57312
2,2,,,ACTIVE,NONE,24.0,139156
3,3,,,ACTIVE,NONE,54.0,128529
4,4,1.0,1.0,ACTIVE,Regularly,52.0,52371


In [12]:
customers["age"] = customers["age"].astype(int)

In [13]:
articles[['article_id', 'product_code', 'product_type_no','graphical_appearance_no','colour_group_code',
       'perceived_colour_value_id', 'perceived_colour_master_id', 'department_no',
       'index_group_no', 'section_no', 'garment_group_no']] = articles[['article_id', 'product_code', 'product_type_no','graphical_appearance_no','colour_group_code',
       'perceived_colour_value_id', 'perceived_colour_master_id', 'department_no',
       'index_group_no', 'section_no', 'garment_group_no']].astype(int)

In [19]:
articles.dtypes

article_id                       int32
product_code                     int32
prod_name                       object
product_type_no                  int32
product_type_name               object
product_group_name              object
graphical_appearance_no          int32
graphical_appearance_name       object
colour_group_code                int32
colour_group_name               object
perceived_colour_value_id        int32
perceived_colour_value_name     object
perceived_colour_master_id       int32
perceived_colour_master_name    object
department_no                    int32
department_name                 object
index_code                      object
index_name                      object
index_group_no                   int32
index_group_name                object
section_no                       int32
section_name                    object
garment_group_no                 int32
garment_group_name              object
detail_desc                     object
dtype: object

In [14]:
transactions[['t_dat', 'customer_id', 'article_id', 'sales_channel_id', 'purchased']] = transactions[['t_dat', 'customer_id', 'article_id', 'sales_channel_id', 'purchased']].astype(int)

In [18]:
transactions.dtypes

t_dat                 int32
customer_id           int32
article_id            int32
price               float64
sales_channel_id      int32
purchased             int32
dtype: object

In [15]:
transactions = transactions.merge(customers[["customer_id", "age", "postal_code"]], how="inner", on='customer_id')

In [16]:
transactions = transactions.merge(articles[["article_id", "product_code", "product_type_no", "graphical_appearance_no", "colour_group_code", "department_no", "index_group_no", "section_no", "garment_group_no"]], how="inner", on='article_id')

In [17]:
transactions

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,purchased,age,postal_code,product_code,product_type_no,graphical_appearance_no,colour_group_code,department_no,index_group_no,section_no,garment_group_no
0,-20,47,70099,0.050831,2,1,25,220995,759423,265,1010016,10,1313,1,11,1013
1,-7,32476,70099,0.044051,1,1,57,6312,759423,265,1010016,10,1313,1,11,1013
2,-20,65542,70099,0.045746,2,1,50,322223,759423,265,1010016,10,1313,1,11,1013
3,-10,164841,70099,0.050831,2,1,22,182018,759423,265,1010016,10,1313,1,11,1013
4,-20,282648,70099,0.045746,2,1,61,283888,759423,265,1010016,10,1313,1,11,1013
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12748383,-15,526329,96221,0.022864,2,0,62,61034,869489,275,1010016,43,8713,4,77,1014
12748384,-2,100674,96221,0.022864,2,0,71,149415,869489,275,1010016,43,8713,4,77,1014
12748385,-2,439208,96221,0.022864,1,0,25,289245,869489,275,1010016,43,8713,4,77,1014
12748386,-19,732170,96221,0.022864,2,0,50,130375,869489,275,1010016,43,8713,4,77,1014


The first time I wrote and ran this notebook, somehow a bunch of ids I would prefer to be integers for data usage reasons were turned into strings, so the cell below was a stopgap fix.

This time they turned into floats which is probably fine because I don't think the numbers get large enough for the floating point precision to go lower than 1

In [108]:
transactions["customer_id"].dtype

dtype('int32')

In [109]:
transactions["customer_id"] = transactions["customer_id"].astype(int)
transactions["article_id"] = transactions["article_id"].astype(int)
transactions["t_dat"] = transactions["t_dat"].astype(int)
transactions["sales_channel_id"] = transactions["sales_channel_id"].astype(int)
transactions["purchased"] = transactions["purchased"].astype(int)
transactions["product_code"] = transactions["product_code"].astype(int)
transactions["product_type_no"] = transactions["product_type_no"].astype(int)
transactions["graphical_appearance_no"] = transactions["graphical_appearance_no"].astype(int)
transactions["colour_group_code"] = transactions["colour_group_code"].astype(int)
transactions["department_no"] = transactions["department_no"].astype(int)
transactions["index_group_no"] = transactions["index_group_no"].astype(int)
transactions["section_no"] = transactions["section_no"].astype(int)
transactions["garment_group_no"] = transactions["garment_group_no"].astype(int)
transactions["age"] = transactions["age"].astype(int)
transactions["postal_code"] = transactions["postal_code"].astype(int)
transactions["price"] = transactions["price"].astype(float)

I'm not entirely sure why I split the data into training and validation sets. I suppose it just feels like a good habit to have even if I'm very much not actually using the validation data since it's easier to upload the output to kaggle

In [17]:
X_train, X_test, y_train, y_test = train_test_split(transactions.drop(['purchased', "price", 'sales_channel_id'], axis=1), transactions['purchased'], test_size=0.10, random_state=42)

Let's start with lightgbm binary classifier

In [19]:
# copying from https://github.com/microsoft/LightGBM/blob/master/examples/python-guide/simple_example.py
# combined with https://github.com/angelotc/LightGBM-binary-classification-example/blob/master/CCData.ipynb

import lightgbm as lgb
print('Starting training...')

gbm = lgb.LGBMClassifier(learning_rate = 0.1, metric = 'l1', 
                        n_estimators = 20)
gbm.fit(X_train, y_train,
        eval_set=[(X_test, y_test)],
        eval_metric=['auc', 'binary_logloss'],
        callbacks=[lgb.early_stopping(stopping_rounds=5)])

print('Saving model...')

Starting training...
Training until validation scores don't improve for 5 rounds
Did not meet early stopping. Best iteration is:
[20]	valid_0's auc: 0.813333	valid_0's binary_logloss: 0.541055	valid_0's l1: 0.387631
Saving model...


In [20]:
# save model to file
gbm.booster_.save_model('./data/model_lgbm_binary.txt')

<lightgbm.basic.Booster at 0x18494972970>

I somewhat stupidly forgot to calculate the most popular items before appending the random negative samples to my transactions table, so this is a silly workaround

In [18]:
popular_all_time = transactions[transactions["purchased"] == 1][["article_id", "purchased"]].groupby("article_id").count().sort_values(ascending=False, by="purchased").head(24).index.to_series().reset_index(drop=True)
popular_all_time

0     53892
1      1714
2     67522
3     22648
4     24837
5     70221
6      1713
7     24836
8     90082
9     22655
10     3091
11    75912
12    81746
13    67052
14    80056
15    84990
16    63596
17    81825
18    91737
19    47295
20    83396
21    87467
22    70234
23    70320
Name: article_id, dtype: int64

Generating twice as many popular items for the month as I want so I have spares in case there are duplicates

In [19]:
popular_by_month = transactions[(transactions["purchased"] == 1) & (transactions["t_dat"] >= -4)][["article_id", "purchased"]].groupby("article_id").count().sort_values(ascending=False, by="purchased").head(48).index.to_series().reset_index(drop=True)
popular_by_month

0      67522
1     103793
2     103796
3     104045
4      67543
5      53892
6     101718
7       3091
8     103108
9      94674
10    104553
11    104072
12     91737
13     56694
14     42626
15     57063
16    101367
17    103795
18     94656
19     92135
20     94657
21     95217
22    103794
23     71110
24     97518
25     95251
26     75912
27     71101
28     71107
29    105146
30        73
31    101729
32    103668
33     79488
34     53894
35     95789
36     95499
37    100937
38     67539
39     99394
40     46382
41     72734
42    104157
43    104947
44     18586
45      3520
46     75438
47    100938
Name: article_id, dtype: int64

In [20]:
popular_by_month2 = popular_by_month[~popular_by_month.isin(popular_all_time)]
popular_by_month2 = popular_by_month2.reset_index(drop=True).head(24)

Merge popular by month and popular "all time"

In [21]:
popular_candidates = pd.DataFrame(pd.concat([popular_all_time, popular_by_month2])).astype(int).reset_index(drop=True)

Put everything that needs to be ranked into a single dataframe and hope it fits into memory

In [32]:
transactions[["customer_id"]].drop_duplicates(subset="customer_id")

Unnamed: 0,customer_id
0,47
1,32476
2,65542
3,164841
4,282648
...,...
10394747,418992
10406633,184898
10800687,934227
10858768,1075374


In [22]:
ranker_input = pd.DataFrame(data={"t_dat": 0}, index=[0]).merge(transactions[["customer_id"]].drop_duplicates(subset="customer_id"), how="cross").merge(popular_candidates, how="cross").merge(customers[["customer_id", "age", "postal_code"]], how="inner", on="customer_id").merge(articles[["article_id", "product_code", "product_type_no", "graphical_appearance_no", "colour_group_code", "department_no", "index_group_no", "section_no", "garment_group_no"]], how="inner", on="article_id")

In [35]:
ranker_input

Unnamed: 0,t_dat,customer_id,article_id,age,postal_code,product_code,product_type_no,graphical_appearance_no,colour_group_code,department_no,index_group_no,section_no,garment_group_no
0,0,47,53892,25,220995,706016,272,1010016,9,1747,2,53,1009
1,0,32476,53892,57,6312,706016,272,1010016,9,1747,2,53,1009
2,0,65542,53892,50,322223,706016,272,1010016,9,1747,2,53,1009
3,0,164841,53892,22,182018,706016,272,1010016,9,1747,2,53,1009
4,0,282648,53892,61,283888,706016,272,1010016,9,1747,2,53,1009
...,...,...,...,...,...,...,...,...,...,...,...,...,...
31632379,0,418992,71107,35,40269,762846,259,1010016,13,1515,1,11,1010
31632380,0,184898,71107,38,126518,762846,259,1010016,13,1515,1,11,1010
31632381,0,934227,71107,41,130191,762846,259,1010016,13,1515,1,11,1010
31632382,0,1075374,71107,27,223897,762846,259,1010016,13,1515,1,11,1010


This is from when I messed up and my equivalent to ranker_input was too big for my memory but I'm leaving it here in case I need it again at some point

In [262]:
f = open("./data/test_output.csv", "w")
f.write("customer_id,prediction\n")
f.close()

for index, row in customers.groupby(np.arange(len(customers)) // 100):
    curr_row_in = unsold_article_id_labels.merge(row[["customer_id", "age", "postal_code"]], how="cross")
    curr_row_probs = gbm.predict_proba(curr_row_in[["customer_id", "article_id", "age", "postal_code"]], num_iteration=gbm.best_iteration_)
    curr_row_in[["probability_0", "probability_1"]] = curr_row_probs
    curr_row_in["customer_id"] = customer_encoder.inverse_transform(curr_row_in["customer_id"])
    curr_row_in["article_id"] = article_encoder.inverse_transform(curr_row_in["article_id"])
    ordering = curr_row_in.sort_values(by=["customer_id", "probability_1"], ascending=[True, False]).set_index("customer_id").groupby("customer_id")["article_id"].apply(lambda x : " ".join(["0" + str(i) for i in x[:12]]))
    ordering.to_csv("./data/test_output.csv", mode="a", header=False)

In [252]:
f.close()

Perform the predictions

In [26]:
predictions = gbm.predict_proba(ranker_input, num_iteration=gbm.best_iteration_)

In [27]:
output_test = ranker_input
output_test[["probability_0", "probability_1"]] = predictions

In [28]:
output_test['article_id'] = article_encoder.inverse_transform(output_test["article_id"])

Copying this from the radekosmulski notebook

In [29]:
ranking_dict = output_test.sort_values(by=["customer_id", "probability_1"], ascending=[True, False]).groupby("customer_id")["article_id"].apply(list).to_dict()

In [30]:
sub = pd.read_csv('data/sample_submission.csv')

In [32]:
pop_candidate_array = article_encoder.inverse_transform(popular_by_month.values)

In [34]:
preds = []
for c_id in customer_encoder.transform(sub.customer_id):
    pred = ranking_dict.get(c_id, pop_candidate_array)
    preds.append(pred[:12])

In [36]:
preds = [' '.join(['0' + str(p) for p in ps]) for ps in preds]
sub.prediction = preds

In [37]:
sub.to_csv('./data/model_lgbm_binary.csv', index=False)

In [38]:
ranker_input['article_id'] = article_encoder.transform(ranker_input["article_id"])

Next up is the ranker

In [39]:
ranker_input.drop(['probability_0', 'probability_1'], inplace=True, axis=1)

In [40]:
ranker_input

Unnamed: 0,t_dat,customer_id,article_id,age,postal_code,product_code,product_type_no,graphical_appearance_no,colour_group_code,department_no,index_group_no,section_no,garment_group_no
0,0,47,53892,25,220995,706016,272,1010016,9,1747,2,53,1009
1,0,32476,53892,57,6312,706016,272,1010016,9,1747,2,53,1009
2,0,65542,53892,50,322223,706016,272,1010016,9,1747,2,53,1009
3,0,164841,53892,22,182018,706016,272,1010016,9,1747,2,53,1009
4,0,282648,53892,61,283888,706016,272,1010016,9,1747,2,53,1009
...,...,...,...,...,...,...,...,...,...,...,...,...,...
31632379,0,418992,71107,35,40269,762846,259,1010016,13,1515,1,11,1010
31632380,0,184898,71107,38,126518,762846,259,1010016,13,1515,1,11,1010
31632381,0,934227,71107,41,130191,762846,259,1010016,13,1515,1,11,1010
31632382,0,1075374,71107,27,223897,762846,259,1010016,13,1515,1,11,1010


In [41]:
# copying from https://github.com/microsoft/LightGBM/blob/master/examples/python-guide/simple_example.py
# combined with https://github.com/angelotc/LightGBM-binary-classification-example/blob/master/CCData.ipynb
# and this one https://github.com/radekosmulski/personalized_fashion_recs/blob/main/03c_Basic_Model_Submission.ipynb



train_baskets = X_train.groupby(['t_dat', 'customer_id'])['article_id'].count().values
train_baskets_test = X_test.groupby(['t_dat', 'customer_id'])['article_id'].count().values

print('Starting training...')

gbm = lgb.LGBMRanker(learning_rate = 0.1, metric = 'l1', 
                        n_estimators = 20)
gbm.fit(X_train, y_train,
        eval_set=[(X_test, y_test)],
        group=train_baskets,
        eval_group=[train_baskets_test],
        eval_metric=['auc', 'binary_logloss'],
        callbacks=[lgb.early_stopping(stopping_rounds=5)])

print('Saving model...')

Starting training...
Training until validation scores don't improve for 5 rounds
Early stopping, best iteration is:
[1]	valid_0's auc: 0.778104	valid_0's binary_logloss: 5.37591	valid_0's l1: 0.522243
Saving model...


In [42]:
# save model to file
gbm.booster_.save_model('./data/model_lgbm_ranker.txt')

<lightgbm.basic.Booster at 0x184dfe85070>

In [43]:
predictions = gbm.predict(ranker_input, num_iteration=gbm.best_iteration_)

In [46]:
output_test = ranker_input
output_test["probability_1"] = predictions

In [47]:
output_test['article_id'] = article_encoder.inverse_transform(output_test["article_id"])

Copying this from the radekosmulski notebook

In [48]:
ranking_dict = output_test.sort_values(by=["customer_id", "probability_1"], ascending=[True, False]).groupby("customer_id")["article_id"].apply(list).to_dict()

In [49]:
sub = pd.read_csv('data/sample_submission.csv')

In [50]:
pop_candidate_array = article_encoder.inverse_transform(popular_by_month.values)

In [51]:
preds = []
for c_id in customer_encoder.transform(sub.customer_id):
    pred = ranking_dict.get(c_id, pop_candidate_array)
    preds.append(pred[:12])

In [53]:
preds = [' '.join(['0' + str(p) for p in ps]) for ps in preds]
sub.prediction = preds

In [55]:
sub.to_csv('./data/model_lgbm_ranker.csv', index=False)

In [146]:
ranker_input['article_id'] = article_encoder.transform(ranker_input["article_id"])

Logistic regression time, referencing https://towardsdatascience.com/logistic-regression-using-python-sklearn-numpy-mnist-handwriting-recognition-matplotlib-a6b31e2b166a

In [69]:
ranker_input.drop(['probability_1'], inplace=True, axis=1)

In [81]:
LG = LogisticRegression(random_state=42)

In [153]:
LG = LG.fit(X_train, y_train)

In [154]:
predictions = LG.predict_proba(ranker_input)

In [157]:
output_test = ranker_input
output_test[["probability0", "probability_1"]] = predictions

In [158]:
output_test['article_id'] = article_encoder.inverse_transform(output_test["article_id"])

Copying this from the radekosmulski notebook

In [159]:
ranking_dict = output_test.sort_values(by=["customer_id", "probability_1"], ascending=[True, False]).groupby("customer_id")["article_id"].apply(list).to_dict()

In [160]:
sub = pd.read_csv('data/sample_submission.csv')

In [161]:
pop_candidate_array = article_encoder.inverse_transform(popular_by_month.values)

In [162]:
preds = []
for c_id in customer_encoder.transform(sub.customer_id):
    pred = ranking_dict.get(c_id, pop_candidate_array)
    preds.append(pred[:12])

In [163]:
preds = [' '.join(['0' + str(p) for p in ps]) for ps in preds]
sub.prediction = preds

In [164]:
sub.to_csv('./data/model_logistic_regression.csv', index=False)

In [165]:
ranker_input['article_id'] = article_encoder.transform(ranker_input["article_id"])

Naive Bayes time

In [169]:
ranker_input.drop(['probability0', 'probability_1'], inplace=True, axis=1)

In [170]:
from sklearn.naive_bayes import GaussianNB

In [186]:
GNB = GaussianNB()

In [187]:
GNB = GNB.fit(X_train, y_train)

In [188]:
predictions = GNB.predict_proba(ranker_input)

In [189]:
predictions

array([[0.09133772, 0.90866228],
       [0.09943939, 0.90056061],
       [0.09170467, 0.90829533],
       ...,
       [0.05959371, 0.94040629],
       [0.05870154, 0.94129846],
       [0.05997718, 0.94002282]])

In [190]:
output_test = ranker_input
output_test[["probability0", "probability_1"]] = predictions

In [191]:
output_test['article_id'] = article_encoder.inverse_transform(output_test["article_id"])

Copying this from the radekosmulski notebook

In [192]:
ranking_dict = output_test.sort_values(by=["customer_id", "probability_1"], ascending=[True, False]).groupby("customer_id")["article_id"].apply(list).to_dict()

In [193]:
sub = pd.read_csv('data/sample_submission.csv')

In [194]:
pop_candidate_array = article_encoder.inverse_transform(popular_by_month.values)

In [195]:
preds = []
for c_id in customer_encoder.transform(sub.customer_id):
    pred = ranking_dict.get(c_id, pop_candidate_array)
    preds.append(pred[:12])

In [196]:
preds = [' '.join(['0' + str(p) for p in ps]) for ps in preds]
sub.prediction = preds

In [197]:
sub.to_csv('./data/model_naive_bayes.csv', index=False)

In [198]:
ranker_input['article_id'] = article_encoder.transform(ranker_input["article_id"])

Random Forest time

In [44]:
ranker_input.drop(['probability0', 'probability_1'], inplace=True, axis=1)

In [23]:
from sklearn.ensemble import RandomForestClassifier

In [45]:
RFC = RandomForestClassifier(n_estimators=20, max_depth=10, random_state=42)

In [46]:
RFC = RFC.fit(X_train, y_train)

In [47]:
predictions = RFC.predict_proba(ranker_input)

In [48]:
predictions

array([[0.38622072, 0.61377928],
       [0.40414821, 0.59585179],
       [0.38622072, 0.61377928],
       ...,
       [0.50358896, 0.49641104],
       [0.50358896, 0.49641104],
       [0.50358896, 0.49641104]])

In [49]:
output_test = ranker_input
output_test[["probability0", "probability_1"]] = predictions

In [50]:
output_test['article_id'] = article_encoder.inverse_transform(output_test["article_id"])

Copying this from the radekosmulski notebook

In [51]:
ranking_dict = output_test.sort_values(by=["customer_id", "probability_1"], ascending=[True, False]).groupby("customer_id")["article_id"].apply(list).to_dict()

In [52]:
sub = pd.read_csv('data/sample_submission.csv')

In [53]:
pop_candidate_array = article_encoder.inverse_transform(popular_by_month.values)

In [54]:
preds = []
for c_id in customer_encoder.transform(sub.customer_id):
    pred = ranking_dict.get(c_id, pop_candidate_array)
    preds.append(pred[:12])

In [55]:
preds = [' '.join(['0' + str(p) for p in ps]) for ps in preds]
sub.prediction = preds

In [56]:
sub.to_csv('./data/model_random_forest.csv', index=False)

In [57]:
ranker_input['article_id'] = article_encoder.transform(ranker_input["article_id"])

Finally Scikit implementation of gradient boosting

In [None]:
ranker_input.drop(['probability0', 'probability_1'], inplace=True, axis=1)

In [26]:
from sklearn.ensemble import GradientBoostingClassifier

In [31]:
GBC = GradientBoostingClassifier(n_estimators=20, random_state=42, n_iter_no_change=5)

In [32]:
GBC = GBC.fit(X_train, y_train)

In [33]:
predictions = GBC.predict_proba(ranker_input)

In [34]:
predictions

array([[0.44175174, 0.55824826],
       [0.44175174, 0.55824826],
       [0.44175174, 0.55824826],
       ...,
       [0.54499219, 0.45500781],
       [0.54499219, 0.45500781],
       [0.54499219, 0.45500781]])

In [35]:
output_test = ranker_input
output_test[["probability0", "probability_1"]] = predictions

In [36]:
output_test['article_id'] = article_encoder.inverse_transform(output_test["article_id"])

Copying this from the radekosmulski notebook

In [37]:
ranking_dict = output_test.sort_values(by=["customer_id", "probability_1"], ascending=[True, False]).groupby("customer_id")["article_id"].apply(list).to_dict()

In [38]:
sub = pd.read_csv('data/sample_submission.csv')

In [39]:
pop_candidate_array = article_encoder.inverse_transform(popular_by_month.values)

In [40]:
preds = []
for c_id in customer_encoder.transform(sub.customer_id):
    pred = ranking_dict.get(c_id, pop_candidate_array)
    preds.append(pred[:12])

In [41]:
preds = [' '.join(['0' + str(p) for p in ps]) for ps in preds]
sub.prediction = preds

In [42]:
sub.to_csv('./data/model_sklearn_gradient.csv', index=False)

In [43]:
ranker_input['article_id'] = article_encoder.transform(ranker_input["article_id"])