In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle

In [3]:
# import warnings filter
from warnings import simplefilter
# ignore all future warnings
simplefilter(action='ignore', category=FutureWarning)

# Clustering

In [4]:
features = pd.read_csv('item_features.csv')

In [5]:
n_pca_comp = 16
number_of_clusters = 4

In [8]:
cluster_assign = pd.read_csv('cluster_assign.csv')
cluster_assign = cluster_assign[['item_id2', 'Cluster_pred']]
cluster_assign

Unnamed: 0,item_id2,Cluster_pred
0,2,4
1,3,0
2,4,0
3,7,1
4,8,0
...,...,...
23686,28139,0
23687,28140,3
23688,28141,0
23689,28142,1


# Feature Engineering

- First seen product
- Instant when the first product was seen
- Last seen product
- Instant when the last product was seen
- Total session duration
- Mean time spent in each product
- Time of the day when the session started

In [9]:
df_sessions = pd.read_csv('train_sessions.csv')

In [10]:
df_sessions.head()

Unnamed: 0,session_id,item_id,date
0,3,9655,2020-12-18 21:25:00.373
1,3,9655,2020-12-18 21:19:48.093
2,13,15654,2020-03-13 19:35:27.136
3,18,18316,2020-08-26 19:18:30.833
4,18,2507,2020-08-26 19:16:31.211


In [11]:
session_sorted = df_sessions.sort_values(by=['date'])

# First date of session
begin_df = session_sorted.groupby('session_id')['date'].first().rename('start_date')

# First product of session
first_prod = session_sorted.groupby('session_id')['item_id'].first().rename('first_prod')

# Number of products seen in the session
prod_count = session_sorted.groupby('session_id')['item_id'].count().rename('prod_count')

# Last date of the session
end_df = session_sorted.groupby('session_id')['date'].last().rename('end_date')

# Last product of the session
last_prod = session_sorted.groupby('session_id')['item_id'].last().rename('last_prod')

# most common category in these session
join_cat = session_sorted.merge(features, on='item_id', how='left').sort_values(by=['feature_category_id'])
common_cat = join_cat.groupby(['session_id'])['feature_category_id'].value_counts().rename('most_common_cat')
common_cat_df = common_cat.to_frame(name = None).reset_index()
common_cat = common_cat_df.groupby(['session_id'])['feature_category_id'].first().rename('most_common_cat')

# counting number of unique categories for session
count_num_unique_cat = common_cat_df.groupby(['session_id'])['feature_category_id'].count().rename('count_num_unique_cat')


# Concat the datasets
times_df = pd.concat([begin_df, end_df, first_prod, last_prod, 
                      prod_count, common_cat, count_num_unique_cat
                     ],axis=1)

# Proper time format
times_df['start_date'] = pd.to_datetime(times_df['start_date'])
times_df['end_date'] = pd.to_datetime(times_df['end_date'])

# Calculatind the date difference
times_df['time_diff'] = (times_df.end_date-times_df.start_date).astype('timedelta64[s]')
times_df['time_diff'] = times_df['time_diff'].astype(int)

# Calculating the time per product
times_df['time_per_prod'] = (times_df['time_diff']/times_df['prod_count']).astype(int)

# Adding the time of the day
mask=(times_df.start_date.dt.hour>=0) & (times_df.start_date.dt.hour<7)
times_df.loc[mask,'time_first_prod'] = 'madrugada'
mask=(times_df.start_date.dt.hour>=7) & (times_df.start_date.dt.hour<12)
times_df.loc[mask,'time_first_prod'] = 'día'
mask=(times_df.start_date.dt.hour>=12) & (times_df.start_date.dt.hour<18)
times_df.loc[mask,'time_first_prod'] = 'tarde'
mask=(times_df.start_date.dt.hour>=18) & (times_df.start_date.dt.hour<24)
times_df.loc[mask,'time_first_prod'] = 'noche'


In [12]:
sessions=times_df
sessions

Unnamed: 0_level_0,start_date,end_date,first_prod,last_prod,prod_count,most_common_cat,count_num_unique_cat,time_diff,time_per_prod,time_first_prod
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
3,2020-12-18 21:19:48.093,2020-12-18 21:25:00.373,9655,9655,2,2,18,312,156,noche
13,2020-03-13 19:35:27.136,2020-03-13 19:35:27.136,15654,15654,1,4,20,0,0,noche
18,2020-08-26 19:15:47.232,2020-08-26 19:18:30.833,4026,18316,3,1,25,163,54,noche
19,2020-11-02 16:30:36.378,2020-11-02 16:48:39.343,19896,8268,17,4,28,1082,63,tarde
24,2020-02-26 17:22:48.903,2020-02-26 18:24:32.770,2927,18476,9,30,43,3703,411,tarde
...,...,...,...,...,...,...,...,...,...,...
4439986,2021-05-13 11:46:56.606,2021-05-13 11:55:50.151,23502,27733,6,4,18,533,88,día
4439990,2020-08-22 11:38:48.785,2020-08-22 12:36:27.326,22093,22551,11,4,46,3458,314,día
4439994,2020-11-27 20:08:37.262,2020-11-27 20:08:37.262,25357,25357,1,3,25,0,0,noche
4439999,2020-11-27 10:52:12.577,2020-11-27 10:59:28.653,6007,15853,7,47,46,436,62,día


# Purchases

In [13]:
purchases = pd.read_csv('train_purchases.csv').set_index('session_id')
purchases['date']=pd.to_datetime(purchases['date'])
purchases.rename(columns={'item_id':'item_purch','date':'purch_date'}, inplace=True)
purchases.head()

Unnamed: 0_level_0,item_purch,purch_date
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1
3,15085,2020-12-18 21:26:47.986
13,18626,2020-03-13 19:36:15.507
18,24911,2020-08-26 19:20:32.049
19,12534,2020-11-02 17:16:45.920
24,13226,2020-02-26 18:27:44.114


In [14]:
purchases.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000000 entries, 3 to 4440001
Data columns (total 2 columns):
 #   Column      Non-Null Count    Dtype         
---  ------      --------------    -----         
 0   item_purch  1000000 non-null  int64         
 1   purch_date  1000000 non-null  datetime64[ns]
dtypes: datetime64[ns](1), int64(1)
memory usage: 22.9 MB


# Features
- Tabla con sesiones y veces que se ha visto cada clúster

In [15]:
item_clust = cluster_assign[['item_id2', 'Cluster_pred']].set_index('item_id2')
item_clust

Unnamed: 0_level_0,Cluster_pred
item_id2,Unnamed: 1_level_1
2,4
3,0
4,0
7,1
8,0
...,...
28139,0
28140,3
28141,0
28142,1


In [16]:
df_sess_clust=df_sessions.merge(item_clust, left_on='item_id', right_index=True)
df_sess_clust

Unnamed: 0,session_id,item_id,date,Cluster_pred
0,3,9655,2020-12-18 21:25:00.373,3
1,3,9655,2020-12-18 21:19:48.093,3
103605,97085,9655,2020-12-27 20:22:13.011,3
162423,152155,9655,2020-11-19 16:26:18.468,3
225780,210806,9655,2021-01-02 05:37:37.183,3
...,...,...,...,...
4737630,4434058,3416,2020-11-14 23:33:46.655,0
4737631,4434058,15964,2020-11-14 23:33:58.652,0
4739963,4436170,27863,2020-07-20 06:36:30.611,1
4741053,4437232,7735,2020-01-02 20:07:06.367,0


In [17]:
df_clust_count = df_sess_clust.groupby(['session_id', 'Cluster_pred'])['session_id'].aggregate('count').unstack().fillna(0)


In [18]:
df_clust_count

Cluster_pred,0,1,2,3,4
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
3,0.0,0.0,0.0,2.0,0.0
13,0.0,0.0,0.0,0.0,1.0
18,3.0,0.0,0.0,0.0,0.0
19,17.0,0.0,0.0,0.0,0.0
24,0.0,3.0,2.0,1.0,3.0
...,...,...,...,...,...
4439986,0.0,0.0,0.0,0.0,6.0
4439990,8.0,0.0,0.0,1.0,2.0
4439994,1.0,0.0,0.0,0.0,0.0
4439999,6.0,0.0,0.0,1.0,0.0


What is the most seen cluster of each session?

In [19]:
df_clust_count["most_seen_cluster"] = df_clust_count.idxmax(axis=1)
df_clust_count

Cluster_pred,0,1,2,3,4,most_seen_cluster
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
3,0.0,0.0,0.0,2.0,0.0,3
13,0.0,0.0,0.0,0.0,1.0,4
18,3.0,0.0,0.0,0.0,0.0,0
19,17.0,0.0,0.0,0.0,0.0,0
24,0.0,3.0,2.0,1.0,3.0,1
...,...,...,...,...,...,...
4439986,0.0,0.0,0.0,0.0,6.0,4
4439990,8.0,0.0,0.0,1.0,2.0,0
4439994,1.0,0.0,0.0,0.0,0.0,0
4439999,6.0,0.0,0.0,1.0,0.0,0


# Combine all datasets

In [20]:
df=pd.concat([sessions, purchases, df_clust_count], axis=1)
df

Unnamed: 0_level_0,start_date,end_date,first_prod,last_prod,prod_count,most_common_cat,count_num_unique_cat,time_diff,time_per_prod,time_first_prod,item_purch,purch_date,0,1,2,3,4,most_seen_cluster
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
3,2020-12-18 21:19:48.093,2020-12-18 21:25:00.373,9655,9655,2,2,18,312,156,noche,15085,2020-12-18 21:26:47.986,0.0,0.0,0.0,2.0,0.0,3
13,2020-03-13 19:35:27.136,2020-03-13 19:35:27.136,15654,15654,1,4,20,0,0,noche,18626,2020-03-13 19:36:15.507,0.0,0.0,0.0,0.0,1.0,4
18,2020-08-26 19:15:47.232,2020-08-26 19:18:30.833,4026,18316,3,1,25,163,54,noche,24911,2020-08-26 19:20:32.049,3.0,0.0,0.0,0.0,0.0,0
19,2020-11-02 16:30:36.378,2020-11-02 16:48:39.343,19896,8268,17,4,28,1082,63,tarde,12534,2020-11-02 17:16:45.920,17.0,0.0,0.0,0.0,0.0,0
24,2020-02-26 17:22:48.903,2020-02-26 18:24:32.770,2927,18476,9,30,43,3703,411,tarde,13226,2020-02-26 18:27:44.114,0.0,3.0,2.0,1.0,3.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4439986,2021-05-13 11:46:56.606,2021-05-13 11:55:50.151,23502,27733,6,4,18,533,88,día,2915,2021-05-13 11:56:37.464,0.0,0.0,0.0,0.0,6.0,4
4439990,2020-08-22 11:38:48.785,2020-08-22 12:36:27.326,22093,22551,11,4,46,3458,314,día,8786,2020-08-22 14:28:22.382,8.0,0.0,0.0,1.0,2.0,0
4439994,2020-11-27 20:08:37.262,2020-11-27 20:08:37.262,25357,25357,1,3,25,0,0,noche,21630,2020-11-27 20:10:28.961,1.0,0.0,0.0,0.0,0.0,0
4439999,2020-11-27 10:52:12.577,2020-11-27 10:59:28.653,6007,15853,7,47,46,436,62,día,16962,2020-11-27 11:01:41.356,6.0,0.0,0.0,1.0,0.0,0


What was the cluster of the first and the last seen item?

In [21]:
df = df.reset_index().merge(item_clust, how='left', left_on='first_prod', right_on=item_clust.index).set_index('session_id')
df = df.rename(columns = {'Cluster_pred':'first_item_cluster'})
df = df.reset_index().merge(item_clust, how='left', left_on='last_prod', right_on=item_clust.index).set_index('session_id')
df = df.rename(columns = {'Cluster_pred':'last_item_cluster'})

In [22]:
df

Unnamed: 0_level_0,start_date,end_date,first_prod,last_prod,prod_count,most_common_cat,count_num_unique_cat,time_diff,time_per_prod,time_first_prod,item_purch,purch_date,0,1,2,3,4,most_seen_cluster,first_item_cluster,last_item_cluster
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
3,2020-12-18 21:19:48.093,2020-12-18 21:25:00.373,9655,9655,2,2,18,312,156,noche,15085,2020-12-18 21:26:47.986,0.0,0.0,0.0,2.0,0.0,3,3,3
13,2020-03-13 19:35:27.136,2020-03-13 19:35:27.136,15654,15654,1,4,20,0,0,noche,18626,2020-03-13 19:36:15.507,0.0,0.0,0.0,0.0,1.0,4,4,4
18,2020-08-26 19:15:47.232,2020-08-26 19:18:30.833,4026,18316,3,1,25,163,54,noche,24911,2020-08-26 19:20:32.049,3.0,0.0,0.0,0.0,0.0,0,0,0
19,2020-11-02 16:30:36.378,2020-11-02 16:48:39.343,19896,8268,17,4,28,1082,63,tarde,12534,2020-11-02 17:16:45.920,17.0,0.0,0.0,0.0,0.0,0,0,0
24,2020-02-26 17:22:48.903,2020-02-26 18:24:32.770,2927,18476,9,30,43,3703,411,tarde,13226,2020-02-26 18:27:44.114,0.0,3.0,2.0,1.0,3.0,1,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4439986,2021-05-13 11:46:56.606,2021-05-13 11:55:50.151,23502,27733,6,4,18,533,88,día,2915,2021-05-13 11:56:37.464,0.0,0.0,0.0,0.0,6.0,4,4,4
4439990,2020-08-22 11:38:48.785,2020-08-22 12:36:27.326,22093,22551,11,4,46,3458,314,día,8786,2020-08-22 14:28:22.382,8.0,0.0,0.0,1.0,2.0,0,0,0
4439994,2020-11-27 20:08:37.262,2020-11-27 20:08:37.262,25357,25357,1,3,25,0,0,noche,21630,2020-11-27 20:10:28.961,1.0,0.0,0.0,0.0,0.0,0,0,0
4439999,2020-11-27 10:52:12.577,2020-11-27 10:59:28.653,6007,15853,7,47,46,436,62,día,16962,2020-11-27 11:01:41.356,6.0,0.0,0.0,1.0,0.0,0,0,0


In order to reduce the training time, we will need to reduce the number of sessions to 112055 (id of the session must be less or equal than 500000).

In [23]:
df = df[df.index<=50000]

df

Unnamed: 0_level_0,start_date,end_date,first_prod,last_prod,prod_count,most_common_cat,count_num_unique_cat,time_diff,time_per_prod,time_first_prod,item_purch,purch_date,0,1,2,3,4,most_seen_cluster,first_item_cluster,last_item_cluster
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
3,2020-12-18 21:19:48.093,2020-12-18 21:25:00.373,9655,9655,2,2,18,312,156,noche,15085,2020-12-18 21:26:47.986,0.0,0.0,0.0,2.0,0.0,3,3,3
13,2020-03-13 19:35:27.136,2020-03-13 19:35:27.136,15654,15654,1,4,20,0,0,noche,18626,2020-03-13 19:36:15.507,0.0,0.0,0.0,0.0,1.0,4,4,4
18,2020-08-26 19:15:47.232,2020-08-26 19:18:30.833,4026,18316,3,1,25,163,54,noche,24911,2020-08-26 19:20:32.049,3.0,0.0,0.0,0.0,0.0,0,0,0
19,2020-11-02 16:30:36.378,2020-11-02 16:48:39.343,19896,8268,17,4,28,1082,63,tarde,12534,2020-11-02 17:16:45.920,17.0,0.0,0.0,0.0,0.0,0,0,0
24,2020-02-26 17:22:48.903,2020-02-26 18:24:32.770,2927,18476,9,30,43,3703,411,tarde,13226,2020-02-26 18:27:44.114,0.0,3.0,2.0,1.0,3.0,1,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49976,2021-01-03 15:10:27.864,2021-01-03 15:27:03.675,8114,23287,2,3,28,995,497,tarde,24900,2021-01-03 15:28:38.196,2.0,0.0,0.0,0.0,0.0,0,0,0
49980,2021-03-31 20:09:31.139,2021-03-31 20:12:58.553,23034,2881,3,4,37,207,69,noche,3057,2021-03-31 20:14:19.965,2.0,0.0,0.0,0.0,1.0,0,0,0
49991,2020-08-29 16:59:45.686,2020-08-29 16:59:45.686,7490,7490,1,4,19,0,0,tarde,7590,2020-08-29 17:01:09.711,0.0,0.0,0.0,0.0,1.0,4,4,4
49993,2021-03-01 15:12:42.574,2021-03-01 15:12:42.574,19048,19048,1,3,24,0,0,tarde,26853,2021-03-01 15:16:11.443,1.0,0.0,0.0,0.0,0.0,0,0,0


In [24]:
df.to_csv('dataset_merge.csv')

### How many type of items are seen in each session?

This gives us valuable insights on whether the clustering is giving us logical information. If the clustering has been made correctly, it would be normal to expect that in the majority of sessions only one type of products has been seen. If the clustering returned random results, not many sessions would have only one type of product seen because the products would have been assigned to clusters randomly, and each session would normally have more than one type of cluster.

In [26]:
df_clusts = df[range(0,number_of_clusters)]
df_clusts = df_clusts.replace(0, np.nan)
df_clusts["distinct"] = df_clusts.count(1)
#df_clusts.head()
item_counts = df_clusts["distinct"].value_counts()
print(item_counts)

1    6104
0    2719
2    1854
3     425
4      70
Name: distinct, dtype: int64


In [27]:
df_clusts = df[range(0,number_of_clusters)]
df_clusts = df_clusts.replace(0, np.nan)
df_clusts["sum"] = df_clusts.sum(axis=1)
df_clusts = df_clusts[df_clusts["sum"] > 4.0]
df_clusts["distinct"] = df_clusts.count(1)
df_clusts.head()
item_counts = df_clusts["distinct"].value_counts()
print(item_counts)

3    1044
2    1035
4     363
5      69
Name: distinct, dtype: int64


## Divide into train - validation

In [28]:
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(df, test_size=0.2, random_state=40)
len(train_set),len(test_set)

(8937, 2235)

In [29]:
cat_vars = ["first_prod", "last_prod", "time_first_prod", "most_common_cat", "count_num_unique_cat", "most_seen_cluster", "first_item_cluster", "last_item_cluster"]
num_vars = ["prod_count", "time_diff", "time_per_prod"] + list(range(0,number_of_clusters))

## Prepare the data for Machine Learning algorithms

In [30]:
from sklearn.preprocessing import OneHotEncoder

cat_encoder = OneHotEncoder(handle_unknown='ignore')
train_set_1hot = cat_encoder.fit_transform(train_set.fillna('Unknown'))

In [31]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy="median")

In [32]:
train_set_num = train_set[num_vars]

In [33]:
imputer.fit(train_set_num)
imputer.statistics_

array([  3., 108.,  30.,   1.,   0.,   0.,   0.])

In [35]:
X = imputer.transform(train_set_num)

In [36]:
train_set_tr = pd.DataFrame(X, columns=train_set_num.columns,
                          index=train_set.index)

In [67]:
train_set_tr

Unnamed: 0_level_0,prod_count,time_diff,time_per_prod,0,1,2,3
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
29421,2.0,87.0,43.0,0.0,0.0,2.0,0.0
49205,4.0,155.0,38.0,2.0,0.0,0.0,0.0
38886,4.0,212.0,53.0,1.0,3.0,0.0,0.0
36511,2.0,31692.0,15846.0,2.0,0.0,0.0,0.0
5287,2.0,5741.0,2870.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...
35188,1.0,0.0,0.0,1.0,0.0,0.0,0.0
16046,1.0,0.0,0.0,0.0,0.0,1.0,0.0
26765,2.0,74.0,37.0,0.0,0.0,0.0,0.0
24395,4.0,58470.0,14617.0,0.0,0.0,4.0,0.0


### Categorical data

Now let's preprocess the categorical input:

In [68]:
train_set_cat = train_set[cat_vars]
# There is missing values in this category, we will create a new cateogry for those.
train_set_cat.head(10)

Unnamed: 0_level_0,first_prod,last_prod,time_first_prod,most_common_cat,count_num_unique_cat,most_seen_cluster,first_item_cluster,last_item_cluster
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
29421,3710,5547,tarde,4,20,2,2,2
49205,17370,19885,día,4,31,0,4,4
38886,11117,5217,día,30,26,1,0,1
36511,19638,16667,día,3,24,0,0,0
5287,9910,27391,tarde,4,31,0,0,4
40450,6692,8622,día,47,29,4,3,4
45297,2098,2098,noche,4,18,4,4,4
28768,26257,22886,tarde,2,18,3,3,3
23819,6651,26656,noche,30,38,0,0,2
22112,7784,7784,noche,7,18,4,4,4


In [69]:
from sklearn.preprocessing import OrdinalEncoder

ordinal_encoder = OrdinalEncoder()
train_set_cat_encoded = ordinal_encoder.fit_transform(train_set_cat)
train_set_cat_encoded[:10]

array([[6.670e+02, 1.073e+03, 3.000e+00, 3.000e+00, 1.200e+01, 2.000e+00,
        2.000e+00, 2.000e+00],
       [3.203e+03, 3.914e+03, 0.000e+00, 3.000e+00, 2.300e+01, 0.000e+00,
        4.000e+00, 4.000e+00],
       [2.060e+03, 1.023e+03, 0.000e+00, 9.000e+00, 1.800e+01, 1.000e+00,
        0.000e+00, 1.000e+00],
       [3.645e+03, 3.259e+03, 0.000e+00, 2.000e+00, 1.600e+01, 0.000e+00,
        0.000e+00, 0.000e+00],
       [1.835e+03, 5.361e+03, 3.000e+00, 3.000e+00, 2.300e+01, 0.000e+00,
        0.000e+00, 4.000e+00],
       [1.244e+03, 1.672e+03, 0.000e+00, 1.200e+01, 2.100e+01, 4.000e+00,
        3.000e+00, 4.000e+00],
       [3.690e+02, 3.970e+02, 2.000e+00, 3.000e+00, 1.000e+01, 4.000e+00,
        4.000e+00, 4.000e+00],
       [4.845e+03, 4.480e+03, 3.000e+00, 1.000e+00, 1.000e+01, 3.000e+00,
        3.000e+00, 3.000e+00],
       [1.236e+03, 5.214e+03, 2.000e+00, 9.000e+00, 3.000e+01, 0.000e+00,
        0.000e+00, 2.000e+00],
       [1.452e+03, 1.505e+03, 2.000e+00, 6.000e+00, 1.0

In [70]:
from sklearn.preprocessing import OneHotEncoder

cat_encoder = OneHotEncoder(handle_unknown='ignore')
train_set_cat_1hot = cat_encoder.fit_transform(train_set_cat)
train_set_cat_1hot

<8937x10805 sparse matrix of type '<class 'numpy.float64'>'
	with 71496 stored elements in Compressed Sparse Row format>

In [71]:
train_set_cat_1hot.toarray()

array([[0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.]])

In [43]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import FunctionTransformer

num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('std_scaler', StandardScaler()),
    ])

train_set_num_tr = num_pipeline.fit_transform(train_set_num)

In [44]:
cat_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="constant",fill_value='Unknown')),
        ('one_hot_encoder', OneHotEncoder(handle_unknown='ignore')),
    ])

In [45]:
from sklearn.compose import ColumnTransformer

num_attribs1 = train_set_num.select_dtypes(include="object").columns
cat_attribs = train_set_cat.select_dtypes(include="object").columns


full_pipeline = ColumnTransformer([
        ("num1", num_pipeline, num_attribs1),
        ("cat", cat_pipeline, cat_attribs),
    ])

train_set_prepared = full_pipeline.fit_transform(train_set)

train_set_prepared

<8937x4 sparse matrix of type '<class 'numpy.float64'>'
	with 8937 stored elements in Compressed Sparse Row format>

## Training and model selection

### SGD Classifier

In [46]:
from sklearn.linear_model import SGDClassifier

#sgd_class = SGDClassifier(loss='modified_huber', random_state=42)
#sgd_class.fit(train_set_prepared, train_set[["item_purch"]].to_numpy().ravel())

#pickle.dump(sgd_class, open('models/sgd_model.sav', 'wb'))

### Decision Tree

In [47]:
from sklearn.tree import DecisionTreeClassifier

tree_class = DecisionTreeClassifier(random_state=42)
tree_class.fit(train_set_prepared, train_set[["item_purch"]].to_numpy().ravel())

pickle.dump(tree_class, open('models/tree_model.sav', 'wb'))

### K-Nearest Neighbour

In [48]:
from sklearn import neighbors

n_neighbors = 3
knn_class = neighbors.KNeighborsClassifier(n_neighbors)
knn_class.fit(train_set_prepared, train_set[["item_purch"]].to_numpy().ravel())

pickle.dump(knn_class, open('models/knn_model.sav', 'wb'))

### SVC

In [49]:
from sklearn import svm

#SVC_class = svm.SVC(kernel='linear', probability=True)
#SVC_class.fit(train_set_prepared, train_set[["item_purch"]].to_numpy().ravel())

#pickle.dump(SVC_class, open('models/svc_model.sav', 'wb'))

### Random Forest Classifier

In [50]:
from sklearn.ensemble import RandomForestClassifier

forest_class = RandomForestClassifier(n_estimators=20, random_state=42)
forest_class.fit(train_set_prepared, train_set[["item_purch"]].to_numpy().ravel())

pickle.dump(forest_class, open('models/random_model.sav', 'wb'))

## Model testing

#### Test each model using the mean reciprocal rank method. That is the one that is used in the RecSys Challenge competition. This will help us choose which model we are going to end up using. 

In [51]:
test_set_num = test_set[num_vars]
test_set_cat = test_set[cat_vars]

num_attribs2 = test_set_num.select_dtypes(include="object").columns
cat_attribs2 = test_set_cat.select_dtypes(include="object").columns


full_pipeline = ColumnTransformer([
        ("num1", num_pipeline, num_attribs2),
        ("cat", cat_pipeline, cat_attribs2),
    ])

test_set_prepared = full_pipeline.fit_transform(test_set)

In [52]:
def mean_reciprocal_rank(model, set_to_test, rr = 0, cont = 0):

    predictions = model.predict_proba(set_to_test)

    pred_df_test = pd.DataFrame(predictions)
    pred_df_test.columns = model.classes_

    pred_df_test["session_id"] = test_set.index
    pred_df_test = pred_df_test.merge(df[["item_purch"]], how='inner', on='session_id')
    pred_df_test = pred_df_test[["session_id", "item_purch"] + list(pred_df_test.columns[:-2])]

    for index, row in pred_df_test.iterrows():
        item_purch_act = int(row.iloc[1])
        row = row.iloc[2:]
        row_sorted = row.sort_values(ascending=False)
        items = row_sorted.index.to_list()
        if item_purch_act in items:
            rank = int(items.index(item_purch_act)) + 1
            if rank <= 100:
                rr += 1/rank
        cont += 1

    mrr = rr/cont
    print("The mean reciprocal rank for the " + str(model) + " is " + str(mrr))

In [53]:
SVC_class = pickle.load(open('models/svc_model.sav', 'rb'))
#mean_reciprocal_rank(sgd_class, test_set_prepared)
mean_reciprocal_rank(knn_class, test_set_prepared)
#mean_reciprocal_rank(SVC_class, test_set_prepared)
mean_reciprocal_rank(tree_class, test_set_prepared)
mean_reciprocal_rank(forest_class, test_set_prepared)

The mean reciprocal rank for the SGDClassifier(loss='modified_huber', random_state=42) is 0.001205949176443282
The mean reciprocal rank for the KNeighborsClassifier(n_neighbors=3) is 0.0022989210718511836
The mean reciprocal rank for the SVC(kernel='linear', probability=True) is 0.012714121903862513
The mean reciprocal rank for the DecisionTreeClassifier(random_state=42) is 0.011661608415718349
The mean reciprocal rank for the RandomForestClassifier(n_estimators=20, random_state=42) is 0.011327447878240757


Classifier | Number of Sessions | Mean Reciprocal Rank | Training Time |
| --- | --- | --- | --- |
SGD | 50000 | 0.001205949176443282 | 6s 900ms |
Decision Tree | 50000 | 0.011661608415718349 | 300ms |
KNN | 50000 | 0.0022989210718511836 | 200ms |
SVC | 50000 | 0.012714121903862513 | 2min 51s 800ms |
Random Forest | 50000 | 0.011327447878240757 | 100ms |

Based on the results we got for each classifier, we can see that the ones that got the best results were the Decision Tree, the Random Forest and the SVC classifier. We are going to have to rule out the SVC classifier, due to its long training time. Between the Decision Tree and the Random Forest classifier, since they have very similar MRR, we are going to choose the one with the shortest training time. 

In conclusion, the Random Forest is the classifier we will use.

## Generate TEST Submission

In [54]:
df_leader = pd.read_csv('test_leaderboard_sessions.csv')

In [55]:
leader_session_sorted = df_leader.sort_values(by=['date'])

# First date of session
leader_begin_df = leader_session_sorted.groupby('session_id')['date'].first().rename('start_date')

# First product of session
leader_first_prod=leader_session_sorted.groupby('session_id')['item_id'].first().rename('first_prod')

# Number of products seen in the session
leader_prod_count=leader_session_sorted.groupby('session_id')['item_id'].count().rename('prod_count')

# Last date of the session
leader_end_df = leader_session_sorted.groupby('session_id')['date'].last().rename('end_date')

# Last product of the session
leader_last_prod=leader_session_sorted.groupby('session_id')['item_id'].last().rename('last_prod')

# most common category in these session
leader_join_cat = leader_session_sorted.merge(features, on='item_id', how='left').sort_values(by=['feature_category_id'])
leader_common_cat = leader_join_cat.groupby(['session_id'])['feature_category_id'].value_counts().rename('most_common_cat')
leader_common_cat_df = leader_common_cat.to_frame(name = None).reset_index()
leader_common_cat = leader_common_cat_df.groupby(['session_id'])['feature_category_id'].first().rename('most_common_cat')

# counting number of unique categories for session
leader_count_num_unique_cat = leader_common_cat_df.groupby(['session_id'])['feature_category_id'].count().rename('count_num_unique_cat')


# Concat the datasets
leader_times_df = pd.concat([leader_begin_df, leader_end_df, leader_first_prod, leader_last_prod, 
                      leader_prod_count, leader_common_cat, leader_count_num_unique_cat
                     ],axis=1)

# Proper time format
leader_times_df['start_date']=pd.to_datetime(leader_times_df['start_date'])
leader_times_df['end_date']=pd.to_datetime(leader_times_df['end_date'])

# Calculatind the date difference
leader_times_df['time_diff']=(leader_times_df.end_date-leader_times_df.start_date).astype('timedelta64[s]')

# Calculating the time per product
leader_times_df['time_per_prod']=leader_times_df['time_diff']/leader_times_df['prod_count']

# Adding the time of the day
mask=(leader_times_df.start_date.dt.hour>=0) & (leader_times_df.start_date.dt.hour<7)
leader_times_df.loc[mask,'time_first_prod']='madrugada'
mask=(leader_times_df.start_date.dt.hour>=7) & (leader_times_df.start_date.dt.hour<12)
leader_times_df.loc[mask,'time_first_prod']='día'
mask=(leader_times_df.start_date.dt.hour>=12) & (leader_times_df.start_date.dt.hour<18)
leader_times_df.loc[mask,'time_first_prod']='tarde'
mask=(leader_times_df.start_date.dt.hour>=18) & (leader_times_df.start_date.dt.hour<24)
leader_times_df.loc[mask,'time_first_prod']='noche'

In [56]:
leader_times_df

Unnamed: 0_level_0,start_date,end_date,first_prod,last_prod,prod_count,most_common_cat,count_num_unique_cat,time_diff,time_per_prod,time_first_prod
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
26,2021-06-16 09:53:54.158,2021-06-16 09:53:54.158,19185,19185,1,3,24,0.0,0.000000,día
200,2021-06-25 12:23:40.811,2021-06-25 12:24:50.692,17089,4758,4,30,19,69.0,17.250000,tarde
205,2021-06-11 00:28:07.058,2021-06-11 00:28:07.058,8194,8194,1,3,25,0.0,0.000000,madrugada
495,2021-06-14 22:13:06.741,2021-06-14 22:13:06.741,6853,6853,1,4,20,0.0,0.000000,noche
521,2021-06-19 13:50:03.090,2021-06-19 13:50:03.090,26471,26471,1,3,25,0.0,0.000000,tarde
...,...,...,...,...,...,...,...,...,...,...
4439446,2021-06-11 17:16:12.243,2021-06-11 17:17:11.565,20770,21396,3,4,19,59.0,19.666667,tarde
4439458,2021-06-09 04:52:02.785,2021-06-09 04:54:45.378,11715,26499,2,4,29,162.0,81.000000,madrugada
4439550,2021-06-02 17:42:40.481,2021-06-02 17:42:40.481,19086,19086,1,28,18,0.0,0.000000,tarde
4439653,2021-06-11 10:19:35.472,2021-06-11 10:23:00.663,4813,12179,10,4,30,205.0,20.500000,día


In [57]:
df_sess_clust_leader=df_leader.merge(item_clust, left_on='item_id', right_index=True)
df_sess_clust_leader

Unnamed: 0,session_id,item_id,date,Cluster_pred
0,26,19185,2021-06-16 09:53:54.158,0
13240,239993,19185,2021-06-21 11:22:20.096,0
34682,661695,19185,2021-06-27 09:31:21.179,0
36264,699226,19185,2021-06-07 09:58:16.167,0
42810,830528,19185,2021-06-14 15:26:44.282,0
...,...,...,...,...
226231,4382253,7523,2021-06-18 14:05:33.111,4
227011,4397116,6327,2021-06-23 21:59:23.999,2
227515,4408745,10278,2021-06-06 11:23:56.511,3
228684,4428903,27394,2021-06-16 17:03:01.114,0


In [58]:
df_sess_clust

Unnamed: 0,session_id,item_id,date,Cluster_pred
0,3,9655,2020-12-18 21:25:00.373,3
1,3,9655,2020-12-18 21:19:48.093,3
103605,97085,9655,2020-12-27 20:22:13.011,3
162423,152155,9655,2020-11-19 16:26:18.468,3
225780,210806,9655,2021-01-02 05:37:37.183,3
...,...,...,...,...
4737630,4434058,3416,2020-11-14 23:33:46.655,0
4737631,4434058,15964,2020-11-14 23:33:58.652,0
4739963,4436170,27863,2020-07-20 06:36:30.611,1
4741053,4437232,7735,2020-01-02 20:07:06.367,0


In [59]:
df_clust_count_leader = df_sess_clust_leader.groupby(['session_id', 'Cluster_pred'])['session_id'].aggregate('count').unstack().fillna(0)

In [60]:
df_clust_count_leader["most_seen_cluster"] = df_clust_count_leader.idxmax(axis=1)

In [61]:
df_leader=pd.concat([leader_times_df, df_clust_count_leader], axis=1)

In [62]:
df_leader = df_leader.reset_index().merge(item_clust, how='left', left_on='first_prod', right_on=item_clust.index).set_index('session_id')
df_leader = df_leader.rename(columns = {'Cluster_pred':'first_item_cluster'})
df_leader = df_leader.reset_index().merge(item_clust, how='left', left_on='last_prod', right_on=item_clust.index).set_index('session_id')
df_leader = df_leader.rename(columns = {'Cluster_pred':'last_item_cluster'})

In [63]:
from sklearn.compose import ColumnTransformer

leader_set_num = df_leader[num_vars]
leader_set_cat = df_leader[cat_vars]

num_attribs_leader = leader_set_num.select_dtypes(include="object").columns
cat_attribs_leader = leader_set_cat.select_dtypes(include="object").columns


full_pipeline = ColumnTransformer([
        ("num1", num_pipeline, num_attribs_leader),
        ("cat", cat_pipeline, cat_attribs_leader),
    ])

leader_set_prepared = full_pipeline.fit_transform(df_leader)

leader_set_prepared

<50000x4 sparse matrix of type '<class 'numpy.float64'>'
	with 50000 stored elements in Compressed Sparse Row format>

In [65]:
predictions = forest_class.predict_proba(leader_set_prepared)
#print(predictions)

In [66]:
pred_df = pd.DataFrame(predictions)
pred_df.columns = forest_class.classes_
pred_df

NameError: name 'predictions' is not defined

In [None]:
candidate_items_df = pd.read_csv("candidate_items.csv")
columns = candidate_items_df["item_id"].to_list()
real_columns = [b for b in pred_df.columns if b in columns]

In [None]:
pred_df = pred_df[real_columns]
pred_df["session_id"] = df_leader.index
pred_df = pred_df[["session_id"] + list(pred_df.columns[:-1])]
pred_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pred_df["session_id"] = df_leader.index


Unnamed: 0,session_id,4,8,19,20,26,33,40,54,64,...,28079,28093,28101,28112,28122,28128,28131,28132,28133,28137
0,26,0.000036,0.000000,0.00003,0.000000,0.000112,0.000174,0.000000,0.000018,0.000122,...,0.00000,0.000084,0.000074,0.000214,0.000030,0.000096,0.000000,0.000066,0.000457,0.000026
1,200,0.000000,0.000035,0.00000,0.000033,0.000131,0.000186,0.000035,0.000025,0.000045,...,0.00004,0.000000,0.000100,0.000061,0.000154,0.000070,0.000061,0.000032,0.000327,0.000000
2,205,0.000144,0.000000,0.00000,0.000112,0.000219,0.000000,0.000000,0.000138,0.000000,...,0.00000,0.000157,0.000000,0.000000,0.000000,0.000000,0.000000,0.000187,0.000621,0.000113
3,495,0.000000,0.000000,0.00000,0.000102,0.000062,0.000177,0.000034,0.000000,0.000081,...,0.00000,0.000000,0.000068,0.000103,0.000032,0.000128,0.000038,0.000068,0.000591,0.000124
4,521,0.000000,0.000035,0.00000,0.000033,0.000131,0.000186,0.000035,0.000025,0.000045,...,0.00004,0.000000,0.000100,0.000061,0.000154,0.000070,0.000061,0.000032,0.000327,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,4439446,0.000000,0.000035,0.00000,0.000033,0.000131,0.000186,0.000035,0.000025,0.000045,...,0.00004,0.000000,0.000100,0.000061,0.000154,0.000070,0.000061,0.000032,0.000327,0.000000
49996,4439458,0.000144,0.000000,0.00000,0.000112,0.000219,0.000000,0.000000,0.000138,0.000000,...,0.00000,0.000157,0.000000,0.000000,0.000000,0.000000,0.000000,0.000187,0.000621,0.000113
49997,4439550,0.000000,0.000035,0.00000,0.000033,0.000131,0.000186,0.000035,0.000025,0.000045,...,0.00004,0.000000,0.000100,0.000061,0.000154,0.000070,0.000061,0.000032,0.000327,0.000000
49998,4439653,0.000036,0.000000,0.00003,0.000000,0.000112,0.000174,0.000000,0.000018,0.000122,...,0.00000,0.000084,0.000074,0.000214,0.000030,0.000096,0.000000,0.000066,0.000457,0.000026


In [None]:
f = open('final_submission2.csv', 'w')
f.write("session_id,item_id,rank" + '\n')

for index, row in pred_df.iterrows():
    #print(index,row)
    session_id_act = int(row.iloc[0])
    row = row.iloc[1:]
    row_sorted = row.sort_values(ascending=False)
    items = row_sorted.index.to_list()
    #print(items[0])
    #break
    for i in range(1,101):
        f.write(str(int(session_id_act)) + ',' + str(items[i-1]) + ',' + str(i) + '\n')