In [364]:
import pandas as pd 
import numpy as np 
import datetime as dt 

from scipy import sparse 
from sklearn import preprocessing
!pip install lightFM
from lightfm.cross_validation import random_train_test_split
from lightfm.evaluation import auc_score, precision_at_k, recall_at_k
from lightfm import LightFM


from sklearn.model_selection import GridSearchCV



In [382]:
tran_df = pd.read_csv('../archive/Processed Data/processed_transactions_data.csv')
cust_df = pd.read_csv('../archive/Processed Data/processed_customer_data.csv')
prod_df = pd.read_csv('../archive/Processed Data/processed_product_data.csv')
rfm = pd.read_csv('../archive/Processed Data/cluster_data.csv')


In [383]:
tran_df

Unnamed: 0,Transaction ID,Customer ID,Transaction Date,Prod Subcat Code,Prod Cat Code,Qty,Rate,Tax,Total Amt,Store Type,Net_Sales
0,29258453508,270384,2014-02-20,5,3,5,1497,785.925,8270.925,e-Shop,7485
1,25455265351,267750,2014-02-20,12,6,3,1360,428.400,4508.400,e-Shop,4080
2,1571002198,275023,2014-02-20,6,5,4,587,246.540,2594.540,e-Shop,2348
3,36554696014,269345,2014-02-20,3,5,3,1253,394.695,4153.695,e-Shop,3759
4,56814940239,268799,2014-02-20,7,5,5,368,193.200,2033.200,e-Shop,1840
...,...,...,...,...,...,...,...,...,...,...,...
20871,94340757522,274550,2011-01-25,12,5,1,1264,132.720,1396.720,e-Shop,1264
20872,89780862956,270022,2011-01-25,4,1,1,677,71.085,748.085,e-Shop,677
20873,85115299378,271020,2011-01-25,2,6,4,1052,441.840,4649.840,MBR,4208
20874,72870271171,270911,2011-01-25,11,5,3,1142,359.730,3785.730,TeleShop,3426


In [384]:
prod_df.head()

Unnamed: 0,Prod Cat Code,Prod Cat,Prod Subcat Code,Prod Subcat,Product Info,Product Code
0,1,Clothing,4,Mens,Clothing_Mens,1_4
1,1,Clothing,1,Women,Clothing_Women,1_1
2,1,Clothing,3,Kids,Clothing_Kids,1_3
3,2,Footwear,1,Mens,Footwear_Mens,2_1
4,2,Footwear,3,Women,Footwear_Women,2_3


In [368]:
cust_df.head()

Unnamed: 0,Customer ID,DOB,Gender,City Code
0,268408,02-01-1970,M,4.0
1,269696,07-01-1970,F,8.0
2,268159,08-01-1970,F,8.0
3,270181,10-01-1970,F,2.0
4,268073,11-01-1970,M,1.0


In [386]:
# Extract Age from DOB 

# Convert the DOB column to datetime
cust_df['DOB'] = pd.to_datetime(cust_df['DOB'])
Now = dt.datetime(2014,2,20)

# Calculate the age
cust_df['Age'] = (Now - cust_df['DOB']).astype('<m8[Y]')

cust_df['Age'] = cust_df['Age'].astype(int)

In [387]:
cust_df

Unnamed: 0,Customer ID,DOB,Gender,City Code,Age
0,268408,1970-02-01,M,4.0,44
1,269696,1970-07-01,F,8.0,43
2,268159,1970-08-01,F,8.0,43
3,270181,1970-10-01,F,2.0,43
4,268073,1970-11-01,M,1.0,43
...,...,...,...,...,...
5642,274474,1992-12-19,M,2.0,21
5643,267666,1992-12-24,M,6.0,21
5644,270476,1992-12-25,F,3.0,21
5645,269626,1992-12-27,F,5.0,21


In [388]:
print(cust_df['Age'].max())
print(cust_df['Age'].min())

44
21


In [389]:
# Define the bin edges
bin_edges = [0, 25, 30, 35, 40, 50]

# Create the bin labels
bin_labels = ['<25', '25-30', '30-35', '35-40', '>40']

# Bin the age column
cust_df['Age'] = pd.cut(cust_df['Age'], bin_edges, labels=bin_labels)

In [390]:
cust_df['Age'].value_counts() 

30-35    1247
35-40    1228
<25      1213
25-30    1210
>40       749
Name: Age, dtype: int64

## i) Merge Cluster, Customer Features and Product Features
1. Merge cluster from RFM data to transactional data
2. Merge Gender, City from Customer data to transactional data
3. Merge Product info from Customer data to transactional data (No common feature available so, first merge prod cat code, prod subcat code in transactional data then merge from product data) 

In [391]:
rfm.set_index('Customer ID', inplace =True) 

In [400]:
rm_df = rfm.copy() 

In [401]:
# merge cluster
rm_df2 = tran_df.merge(rm_df['Cluster_name'], how = 'left', on = 'Customer ID')


#merge customer's features [Gender and City]
#rm_df2 = cust_df.merge(rm_df2, how='left', on='Customer Id')

rm_df2 = rm_df2.merge(cust_df[['Customer ID','City Code', 'Gender', 'Age']], how='left', on = 'Customer ID')




In [402]:
rm_df2.head()

Unnamed: 0,Transaction ID,Customer ID,Transaction Date,Prod Subcat Code,Prod Cat Code,Qty,Rate,Tax,Total Amt,Store Type,Net_Sales,Cluster_name,City Code,Gender,Age
0,29258453508,270384,2014-02-20,5,3,5,1497,785.925,8270.925,e-Shop,7485,Champions,8.0,F,35-40
1,25455265351,267750,2014-02-20,12,6,3,1360,428.4,4508.4,e-Shop,4080,Champions,1.0,M,25-30
2,1571002198,275023,2014-02-20,6,5,4,587,246.54,2594.54,e-Shop,2348,Champions,6.0,M,>40
3,36554696014,269345,2014-02-20,3,5,3,1253,394.695,4153.695,e-Shop,3759,Champions,10.0,F,>40
4,56814940239,268799,2014-02-20,7,5,5,368,193.2,2033.2,e-Shop,1840,Champions,9.0,M,30-35


In [403]:
#merge 'Prod Cat Code' and 'Prod Subcat Code' of transactional data
rm_df2['Product Code'] = rm_df2['Prod Cat Code'].astype(str) + '_'+ rm_df2['Prod Subcat Code'].astype(str)



In [404]:
#merge product features [Product Info]
rm_df2 = rm_df2.merge(prod_df[['Product Code','Product Info']], how='left', on = 'Product Code')



In [405]:
# check the unique items of each feature
for i in rm_df2.columns: 
  print(i, ':', rm_df2[i].nunique()) 

Transaction ID : 20876
Customer ID : 5506
Transaction Date : 1123
Prod Subcat Code : 12
Prod Cat Code : 6
Qty : 5
Rate : 1431
Tax : 4194
Total Amt : 4194
Store Type : 4
Net_Sales : 4194
Cluster_name : 5
City Code : 10
Gender : 2
Age : 5
Product Code : 23
Product Info : 23


In [397]:
rm_df2.isnull().sum()

Transaction ID      0
Customer ID         0
Transaction Date    0
Prod Subcat Code    0
Prod Cat Code       0
Qty                 0
Rate                0
Tax                 0
Total Amt           0
Store Type          0
Net_Sales           0
Cluster_name        0
Product Code        0
Product Info        0
dtype: int64

In [406]:
# Now we will remove the missing elements. We didn't remove before because removing at the very beginning could loss some data while we merge them. 

rm_df2 = rm_df2.dropna()

## ii) Interaction Matrix
Getting the customer's transaction for every material in terms of sales quantity

In [407]:
#Keep only essential columns
final_rm_df = rm_df2.copy() 
final_rm_df = final_rm_df[['Product Info','Product Code','Store Type', 'Qty', 'Customer ID', 'Gender', 'Age', 'City Code', 'Cluster_name']]

In [295]:
#merge product info and store type for differenciating materials
#final_rm_df['Material'] = final_rm_df['Product Info'].astype(str) + '_'+ final_rm_df['Store Type'].astype(str)



In [408]:
#Interacion Matrix

interactions = final_rm_df.groupby(['Customer ID', 'Product Info'])['Qty'].sum().unstack()  

In [409]:
interactions = interactions.fillna(0) 

In [410]:
#feature transformation (Min-max-scaler)
minmaxscaler = preprocessing.MinMaxScaler() 
interactions_scaled = minmaxscaler.fit_transform(interactions) 
interactions_scaled = pd.DataFrame(interactions_scaled) 

In [411]:
## Re-append the customer ID and the material into the DataFrame's index and column respectively
interactions_scaled.index = interactions.index 
interactions_scaled.columns = interactions.columns 

In [412]:
interactions_scaled 

Product Info,Bags_Mens,Bags_Women,Books_Academic,Books_Children,Books_Comics,Books_DIY,Books_Fiction,Books_Non-Fiction,Clothing_Kids,Clothing_Mens,...,Electronics_Computers,Electronics_Mobiles,Electronics_Personal Appliances,Footwear_Kids,Footwear_Mens,Footwear_Women,Home and kitchen_Bath,Home and kitchen_Furnishing,Home and kitchen_Kitchen,Home and kitchen_Tools
Customer ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
266783,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.2,0.000000,0.333333,...,0.0,0.000000,0.000000,0.0,0.333333,0.000000,0.0,0.0,0.000000,0.000000
266784,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.384615,0.3,0.000000,0.000000,...,0.0,0.166667,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.000000
266785,0.3,0.000000,0.0,0.454545,0.000000,0.0,0.000000,0.0,0.000000,0.000000,...,0.0,0.000000,0.000000,0.7,0.666667,0.000000,0.0,0.0,0.416667,0.000000
266788,0.2,0.000000,0.0,0.000000,0.000000,0.0,0.076923,0.0,0.000000,0.000000,...,0.0,0.000000,0.000000,0.4,0.083333,0.000000,0.0,0.0,0.000000,0.000000
266794,0.2,0.272727,0.4,0.272727,0.000000,0.0,0.000000,0.0,0.583333,0.000000,...,0.0,0.000000,0.363636,0.0,0.000000,0.416667,0.0,0.0,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
275257,0.0,0.000000,0.0,0.272727,0.000000,0.0,0.000000,0.0,0.000000,0.000000,...,0.0,0.000000,0.000000,0.0,0.416667,0.000000,0.0,0.3,0.000000,0.000000
275261,0.0,0.090909,0.0,0.454545,0.272727,0.0,0.000000,0.0,0.000000,0.000000,...,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.000000
275262,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.250000,...,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.000000
275264,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.5,0.000000,0.000000,...,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.333333


## iii) User Features Matrix
There are three features merged into building the user features matrix
1. Customer's unique categories they buy 
2. Customer info (Gender, City Code) 
3. Cluster they belong to  

In [413]:
# User Features Matrix 

# Getting the unique categories they buy, grouped them by the quantity per category
cust_qty = final_rm_df.groupby(['Customer ID', 'Product Info'])['Qty'].sum().unstack().fillna(0)



In [414]:
# Scale with minmaxscaler in the same way as previous
minmaxscaler = preprocessing.MinMaxScaler()
cust_qty_scaled = minmaxscaler.fit_transform(cust_qty)
cust_qty_scaled = pd.DataFrame(cust_qty_scaled)
cust_qty_scaled.index = cust_qty.index
cust_qty_scaled.columns = cust_qty.columns 

In [415]:
final_rm_df['City Code'].unique()

array([ 8.,  1.,  6., 10.,  9.,  2.,  7.,  5.,  4.,  3.])

In [416]:
#Getting customer cluster 
cust_clus = final_rm_df.groupby(['Customer ID', 'Cluster_name'])['Cluster_name'].nunique().unstack().fillna(0)

#Getting customer Gender
cust_gender = final_rm_df.groupby(['Customer ID', 'Gender'])['Gender'].nunique().unstack().fillna(0)

#Getting customer Age
cust_age = final_rm_df.groupby(['Customer ID', 'Age'])['Age'].nunique().unstack().fillna(0)


#Getting customer city code
cust_city = final_rm_df.groupby(['Customer ID', 'City Code'])['City Code'].nunique().unstack().fillna(0)

print(cust_gender.shape)
print(cust_age.shape)
print(cust_clus.shape)
print(cust_city.shape)

(5502, 2)
(5502, 5)
(5502, 5)
(5502, 10)


In [417]:
#merge cluster dataframe with gender dataframe
cluster_gender = pd.merge(cust_clus, cust_gender, left_index=True, right_index=True, how='inner')
print(cluster_gender)


#merge cluster_gender dataframe with city dataframe
cluster_gender_city = pd.merge(cluster_gender, cust_city, left_index=True, right_index=True, how='inner')
print(cluster_gender_city)


#merge cluster_gender dataframe with city dataframe
cluster_gender_city_age = pd.merge(cluster_gender_city, cust_age, left_index=True, right_index=True, how='inner')
print(cluster_gender_city_age)

             About to Sleep  Champions  Hibernating  Loyal Customer  \
Customer ID                                                           
266783                  0.0        0.0          0.0             1.0   
266784                  1.0        0.0          0.0             0.0   
266785                  0.0        1.0          0.0             0.0   
266788                  1.0        0.0          0.0             0.0   
266794                  0.0        1.0          0.0             0.0   
...                     ...        ...          ...             ...   
275257                  0.0        0.0          0.0             0.0   
275261                  0.0        0.0          0.0             1.0   
275262                  0.0        0.0          1.0             0.0   
275264                  0.0        0.0          1.0             0.0   
275265                  0.0        0.0          1.0             0.0   

             Potential Loyalist    F    M  
Customer ID                     

In [418]:
# Merge the two features to create the user feature matrix
#customer_features = pd.merge(cust_qty_scaled, cust_gender, left_index=True, right_index=True, how='inner')
#print(customer_features)
customer_features = pd.merge(cust_qty_scaled, cluster_gender_city_age, left_index=True, right_index=True, how='inner')
print(customer_features)

             Bags_Mens  Bags_Women  Books_Academic  Books_Children  \
Customer ID                                                          
266783             0.0    0.000000             0.0        0.000000   
266784             0.0    0.000000             0.0        0.000000   
266785             0.3    0.000000             0.0        0.454545   
266788             0.2    0.000000             0.0        0.000000   
266794             0.2    0.272727             0.4        0.272727   
...                ...         ...             ...             ...   
275257             0.0    0.000000             0.0        0.272727   
275261             0.0    0.090909             0.0        0.454545   
275262             0.0    0.000000             0.0        0.000000   
275264             0.0    0.000000             0.0        0.000000   
275265             0.1    0.000000             0.3        0.000000   

             Books_Comics  Books_DIY  Books_Fiction  Books_Non-Fiction  \
Customer ID    

In [307]:
customer_features 

Unnamed: 0_level_0,Bags_Mens,Bags_Women,Books_Academic,Books_Children,Books_Comics,Books_DIY,Books_Fiction,Books_Non-Fiction,Clothing_Kids,Clothing_Mens,...,6.0,7.0,8.0,9.0,10.0,<25,25-30,30-35,35-40,>40
Customer ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
266783,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.2,0.000000,0.333333,...,0.0,0.0,0.0,0.0,0.0,0,0,0,1,0
266784,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.384615,0.3,0.000000,0.000000,...,0.0,0.0,0.0,0.0,1.0,1,0,0,0,0
266785,0.3,0.000000,0.0,0.454545,0.000000,0.0,0.000000,0.0,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0,1,0,0,0
266788,0.2,0.000000,0.0,0.000000,0.000000,0.0,0.076923,0.0,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0,0,0,0,1
266794,0.2,0.272727,0.4,0.272727,0.000000,0.0,0.000000,0.0,0.583333,0.000000,...,0.0,0.0,0.0,1.0,0.0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
275257,0.0,0.000000,0.0,0.272727,0.000000,0.0,0.000000,0.0,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0,1,0,0,0
275261,0.0,0.090909,0.0,0.454545,0.272727,0.0,0.000000,0.0,0.000000,0.000000,...,0.0,0.0,0.0,0.0,1.0,0,0,0,1,0
275262,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.250000,...,0.0,0.0,0.0,0.0,1.0,0,0,0,1,0
275264,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.5,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,1,0,0,0,0


## iv) Item Feature Matrix
We use the different categories of the items as the feature to build the item feature matrix

In [424]:
### Item Features Matrix

item_category = pd.DataFrame(final_rm_df.groupby(['Product Info', 
                                              'Product Code'])['Qty'].sum().unstack().fillna(0).reset_index().set_index('Product Info'))

# again the minmax scaling
minmaxscaler = preprocessing.MinMaxScaler()
item_category_scaled = minmaxscaler.fit_transform(item_category)
item_category_scaled = pd.DataFrame(item_category_scaled)
item_category_scaled.index = item_category.index 
item_category_scaled.columns = item_category.columns 

In [425]:
item_category_scaled

Product Code,1_1,1_3,1_4,2_1,2_3,2_4,3_10,3_4,3_5,3_8,...,5_10,5_11,5_12,5_3,5_6,5_7,6_10,6_11,6_12,6_2
Product Info,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Bags_Mens,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Bags_Women,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Books_Academic,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Books_Children,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Books_Comics,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
Books_DIY,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
Books_Fiction,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
Books_Non-Fiction,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Clothing_Kids,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Clothing_Mens,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## v) Sparse matrix tranformation and train_test_splitting

In [426]:
### We have three features as required by LightFM 
## Create COOMatrices using Scipy function 
## COOMatrices are sparse matrices, mostly filled with zero
## It speeds up the processes and saves a lot of memory

interaction_f = sparse.coo_matrix(interactions_scaled)   
user_features  = sparse.coo_matrix(customer_features) 
item_features  = sparse.coo_matrix(item_category_scaled) 



In [427]:
### Jesse Steinweg-Woods provide a great elaboration on Recommendation metrics
### The following code is borrowed from his work
### https://jessesw.com/Rec-System/

def interaction_masking(interactions):
    '''
    This function will "mask" (a.k.a "hide") 20% of original interactions
    Masked items wil be considered not purchased
    '''
    mask_size = len(interactions.data)
    mask = np.random.choice(a=[False, True], size=mask_size, p=[.2, .8])
    not_mask = np.invert(mask)
    
    train_interactions = sparse.coo_matrix((interactions.data[mask],
                                        (interactions.row[mask],
                                         interactions.col[mask])),
                                       shape=interactions.shape)

    test_interactions = sparse.coo_matrix((interactions.data[not_mask], 
                                       (interactions.row[not_mask], 
                                        interactions.col[not_mask])), 
                                      shape=interactions.shape)

    return train_interactions, test_interactions

In [428]:
mask_size = len(interaction_f.data)

### use numpy.random.choice to generate a random True/False array, where False denotes the "masked" interactions (hidden)
## Ref: https://docs.scipy.org/doc/numpy-1.15.0/reference/generated/numpy.random.choice.html
np.random.choice(a=[False, True], 
                 size=mask_size, 
                 p=[.2, .8]) 

array([False,  True,  True, ...,  True,  True,  True])

In [429]:
# Create a masked train and test dataset of the interaction features
train_interactions, test_interactions = interaction_masking(interaction_f)



## vi) LightFM Model
Here's some references we've explored, also you can explore: 

1] https://making.lyst.com/lightfm/docs/home.html# 

2] https://www.ethanrosenthal.com/2016/11/07/implicit-mf-part-2/ 

3] https://www.kaggle.com/niyamatalmass/lightfm-hybrid-recommendation-system

In [430]:
#define auc score
def calculate_auc_score(lightfm_model, interactions_matrix, 
                        item_features, product_features): 
    """ 
    Measure the ROC AUC metric for a model running the auc_score function of LightFM algorithm. 

    Returns
    -------
    String containing AUC score [perfect score: 1.0]
    """
    score = auc_score( 
        lightfm_model, interactions_matrix, 
        item_features=item_features, 
        user_features=product_features, 
        num_threads=8).mean() #num_threads = number of available core
    return score

### Tuning Process of the model
we have applied a very naive approach with the for loops for the tuning process. 

(N.B: For the working load we run the tuning snippet on local environment. the best parameter found: **(learning rate: 0.01, no_components: 1000, epochs:300))**

In [315]:
# #tuning process
# train_auc = []
# test_auc = []
# parameters = []
# loss = 'warp'
# learning_rate = [0.1, 0.01, 0.001, 0.0001, 0.5, 1, 5]
# no_components = [10, 50,100,150,200,300,500,1000]
# epochs = [50,100,200,300,500,1000]
# loss = ['logistic', 'bpr', 'warp', 'warp-kos']


# # param_grid = {'C': [0.1, 1, 10],
# #               'kernel': ['linear', 'rbf']}



# for i in learning_rate:
#     for j in no_components:
#         for k in loss:
#             model = LightFM(loss=k, random_state = 42,
#                         learning_rate = i, no_components = j)

#     for l in epochs: 
#         model = model.fit(train_interactions, 
#                   item_features = item_features, 
#                   user_features = user_features,  
#                   epochs=l, 
#                   num_threads= 8, verbose=True) 

#         train_auc.append(calculate_auc_score(model, train_interactions, item_features, user_features))
#         test_auc.append(calculate_auc_score(model, test_interactions, item_features, user_features))
#         parameters.append('learning rate: '+str(i)+', no_components: '+str(j)+ ', epochs:'+ str(l)+ ', loss:'+str(k) ) 

In [316]:
max_index = np.argmax(test_auc) # retunrs the maximum score index
parameters[max_index]

'learning rate: 0.01, no_components: 1000, epochs:300'

### Run and Fit the model with the tuned parameters

In [431]:
model = LightFM(loss='warp',
                random_state= 42,
                learning_rate=0.01,
                no_components=1000,
                )

model = model.fit(train_interactions,
                  item_features = item_features, 
                  user_features = user_features, 
                  epochs=300,
                  num_threads= 8, verbose=True)

Epoch: 100%|██████████████████████████████████| 300/300 [00:49<00:00,  6.06it/s]


In [432]:
print(calculate_auc_score(model, train_interactions, item_features, user_features)) 
print(calculate_auc_score(model, test_interactions, item_features, user_features)) 

0.9836795
0.9370837


In [444]:
#Encode the material for getting a product id  
final_rm_df['Product ID'] = final_rm_df['Product Info'].factorize()[0] 

In [445]:
final_rm_df


Unnamed: 0,Product Info,Product Code,Store Type,Qty,Customer ID,Gender,Age,City Code,Cluster_name,Product ID
0,Electronics_Computers,3_5,e-Shop,5,270384,F,35-40,8.0,Champions,0
1,Home and kitchen_Tools,6_12,e-Shop,3,267750,M,25-30,1.0,Champions,1
2,Books_DIY,5_6,e-Shop,4,275023,M,>40,6.0,Champions,2
3,Books_Comics,5_3,e-Shop,3,269345,F,>40,10.0,Champions,3
4,Books_Fiction,5_7,e-Shop,5,268799,M,30-35,9.0,Champions,4
...,...,...,...,...,...,...,...,...,...,...
20871,Books_Academic,5_12,e-Shop,1,274550,M,>40,7.0,Loyal Customer,5
20872,Clothing_Mens,1_4,e-Shop,1,270022,M,25-30,9.0,Hibernating,14
20873,Home and kitchen_Furnishing,6_2,MBR,4,271020,M,35-40,8.0,Loyal Customer,18
20874,Books_Children,5_11,TeleShop,3,270911,M,>40,2.0,Potential Loyalist,9


In [439]:
final_rm_df['Product Info'].nunique()

23

 # Let's see the outcome 

In [451]:
def prediction(customer_ids):
    for customer in customer_ids:
    
        print('Customer ID: (', customer, ''') \n 
                    Previous Purchase''' )
        print('=====================================================')
        
        final_rm_df2 = final_rm_df.copy()
        
        #print the previous purchase
        print(final_rm_df2[final_rm_df2['Customer ID'] == customer][['Product Info', 'Qty']].sort_values(by = 'Qty', ascending = False).to_string(index = False))

        #find the item_id of already purchased item. We don't want to recommend the purchased product.
        discard_item_id = final_rm_df2[final_rm_df2['Customer ID'] == customer]['Product ID'].values.tolist() 

        #Consider only needed column for the predicted_prod dataframe
        predicted_prod = final_rm_df2[['Product ID', 'Product Info']]  

        #remove the already purchased product from the dataframe
        predicted_prod = predicted_prod[~predicted_prod['Product Info'].isin(discard_item_id)]  

        #drop the duplicates item and reset the index as we don't need customer ID here
        predicted_prod = predicted_prod.drop_duplicates().reset_index().drop('index', axis=1)  


        #take the material_ids as a list
        item_ids = predicted_prod['Product ID'].unique().tolist() 
        
        #print(len(item_ids))

        #reset the index because we need to call the index through the customer ID and for the model we need the index number start from 0 for defining customer ID and product ID.
        cust_features = customer_features.copy()  
        cust_features.reset_index(inplace = True) 

        #take the customer_id as well. it's the index number, not the customer ID. (Both customerID and productID should start from 0 and should be unique as well)
        customer_id = int(cust_features[cust_features['Customer ID'] == 267912].reset_index()['index']) 

        #predict the score
        scores = model.predict(customer_id, item_ids, item_features=item_features, 
                               user_features = user_features) 



        #merge the prediction scores in the predicted_prod dataframe
        predicted_prod['scores'] = scores 

        #sort values descendingly and take only top 8
        predicted_prod = predicted_prod.sort_values(by = 'scores',ascending = False)[:8] 
        print('\n')
        print('               Recommended Products(Top 8)' ) 
        print('======================================================')
        print(predicted_prod[['Product ID', 'Product Info']].to_string(index = False)) 
        print('\n')

   

In [449]:
prediction([267815,266783])  

Customer ID: ( 267815 ) 
 
                    Previous Purchase
            Product Info  Qty
Home and kitchen_Kitchen    4
               Bags_Mens    4
               Books_DIY    1


               Recommended Products(Top 8)
 Product ID                Product Info
          0       Electronics_Computers
         21           Books_Non-Fiction
          5              Books_Academic
          8               Clothing_Kids
          4               Books_Fiction
         13              Footwear_Women
          3                Books_Comics
         12 Electronics_Audio and video


Customer ID: ( 266783 ) 
 
                    Previous Purchase
     Product Info  Qty
    Footwear_Mens    4
    Clothing_Mens    3
Books_Non-Fiction    2
    Clothing_Mens    1


               Recommended Products(Top 8)
 Product ID             Product Info
          9           Books_Children
         17      Electronics_Mobiles
          7            Footwear_Mens
          6               Bags_Wome

In [340]:
cust_df[~cust_df['Customer ID'].isin(tran_df['Customer ID'])]


Unnamed: 0,Customer ID,DOB,Gender,City Code,Age
25,267916,1970-02-17,F,8.0,>40
52,266969,1970-03-28,M,2.0,>40
102,273529,1970-06-19,F,1.0,>40
111,274892,1970-05-07,F,8.0,>40
155,272408,1970-09-13,M,6.0,>40
...,...,...,...,...,...
5476,270232,1992-04-22,M,10.0,<25
5493,268693,1992-05-23,M,6.0,<25
5533,266947,1992-07-17,M,5.0,<25
5540,270973,1992-07-24,F,5.0,<25


In [335]:
cust_df[cust_df['Customer ID'] == 267916]

Unnamed: 0,Customer ID,DOB,Gender,City Code,Age
25,267916,1970-02-17,F,8.0,>40


In [452]:
prediction([267916,266969])  

Customer ID: ( 267916 ) 
 
                    Previous Purchase
Empty DataFrame
Columns: [Product Info, Qty]
Index: []


               Recommended Products(Top 8)
 Product ID                    Product Info
         14                   Clothing_Mens
          6                      Bags_Women
         20 Electronics_Personal Appliances
         15        Home and kitchen_Kitchen
         18     Home and kitchen_Furnishing
         21               Books_Non-Fiction
          0           Electronics_Computers
          8                   Clothing_Kids


Customer ID: ( 266969 ) 
 
                    Previous Purchase
Empty DataFrame
Columns: [Product Info, Qty]
Index: []


               Recommended Products(Top 8)
 Product ID                    Product Info
         14                   Clothing_Mens
          6                      Bags_Women
         20 Electronics_Personal Appliances
         15        Home and kitchen_Kitchen
         18     Home and kitchen_Furnishing
        