## This experiment was performed using LightFM which a very popular recommender module and it has support to take in different data modalities such as text, image, graphical, etc

Hope you like this notebook, please feel free to vote for this notebook

### Importing Required Libraries

In [1]:
# Importing Libraries
import sys, os
import re
import pickle
import random
import pandas as pd
import numpy as np
import scipy.sparse as sparse
%matplotlib inline
import matplotlib.pyplot as plt

# lightfm 
from lightfm import LightFM # model
from lightfm.evaluation import auc_score
from lightfm.cross_validation import random_train_test_split

import time

In [2]:
os.environ["openblas_set_num_threads"] = "1"
data_path = r'../input/h-and-m-personalized-fashion-recommendations/transactions_train.csv'

In [3]:
# Data Extraction
def create_data(datapath):
    df=pd.read_csv(datapath)
    return df

In [4]:
# Loal all sales data (for 3 years startign from 2018 to 2020)
transactions_data=create_data(data_path)
print(transactions_data.shape)

# # Unique Attributes
print(str(len(transactions_data['t_dat'].drop_duplicates())) + "-total No of unique transactions dates in data sheet")
print(str(len(transactions_data['customer_id'].drop_duplicates())) + "-total No of unique customers ids in data sheet")
print(str(len(transactions_data['article_id'].drop_duplicates())) + "-total No of unique article ids courses names in data sheet")
print(str(len(transactions_data['sales_channel_id'].drop_duplicates())) + "-total No of unique sales channels in data sheet")

(31788324, 5)
734-total No of unique transactions dates in data sheet
1362281-total No of unique customers ids in data sheet
104547-total No of unique article ids courses names in data sheet
2-total No of unique sales channels in data sheet


In [5]:
transactions_data.head()

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id
0,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,663713001,0.050831,2
1,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,541518023,0.030492,2
2,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,505221004,0.015237,2
3,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687003,0.016932,2
4,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687004,0.016932,2


In [6]:
transactions_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31788324 entries, 0 to 31788323
Data columns (total 5 columns):
 #   Column            Dtype  
---  ------            -----  
 0   t_dat             object 
 1   customer_id       object 
 2   article_id        int64  
 3   price             float64
 4   sales_channel_id  int64  
dtypes: float64(1), int64(2), object(2)
memory usage: 1.2+ GB


### Aggregating Customers and Articles irrespective of transaction dates

In [7]:
transactions_data = transactions_data.groupby(['customer_id','article_id']).agg({'price':'sum','t_dat':'count'}).reset_index()
transactions_data = transactions_data[['customer_id','article_id','price']]

### Generating user and article index mapping dictionaries

In [8]:
def get_customers_list():
    # Creating a list of users
    return np.sort(transactions_data['customer_id'].unique())

def get_articles_list():
    # Creating a list of courses 
    item_list = transactions_data['article_id'].unique()
    return item_list

def id_mappings(customers_list, articles_list):
    """
    
    Create id mappings to convert user_id, item_id, and feature_id
    
    """
    customer_to_index_mapping = {}
    index_to_customer_mapping = {}
    for customer_index, customer_id in enumerate(customers_list):
        customer_to_index_mapping[customer_id] = customer_index
        index_to_customer_mapping[customer_index] = customer_id
        
    article_to_index_mapping = {}
    index_to_article_mapping = {}
    for article_index, article_id in enumerate(articles_list):
        article_to_index_mapping[article_id] = article_index
        index_to_article_mapping[article_index] = article_id
        
    return customer_to_index_mapping, index_to_customer_mapping, \
           article_to_index_mapping, index_to_article_mapping

In [9]:
customers = get_customers_list()
articles = get_articles_list()

In [10]:
customers

array(['00000dbacae5abe5e23885899a1fa44253a17956c6d1c3d25f88aa139fdfc657',
       '0000423b00ade91418cceaf3b26c6af3dd342b51fd051eec9c12fb36984420fa',
       '000058a12d5b43e67d225668fa1f8d618c13dc232df0cad8ffe7ad4a1091e318',
       ...,
       'ffffcf35913a0bee60e8741cb2b4e78b8a98ee5ff2e6a1778d0116cffd259264',
       'ffffd7744cebcf3aca44ae7049d2a94b87074c3d4ffe38b2236865d949d4df6a',
       'ffffd9ac14e89946416d80e791d064701994755c3ab686a1eaf3458c36f52241'],
      dtype=object)

In [11]:
articles

array([176209023, 568601006, 568601043, ..., 618853001, 533932004,
       568499001])

In [12]:
# Generate mapping, LightFM library can't read other than (integer) index
customer_to_index_mapping, index_to_customer_mapping, \
article_to_index_mapping, index_to_article_mapping = id_mappings(customers, articles)

### Generate Customer Article Interaction Matrix

In [13]:
def get_customer_article_interaction(customer_article_amt_df):
    #start indexing
    customer_article_amt_df["customer_id"] = customer_article_amt_df["customer_id"]
    customer_article_amt_df["article_id"] = customer_article_amt_df["article_id"]
    customer_article_amt_df["price"] = customer_article_amt_df["price"]

    # Preprocessing dataframe created
    customer_article_amt_df = customer_article_amt_df.rename(columns = {"price":"total_amount_spent"})

    # Replace Amount Column with category codes 
    customer_article_amt_df['total_amount_spent'] = customer_article_amt_df['total_amount_spent'].astype('category')
    customer_article_amt_df['total_amount_spent'] = customer_article_amt_df['total_amount_spent'].cat.codes

    return customer_article_amt_df

def get_interaction_matrix(df, df_column_as_row, df_column_as_col, 
                        df_column_as_value, row_indexing_map, col_indexing_map):
    
    row = df[df_column_as_row].apply(lambda x: row_indexing_map[x]).values
    col = df[df_column_as_col].apply(lambda x: col_indexing_map[x]).values
    value = df[df_column_as_value].values
    
    return sparse.coo_matrix((value, (row, col)), shape = (len(row_indexing_map), len(col_indexing_map)))


In [14]:
# Create customer and article interaction dataframe
customer_to_article = get_customer_article_interaction(customer_article_amt_df = transactions_data[['customer_id','article_id','price']])
print(customer_to_article.shape)                                                  

(27306439, 3)


In [15]:
customer_to_article.head()

Unnamed: 0,customer_id,article_id,total_amount_spent
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,176209023,8137
1,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,568601006,27464
2,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,568601043,12672
3,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,607642008,2052
4,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,625548001,10525


In [16]:
# Generate customer_article_interaction_matrix for train data
customer_to_article_interaction = get_interaction_matrix(customer_to_article, "customer_id", "article_id", "total_amount_spent", \
                                                    customer_to_index_mapping, article_to_index_mapping)

In [17]:
customer_to_article_interaction

<1362281x104547 sparse matrix of type '<class 'numpy.int32'>'
	with 27306439 stored elements in COOrdinate format>

### Light FM Model Training

In [18]:
#### FULL MODEL TRAINING ####
# Retraining the final model with full dataset
"""
Training model without any article or customer features
"""
final_model_without_feature = LightFM(loss = "warp")

# Fitting to combined dataset with pure collaborative filtering result
start = time.time() 
final_model_without_feature.fit(customer_to_article_interaction,
                                user_features=None, 
                                item_features=None, 
                                sample_weight=None, 
                                epochs=1, 
                                num_threads=4, 
                                verbose=False)
end = time.time()
print("time taken = {0:.{1}f} seconds".format(end - start, 2))

time taken = 23.34 seconds


### Recommendation sampling and Comparison with Known Positives

In [19]:
class recommendation_sampling():
    def __init__(self, model, items = None, user_to_product_interaction_matrix = None, 
                item_features = None, user2index_map = None):
        
        self.user_to_product_interaction_matrix = user_to_product_interaction_matrix
        self.item_features = item_features if item_features is not None else None
        self.model = model
        self.items = items
        self.user2index_map = user2index_map
    
    def recommendation_for_user(self, user, prediction_type = 'normal'):
        # Getting the userindex
        userindex = self.user2index_map.get(user, None)
        if userindex == None:
            print("User %s not provided during Training the model" %(user))
            return None

        users = [userindex]
        
        # Products already bought
        known_positives = self.items[self.user_to_product_interaction_matrix.tocsr()[userindex].indices]
        
        # Scores from model prediction
        scores = self.model.predict(user_ids = users, item_ids = np.arange(self.user_to_product_interaction_matrix.shape[1])) if prediction_type == 'normal' else \
            self.model.predict(user_ids = users, item_ids = np.arange(self.user_to_product_interaction_matrix.shape[1]), item_features = self.item_features)
    
        print(scores)
        
#         # Top items
#         top_items = self.items[np.argsort(-scores)]
        
#         # Printing out the result
#         print("User %s" % user)
#         print("     Known positives:")
#         for x in known_positives[:3]:
#             print("                  %s" % x)
            
#         print("     Recommended:")
#         for x in top_items[:3]:
#             print("                  %s" % x)

    def get_recommendation(self, user, prediction_type = 'normal'):
        # Getting the userindex
        userindex = self.user2index_map.get(user, None)
        if userindex == None:
            return None
        users = [userindex]
        
        # Products already bought
        known_positives = self.items[self.user_to_product_interaction_matrix.tocsr()[userindex].indices]
        
        # Scores from model prediction
        scores = self.model.predict(user_ids = users, item_ids = np.arange(self.user_to_product_interaction_matrix.shape[1])) if prediction_type == 'normal' else \
            self.model.predict(user_ids = users, item_ids = np.arange(self.user_to_product_interaction_matrix.shape[1]), item_features = self.item_features)
        
        # Top items
        top_items = self.items[np.argsort(-scores)]

        # Returning results
        recommended_list, recommender_count = [],1
        for item in top_items[:3]:
            recommended_list.append({'Priority': recommender_count,'Course/Event': item})
            recommender_count+=1
        return recommended_list

In [20]:
# Giving recommendations
recom_without_feature = recommendation_sampling(model = final_model_without_feature,
                                               items = articles,
                                               user_to_product_interaction_matrix = customer_to_article_interaction,
                                               user2index_map = customer_to_index_mapping)

In [21]:
recom_without_feature.recommendation_for_user('00000dbacae5abe5e23885899a1fa44253a17956c6d1c3d25f88aa139fdfc657')

TypeError: Invalid type passed to user_ids parameter. This must be either int or np.int32 array. Type received: <class 'list'>