## Collaborative Filtering for Implicit Feedback Datasets

Article for the math behind the code: http://yifanhu.net/PUB/cf.pdf

The number of interactions are calculated each time the user interacts with the item.
Based on the interactions sparse martics are generated for the user interaction 

#### User Item preference matrix:

We make a matrix which tells us which brand has been seen by the user. If the user has interacted with the product of a brand :

$$
p_{ui} =    \begin{cases}
    1, &amp; r_{ui} &gt; 0.\\
    0, &amp; r_{ui} = 0.
  \end{cases}
  $$
$r_{ui}$: user $u$ clicked(or other interaction) number of times on item $i$

$p_{ui}$: user $u$ consumed item $i$ $(r_{ui} &gt; 0)$, then we have an indication that $u$ likes $i$ $(p_{ui} = 1)$. On the other hand, if $u$ never consumed $i$, we believe no preference $(p_{ui} = 0)$.

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import scipy.sparse as sparse
import implicit
import numpy as np
from operator import itemgetter
import time

In [2]:
class RecSys():
    def __init__(self):
        self.user_id = None
        self.user_dict = None
        self.item_dict = None
        self.interaction_sparse_matrix = None
        self.predicted_preference = None
        self.recommended_items = None
        self.users = None
        self.items = None
        self.agg_df = None
        self.interaction_index_to_replace = None
        self.predicted_df = None
        self.compare_df = None

        print('Preparing recommender engine...')

    def build_recsys(self, alpha = 40, factors = 100, regularization = 0.01, iterations = 10):
        self.alpha = alpha
        self.factors = factors
        self.regularization = regularization
        self.iterations = iterations

        # output parameters
        print('This Implicit Collaborative Filtering model is built using the following parameter values: ')
        print('alpha = ' + str(alpha))
        print('factors = ' + str(factors))
        print('regularization = ' + str(regularization))
        print('no of iterations = ' + str(iterations))

        # create a sparse coordinate matrix
        interaction_sparse_matrix = self.get_item_user_sparse_matrix()

        # create the training data
        train_matrix = self.create_training_set(interaction_sparse_matrix)

        # build item-user confidence matrix weighted by alpha
        item_user_confidence = alpha * train_matrix

        # create the ALS model using the Cython package 'implicit' which allows parallelisation
        als_model = implicit.als.AlternatingLeastSquares(factors=factors,
                                                         regularization=regularization,
                                                         iterations=iterations)

        # train the ALS model on the training data - confidence matrix
        # this step will initialise the user-factors and item-factors vectors
        als_model.fit(item_users=item_user_confidence)

        # build our predicted preference matrix
        user_factors = als_model.user_factors
        item_factors = als_model.item_factors
        predicted_preference = user_factors.dot(item_factors.T)

        # return the predicted preference matrix
        self.predicted_preference = predicted_preference
        print('The recommender engine is ready!')
        print('================================')

    def get_item_user_sparse_matrix(self, data='C:/Users/harini/Desktop/NEUCourses/ADM/Project/e_commerce_synthetic_data.csv'):
        '''
        :param data: path to csv file containing the data
        :return: item-user sparse matrix
        '''

        self.data = data

        # load the csv file
        df = pd.read_csv(filepath_or_buffer=data)

        # getting specific columns
        df = df[['user_id','brand', 'event_type']]

        # aggregate data so each row is user-item pair interaction
        agg_df = df.groupby(['user_id', 'brand']).count().reset_index()
        agg_df.rename(columns = {'event_type' :'user_item_interaction_count'}, inplace = True)
        agg_df = agg_df.sort_values('user_item_interaction_count', ascending=False)

        self.agg_df = agg_df

        ## map each item and user to a unique numeric value
        items = agg_df.brand.astype('category')
        users = agg_df.user_id.astype('category')

        self.items = items
        self.users = users

        ### create mapping dictionary
        item_dict = dict(enumerate(items.cat.categories))
        user_dict = dict(enumerate(users.cat.categories))

        self.item_dict = item_dict
        self.user_dict = user_dict

        ## map no of interaction to integer
        interactions = agg_df.user_item_interaction_count.astype('int')

        # build the item-user matrix
        ## since we are using the implicit library, the rows of the matrix will be items and columns will be users
        sparse_rows = items.cat.codes
        sparse_columns = users.cat.codes

        # create a sparse coordinate matrix
        interaction_sparse_matrix = sparse.csr_matrix((interactions, (sparse_rows, sparse_columns)))

        return interaction_sparse_matrix

    def create_training_set(self, interaction_sparse_matrix, seed_no=9257, percent_masked = 0.2):
        '''
        :param interaction_sparse_matrix: item-user CSR sparse matrix
        :param seed_no: this is for reproducibility
        :param percent_masked: proportion of user-item interactions to be masked
        :return: training set matrix on which the model is trained
        '''

        self.interaction_sparse_matrix = interaction_sparse_matrix
        self.seed_no = seed_no
        self.percent_masked = percent_masked

        # set seed for reproducibility
        np.random.seed(seed_no)

        # copy the original matrix to training matrix
        train_matrix = interaction_sparse_matrix.copy()

        # identify indices of the purchase sparse matrix where the element is not zero
        existing_interaction_index = np.transpose(np.nonzero(train_matrix))

        interaction_index_to_replace = train_test_split(existing_interaction_index,
                                                        test_size=percent_masked)[1]  # mask % of user-item interactions
        interaction_index_to_replace = np.transpose(interaction_index_to_replace) # so we can subset our matrix

        self.interaction_index_to_replace = interaction_index_to_replace

        # replace with 0's
        train_matrix[interaction_index_to_replace[0], interaction_index_to_replace[1]] = 0

        return train_matrix

### Model buildig and Prediction

In [3]:
r = RecSys() # initiate the recsys class

Preparing recommender engine...


In [4]:
r.build_recsys() # build the model

This Implicit Collaborative Filtering model is built using the following parameter values: 
alpha = 40
factors = 100
regularization = 0.01
no of iterations = 10


100%|████████████████████████████████████████████████████████████████████████████████| 10.0/10 [00:27<00:00,  2.71s/it]


The recommender engine is ready!


In [5]:
r.recommend_items(550388516)

['airnails',
 'polarus',
 'yoko',
 'trind',
 'godefroy',
 'inm',
 'milv',
 'runail',
 'de.lux',
 'masura']

In [50]:
agg_df = r.agg_df

In [53]:
agg_df = agg_df[agg_df['user_id'].isin(Recommendation_df['user_id'])]

In [57]:
agg_df_to_compare = agg_df.loc[:, ['user_id', 'brand']]
agg_df_to_compare.sort_values('user_id', inplace=True)

In [59]:
rows = []
_ = Recommendation_df.apply(lambda row: [rows.append([row['user_id'], nn]) 
                         for nn in row.recommended_brands], axis=1)

In [61]:
Recommendation_df_new = pd.DataFrame(rows, columns=['user_id','brand'])

In [62]:
Recommendation_df_new.head()

Unnamed: 0,user_id,brand
0,407458156,freedecor
1,407458156,smart
2,407458156,pole
3,407458156,severina
4,407458156,milv


In [64]:
compare_df = pd.merge(agg_df_to_compare, Recommendation_df_new, on=['user_id', 'brand'],
                              how='left', indicator='predicted')

In [6]:
df = pd.read_csv('C:/Users/harini/Desktop/NEUCourses/ADM/Project/RFM_Uplift_data.csv')

In [10]:
df.columns

Index(['user_id', 'product_id', 'campaign', 'promotion', 'conversion',
       'campaign_group', 'target_class', 'Monetary_cluster', 'uplift_score',
       'Recency', 'Frequency', 'Monetary', 'cluster', 'Status'],
      dtype='object')

In [15]:
len(df[df['cluster']== 0]['user_id'].unique())

45833

In [16]:
len(df[df['cluster']== 1]['user_id'].unique())

48167

In [17]:
len(df[df['cluster']== 2]['user_id'].unique())

4458

In [18]:
Recommendation_df = pd.DataFrame(df[df['cluster']== 2]['user_id'].unique())
Recommendation_df.columns = ['user_id']

In [19]:
Recommendation_df.head()

Unnamed: 0,user_id
0,407458156
1,548228624
2,433063720
3,547853931
4,516199419


In [27]:
users = [576802932,412120092,494077766,348405118,560109803]
recommendations = []

In [29]:
for i in users:
    recommendations.append(r.recommend_items(i))

In [20]:
Recommendation_df['recommended_brands'] = Recommendation_df['user_id'].apply(lambda x: r.recommend_items(x))

In [21]:
Recommendation_df.head()

Unnamed: 0,user_id,recommended_brands
0,407458156,"[freedecor, smart, pole, severina, milv, irisk..."
1,548228624,"[bpw.style, freedecor, runail, grattol, de.lux..."
2,433063720,"[cnd, irisk, oniq, freedecor, uno, staleks, yo..."
3,547853931,"[nagaraku, art-visage, italwax, solomeya, arte..."
4,516199419,"[freedecor, airnails, smart, yoko, masura, lia..."


In [23]:
Recommendation_df_0 = pd.DataFrame(df[df['cluster']== 0]['user_id'].unique())
Recommendation_df_0.columns = ['user_id']

In [24]:
Recommendation_df_0['recommended_brands'] = Recommendation_df_0['user_id'].apply(lambda x: r.recommend_items(x))

In [73]:
Recommendation_df_0.head()

Unnamed: 0,user_id,recommended_brands
0,579752970,"[yoko, nagaraku, kapous, lovely, irisk, domix,..."
1,513026898,"[bluesky, ingarden, freedecor, bpw.style, iris..."
2,569278192,"[roubloff, metzger, smart, airnails, de.lux, c..."
3,451312379,"[irisk, uno, runail, ingarden, bpw.style, blue..."
4,555042132,"[bpw.style, irisk, runail, uno, jessnail, grat..."


In [31]:
Recommendation_final = pd.concat([Recommendation_df_0,Recommendation_df]).reset_index(drop = True)

In [41]:
Recommendation_final.to_csv('C:/Users/harini/Desktop/NEUCourses/ADM/Project/Recommendation_data.csv', index = False, encoding = 'utf-8')

In [35]:
Recommendation_final[['Rec 1','Rec 2', 'Rec 3', 'Rec 4', 'Rec 5', 'Rec 6', 'Rec 7', 'Rec 8', 'Rec 9', 'Rec 10']] = pd.DataFrame(Recommendation_final['recommended_brands'].values.tolist(), index= Recommendation_final.index)

## Accuracy mertics

In [72]:
recall_accuracy = round(sum(compare_df.predicted == 'both') / agg_df_to_compare.shape[0] * 100, 1)
precision = round(sum(compare_df.predicted == 'both') / Recommendation_df_new.shape[0] * 100, 1)
print('Recall :',recall_accuracy, '\nPrecision :',precision)

Recall : 41.1 
Precision : 78.5


In [37]:
Recommendation_final = Recommendation_final.drop('recommended_brands', axis =1)