# Collaborative Filtering (CF)

## Importing the libraries

In [1]:
import numpy as np
import pandas as pd

## Importing the dataset

In [2]:
column_names = ['user_id', 'item_id', 'rating', 'timestamp']
df = pd.read_csv('u.data', sep='\t', names=column_names)
df.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,0,50,5,881250949
1,0,172,5,881250949
2,0,133,1,881250949
3,196,242,3,881250949
4,186,302,3,891717742


In [3]:
movie_titles = pd.read_csv("Movie_Id_Titles")
movie_titles.head()

Unnamed: 0,item_id,title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)


In [4]:
df = pd.merge(df,movie_titles,on='item_id')
df.head()

Unnamed: 0,user_id,item_id,rating,timestamp,title
0,0,50,5,881250949,Star Wars (1977)
1,290,50,5,880473582,Star Wars (1977)
2,79,50,4,891271545,Star Wars (1977)
3,2,50,5,888552084,Star Wars (1977)
4,8,50,5,879362124,Star Wars (1977)


In [5]:
n_users = df.user_id.nunique()
n_items = df.item_id.nunique()

print('Num. of Users: '+ str(n_users))
print('Num of Movies: '+ str(n_items))

Num. of Users: 944
Num of Movies: 1682


In [6]:
df.describe()

Unnamed: 0,user_id,item_id,rating,timestamp
count,100003.0,100003.0,100003.0,100003.0
mean,462.470876,425.520914,3.529864,883528800.0
std,266.622454,330.797791,1.125704,5343791.0
min,0.0,1.0,1.0,874724700.0
25%,254.0,175.0,3.0,879448700.0
50%,447.0,322.0,4.0,882826900.0
75%,682.0,631.0,4.0,888260000.0
max,943.0,1682.0,5.0,893286600.0


## Splitting the dataset

In [7]:
from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split(df, test_size=0.25)

In [8]:
train_data.head()

Unnamed: 0,user_id,item_id,rating,timestamp,title
45425,533,186,3,879438850,"Blues Brothers, The (1980)"
18075,246,174,3,884921086,Raiders of the Lost Ark (1981)
97845,407,565,3,876348702,Village of the Damned (1995)
48457,806,407,3,882386125,Spy Hard (1996)
33065,804,68,3,879445975,"Crow, The (1994)"


## Formatting the dataset

In [9]:
#Create two user-item matrices, one for training and another for testing
train_data_matrix = np.zeros((n_users, n_items))
for line in train_data.itertuples():
    train_data_matrix[line[1]-1, line[2]-1] = line[3]  

test_data_matrix = np.zeros((n_users, n_items))
for line in test_data.itertuples():
    test_data_matrix[line[1]-1, line[2]-1] = line[3]

In [10]:
train_data_matrix

array([[5., 3., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [11]:
train_data_matrix.shape

(944, 1682)

In [12]:
test_data_matrix.shape

(944, 1682)

# Collaborative Filtering Memory-Based User-Based

## Calculating similarity

Using Cosine similarity:

<img class="aligncenter size-thumbnail img-responsive" src="https://latex.codecogs.com/gif.latex?s_u^{cos}(u_k,u_a)=\frac{u_k&space;\cdot&space;u_a&space;}{&space;\left&space;\|&space;u_k&space;\right&space;\|&space;\left&space;\|&space;u_a&space;\right&space;\|&space;}&space;=\frac{\sum&space;x_{k,m}x_{a,m}}{\sqrt{\sum&space;x_{k,m}^2\sum&space;x_{a,m}^2}}"/>

In [13]:
from sklearn.metrics.pairwise import pairwise_distances
user_similarity = pairwise_distances(train_data_matrix, metric='cosine')

## Predicting

User-based:
<img class="aligncenter size-thumbnail img-responsive" src="https://latex.codecogs.com/gif.latex?\hat{x}_{k,m}&space;=&space;\bar{x}_{k}&space;&plus;&space;\frac{\sum\limits_{u_a}&space;sim_u(u_k,&space;u_a)&space;(x_{a,m}&space;-&space;\bar{x_{u_a}})}{\sum\limits_{u_a}|sim_u(u_k,&space;u_a)|}"/>

In [14]:
def predict_user(ratings, similarity):
    mean_user_rating = ratings.mean(axis=1)
    #You use np.newaxis so that mean_user_rating has same format as ratings
    ratings_diff = (ratings - mean_user_rating[:, np.newaxis]) 
    pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
    return pred
user_prediction = predict_user(train_data_matrix, user_similarity)

In [15]:
user_prediction

array([[ 1.60646749,  0.60331475,  0.48053452, ...,  0.30775069,
         0.30527272,  0.30527272],
       [ 1.33103566,  0.29614012,  0.11688105, ..., -0.07675572,
        -0.0786909 , -0.0786909 ],
       [ 1.34092188,  0.27041619,  0.09834367, ..., -0.10137732,
        -0.10290949, -0.10290949],
       ...,
       [ 1.36575643,  0.31961978,  0.16842259, ..., -0.02406274,
        -0.02623138, -0.02623138],
       [ 1.41144811,  0.39478775,  0.2660305 , ...,  0.09394756,
         0.09146921,  0.09146921],
       [ 1.21484584,  0.19169693,  0.03735874, ..., -0.15683088,
        -0.15911378, -0.15911378]])

In [16]:
user_prediction.shape

(944, 1682)

## Evaluating

Using Root Mean Squared Error (RMSE):

<img src="https://latex.codecogs.com/gif.latex?RMSE&space;=\sqrt{\frac{1}{N}&space;\sum&space;(x_i&space;-\hat{x_i})^2}" title="RMSE =\sqrt{\frac{1}{N} \sum (x_i -\hat{x_i})^2}" />

In [17]:
from sklearn.metrics import mean_squared_error
from math import sqrt
def rmse(prediction, ground_truth):
    prediction = prediction[ground_truth.nonzero()].flatten() 
    ground_truth = ground_truth[ground_truth.nonzero()].flatten()
    return sqrt(mean_squared_error(prediction, ground_truth))

In [18]:
print('Collaborative Filtering Memory-Based User-Based RMSE: ' + str(rmse(user_prediction, test_data_matrix)))

Collaborative Filtering Memory-Based User-Based RMSE: 3.123690053384963


# Collaborative Filtering Memory-Based Item-Based

## Calculating similarity

Using Cosine similarity:

<img class="aligncenter size-thumbnail img-responsive" src="https://latex.codecogs.com/gif.latex?s_u^{cos}(i_m,i_b)=\frac{i_m&space;\cdot&space;i_b&space;}{&space;\left&space;\|&space;i_m&space;\right&space;\|&space;\left&space;\|&space;i_b&space;\right&space;\|&space;}&space;=\frac{\sum&space;x_{a,m}x_{a,b}}{\sqrt{\sum&space;x_{a,m}^2\sum&space;x_{a,b}^2}}
"/>

In [19]:
from sklearn.metrics.pairwise import pairwise_distances
item_similarity = pairwise_distances(train_data_matrix.T, metric='cosine')

## Predicting

Item-based:
<img class="aligncenter size-thumbnail img-responsive" src="https://latex.codecogs.com/gif.latex?\hat{x}_{k,m}&space;=&space;\frac{\sum\limits_{i_b}&space;sim_i(i_m,&space;i_b)&space;(x_{k,b})&space;}{\sum\limits_{i_b}|sim_i(i_m,&space;i_b)|}"/>

In [20]:
def predict_item(ratings, similarity):
    pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])    
    return pred
item_prediction = predict_item(train_data_matrix, item_similarity)

In [21]:
item_prediction

array([[0.37877391, 0.38811035, 0.41763886, ..., 0.45860338, 0.45627603,
        0.45627603],
       [0.07900639, 0.08869014, 0.08543892, ..., 0.08858275, 0.08982748,
        0.08982748],
       [0.06422581, 0.0677163 , 0.06549154, ..., 0.06348655, 0.06662701,
        0.06662701],
       ...,
       [0.11766989, 0.12316576, 0.12933446, ..., 0.13541834, 0.13563355,
        0.13563355],
       [0.19797636, 0.19768283, 0.21427784, ..., 0.24694849, 0.24509221,
        0.24509221],
       [0.00345588, 0.00430433, 0.00507049, ..., 0.0059939 , 0.00594884,
        0.00594884]])

In [22]:
item_prediction.shape

(944, 1682)

## Evaluating

Using Root Mean Squared Error (RMSE):

<img src="https://latex.codecogs.com/gif.latex?RMSE&space;=\sqrt{\frac{1}{N}&space;\sum&space;(x_i&space;-\hat{x_i})^2}" title="RMSE =\sqrt{\frac{1}{N} \sum (x_i -\hat{x_i})^2}" />

In [23]:
from sklearn.metrics import mean_squared_error
from math import sqrt
def rmse(prediction, ground_truth):
    prediction = prediction[ground_truth.nonzero()].flatten() 
    ground_truth = ground_truth[ground_truth.nonzero()].flatten()
    return sqrt(mean_squared_error(prediction, ground_truth))

In [24]:
print('Collaborative Filtering Memory-Based Item-Based RMSE: ' + str(rmse(item_prediction, test_data_matrix)))

Collaborative Filtering Memory-Based Item-Based RMSE: 3.453003278118396


# Collaborative Filtering Model-Based

## Building the machine learning algorithm

Using Singular Value Decomposition:

<img src="https://latex.codecogs.com/gif.latex?X=USV^T" title="X=USV^T" />

In [25]:
import scipy.sparse as sp
from scipy.sparse.linalg import svds

#get SVD components from train matrix. Choose k.
u, s, vt = svds(train_data_matrix, k=20)
s_diag_matrix=np.diag(s)

## Predicting

Using the dot product of *`U`*, *`S`* and *`V^T`*.

In [26]:
X_pred = np.dot(np.dot(u, s_diag_matrix), vt)

In [27]:
X_pred

array([[ 2.91036945e+00,  2.02866289e+00,  4.34762071e-01, ...,
        -8.46597412e-03,  0.00000000e+00,  0.00000000e+00],
       [ 1.18053368e+00, -1.25584387e-01,  7.99441569e-02, ...,
         5.40246631e-03,  0.00000000e+00,  0.00000000e+00],
       [-2.04303958e-01, -9.61831895e-02,  1.24389207e-01, ...,
         1.52019169e-02,  0.00000000e+00,  0.00000000e+00],
       ...,
       [ 9.53173296e-01,  1.08356512e-01, -6.70182987e-02, ...,
         6.77615383e-03,  0.00000000e+00,  0.00000000e+00],
       [ 1.10280427e+00,  1.41866132e+00,  9.54873197e-01, ...,
        -8.44290685e-03,  0.00000000e+00,  0.00000000e+00],
       [ 2.43000822e-01,  2.63778143e-02,  1.35301056e-02, ...,
        -5.82967681e-04,  0.00000000e+00,  0.00000000e+00]])

In [28]:
X_pred.shape

(944, 1682)

## Evaluating

Using Root Mean Squared Error (RMSE):

<img src="https://latex.codecogs.com/gif.latex?RMSE&space;=\sqrt{\frac{1}{N}&space;\sum&space;(x_i&space;-\hat{x_i})^2}" title="RMSE =\sqrt{\frac{1}{N} \sum (x_i -\hat{x_i})^2}" />

In [29]:
print('User-based CF MSE: ' + str(rmse(X_pred, test_data_matrix)))

User-based CF MSE: 2.715586302138211
