# Cosine Similarity

\begin{align}
Cosine(x,y)= \frac{\sum_{i = 1}^{n}x_i y_i
}{\sqrt{\sum_{i=1}^{n}x_i^2} \sqrt{\sum_{i=1}^{n}y_i^2}}
\end{align}

> ## Cosine Similarity Illustration 1

In [1]:
import numpy as np
import pandas as pd

In [2]:
cosine = (1*1+1*1+1*1+1*0)/(np.sqrt(4)*np.sqrt(3))
print(cosine)

0.8660254037844387


In [3]:
cosine = (1*0+1*1+1*1+1*0)/(np.sqrt(4)*np.sqrt(2))
print(cosine)

0.7071067811865475


In [4]:
cosine = (1*1+1*0+1*0+1*0)/(np.sqrt(4)*np.sqrt(1))
print(cosine)

0.5


> ## Cosine Similarity Illustration 2

In [6]:
cosine = (4*3+5*5+5*5+4*0)/(np.sqrt(4**2+5**2+5**2+4**2)*np.sqrt(3**2+5**2+5**2+0**2))
print(cosine)

0.891371527293353


In [7]:
cosine = (4*0+5*5+5*5+4*0)/(np.sqrt(4**2+5**2+5**2+4**2)*np.sqrt(0**2+5**2+5**2+0**2))
print(cosine)

0.7808688094430302


In [8]:
cosine = (4*5+5*0+5*0+4*0)/(np.sqrt(4**2+5**2+5**2+4**2)*np.sqrt(4**2+0**2+0**2+0**2))
print(cosine)

0.5521576303742327


# Content Based Filtering

> ## Content Based Filtering One User

In [9]:
import pandas as pd
import numpy as np

In [10]:
movies = ["Terminator 2","Interstellar","Ant Man 2","3 Idiots"]
scores = [7,9,8,9]
action = [1,0,1,0]
scifi = [1,1,1,0]
adventure = [0,1,1,0]
comedy = [0,0,1,1]
drama = [0,1,0,1]

df_movies = pd.DataFrame({
    'movie':movies,
    'scores':scores,
    'Action':action,
    'Sci-Fi':scifi,
    'Adventure':adventure,
    'Comedy':comedy,
    'Drama':drama
})

df_movies

Unnamed: 0,movie,scores,Action,Sci-Fi,Adventure,Comedy,Drama
0,Terminator 2,7,1,1,0,0,0
1,Interstellar,9,0,1,1,0,1
2,Ant Man 2,8,1,1,1,1,0
3,3 Idiots,9,0,0,0,1,1


In [11]:
df_movies2 = df_movies.copy()
df_movies2.drop('movie', axis = 1, inplace = True)

for i in ['Action','Sci-Fi','Adventure','Comedy','Drama']:
  df_movies2[i] = df_movies2['scores']*df_movies2[i]

df_movies2.drop('scores', axis = 1, inplace = True)
movie_scoring = df_movies2.sum()/df_movies2.sum().sum()
movie_scoring

Unnamed: 0,0
Action,0.164835
Sci-Fi,0.263736
Adventure,0.186813
Comedy,0.186813
Drama,0.197802


In [12]:
movies = ["Titanic",'Martian','GOTG Vol 2']
action = [1,0,1]
scifi = [1,1,1]
adventure = [0,1,1]
comedy = [0,0,1]
drama = [0,1,0]

df_movies_recommendation = pd.DataFrame({
    'movie':movies,
    'Action':action,
    'Sci-Fi':scifi,
    'Adventure':adventure,
    'Comedy':comedy,
    'Drama':drama
})

df_movies_recommendation

Unnamed: 0,movie,Action,Sci-Fi,Adventure,Comedy,Drama
0,Titanic,1,1,0,0,0
1,Martian,0,1,1,0,1
2,GOTG Vol 2,1,1,1,1,0


In [13]:
movie_scoring

Unnamed: 0,0
Action,0.164835
Sci-Fi,0.263736
Adventure,0.186813
Comedy,0.186813
Drama,0.197802


In [15]:
for i in ['Action','Sci-Fi','Adventure','Comedy','Drama']:
  df_movies_recommendation[i] = df_movies_recommendation[i]*movie_scoring[i]

df_movies_recommendation['movie rating prediction'] = df_movies_recommendation.sum(axis = 1)
df_movies_recommendation

TypeError: can only concatenate str (not "float") to str

Recommendation Order for the user:
- GOTG Vol 2
- Martian
- Titanic

> ## Content Based Filtering Multiple User

In [None]:
import pandas as pd
import numpy as np

In [None]:
movies = ["Terminator 2","Interstellar","Ant Man 2","3 Idiots"]
action = [1,0,1,0]
scifi = [1,1,1,0]
adventure = [0,1,1,0]
comedy = [0,0,1,1]
drama = [0,1,0,1]

df_item_features = pd.DataFrame({
    'movie':movies,
    'Action':action,
    'Sci-Fi':scifi,
    'Adventure':adventure,
    'Comedy':comedy,
    'Drama':drama
})

df_item_features

Unnamed: 0,movie,Action,Sci-Fi,Adventure,Comedy,Drama
0,Terminator 2,1,1,0,0,0
1,Interstellar,0,1,1,0,1
2,Ant Man 2,1,1,1,1,0
3,3 Idiots,0,0,0,1,1


In [None]:
user = ['user 1','user 2','user 3','user 4']
terminator_2 = [7,8,9,0]
interstellar = [9,0,0,7]
ant_man_2 = [8,6,0,0]
three_idiots = [9,5,10,9]

df_user_items = pd.DataFrame({
    'user':user,
    'Terminator 2':terminator_2,
    'Interstellar':interstellar,
    'Ant Man 2':ant_man_2,
    '3 Idiots':three_idiots
})

df_user_items

Unnamed: 0,user,Terminator 2,Interstellar,Ant Man 2,3 Idiots
0,user 1,7,9,8,9
1,user 2,8,0,6,5
2,user 3,9,0,0,10
3,user 4,0,7,0,9


In [None]:
arr_user_items = np.array(df_user_items.drop('user', axis = 1))
arr_item_features = np.array(df_item_features.drop('movie', axis = 1))

n_user = arr_user_items.shape[0]
n_item = arr_user_items.shape[1]
n_feature = arr_item_features.shape[1]

arr_user_items_score = np.empty((n_user,n_item))
arr_user_feature = np.empty((n_user,n_feature))

for i in range(0,n_user):
  # print(arr_user_items[i,:])
  user_feature = np.matmul(arr_user_items[i,:],arr_item_features)
  # print(user_feature)
  user_feature = user_feature/user_feature.sum()
  arr_user_feature[i,:] = user_feature

In [None]:
df_user_feature = pd.DataFrame(arr_user_feature)
df_user_feature.columns = df_item_features.columns[1:]
df_user_feature.index = user
df_user_feature

Unnamed: 0,Action,Sci-Fi,Adventure,Comedy,Drama
user 1,0.164835,0.263736,0.186813,0.186813,0.197802
user 2,0.28,0.28,0.12,0.22,0.1
user 3,0.236842,0.236842,0.0,0.263158,0.263158
user 4,0.0,0.179487,0.179487,0.230769,0.410256


In [None]:
for i in range(0, n_user):
  user_item_score = np.matmul(arr_item_features,arr_user_feature[i,:])
  arr_user_items_score[i,:] = user_item_score

arr_user_items_score_unwatched = np.where(arr_user_items == 0,arr_user_items_score,0)

df_user_items_score_unwatched = pd.DataFrame(arr_user_items_score_unwatched)
df_user_items_score_unwatched.columns = movies
df_user_items_score_unwatched.index = user
df_user_items_score_unwatched

Unnamed: 0,Terminator 2,Interstellar,Ant Man 2,3 Idiots
user 1,0.0,0.0,0.0,0.0
user 2,0.0,0.5,0.0,0.0
user 3,0.0,0.5,0.736842,0.0
user 4,0.179487,0.0,0.589744,0.0


Recommendation Order for unwatched movies
- User 3 : Ant Man 2, Interstellar
- User 4 : Ant Man 2, Terminator 2


In [None]:
movies = ["Titanic","Martian","GOTG Vol 2"]
action = [1,0,1]
scifi = [1,1,1]
adventure = [0,1,1]
comedy = [0,0,1]
drama = [0,1,0]

df_item_features_new = pd.DataFrame({
    'movie':movies,
    'Action':action,
    'Sci-Fi':scifi,
    'Adventure':adventure,
    'Comedy':comedy,
    'Drama':drama
})

df_item_features_new

Unnamed: 0,movie,Action,Sci-Fi,Adventure,Comedy,Drama
0,Titanic,1,1,0,0,0
1,Martian,0,1,1,0,1
2,GOTG Vol 2,1,1,1,1,0


In [None]:
arr_item_features_new = np.array(df_item_features_new.drop('movie', axis = 1))

n_item_new = df_item_features_new.shape[0]

arr_user_items_score_new = np.empty((n_user,n_item_new))

for i in range(0, n_user):
  user_item_score = np.matmul(arr_item_features_new,arr_user_feature[i,:])
  arr_user_items_score_new[i,:] = user_item_score

df_user_items_score_new = pd.DataFrame(arr_user_items_score_new)
df_user_items_score_new.index = user
df_user_items_score_new.columns = df_item_features_new['movie']
df_user_items_score_new

movie,Titanic,Martian,GOTG Vol 2
user 1,0.428571,0.648352,0.802198
user 2,0.56,0.5,0.9
user 3,0.473684,0.5,0.736842
user 4,0.179487,0.769231,0.589744


Recommendation Order
- User 1 : GOTG Vol 2, Martian, Titanic
- User 2 : GOTG Vol 2, Titanic, Martian
- User 3 : GOTG Vol 2, Martian, Titanic
- User 4 : Martian, GOTG Vol 2, Titanic

# Collaborative Filtering : Model Based

> ## Library

In [None]:
!pip install scikit-surprise

Collecting scikit-surprise
[?25l  Downloading https://files.pythonhosted.org/packages/97/37/5d334adaf5ddd65da99fc65f6507e0e4599d092ba048f4302fe8775619e8/scikit-surprise-1.1.1.tar.gz (11.8MB)
[K     |████████████████████████████████| 11.8MB 3.5MB/s 
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.1-cp36-cp36m-linux_x86_64.whl size=1618250 sha256=146da6b286bacfde843e2c004ab24c9db238be5deae0e583f3c146fabc244d78
  Stored in directory: /root/.cache/pip/wheels/78/9c/3d/41b419c9d2aff5b6e2b4c0fc8d25c538202834058f9ed110d0
Successfully built scikit-surprise
Installing collected packages: scikit-surprise
Successfully installed scikit-surprise-1.1.1


In [None]:
import pandas as pd

import seaborn as sns

from surprise import Reader
from surprise import Dataset

from surprise import SVD
from surprise import BaselineOnly

from surprise import accuracy
from surprise.model_selection import cross_validate, train_test_split
from surprise.model_selection import GridSearchCV

> ## Data

In [None]:
column_names = ['user_id', 'item_id', 'rating', 'timestamp']
df = pd.read_csv('u.data', sep='\t', names=column_names)

In [None]:
df

Unnamed: 0,user_id,item_id,rating,timestamp
0,0,50,5,881250949
1,0,172,5,881250949
2,0,133,1,881250949
3,196,242,3,881250949
4,186,302,3,891717742
...,...,...,...,...
99998,880,476,3,880175444
99999,716,204,5,879795543
100000,276,1090,1,874795795
100001,13,225,2,882399156


In [None]:
user_item_rating_matrix = df.pivot_table(values = 'rating', index = 'user_id', columns = 'item_id')

In [None]:
reader = Reader(rating_scale=(0, 5))
data = Dataset.load_from_df(df[['user_id','item_id','rating']], reader)

In [None]:
data.df.head(10)

Unnamed: 0,user_id,item_id,rating
0,0,50,5
1,0,172,5
2,0,133,1
3,196,242,3
4,186,302,3
5,22,377,1
6,244,51,2
7,166,346,1
8,298,474,4
9,115,265,2


> ## Validation

Data Splitting

In [None]:
trainset, testset = train_test_split(data, test_size=0.25,random_state=101)

Algo 1 : SVD

In [None]:
algo = SVD()

algo.fit(trainset)
predictions = algo.test(testset)

In [None]:
accuracy.rmse(predictions)

RMSE: 0.9398


0.9397981717733036

Algo 2 : ALS

In [None]:
bsl_options = {'method': 'als',
               'n_epochs': 5,
               'reg_u': 12,
               'reg_i': 5
               }
algo = BaselineOnly(bsl_options=bsl_options)

algo.fit(trainset)
predictions = algo.test(testset)

Estimating biases using als...


In [None]:
accuracy.rmse(predictions)

RMSE: 0.9409


0.9408558035131163

Based on train-test validation, SVD is better than ALS

> ## Cross Validation

Algo 1 : SVD

In [None]:
algo = SVD()

cv_svd = cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9393  0.9382  0.9425  0.9332  0.9293  0.9365  0.0047  
MAE (testset)     0.7376  0.7407  0.7399  0.7361  0.7352  0.7379  0.0021  
Fit time          4.95    4.89    4.89    4.84    4.88    4.89    0.04    
Test time         0.14    0.23    0.14    0.24    0.16    0.18    0.04    


In [None]:
print('rmse cv mean',cv_svd['test_rmse'].mean())

rmse cv mean 0.9365023535802692


Algo 2 : ALS

In [None]:
bsl_options = {'method': 'als',
               'n_epochs': 5,
               'reg_u': 12,
               'reg_i': 5
               }

algo = BaselineOnly(bsl_options=bsl_options)

cv_als = cross_validate(algo, data, measures=['RMSE','MAE'], cv=5, verbose=False)

Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...


In [None]:
print('rmse cv mean',cv_als['test_rmse'].mean())

rmse cv mean 0.9415166339469463


Based on cross validation, SVD is better than ALS

> ## Hyperparameter Tuning

We optimize SVD further using hyperparameter tuning

In [None]:
param_grid = {'n_epochs': [5, 10, 20], 'lr_all': [0.002, 0.005],
              'reg_all': [0.02, 0.4, 0.6]}

gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3)

gs.fit(data)

best score and best hyperparameter

In [None]:
print(gs.best_score['rmse'])
print(gs.best_params['rmse'])

0.946206662876543
{'n_epochs': 20, 'lr_all': 0.005, 'reg_all': 0.02}


In [None]:
print(gs.best_score['mae'])
print(gs.best_params['mae'])

0.7470844480823832
{'n_epochs': 20, 'lr_all': 0.005, 'reg_all': 0.02}


before

In [None]:
algo = SVD()

cv_svd = cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9345  0.9396  0.9323  0.9350  0.9444  0.9372  0.0043  
MAE (testset)     0.7371  0.7390  0.7345  0.7380  0.7459  0.7389  0.0038  
Fit time          4.82    4.82    4.82    4.82    4.81    4.82    0.00    
Test time         0.14    0.14    0.24    0.14    0.14    0.16    0.04    


In [None]:
print('rmse cv mean',cv_svd['test_rmse'].mean())

rmse cv mean 0.9371529626957331


after

In [None]:
algo = SVD(n_epochs = 20, lr_all = 0.005, reg_all = 0.02)
cv_svd = cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9402  0.9373  0.9302  0.9334  0.9276  0.9337  0.0046  
MAE (testset)     0.7407  0.7373  0.7362  0.7356  0.7309  0.7361  0.0032  
Fit time          4.86    4.89    4.95    4.92    4.91    4.91    0.03    
Test time         0.26    0.14    0.26    0.14    0.14    0.19    0.06    


In [None]:
print('rmse cv mean',cv_svd['test_rmse'].mean())

rmse cv mean 0.9337282090558603


> ## Prediction Result

We will reccommend item 565, 647, 665, and 677 to users 0, 111 and 212.

In [None]:
df_test = pd.DataFrame(columns = ['user_id','item_id'])

for i in [0,111,212]:
    for j in [647, 665, 565, 677]:
        df_test = df_test.append({'user_id':i,'item_id':j},ignore_index = True)

df_test

Unnamed: 0,user_id,item_id
0,0,647
1,0,665
2,0,565
3,0,677
4,111,647
5,111,665
6,111,565
7,111,677
8,212,647
9,212,665


Build the final model using SVD with following hyperparameter, n_epochs = {5, 10}, lr_all = {0.002, 0.005}, reg_all = {0.4, 0.6}

In [None]:
algo = SVD(n_epochs = 20, lr_all = 0.005, reg_all = 0.02)
algo.fit(trainset)

y = []

for _, row in df_test.iterrows():
    est = algo.predict(row.user_id, row.item_id)
    y.append(est[3])

df_test['rating'] = y

df_test.sort_values(by = ['user_id','rating'], ascending = [True,False], inplace = True)

USER ID - 0

In [None]:
df_test[df_test['user_id'] == 0]

Unnamed: 0,user_id,item_id,rating
0,0,647,3.94163
3,0,677,3.310805
2,0,565,3.08572
1,0,665,3.040065


recommendation order for user 0 : 647, 677, 665, 565

USER ID - 111

In [None]:
df_test[df_test['user_id'] == 111]

Unnamed: 0,user_id,item_id,rating
4,111,647,3.903866
7,111,677,3.249486
5,111,665,3.002651
6,111,565,2.941813


recommendation order for user 0 : 647, 677, 565, 665

USER ID - 212

In [None]:
df_test[df_test['user_id'] == 212]

Unnamed: 0,user_id,item_id,rating
8,212,647,3.844641
11,212,677,3.419403
9,212,665,2.886898
10,212,565,2.863581


recommendation order for user 0 : 647, 677, 565, 665