# User-Based Recommender System - Cosine Similarities
## Iteration 1

### Read data

In [5]:
import pandas as pd
movies = pd.read_csv("https://github.com/tiagofassoni/useful-datasets/raw/main/ml-latest-small/movies.csv")
ratings = pd.read_csv("https://github.com/tiagofassoni/useful-datasets/raw/main/ml-latest-small/ratings.csv")
links = pd.read_csv("https://github.com/tiagofassoni/useful-datasets/raw/main/ml-latest-small/links.csv")
tags = pd.read_csv("https://github.com/tiagofassoni/useful-datasets/raw/main/ml-latest-small/tags.csv")

In [6]:
from sklearn.metrics.pairwise import cosine_similarity

data = ratings.merge(movies, on="movieId", how="left")
data.head(4)

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,1,3,4.0,964981247,Grumpier Old Men (1995),Comedy|Romance
2,1,6,4.0,964982224,Heat (1995),Action|Crime|Thriller
3,1,47,5.0,964983815,Seven (a.k.a. Se7en) (1995),Mystery|Thriller


In [7]:
movie_user = data.pivot_table(index='userId',columns='title',values='rating')
movie_user.head()

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,4.0,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,


In [8]:
movie_user.fillna(0, inplace=True)
movie_user.head()

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### 3. Compute pairwise cosine similarities

In [9]:
user_similarities = pd.DataFrame(cosine_similarity(movie_user),
                                 columns=movie_user.index, 
                                 index=movie_user.index)
user_similarities.head()

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.027283,0.05972,0.194395,0.12908,0.128152,0.158744,0.136968,0.064263,0.016875,...,0.080554,0.164455,0.221486,0.070669,0.153625,0.164191,0.269389,0.291097,0.093572,0.145321
2,0.027283,1.0,0.0,0.003726,0.016614,0.025333,0.027585,0.027257,0.0,0.067445,...,0.202671,0.016866,0.011997,0.0,0.0,0.028429,0.012948,0.046211,0.027565,0.102427
3,0.05972,0.0,1.0,0.002251,0.00502,0.003936,0.0,0.004941,0.0,0.0,...,0.005048,0.004892,0.024992,0.0,0.010694,0.012993,0.019247,0.021128,0.0,0.032119
4,0.194395,0.003726,0.002251,1.0,0.128659,0.088491,0.11512,0.062969,0.011361,0.031163,...,0.085938,0.128273,0.307973,0.052985,0.084584,0.200395,0.131746,0.149858,0.032198,0.107683
5,0.12908,0.016614,0.00502,0.128659,1.0,0.300349,0.108342,0.429075,0.0,0.030611,...,0.068048,0.418747,0.110148,0.258773,0.148758,0.106435,0.152866,0.135535,0.261232,0.060792


In [11]:
user_id = 80

### 1. Compute the weights

Accumulate to 1

In [12]:
weights = user_similarities.query("userId")[user_id] / sum(user_similarities.query("userId")[user_id])
weights

userId
1      0.001142
2      0.002472
3      0.000032
4      0.000428
5      0.000569
         ...   
606    0.001954
607    0.001194
608    0.004232
609    0.000503
610    0.004697
Name: 80, Length: 610, dtype: float64

In [13]:
weights = user_similarities.query("userId!=@user_id")[user_id] 
weights

userId
1      0.065308
2      0.141365
3      0.001812
4      0.024481
5      0.032532
         ...   
606    0.111732
607    0.068284
608    0.242065
609    0.028787
610    0.268658
Name: 80, Length: 609, dtype: float64

In [14]:
sum(user_similarities.query("userId!=@user_id")[user_id])

56.19228408085648

In [15]:
# compute the weights for one user
weights = user_similarities.query("userId!=@user_id")[user_id] / sum(user_similarities.query("userId!=@user_id")[user_id])

userId
1    0.001162
2    0.002516
Name: 80, dtype: float64

In [16]:
# select movies that the inputed user has not seen
unseen_movies = movie_user.loc[movie_user.index!=user_id, movie_user.loc[user_id,:]==0]
unseen_movies

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
607,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
608,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,4.5,3.5,0.0,0.0,0.0
609,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
# multiply the unseen movies and the weights
weighted_averages = unseen_movies.T.dot(weights)
weighted_averages

# alternative method:
# weighted_averages = unseen_movies.multiply(weights, axis=0).sum()

title
'71 (2014)                                   0.019124
'Hellboy': The Seeds of Creation (2004)      0.015911
'Round Midnight (1986)                       0.016997
'Salem's Lot (2004)                          0.004787
'Til There Was You (1997)                    0.005524
                                               ...   
eXistenZ (1999)                              0.228025
xXx (2002)                                   0.228063
xXx: State of the Union (2005)               0.041450
¡Three Amigos! (1986)                        0.197073
À nous la liberté (Freedom for Us) (1931)    0.000742
Length: 9552, dtype: float64

### 4. Rating Predictions:

In [18]:
weighted_averages.sort_values(ascending=False).head(5)

title
Forrest Gump (1994)                                          2.795532
Pulp Fiction (1994)                                          2.688947
Star Wars: Episode IV - A New Hope (1977)                    2.355558
Lord of the Rings: The Fellowship of the Ring, The (2001)    2.274043
Lord of the Rings: The Return of the King, The (2003)        2.223876
dtype: float64

## 1st Recommender Function

In [19]:
def rec_iter1(user_id, n):
    '''
    similarity matrix: has to be previously computed
    user_id: the user for which to provide recommendations
    n: the number of recommendations to provide
    '''
    
    # compute the weights for the inputed user
    weights = user_similarities.query("userId!=@user_id")[user_id] / sum(user_similarities.query("userId!=@user_id")[user_id])
    
    # select movies that the inputed user has not seen
    unseen_movies = movie_user.loc[movie_user.index!=user_id, movie_user.loc[user_id,:]==0]
    
    # multiply the unseen movies and the weights
    weighted_averages = unseen_movies.T.dot(weights)
    
    # get the top n movies
    recommendations = weighted_averages.sort_values(ascending=False).head(n).index.tolist()
    
    return recommendations



### Give recommendations:

In [20]:
rec_iter1(user_id = 1, 
          n = 5)

['Shawshank Redemption, The (1994)',
 'Terminator 2: Judgment Day (1991)',
 'Godfather, The (1972)',
 'Sixth Sense, The (1999)',
 'Lord of the Rings: The Fellowship of the Ring, The (2001)']

In [21]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

sample = movie_user.copy()
# Train
train = sample.copy()
train.iloc[609,8] = 0 # user609 500 days of summer to 0
train.iloc[18,9717] = 0 # user 18 Three Amigos to 0
# Test # create a matrix full of 0 of the same shape than the sample
test = (pd.DataFrame(np.zeros(sample.shape), columns=sample.columns, index=sample.index).apply(pd.to_numeric, downcast='integer'))
#The test set only contains the values we excluded on the train set
test.iloc[609,8] = sample.iloc[609,8]
test.iloc[18,9717] = sample.iloc[18,9717]
# train the model
train_similarity = pd.DataFrame(cosine_similarity(train), columns=sample.index, index=sample.index)
# Predictions
pred_609 = sum(train['(500) Days of Summer (2009)'] * train_similarity.iloc[609]) / (sum(train_similarity.iloc[609])-1)
pred_18 = sum(train['¡Three Amigos! (1986)'] * train_similarity.iloc[18]) / (sum(train_similarity.iloc[18])-1)

print(
    f"""
    User 609 rating was {sample.iloc[609,8]} and its prediction is {pred_609}, 
    User 18 rating was {sample.iloc[18,9717]} and its prediction is {pred_18} 
    """
)



    User 609 rating was 3.5 and its prediction is 0.4417016837658108, 
    User 18 rating was 2.0 and its prediction is 0.26065212730782655 
    


...trying to improve

In [22]:
data

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,1,3,4.0,964981247,Grumpier Old Men (1995),Comedy|Romance
2,1,6,4.0,964982224,Heat (1995),Action|Crime|Thriller
3,1,47,5.0,964983815,Seven (a.k.a. Se7en) (1995),Mystery|Thriller
4,1,50,5.0,964982931,"Usual Suspects, The (1995)",Crime|Mystery|Thriller
...,...,...,...,...,...,...
100831,610,166534,4.0,1493848402,Split (2017),Drama|Horror|Thriller
100832,610,168248,5.0,1493850091,John Wick: Chapter Two (2017),Action|Crime|Thriller
100833,610,168250,5.0,1494273047,Get Out (2017),Horror
100834,610,168252,5.0,1493846352,Logan (2017),Action|Sci-Fi


In [23]:
ratings_per_movie = data.groupby('title')['rating'].count().sort_values(ascending=False)
ratings_per_movie

title
Forrest Gump (1994)                          329
Shawshank Redemption, The (1994)             317
Pulp Fiction (1994)                          307
Silence of the Lambs, The (1991)             279
Matrix, The (1999)                           278
                                            ... 
King Solomon's Mines (1950)                    1
King Solomon's Mines (1937)                    1
King Ralph (1991)                              1
King Kong Lives (1986)                         1
À nous la liberté (Freedom for Us) (1931)      1
Name: rating, Length: 9719, dtype: int64

# Reducing amount of data

In [284]:
# counts of ratings per movie as a df
ratings_per_mv_df = pd.DataFrame(ratings_per_movie)
# remove if < 20 ratings
filtered_ratings_per_mv_df = ratings_per_mv_df[ratings_per_mv_df.rating >= 20]
# build a list of titles to keep
popular_movie = filtered_ratings_per_mv_df.index.tolist()
#popular_movie
filtered_ratings_per_mv_df

Unnamed: 0_level_0,rating
title,Unnamed: 1_level_1
Forrest Gump (1994),329
"Shawshank Redemption, The (1994)",317
Pulp Fiction (1994),307
"Silence of the Lambs, The (1991)",279
"Matrix, The (1999)",278
...,...
Intolerable Cruelty (2003),20
"Secret Life of Walter Mitty, The (2013)",20
Megamind (2010),20
Short Circuit (1986),20


In [318]:
ratings_per_user = data.groupby('userId')['rating'].count()
# counts ratings per user as a df
ratings_per_user_df = pd.DataFrame(ratings_per_user)
# remove if < 20, the others dont know whats going on
filtered_ratings_per_user_df = ratings_per_user_df[ratings_per_user_df.rating >= 300]
# build a list of user_ids to keep
prolific_users = filtered_ratings_per_user_df.index.tolist()
prolific_users

[6,
 18,
 19,
 21,
 28,
 42,
 45,
 50,
 51,
 57,
 62,
 64,
 66,
 68,
 89,
 91,
 103,
 105,
 111,
 125,
 132,
 140,
 156,
 160,
 177,
 182,
 199,
 200,
 202,
 217,
 219,
 221,
 226,
 232,
 249,
 274,
 275,
 288,
 292,
 294,
 298,
 305,
 307,
 313,
 318,
 339,
 357,
 368,
 380,
 381,
 387,
 391,
 414,
 425,
 428,
 438,
 448,
 453,
 462,
 469,
 474,
 477,
 480,
 483,
 489,
 509,
 514,
 517,
 525,
 534,
 555,
 560,
 561,
 567,
 580,
 590,
 596,
 597,
 599,
 600,
 603,
 606,
 608,
 610]

In [319]:
#filtered_ratings = data[data.title.isin(popular_movie)]
filtered_ratings = data[data.userId.isin(prolific_users)]

In [320]:
len(filtered_ratings)

56299

In [321]:
len(data)

100836

In [322]:
filtered_ratings

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
560,6,2,4.0,845553522,Jumanji (1995),Adventure|Children|Fantasy
561,6,3,5.0,845554296,Grumpier Old Men (1995),Comedy|Romance
562,6,4,3.0,845554349,Waiting to Exhale (1995),Comedy|Drama|Romance
563,6,5,5.0,845553938,Father of the Bride Part II (1995),Comedy
564,6,6,4.0,845553757,Heat (1995),Action|Crime|Thriller
...,...,...,...,...,...,...
100831,610,166534,4.0,1493848402,Split (2017),Drama|Horror|Thriller
100832,610,168248,5.0,1493850091,John Wick: Chapter Two (2017),Action|Crime|Thriller
100833,610,168250,5.0,1494273047,Get Out (2017),Horror
100834,610,168252,5.0,1493846352,Logan (2017),Action|Sci-Fi


In [323]:
filtered_data = filtered_ratings.pivot_table(index='userId',columns='title',values='rating').fillna(0)
filtered_data

title,'71 (2014),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...And Justice for All (1979),00 Schneider - Jagd auf Nihil Baxter (1994),10 (1979),10 Cent Pistol (2015),...,Zootopia (2016),Zulu (1964),Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
18,0.0,0.0,0.0,0.0,4.0,0.0,0.0,4.5,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
21,0.0,1.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,...,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
28,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
600,0.0,0.0,0.0,0.0,0.0,3.5,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
603,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0
606,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
608,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,4.5,3.5,0.0,0.0


In [312]:
#filtered_data.iloc[500,0]
filtered_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 248 entries, 1 to 610
Columns: 9552 entries, '71 (2014) to À nous la liberté (Freedom for Us) (1931)
dtypes: float64(9552)
memory usage: 18.1 MB


# 3rd Iteration

In [331]:
filtered_data.iloc[83,9047]

2.0

In [332]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

sample1 = filtered_data.copy()
# Train
train = sample1.copy()
train.iloc[83,9048] = 0 
train.iloc[82,9047] = 0 
# Test # create a matrix full of 0 of the same shape than the sample
test = (pd.DataFrame(np.zeros(sample1.shape), columns=sample1.columns, index=sample1.index).apply(pd.to_numeric, downcast='integer'))
#The test set only contains the values we excluded on the train set
test.iloc[83,9048] = sample1.iloc[83,9048]
test.iloc[82,9047] = sample1.iloc[82,9047]
# train the model
train_similarity = pd.DataFrame(cosine_similarity(train), columns=sample1.index, index=sample1.index)
# Predictions
pred_83 = sum(train["xXx: State of the Union (2005)"] * train_similarity.iloc[83]) / (sum(train_similarity.iloc[83]))
pred_82 = sum(train["xXx (2002)"] * train_similarity.iloc[82]) / (sum(train_similarity.iloc[82]))

print(
    f"""
    User 83 rating was {sample1.iloc[83,9048]} and its prediction is {pred_83}, 
    User 82 rating was {sample1.iloc[82,9047]} and its prediction is {pred_82} 
    """
)



    User 83 rating was 1.5 and its prediction is 0.07656563523224985, 
    User 82 rating was 3.5 and its prediction is 0.5692840058775555 
    


well, thanks, even worse...

# 4th iteration - neighbor based

In [25]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

sample = movie_user.copy()
# Train
train = sample.copy()
train.iloc[609,8] = 0 # user609 500 days of summer to 0
train.iloc[18,9717] = 0 # user 18 Three Amigos to 0
# Test # create a matrix full of 0 of the same shape than the sample
test = (pd.DataFrame(np.zeros(sample.shape), columns=sample.columns, index=sample.index).apply(pd.to_numeric, downcast='integer'))
#The test set only contains the values we excluded on the train set
test.iloc[609,8] = sample.iloc[609,8]
test.iloc[18,9717] = sample.iloc[18,9717]
# train the model
train_similarity = pd.DataFrame(cosine_similarity(train), columns=sample.index, index=sample.index)
# select only the closer neigbours 
weighted_ratings = (
pd.DataFrame({'ratings': train['(500) Days of Summer (2009)'],'similarities': train_similarity.iloc[609]})
    .sort_values('similarities', ascending=False) # order values with higher similarities
    .head(5) # select 609 + the 4 closer neigbours
    .query('ratings != 0') # filter the similarity 1, which is the own student
    .assign(weighted_ratings = lambda x: x.ratings * x.similarities) # weight the food and similarities
)
print(weighted_ratings)
# calculate the ponderated weight
pred_609 = sum(weighted_ratings.weighted_ratings) / sum(weighted_ratings.similarities)
print(
    f"""
    User 609 rating was {sample.iloc[609,8]} and its predictions is {pred_609}. 
    """
)

        ratings  similarities  weighted_ratings
userId                                         
249         4.0      0.477976          1.911904
298         2.0      0.447001          0.894002

    User 609 rating was 3.5 and its predictions is 3.033487265130094. 
    


In [27]:
# make recommendations
recs_609 = rec_iter1(609,5)
recs_609

['Silence of the Lambs, The (1991)',
 'True Lies (1994)',
 'Seven (a.k.a. Se7en) (1995)',
 "Schindler's List (1993)",
 'Aladdin (1992)']

...then predicting all ratings positions