In [2]:
import numpy as np
import pandas as pd

In [4]:
df_movies=pd.read_csv("movies.csv")

In [6]:
df_movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [48]:
df_movies.shape

(9742, 3)

In [8]:
df_movies.isna().sum()

movieId    0
title      0
genres     0
dtype: int64

In [10]:
df_ratings=pd.read_csv("ratings.csv")

In [12]:
df_ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [50]:
df_ratings.shape

(100836, 4)

In [14]:
df_ratings.isna().sum()

userId       0
movieId      0
rating       0
timestamp    0
dtype: int64

In [16]:
df_ratings.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
userId,100836.0,326.1276,182.6185,1.0,177.0,325.0,477.0,610.0
movieId,100836.0,19435.3,35530.99,1.0,1199.0,2991.0,8122.0,193609.0
rating,100836.0,3.501557,1.042529,0.5,3.0,3.5,4.0,5.0
timestamp,100836.0,1205946000.0,216261000.0,828124615.0,1019124000.0,1186087000.0,1435994000.0,1537799000.0


In [28]:
df_ratings.columns

Index(['userId', 'movieId', 'rating', 'timestamp'], dtype='object')

In [24]:
# !pip install surprise

In [22]:
from surprise import Reader, Dataset

In [32]:
movie_reader=Reader(line_format="user item rating", rating_scale=(0.5,5))

In [34]:
data=Dataset.load_from_df(df_ratings[['userId', 'movieId', 'rating']],reader=movie_reader)

In [36]:
type(data)

surprise.dataset.DatasetAutoFolds

In [38]:
data.df.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [40]:
# Using KNN (K-nearest neighbour) recommendation model we will find user based collaborative filtering using cosine similary option
from surprise import KNNBasic

In [42]:
# user based (or item based) knn recommendation model using cosine similary option
knn_model=KNNBasic(k=15, min_k=5, sim_options={'name':'cosine','user_based':True},verbose=True)

In [44]:
# import cross validate function
from surprise.model_selection import cross_validate

In [46]:
# 5 fild cross validation
results=cross_validate(algo=knn_model,data=data, measures=['rmse'],cv=5,return_train_measures=True)

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.


In [52]:
results

{'test_rmse': array([0.96664858, 0.97691142, 0.98262458, 0.97752306, 0.97080069]),
 'train_rmse': array([0.86646083, 0.86232237, 0.86220235, 0.86322597, 0.86478994]),
 'fit_time': (0.5187580585479736,
  0.5410768985748291,
  0.5859315395355225,
  0.5365426540374756,
  0.5707623958587646),
 'test_time': (1.7169194221496582,
  1.728809118270874,
  1.6866321563720703,
  1.6885216236114502,
  1.7077915668487549)}

In [54]:
results['test_rmse'].mean()

0.9749016687130467

In [56]:
movies_dict = {}
for i in range(0, df_movies.shape[0]):
    movies_dict[df_movies.iloc[i,0]] = df_movies.iloc[i,1]

In [58]:
movies_dict

{1: 'Toy Story (1995)',
 2: 'Jumanji (1995)',
 3: 'Grumpier Old Men (1995)',
 4: 'Waiting to Exhale (1995)',
 5: 'Father of the Bride Part II (1995)',
 6: 'Heat (1995)',
 7: 'Sabrina (1995)',
 8: 'Tom and Huck (1995)',
 9: 'Sudden Death (1995)',
 10: 'GoldenEye (1995)',
 11: 'American President, The (1995)',
 12: 'Dracula: Dead and Loving It (1995)',
 13: 'Balto (1995)',
 14: 'Nixon (1995)',
 15: 'Cutthroat Island (1995)',
 16: 'Casino (1995)',
 17: 'Sense and Sensibility (1995)',
 18: 'Four Rooms (1995)',
 19: 'Ace Ventura: When Nature Calls (1995)',
 20: 'Money Train (1995)',
 21: 'Get Shorty (1995)',
 22: 'Copycat (1995)',
 23: 'Assassins (1995)',
 24: 'Powder (1995)',
 25: 'Leaving Las Vegas (1995)',
 26: 'Othello (1995)',
 27: 'Now and Then (1995)',
 28: 'Persuasion (1995)',
 29: 'City of Lost Children, The (Cité des enfants perdus, La) (1995)',
 30: 'Shanghai Triad (Yao a yao yao dao waipo qiao) (1995)',
 31: 'Dangerous Minds (1995)',
 32: 'Twelve Monkeys (a.k.a. 12 Monkeys) (199

In [64]:
movies_dict[2]

'Jumanji (1995)'

In [66]:
knn_model.predict(1,6)

Prediction(uid=1, iid=6, r_ui=None, est=4.201396959597076, details={'actual_k': 15, 'was_impossible': False})

In [68]:
knn_model.predict(1,6)[3]

4.201396959597076

In [72]:
userid, movieid = 1,6
val = knn_model.predict(uid=userid, iid=movieid)[3]
print(f'Prediction for user {userid} and movie {movies_dict[movieid]} is {np.round(val,0)}')

Prediction for user 1 and movie Heat (1995) is 4.0


In [74]:
userid, movieid = 2,3
val = knn_model.predict(uid=userid, iid=movieid)[3]
print(f'Prediction for user {userid} and movie {movies_dict[movieid]} is {np.round(val,0)}')

Prediction for user 2 and movie Grumpier Old Men (1995) is 3.0


In [76]:
# filter all movie Ids user 1 have watched
user_1_movies = df_ratings[df_ratings.userId == 1]['movieId'].values

In [78]:
user_1_movies[0:10]

array([  1,   3,   6,  47,  50,  70, 101, 110, 151, 157], dtype=int64)

In [80]:
len(user_1_movies)

232

In [84]:
user_1_predictions = {}
for i in range(df_movies.shape[0]):
    movId = df_movies.iloc[i,0]
    if  movId not in user_1_movies:
        user_1_predictions[movies_dict[movId]] = np.round(knn_model.predict(uid=1, iid=movId)[3],1)

In [98]:
len(user_1_predictions)

9505

In [96]:
user_1_predictions

{'Jumanji (1995)': 3.9,
 'Waiting to Exhale (1995)': 2.6,
 'Father of the Bride Part II (1995)': 3.4,
 'Sabrina (1995)': 3.4,
 'Tom and Huck (1995)': 3.2,
 'Sudden Death (1995)': 3.3,
 'GoldenEye (1995)': 3.7,
 'American President, The (1995)': 4.0,
 'Dracula: Dead and Loving It (1995)': 2.3,
 'Balto (1995)': 3.1,
 'Nixon (1995)': 3.9,
 'Cutthroat Island (1995)': 2.9,
 'Casino (1995)': 4.2,
 'Sense and Sensibility (1995)': 3.8,
 'Four Rooms (1995)': 3.7,
 'Ace Ventura: When Nature Calls (1995)': 3.3,
 'Money Train (1995)': 2.6,
 'Get Shorty (1995)': 3.6,
 'Copycat (1995)': 3.3,
 'Assassins (1995)': 3.2,
 'Powder (1995)': 3.1,
 'Leaving Las Vegas (1995)': 4.1,
 'Othello (1995)': 3.6,
 'Now and Then (1995)': 3.5,
 'Persuasion (1995)': 4.2,
 'City of Lost Children, The (Cité des enfants perdus, La) (1995)': 4.1,
 'Shanghai Triad (Yao a yao yao dao waipo qiao) (1995)': 3.5,
 'Dangerous Minds (1995)': 3.1,
 'Twelve Monkeys (a.k.a. 12 Monkeys) (1995)': 4.5,
 'Babe (1995)': 3.1,
 'Dead Man Wa

In [102]:
# get 10 top recommendations to user 1
list(dict(sorted(user_1_predictions.items(), key=lambda item: item[1],reverse=True)))[0:10]

['Life Is Beautiful (La Vita è bella) (1997)',
 'Three Billboards Outside Ebbing, Missouri (2017)',
 'Paths of Glory (1957)',
 'Wings of the Dove, The (1997)',
 "Swept Away (Travolti da un insolito destino nell'azzurro mare d'Agosto) (1975)",
 'Christmas Story, A (1983)',
 'Memories of Murder (Salinui chueok) (2003)',
 'Shawshank Redemption, The (1994)',
 'Wallace & Gromit: The Wrong Trousers (1993)',
 'Lawrence of Arabia (1962)']

In [104]:
user_1_predictions['Life Is Beautiful (La Vita è bella) (1997)']

4.8

In [92]:
df_movies[df_movies['title']=='Life Is Beautiful (La Vita è bella) (1997)']

Unnamed: 0,movieId,title,genres
1730,2324,Life Is Beautiful (La Vita è bella) (1997),Comedy|Drama|Romance|War
