In [1]:
!pip install surprise



In [2]:
import re
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn.manifold as manifold

from sklearn.neighbors import NearestNeighbors
from scipy.sparse import csr_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.neighbors import KDTree
from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate

In [3]:
!gdown --id 11jHTacIQ2e3ndLseet-YQp3clAGWTpCu #ratings.csv
!gdown --id 1mwNCe4qZoPvE_ktbHrvMNQ4hFovDsL3w #ratings_small.csv
!gdown --id 1-gTwkOpJ1RNMbYUnEAjR7b6m-19zGKHy #KaggleMovies2.csv

Downloading...
From: https://drive.google.com/uc?id=11jHTacIQ2e3ndLseet-YQp3clAGWTpCu
To: /content/ratings.csv
710MB [00:04, 159MB/s]
Downloading...
From: https://drive.google.com/uc?id=1mwNCe4qZoPvE_ktbHrvMNQ4hFovDsL3w
To: /content/ratings_small.csv
2.44MB [00:00, 76.3MB/s]
Downloading...
From: https://drive.google.com/uc?id=1-gTwkOpJ1RNMbYUnEAjR7b6m-19zGKHy
To: /content/KaggleMovies2.csv
3.93MB [00:00, 125MB/s]


In [4]:
dfMovies5 = pd.read_csv("/content/KaggleMovies2.csv")
dfMovies6 = pd.read_csv("/content/ratings.csv")
dfMovies7 = pd.read_csv("/content/ratings_small.csv")

In [5]:
df_movies = dfMovies5.copy()
df_ratings = pd.concat([dfMovies6, dfMovies7]).copy()

In [6]:
df_movies = df_movies.rename(columns={'id': 'movieId'})

In [7]:
ratings_copy = dfMovies6.copy(deep=False)

In [8]:
del ratings_copy['timestamp']

In [9]:
ratings_copy

Unnamed: 0,userId,movieId,rating
0,1,110,1.0
1,1,147,4.5
2,1,858,5.0
3,1,1221,5.0
4,1,1246,5.0
...,...,...,...
26024284,270896,58559,5.0
26024285,270896,60069,5.0
26024286,270896,63082,4.5
26024287,270896,64957,4.5


Group by User ID and get the count of how many movies each user rated.

In [10]:
top_users = ratings_copy.groupby(['userId']).count()['movieId']

Filter out users who rated less than 50 movies

In [11]:
top_users = top_users[top_users.values > 50]

In [12]:
top_users_list = top_users.keys()

In [13]:
top_ratings = ratings_copy[ratings_copy['userId'].isin(top_users_list)]

In [14]:
top_ratings

Unnamed: 0,userId,movieId,rating
59,4,223,4.0
60,4,415,4.0
61,4,648,4.0
62,4,1097,5.0
63,4,1197,4.0
...,...,...,...
26024284,270896,58559,5.0
26024285,270896,60069,5.0
26024286,270896,63082,4.5
26024287,270896,64957,4.5


Group by movie ID and get the count of how many users rated each movie

In [15]:
top_movies = ratings_copy.groupby(['movieId']).count()['userId']

Filter out movies which were rated less than 100 times

In [16]:
top_movies = top_movies[top_movies.values > 100]

In [17]:
top_movies_list = top_movies.keys()

In [18]:
top_ratings = top_ratings[top_ratings['movieId'].isin(top_movies_list)]

In [19]:
top_ratings

Unnamed: 0,userId,movieId,rating
59,4,223,4.0
60,4,415,4.0
61,4,648,4.0
62,4,1097,5.0
63,4,1197,4.0
...,...,...,...
26024284,270896,58559,5.0
26024285,270896,60069,5.0
26024286,270896,63082,4.5
26024287,270896,64957,4.5


Add Movie Title column to dataframe

In [20]:
top_ratings = pd.merge(top_ratings, df_movies[["movieId", "title"]], on="movieId", how="left")

In [21]:
top_ratings

Unnamed: 0,userId,movieId,rating,title
0,4,223,4.0,rebecca
1,4,415,4.0,batman & robin
2,4,648,4.0,beauty and the beast
3,4,1097,5.0,
4,4,1197,4.0,
...,...,...,...,...
22381164,270896,58559,5.0,confession of a child of the century
22381165,270896,60069,5.0,
22381166,270896,63082,4.5,
22381167,270896,64957,4.5,


Remove movies with no movie title

In [22]:
top_ratings.dropna(subset = ["title"], inplace=True)

In [23]:
top_ratings

Unnamed: 0,userId,movieId,rating,title
0,4,223,4.0,rebecca
1,4,415,4.0,batman & robin
2,4,648,4.0,beauty and the beast
7,4,1422,4.0,the departed
9,4,1597,3.0,meet the parents
...,...,...,...,...
22381149,270896,48780,5.0,boat
22381151,270896,49530,4.0,in time
22381158,270896,54001,4.0,the traveler
22381160,270896,54503,4.0,the mystery of chess boxing


Create pivot table for user ratings

In [46]:
user_rating_matrix = top_ratings.pivot_table(index='title', columns='userId', values='rating').fillna(0)

In [47]:
user_rating_matrix

userId,8,11,12,15,16,20,24,30,34,37,41,43,46,47,49,53,55,56,60,62,63,65,68,74,76,79,82,98,111,115,120,125,132,150,151,153,158,160,166,173,...,270725,270727,270729,270731,270734,270737,270740,270746,270750,270751,270759,270763,270765,270769,270775,270776,270784,270787,270788,270793,270795,270807,270828,270833,270835,270836,270844,270850,270854,270857,270859,270869,270871,270872,270879,270885,270887,270893,270894,270896
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
!women art revolution,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
$5 a day,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'gator bait,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0
'twas the night before christmas,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...and the pursuit of happiness,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
¡a volar joven!,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
à nos amours,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ödipussi,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
şaban oğlu şaban,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### KNN

Create sparse matrix

In [48]:
sparse_user_rating_matrix = csr_matrix(user_rating_matrix.values)

In [49]:
sparse_user_rating_matrix

<3008x63577 sparse matrix of type '<class 'numpy.float64'>'
	with 8204732 stored elements in Compressed Sparse Row format>

Fit matrix to KNN model

In [50]:
knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=20, n_jobs=-1)

In [51]:
knn.fit(sparse_user_rating_matrix)

NearestNeighbors(algorithm='brute', leaf_size=30, metric='cosine',
                 metric_params=None, n_jobs=-1, n_neighbors=20, p=2,
                 radius=1.0)

In [52]:
def make_recommendation(value):

    if len(user_rating_matrix[value:value]) == 0: # If movie not in matrix
      print("Movie not found")
      return
    distances, indices = knn.kneighbors(user_rating_matrix[value:value].values.reshape(1, -1), n_neighbors=11)
    
    for i in range(1, 11):
        print(i, user_rating_matrix.index[indices.flatten()[i]])

In [56]:
value = input("Enter a movie")
make_recommendation(value)

Enter a movieiron man
1 roger & me
2 the next best thing
3 will penny
4 parenthood
5 escape from the planet of the apes
6 secret window
7 school of rock
8 the last mistress
9 when saturday comes
10 who killed bambi?


#### Compile recommended movies into CSV for Streamlit (this takes a while)

In [None]:
"""
def make_table():

    #knn.fit(sparse_user_rating_matrix)
    data_arr = []
    temp_data_arr = []
    stop = False
    for i in range(df_movies.shape[0]):
      arr = []
      value = df_movies.iloc[i]['title']
      if len(user_rating_matrix[value:value]) == 0:
        arr = np.append(arr, [value,0,0,0,0,0,0,0,0,0,0])
        temp_data_arr.append(arr)
        data_arr = np.array(temp_data_arr)
        stop = True
      if not stop:
        distances, indices = knn.kneighbors(user_rating_matrix[value:value].values.reshape(1, -1), n_neighbors=11)
        # indices = kdt.query(user_rating_matrix[value:value].values.reshape(1, -1), k=11, return_distance=False)
        
        
        for i in range(0, 11):
            arr = np.append(arr, user_rating_matrix.index[indices.flatten()[i]])
        #print(arr)
        temp_data_arr.append(arr)
        data_arr = np.array(temp_data_arr)
      stop = False
    print(data_arr)
    rec_df = pd.DataFrame(data=data_arr, columns=["Movie", "Rec1", "Rec2",  "Rec3",  "Rec4",  "Rec5",  "Rec6",  "Rec7",  "Rec8",  "Rec9",  "Rec10"])
    return rec_df 
"""

In [None]:
# rec_df = make_table()

In [None]:
# rec_df

In [None]:
# from google.colab import drive
# drive.mount('/content/drive', force_remount=True)

In [None]:
# rec_df.to_csv('/content/drive/My Drive/Big Data Project/data/MovieRecsKNN.csv', index=False)

#### Cross Validation with SVD

Attempted to do cross validation with SVD but because of the large size of the dataset, Google Collab would always run out of RAM even with only 2 folds

In [24]:
reader = Reader() 

To perform some sort of cross validation, a massive amount of data had to be filtered out. Here we filter to only have ratings by the top 1000 users who made the most reviews.

In [25]:
top_users = top_ratings.groupby(['userId']).count()['movieId'].nlargest(1000, keep='first')

In [26]:
top_users_list = top_users.keys()

In [27]:
top_users_list

Int64Index([  8659, 179792, 107720,  45811, 229879, 243443, 270123, 172224,
             70648, 165352,
            ...
            269750,  10982,  87058, 100834, 111649, 112804, 144250, 151782,
            165164,  10132],
           dtype='int64', name='userId', length=1000)

In [28]:
top_ratings = top_ratings[top_ratings['userId'].isin(top_users_list)]

In [33]:
top_ratings

Unnamed: 0,userId,movieId,rating,title
14565,229,2,3.0,ariel
14567,229,5,1.0,four rooms
14569,229,12,1.0,finding nemo
14570,229,16,3.0,dancer in the dark
14571,229,19,1.0,metropolis
...,...,...,...,...
22380450,270887,155288,5.0,bela kiss: prologue
22380489,270887,166643,5.0,men without women
22380493,270887,167738,4.0,guy
22380496,270887,168712,5.0,panic in the skies


Perform Cross Validation

In [34]:
data = Dataset.load_from_df(top_ratings[['userId', 'movieId', 'rating']], reader)

In [35]:
svd = SVD()
cross_validate(svd,data,measures=['RMSE', 'MAE'],cv=5, verbose=False, n_jobs=-1)

{'fit_time': (67.41249871253967,
  78.0984890460968,
  77.71211504936218,
  68.86977195739746,
  40.96268439292908),
 'test_mae': array([0.57253178, 0.57209174, 0.5740254 , 0.57400415, 0.57507083]),
 'test_rmse': array([0.75104981, 0.75055775, 0.75148704, 0.75227101, 0.75235047]),
 'test_time': (2.064523220062256,
  3.1859946250915527,
  2.046184539794922,
  2.1975691318511963,
  1.2308039665222168)}

We get a Mean Square Error of approx 0.75