### Data Preprocessing

In [5]:
import pandas as pd
import numpy as np

In [6]:
movies_df = pd.read_csv("DataSet\movies.csv")
movies_df.head(5)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [7]:
rating_df = pd.read_csv("DataSet/ratings.csv")
rating_df.head(5)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [8]:
df = pd.merge(rating_df, movies_df, on = "movieId")
df.head(5)

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,5,1,4.0,847434962,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,7,1,4.5,1106635946,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
3,15,1,2.5,1510577970,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
4,17,1,4.5,1305696483,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy


In [9]:
ratingCount = df.groupby(by=["title"])['rating'].count().reset_index().rename(columns = {'rating': 'totalRatingCount'})
ratingCount.head(5)

Unnamed: 0,title,totalRatingCount
0,'71 (2014),1
1,'Hellboy': The Seeds of Creation (2004),1
2,'Round Midnight (1986),2
3,'Salem's Lot (2004),1
4,'Til There Was You (1997),2


In [10]:
movies_withTotalCount = pd.merge(df,ratingCount, on = "title", how = "left") 
movies_withTotalCount.head(5)

Unnamed: 0,userId,movieId,rating,timestamp,title,genres,totalRatingCount
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,215
1,5,1,4.0,847434962,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,215
2,7,1,4.5,1106635946,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,215
3,15,1,2.5,1510577970,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,215
4,17,1,4.5,1305696483,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,215


In [11]:
ratingCountThreshold = 40
movies_withTotalCount = movies_withTotalCount.query("totalRatingCount >= @ratingCountThreshold")
movies_withTotalCount.head(5)

Unnamed: 0,userId,movieId,rating,timestamp,title,genres,totalRatingCount
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,215
1,5,1,4.0,847434962,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,215
2,7,1,4.5,1106635946,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,215
3,15,1,2.5,1510577970,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,215
4,17,1,4.5,1305696483,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,215


In [12]:
pivot_movie_matrix = movies_withTotalCount.pivot_table(index = "movieId", columns = "userId", values = "rating").fillna(0)
pivot_movie_matrix

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,0.0,0.0,4.0,0.0,4.5,0.0,0.0,0.0,...,4.0,0.0,4.0,3.0,4.0,2.5,4.0,2.5,3.0,5.0
2,0.0,0.0,0.0,0.0,0.0,4.0,0.0,4.0,0.0,0.0,...,0.0,4.0,0.0,5.0,3.5,0.0,0.0,2.0,0.0,0.0
3,4.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0
6,4.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,...,0.0,3.0,4.0,3.0,0.0,0.0,0.0,0.0,0.0,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
122882,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0
122886,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.5
122904,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0
134130,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.5


In [13]:
query_index = np.random.choice(pivot_movie_matrix.shape[0])
movie_to_query = pivot_movie_matrix.iloc[query_index,:].values.reshape(1, -1)
movie_to_query.shape

(1, 608)

In [14]:
movie_id_indices = pd.Series(pivot_movie_matrix.index)
movie_id_indices

0           1
1           2
2           3
3           5
4           6
        ...  
634    122882
635    122886
636    122904
637    134130
638    134853
Name: movieId, Length: 639, dtype: int64

In [15]:
def getMovieNameFromIndexForKNN(index):
    movieId = movie_id_indices[index]
    movieName = movies_withTotalCount[movies_withTotalCount["movieId"] == movieId]["title"].iloc[0]
    return movieName

In [16]:
from scipy.sparse import csr_matrix
movie_feature_matrix = csr_matrix(pivot_movie_matrix.values)
movie_feature_matrix.shape

(639, 608)

### Using KNN

In [17]:
from sklearn.neighbors import NearestNeighbors

model_knn = NearestNeighbors(metric = 'cosine', algorithm = 'brute')
model_knn.fit(movie_feature_matrix)
distances, indices = model_knn.kneighbors(movie_to_query, n_neighbors = 5)

for i in range(0, len(distances.flatten())):
    if i == 0:
        print(f'Recommendations for {getMovieNameFromIndexForKNN(query_index)} are following\n')
    else:
        print(f'{getMovieNameFromIndexForKNN(indices.flatten()[i])}  with distance { distances.flatten()[i]} ')

Recommendations for Me, Myself & Irene (2000) are following

Road Trip (2000)  with distance 0.41052354658751977 
Scary Movie (2000)  with distance 0.4983537712299012 
Zoolander (2001)  with distance 0.5188291794996924 
Kingpin (1996)  with distance 0.545651680132613 


In [18]:
pivot_movie_matrix = pivot_movie_matrix.T
pivot_movie_matrix

movieId,1,2,3,5,6,7,10,11,16,17,...,109487,111759,112852,115617,116797,122882,122886,122904,134130,134853
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,4.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,2.5,0.0,0.0,0.0,0.0,2.5,0.0,2.5,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
607,4.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
608,2.5,2.0,2.0,0.0,0.0,0.0,4.0,0.0,4.5,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
609,3.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Using Correlation

In [19]:
def getMovieNameFromIndexForCorr(movieId):
    movieName = movies_withTotalCount[movies_withTotalCount["movieId"] == movieId]["title"].iloc[0]
    return movieName

In [20]:
similar_movie = pivot_movie_matrix.corrwith(pivot_movie_matrix.iloc[:,query_index])
similar_movie = pd.DataFrame(similar_movie, columns=['Pearson Corr']).reset_index()
corr_summary = similar_movie.join(ratingCount['totalRatingCount'])
corr_summary = corr_summary[corr_summary['totalRatingCount']>=40].sort_values('Pearson Corr', ascending=False).head(10)
corr_summary

Unnamed: 0,movieId,Pearson Corr,totalRatingCount
332,2355,0.328425,45
428,3948,0.319179,45
24,104,0.305642,44
435,4018,0.276528,59
298,2005,0.27207,183
563,46976,0.271068,107
553,40815,0.259248,45
405,3489,0.255785,204
285,1917,0.253594,58
8,16,0.241416,42


In [21]:

for i in range(0, 10):
    if i == 0:
        print(f'Recommendations for {getMovieNameFromIndexForCorr(corr_summary["movieId"].iloc[i])} are following\n')
    else:
        print(f'{getMovieNameFromIndexForCorr(corr_summary["movieId"].iloc[i])}  with Correlation {corr_summary["Pearson Corr"].iloc[i]} ')

Recommendations for Bug's Life, A (1998) are following

Meet the Parents (2000)  with Correlation 0.319179480340085 
Happy Gilmore (1996)  with Correlation 0.30564168809384745 
What Women Want (2000)  with Correlation 0.27652805211633336 
Goonies, The (1985)  with Correlation 0.27207038780387344 
Stranger than Fiction (2006)  with Correlation 0.27106842653634095 
Harry Potter and the Goblet of Fire (2005)  with Correlation 0.2592479613348246 
Hook (1991)  with Correlation 0.2557845145897293 
Armageddon (1998)  with Correlation 0.25359419940025424 
Casino (1995)  with Correlation 0.24141611426375129 
