In [1]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
import matplotlib.pyplot as plt
import seaborn as sns

In [185]:
movies = pd.read_csv('D:/RISE - WPU/Internship/archive (1)/ratings_small.csv')
movies

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205
5,1,1263,2.0,1260759151
6,1,1287,2.0,1260759187
7,1,1293,2.0,1260759148
8,1,1339,3.5,1260759125
9,1,1343,2.0,1260759131


In [186]:
#Transforming data into matrix where each row represents the user and column represents movie

users = movies.userId.unique().shape[0]
movie = movies.movieId.unique().shape[0]
movie = movies['movieId'].max()
num = np.zeros((users,movie))
for line in movies.itertuples():  # The loop iterates thrugh each row of dataframe and extracts ratings from user-movie matrix
    num[line[1]-1,line[2]-1] = line[3] 
print("Original Rating Matrix: ",num)

Original Rating Matrix:  [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [4. 0. 0. ... 0. 0. 0.]
 [5. 0. 0. ... 0. 0. 0.]]


# The dataset consists of ratings from 1 to 5, where 1 represents the lowest rating & 5 represents the highest. Different ratings could have different meanings to users. For instance, a rating of 3 might be good for one user while average for another user.
# To solve this ambiguity, big giants such as Netflix or YouTube have moved to binary ratings. Therefore, we will work on binary ratings instead of continuous ratings.

In [187]:
# This code converts the dataset to a binary dataset. Here only those item are considered whose ratings are greater or equal to 3 being liked by the user and others being disliked by the user. 
#As we are only considerate about the liking of users, making ratings less than 3 as 0 would not impact the recommendation process.

for i in range(len(num)):
    for j in range(len(num[0])):
        if num[i][j]>=3:
            num[i][j]=1
        else:
            num[i][j]=0

In [188]:
# We are converting dense rating matrix to a sparse matrix using csr_matrix function.

sample = csr_matrix(num)
print(sample)

  (0, 1028)	1.0
  (0, 1060)	1.0
  (0, 1171)	1.0
  (0, 1338)	1.0
  (0, 1952)	1.0
  (0, 2104)	1.0
  (0, 2149)	1.0
  (0, 3670)	1.0
  (1, 9)	1.0
  (1, 16)	1.0
  (1, 38)	1.0
  (1, 46)	1.0
  (1, 49)	1.0
  (1, 51)	1.0
  (1, 61)	1.0
  (1, 109)	1.0
  (1, 143)	1.0
  (1, 149)	1.0
  (1, 152)	1.0
  (1, 160)	1.0
  (1, 164)	1.0
  (1, 167)	1.0
  (1, 184)	1.0
  (1, 185)	1.0
  (1, 207)	1.0
  :	:
  (670, 4033)	1.0
  (670, 4305)	1.0
  (670, 4307)	1.0
  (670, 4879)	1.0
  (670, 4885)	1.0
  (670, 4895)	1.0
  (670, 4962)	1.0
  (670, 4972)	1.0
  (670, 4992)	1.0
  (670, 4994)	1.0
  (670, 5298)	1.0
  (670, 5348)	1.0
  (670, 5376)	1.0
  (670, 5444)	1.0
  (670, 5463)	1.0
  (670, 5668)	1.0
  (670, 5815)	1.0
  (670, 5901)	1.0
  (670, 5951)	1.0
  (670, 5988)	1.0
  (670, 5990)	1.0
  (670, 5994)	1.0
  (670, 6268)	1.0
  (670, 6364)	1.0
  (670, 6564)	1.0


In [189]:
# Computing similarity between movies of sample using cosine similarity.

knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=3, n_jobs=-1)
knn.fit(sample)

NearestNeighbors(algorithm='brute', leaf_size=30, metric='cosine',
                 metric_params=None, n_jobs=-1, n_neighbors=3, p=2, radius=1.0)

In [209]:
# Here we are finding the movies liked by the person whose userId = 2.

movies_sort_des = movies.sort_values(['userId', 'timestamp'])
filter1 = movies_sort_des[movies_sort_des['userId'] == 2].movieId
filter1 = filter1.tolist()
filter1 = filter1[:20]
print("Movies liked by user: ",filter1)

Movies liked by user:  [150, 296, 590, 592, 153, 165, 349, 588, 292, 339, 10, 161, 185, 208, 253, 457, 593, 110, 300, 410]


In [210]:
# Based on the likes of the userId 2 the following code depicts that what movies should be recommended to that user.

distances1=[]
indices1=[]
for i in filter1:
    distances , indices = knn.kneighbors(sample[i],n_neighbors=3)
    indices = indices.flatten()
    indices= indices[1:]
    indices1.extend(indices)
print("Movies to be recommended: ",indices1)

Movies to be recommended:  [368, 399, 566, 427, 215, 129, 160, 287, 360, 99, 561, 294, 597, 561, 567, 143, 670, 123, 123, 115, 137, 437, 17, 255, 455, 151, 173, 592, 589, 84, 281, 123, 341, 33, 561, 653, 28, 563, 453, 26]
