In [1]:
import numpy as np
import pandas as pd

from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

from pprint import pprint

from joblib import Parallel, delayed
import joblib

In [2]:
refined_dataset = pd.read_csv('IMDbRefined2old.csv')

knn_model = joblib.load('IMDbModel.h5')

In [23]:
refined_dataset.head(1)

KeyError: 0

In [4]:
refined_dataset = refined_dataset.drop('Unnamed: 0', axis=1)

In [5]:
newTitleID = "tt8946378"    
newUserID = "641c87d06a97e629837fc079"
newPrimaryTitle = "Knives Out"
newRating = "8.0"
row = [newTitleID, newUserID, newPrimaryTitle, newRating]

In [6]:
refined_dataset.loc[len(refined_dataset)] = row

In [7]:
movieUser_df = refined_dataset.pivot_table(
    index='userID',
     columns='titleID',
      ## Replacing all movies users haven't rated with a rating of 0
      values='rating').fillna(0)

In [8]:
movie_list = movieUser_df.columns
movieUser_scipy_df = csr_matrix(movieUser_df.values)

In [9]:
knn_model.fit(movieUser_scipy_df)

NearestNeighbors(algorithm='brute', metric='cosine')

In [10]:
simUsers = []
userDistances = []
highestMovies = []
recommendedMovies = []

In [11]:
def similar_users(user, n = 5):
    knn_input = np.asarray([movieUser_df.values[user-1]])
    distances, indices = knn_model.kneighbors(knn_input, n_neighbors=n+1)
        
    for i in range(1,len(distances[0])):
        simUsers.append(indices[0][i]+1)
        userDistances.append(distances[0][i])
    return indices.flatten()[1:] + 1, distances.flatten()[1:]

In [12]:
def recommend_movies(n):
    n = min(len(mean_ratings_list),n)
    recommendedMovies.append(list(movie_list[np.argsort(mean_ratings_list)[::-1][:n]]))
    return recommendedMovies

In [13]:
target_user_id = "641c87d06a97e629837fc079"
target_user = np.where(movieUser_df.index == target_user_id)[0][0]
no_of_highest = 5
no_of_similar_users = 5
no_of_movies = 5

In [14]:
movieUser_df

titleID,tt0034583,tt0050083,tt0054215,tt0060196,tt0062622,tt0066921,tt0068646,tt0071562,tt0071853,tt0073195,...,tt5463162,tt6320628,tt6644200,tt6723592,tt6751668,tt6966692,tt7131622,tt7286456,tt8579674,tt8946378
userID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
641c87d06a97e629837fc079,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0
ur0000011,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ur0000039,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0
ur0000066,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ur0000157,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ur99955002,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ur99964320,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,9.0,0.0,0.0,0.0,0.0
ur99965244,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0
ur99966337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
target_user

0

In [16]:
highestMovies.append(list(refined_dataset[refined_dataset['userID'] == target_user_id].sort_values('rating', ascending=False)['titleID'])[:no_of_highest])

In [17]:
similar_user_list, distance_list = similar_users(target_user, no_of_similar_users)
weighted_list = distance_list/np.sum(distance_list)
similar_user_ratings = movieUser_df.values[similar_user_list]
weighted_list = weighted_list[:,np.newaxis] + np.zeros(len(movie_list))
ratings_matrix = weighted_list*similar_user_ratings
mean_ratings_list = ratings_matrix.sum(axis =0)

In [18]:
recommend_movies(no_of_movies)

[['tt6751668', 'tt1856101', 'tt2802144', 'tt7131622', 'tt8579674']]

In [19]:
dict1 = {'Users_Top_Movies': highestMovies, 'Similar_Users': simUsers, 'Sim_User_distances': userDistances, 'Recommendatons': recommendedMovies}

In [20]:
dict1

{'Users_Top_Movies': [['tt8946378']],
 'Similar_Users': [20530, 20758, 15859, 18353, 4107],
 'Sim_User_distances': [0.003765914599902831,
  0.0049608789949019405,
  0.005120760520097245,
  0.005155903583077492,
  0.005155908224726002],
 'Recommendatons': [['tt6751668',
   'tt1856101',
   'tt2802144',
   'tt7131622',
   'tt8579674']]}

In [21]:
list(movie_list[np.argsort(mean_ratings_list)[::-1][:5]])

['tt6751668', 'tt1856101', 'tt2802144', 'tt7131622', 'tt8579674']

In [22]:
refined_dataset.to_csv('testest.csv')

PermissionError: [Errno 13] Permission denied: 'testest.csv'

In [None]:
testest

In [None]:
testest = pd.read_csv('testest.csv')