In [10]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

# Get the Cleaned Data

In [2]:
final_dataset = pd.read_csv('final_dataset.csv')

In [3]:
final_dataset = final_dataset.rename(columns={final_dataset.columns[0]: 'userID'})

In [4]:
final_dataset

Unnamed: 0,userID,movieId,1,4,6,7,10,11,15,16,...,600,601,602,603,604,605,606,607,608,610
0,0,1,4.0,0.0,0.0,4.5,0.0,0.0,2.5,0.0,...,2.5,4.0,0.0,4.0,3.0,4.0,2.5,4.0,2.5,5.0
1,1,2,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,...,4.0,0.0,4.0,0.0,5.0,3.5,0.0,0.0,2.0,0.0
2,2,3,4.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0
3,3,5,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,...,2.5,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0
4,4,6,4.0,0.0,4.0,0.0,0.0,5.0,0.0,0.0,...,0.0,0.0,3.0,4.0,3.0,0.0,0.0,0.0,0.0,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2116,2116,174055,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2117,2117,176371,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2118,2118,177765,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,4.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2119,2119,179819,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
csr_data = csr_matrix(final_dataset.values)
final_dataset.reset_index(inplace=True)

In [6]:
movies = pd.read_csv("Data/movies.csv")

# KNN

In [7]:
knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=20, n_jobs=-1)
knn.fit(csr_data)

NearestNeighbors(algorithm='brute', metric='cosine', n_jobs=-1, n_neighbors=20)

In [8]:
def get_movie_recommendation(movie_name, model):
    n_movies_to_reccomend = 10
    movie_list = movies[movies['title'].str.contains(movie_name)]  
    if len(movie_list):        
        movie_idx = movie_list.iloc[0]['movieId']
        movie_idx = final_dataset[final_dataset['movieId'] == movie_idx].index[0]
        distances, indices = model.kneighbors(csr_data[movie_idx],n_neighbors=n_movies_to_reccomend+1)    
        rec_movie_indices = sorted(list(zip(indices.squeeze().tolist(),distances.squeeze().tolist())),key=lambda x: x[1])[:0:-1]
        recommend_frame = []
        for val in rec_movie_indices:
            movie_idx = final_dataset.iloc[val[0]]['movieId']
            idx = movies[movies['movieId'] == movie_idx].index
            recommend_frame.append({'Title':movies.iloc[idx]['title'].values[0],'Distance':val[1]})
        df = pd.DataFrame(recommend_frame,index=range(1,n_movies_to_reccomend+1))
        return df
    else:
        return "No movies found. Please check your input"

# Test the Model

In [9]:
model = knn
get_movie_recommendation('Iron Man', model)

Unnamed: 0,Title,Distance
1,Wanted (2008),1.638573e-07
2,"Chronicles of Narnia: Prince Caspian, The (2008)",1.619484e-07
3,Harold & Kumar Escape from Guantanamo Bay (2008),1.603542e-07
4,Forgetting Sarah Marshall (2008),1.592076e-07
5,Hancock (2008),1.551593e-07
6,"Incredible Hulk, The (2008)",1.442738e-07
7,Indiana Jones and the Kingdom of the Crystal S...,1.414663e-07
8,WALL·E (2008),1.397953e-07
9,Taken (2008),1.339826e-07
10,Kung Fu Panda (2008),1.164248e-07


# Save and Load Model

In [11]:
filename = 'model_knn.sav'
joblib.dump(model, filename)

['model_knn.sav']

In [None]:
loaded_model = joblib.load(filename)
result = get_movie_recommendation('Iron Man', loaded_model)
print(result)