In [1]:
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
import matplotlib.pyplot as plt
import seaborn as sns

In [27]:
movies = pd.read_csv("./data/movies.csv")
ratings = pd.read_csv("./data/ratings.csv")
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [7]:
data_clean = ratings.pivot(index='movieId',columns='userId',values='rating')
data_clean.head()

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,,,4.0,,4.5,,,,...,4.0,,4.0,3.0,4.0,2.5,4.0,2.5,3.0,5.0
2,,,,,,4.0,,4.0,,,...,,4.0,,5.0,3.5,,,2.0,,
3,4.0,,,,,5.0,,,,,...,,,,,,,,2.0,,
4,,,,,,3.0,,,,,...,,,,,,,,,,
5,,,,,,5.0,,,,,...,,,,3.0,,,,,,


In [10]:
data_clean.fillna(0,inplace=True)
data_clean.head()

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,0.0,0.0,4.0,0.0,4.5,0.0,0.0,0.0,...,4.0,0.0,4.0,3.0,4.0,2.5,4.0,2.5,3.0,5.0
2,0.0,0.0,0.0,0.0,0.0,4.0,0.0,4.0,0.0,0.0,...,0.0,4.0,0.0,5.0,3.5,0.0,0.0,2.0,0.0,0.0
3,4.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
# to make the data more stable and reliable we take 
#  a min of 10 users must vote for a movie

no_user_voted = ratings.groupby('movieId')['rating'].agg('count')
no_movies_voted = ratings.groupby('userId')['rating'].agg('count')

In [18]:
data_clean = data_clean.loc[no_user_voted[no_user_voted > 10].index,:]

In [19]:
data_preprocessed = data_clean.loc[:,no_movies_voted[no_movies_voted > 50].index]

In [21]:
csr_data = csr_matrix(data_preprocessed.values)
data_preprocessed.reset_index(inplace=True)

In [22]:
knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=20, n_jobs=-1)
knn.fit(csr_data)

NearestNeighbors(algorithm='brute', leaf_size=30, metric='cosine',
                 metric_params=None, n_jobs=-1, n_neighbors=20, p=2,
                 radius=1.0)

In [38]:
def recommend_movie(movie_name):
    no_movies_to_recommend = 10
    movie = movies[movies['title'].str.contains(movie_name)]  
    if len(movie):
        movie_ids = movie.iloc[0]['movieId']
        movie_ids = data_preprocessed[data_preprocessed['movieId'] == movie_ids].index[0]
        dist,indices = knn.kneighbors(csr_data[movie_ids],n_neighbors = no_movies_to_recommend+1)
        rec_movie_indices = sorted(list(zip(indices.squeeze().tolist(),dist.squeeze().tolist())),key=lambda x: x[1])[:0:-1]
        recommend_frame=[]
        for val in rec_movie_indices:
            movie_ids = data_preprocessed.iloc[val[0]]['movieId']
            ids = movies[movies['movieId'] == movie_ids].index
            recommend_frame.append({'Title':movies.iloc[ids]['title'].values[0],'Distance':val[1]})
        df = pd.DataFrame(recommend_frame,index=range(1,no_movies_to_recommend+1))
        return df
    else:
        return "No Movies Found. Please check your I/P"

In [39]:
# call recommend movie with a movie name to recommend similar movies

Unnamed: 0,Title,Distance
1,X-Men: First Class (2011),1.111104e-07
2,Guardians of the Galaxy (2014),1.106718e-07
3,District 9 (2009),1.085074e-07
4,Sherlock Holmes (2009),1.077268e-07
5,Kung Fu Panda (2008),1.066761e-07
6,Watchmen (2009),1.061401e-07
7,Star Trek (2009),1.047793e-07
8,Iron Man 2 (2010),9.618056e-08
9,Avatar (2009),9.313901e-08
10,"Avengers, The (2012)",8.755834e-08


In [42]:
recommend_movie('Iron Man 2')

Unnamed: 0,Title,Distance
1,Captain America: The First Avenger (2011),2.986136e-08
2,Man of Steel (2013),2.94199e-08
3,X-Men: Days of Future Past (2014),2.935479e-08
4,Star Wars: Episode VII - The Force Awakens (2015),2.935363e-08
5,Ant-Man (2015),2.891066e-08
6,"Amazing Spider-Man, The (2012)",2.88358e-08
7,Captain America: The Winter Soldier (2014),2.751788e-08
8,Guardians of the Galaxy (2014),2.65233e-08
9,Iron Man 3 (2013),2.4604e-08
10,X-Men: First Class (2011),2.287212e-08
