In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('white')
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

In [2]:
movies_df = pd.read_csv('G:/Software/Machine learning/Datasets/Movie Recommendation/New folder/movies.csv',usecols=['movieId','title'],dtype={'movieId': 'int32', 'title': 'str'})

In [3]:
movies_df.isnull().sum()

movieId    0
title      0
dtype: int64

In [4]:
movies_df.head()

Unnamed: 0,movieId,title
0,1,Toy Story (1995)
1,2,Jumanji (1995)
2,3,Grumpier Old Men (1995)
3,4,Waiting to Exhale (1995)
4,5,Father of the Bride Part II (1995)


In [5]:
rating_df=pd.read_csv('G:/Software/Machine learning/Datasets/Movie Recommendation/New folder/ratings.csv',usecols=['userId', 'movieId', 'rating'],
    dtype={'userId': 'int32', 'movieId': 'int32', 'rating': 'float32'})

In [6]:
rating_df.head()

Unnamed: 0,userId,movieId,rating
0,1,307,3.5
1,1,481,3.5
2,1,1091,1.5
3,1,1257,4.5
4,1,1449,4.5


In [7]:
rating_df.isnull().sum()

userId     0
movieId    0
rating     0
dtype: int64

In [8]:
dataset = pd.merge(rating_df , movies_df , on = 'movieId')
dataset.head()

Unnamed: 0,userId,movieId,rating,title
0,1,307,3.5,Three Colors: Blue (Trois couleurs: Bleu) (1993)
1,6,307,4.0,Three Colors: Blue (Trois couleurs: Bleu) (1993)
2,56,307,4.0,Three Colors: Blue (Trois couleurs: Bleu) (1993)
3,71,307,5.0,Three Colors: Blue (Trois couleurs: Bleu) (1993)
4,84,307,3.0,Three Colors: Blue (Trois couleurs: Bleu) (1993)


In [9]:
combine_movie_rating = dataset.dropna(axis = 0 , subset = ['title'])
movie_ratingCount = (combine_movie_rating.groupby(by = ['title'])['rating'].count().reset_index().rename(columns = {'rating': 'totalRatingCount'})[['title', 'totalRatingCount']])
movie_ratingCount.head()

Unnamed: 0,title,totalRatingCount
0,"""Great Performances"" Cats (1998)",8
1,#Horror (2015),1
2,#realityhigh (2017),1
3,$ (Dollars) (1971),1
4,$5 a Day (2008),2


In [10]:
rating_with_totalRatingCount = combine_movie_rating.merge(movie_ratingCount, left_on = 'title', right_on = 'title', how = 'left')
rating_with_totalRatingCount.head()

Unnamed: 0,userId,movieId,rating,title,totalRatingCount
0,1,307,3.5,Three Colors: Blue (Trois couleurs: Bleu) (1993),332
1,6,307,4.0,Three Colors: Blue (Trois couleurs: Bleu) (1993),332
2,56,307,4.0,Three Colors: Blue (Trois couleurs: Bleu) (1993),332
3,71,307,5.0,Three Colors: Blue (Trois couleurs: Bleu) (1993),332
4,84,307,3.0,Three Colors: Blue (Trois couleurs: Bleu) (1993),332


In [11]:
pd.set_option('display.float_format', lambda x: '%.3f' % x)
print(movie_ratingCount['totalRatingCount'].describe())

count   22147.000
mean       47.346
std       166.740
min         1.000
25%         1.000
50%         4.000
75%        20.000
max      3606.000
Name: totalRatingCount, dtype: float64


In [12]:
popularity_threshold = 50
rating_popular_movie= rating_with_totalRatingCount.query('totalRatingCount >= @popularity_threshold')
rating_popular_movie.head()

Unnamed: 0,userId,movieId,rating,title,totalRatingCount
0,1,307,3.5,Three Colors: Blue (Trois couleurs: Bleu) (1993),332
1,6,307,4.0,Three Colors: Blue (Trois couleurs: Bleu) (1993),332
2,56,307,4.0,Three Colors: Blue (Trois couleurs: Bleu) (1993),332
3,71,307,5.0,Three Colors: Blue (Trois couleurs: Bleu) (1993),332
4,84,307,3.0,Three Colors: Blue (Trois couleurs: Bleu) (1993),332


In [13]:
rating_popular_movie.shape

(915426, 5)

In [14]:
movie_features_df=rating_popular_movie.pivot_table(index='title',columns='userId',values='rating').fillna(0)
movie_features_df.head()

userId,1,2,3,4,5,6,7,8,9,10,...,10523,10524,10525,10526,10527,10528,10529,10530,10531,10532
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"'burbs, The (1989)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
(500) Days of Summer (2009),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0
*batteries not included (1987),0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10 Cloverfield Lane (2016),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10 Things I Hate About You (1999),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4.5,0.0,0.0,0.0,3.5,0.0,0.0,0.0,0.0,0.0


In [15]:
movie_features_df_matrix = csr_matrix(movie_features_df.values)
model_knn = NearestNeighbors(metric = 'cosine', algorithm = 'brute')
model_knn.fit(movie_features_df_matrix)

NearestNeighbors(algorithm='brute', leaf_size=30, metric='cosine',
                 metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                 radius=1.0)

In [16]:
query_index = np.random.choice(movie_features_df.shape[0])
print(query_index)
distances, indices = model_knn.kneighbors(movie_features_df.iloc[query_index,:].values.reshape(1, -1), n_neighbors = 6)

1568


In [17]:
for i in range(0, len(distances.flatten())):
    if i == 0:
        print('Recommendations for {0}:\n'.format(movie_features_df.index[query_index]))
    else:
        print('{0}: {1}, with distance of {2}:'.format(i, movie_features_df.index[indices.flatten()[i]], distances.flatten()[i]))

Recommendations for Jack Reacher (2012):

1: Knight and Day (2010), with distance of 0.5485837459564209:
2: Oblivion (2013), with distance of 0.55196213722229:
3: Mission: Impossible - Ghost Protocol (2011), with distance of 0.5547822713851929:
4: Red (2010), with distance of 0.5669859647750854:
5: Total Recall (2012), with distance of 0.5734819173812866:
