In [1]:
import pandas as pd
import numpy as np

In [4]:
movies_df=pd.read_csv('ml-25m/movies.csv', usecols=['movieId','title'], dtype={'movirsId':'int32','title':'str'})
movies_df.head()

Unnamed: 0,movieId,title
0,1,Toy Story (1995)
1,2,Jumanji (1995)
2,3,Grumpier Old Men (1995)
3,4,Waiting to Exhale (1995)
4,5,Father of the Bride Part II (1995)


In [7]:
ratings_df=pd.read_csv('ml-25m/ratings.csv',
    usecols=['userId', 'movieId', 'rating','timestamp'],dtype={'userId': 'int32', 'movieId': 'int32', 'rating': 'float32'})
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510


In [9]:
#checking for null 
movies_df.isnull().sum()

movieId    0
title      0
dtype: int64

In [10]:
ratings_df.isnull().sum()

userId       0
movieId      0
rating       0
timestamp    0
dtype: int64

In [12]:
#merging
movies_merged_df=movies_df.merge(ratings_df, on='movieId')
movies_merged_df.head()

Unnamed: 0,movieId,title,userId,rating,timestamp
0,1,Toy Story (1995),2,3.5,1141415820
1,1,Toy Story (1995),3,4.0,1439472215
2,1,Toy Story (1995),4,3.0,1573944252
3,1,Toy Story (1995),5,4.0,858625949
4,1,Toy Story (1995),8,4.0,890492517


In [13]:
movies_merged_df=movies_merged_df.dropna(axis = 0, subset = ['title'])
movies_merged_df.head()

Unnamed: 0,movieId,title,userId,rating,timestamp
0,1,Toy Story (1995),2,3.5,1141415820
1,1,Toy Story (1995),3,4.0,1439472215
2,1,Toy Story (1995),4,3.0,1573944252
3,1,Toy Story (1995),5,4.0,858625949
4,1,Toy Story (1995),8,4.0,890492517


In [15]:
movies_average_rating=movies_merged_df.groupby('title')['rating'].mean().sort_values(ascending=False).reset_index().rename(columns={'rating':'Average Rating'})
movies_average_rating.head()


Unnamed: 0,title,Average Rating
0,Full of Grace (2015),5.0
1,Geordie (1955),5.0
2,Garfield In Paradise (1986),5.0
3,Lost Woods (2012),5.0
4,Garfield in the Rough (1984),5.0


In [17]:
movies_rating_count=movies_merged_df.groupby('title')['rating'].count().sort_values(ascending=True).reset_index().rename(columns={'rating':'Rating Count'}) #ascending=False
movies_rating_count_avg=movies_rating_count.merge(movies_average_rating, on='title')
movies_rating_count_avg.head()

Unnamed: 0,title,Rating Count,Average Rating
0,"""BLOW THE NIGHT!"" Let's Spend the Night Togeth...",1,3.0
1,On Trial (1954),1,3.5
2,Checkmate,1,3.0
3,On Top (1982),1,3.5
4,On Reflection: B.S. Johnson on Dr. Samuel John...,1,4.5


In [18]:
rating_with_RatingCount = movies_merged_df.merge(movies_rating_count, left_on = 'title', right_on = 'title', how = 'left')
rating_with_RatingCount.head()

Unnamed: 0,movieId,title,userId,rating,timestamp,Rating Count
0,1,Toy Story (1995),2,3.5,1141415820,57309
1,1,Toy Story (1995),3,4.0,1439472215,57309
2,1,Toy Story (1995),4,3.0,1573944252,57309
3,1,Toy Story (1995),5,4.0,858625949,57309
4,1,Toy Story (1995),8,4.0,890492517,57309


In [19]:
pd.set_option('display.float_format',lambda x: '%.3f' % x)
print(rating_with_RatingCount['Rating Count'].describe())

count   25000095.000
mean       14925.355
std        16439.336
min            1.000
25%         2986.000
50%         9152.000
75%        20757.000
max        81491.000
Name: Rating Count, dtype: float64


In [21]:
popularity_threshold = 50
popular_movies = rating_with_RatingCount[rating_with_RatingCount['Rating Count']>=popularity_threshold]
popular_movies.head()

Unnamed: 0,movieId,title,userId,rating,timestamp,Rating Count
0,1,Toy Story (1995),2,3.5,1141415820,57309
1,1,Toy Story (1995),3,4.0,1439472215,57309
2,1,Toy Story (1995),4,3.0,1573944252,57309
3,1,Toy Story (1995),5,4.0,858625949,57309
4,1,Toy Story (1995),8,4.0,890492517,57309


In [25]:
import os
movie_features_df=popular_movies.pivot_table(index='title',columns='userId',values='rating').fillna(0)
movie_features_df.head()

#movie_features_df.to_excel('output.xlsx')

userId,1,2,3,4,5,6,7,8,9,10,...,162532,162533,162534,162535,162536,162537,162538,162539,162540,162541
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"""Great Performances"" Cats (1998)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
$9.99 (2008),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'71 (2014),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Hellboy': The Seeds of Creation (2004),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Round Midnight (1986),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [30]:
from scipy.sparse import csr_matrix
movie_features_df_matrix =csr_matrix(movie_features_df.values)

In [31]:
from sklearn.neighbors import NearestNeighbors
model_knn = NearestNeighbors(metric = 'cosine', algorithm ='brute')
model_knn.fit(movie_features_df_matrix)

NearestNeighbors(algorithm='brute', metric='cosine')

In [32]:
movie_features_df.shape

(13176, 162540)

In [33]:
query_index = np.random.choice(movie_features_df.shape[0])
print(query_index)
distances, indices = model_knn.kneighbors(movie_features_df.iloc[query_index,:].values.reshape(1, -1), n_neighbors = 6)


5458


In [34]:
movie_features_df.head()

userId,1,2,3,4,5,6,7,8,9,10,...,162532,162533,162534,162535,162536,162537,162538,162539,162540,162541
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"""Great Performances"" Cats (1998)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
$9.99 (2008),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'71 (2014),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Hellboy': The Seeds of Creation (2004),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Round Midnight (1986),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [36]:
for i in range(0, len(distances.flatten())):
    if i ==0:
        print('Recommendation for {0}:\n'.format(movie_features_df.index[query_index]))
    else:
        print('{0}: {1}, with distance of {2}:'.format(i, movie_features_df.index[indices.flatten()[i]], distances.flatten()[i]))
        

Recommendation for House of the Dead, The (2003):

1: Jason X (2002), with distance of 0.7672885060310364:
2: Freddy vs. Jason (2003), with distance of 0.778326690196991:
3: Jason Goes to Hell: The Final Friday (1993), with distance of 0.7820399403572083:
4: Darkness Falls (2003), with distance of 0.7997773885726929:
5: Alone in the Dark (2005), with distance of 0.8002175688743591:
