In [1]:
# # Import Python libraries
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
from sklearn.metrics import pairwise_distances
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import csr_matrix

In [2]:
metadata = pd.read_csv("Data/Horror.csv", low_memory=False)

#Select Needed columns for analysis from Metadata
metadata= metadata[['movieId', 'title']]

metadata.head()

Unnamed: 0,movieId,title
0,12,Dracula: Dead and Loving It
1,22,Copycat
2,70,From Dusk Till Dawn
3,92,Mary Reilly
4,93,Vampire in Brooklyn


In [3]:
#Count of movies in the file based off of ID 
metadata['movieId'].count()

5555

In [4]:
#Read movie ratings file
ratings= pd.read_csv("Data/ratings.csv")
#Select Columns that will be used
ratings= ratings[['userId', 'movieId', 'rating']]
#Change movie ID to Neumeric so that it can be merged. Errors = Coerce so that invalid parsing will be set as NaN 
ratings.movieId = pd.to_numeric(ratings.movieId)
ratings.userId = pd.to_numeric(ratings.userId)
ratings.head()

Unnamed: 0,userId,movieId,rating
0,1,307,3.5
1,1,481,3.5
2,1,1091,1.5
3,1,1257,4.5
4,1,1449,4.5


In [5]:
# Look at the count of ratings in our dataset. Will need to reduce size so code runs faster 
ratings.count()

userId     27753444
movieId    27753444
rating     27753444
dtype: int64

In [6]:
ratings.movieId.nunique()

53889

In [7]:
#Count the number of movie ratings per movie
movie_rating_count = (ratings.groupby(by = ['movieId'])['rating'].count().reset_index().
                      rename(columns = {'rating' : 'movie_rating_count'})[['movieId','movie_rating_count']])
movie_rating_count.head()

Unnamed: 0,movieId,movie_rating_count
0,1,68469
1,2,27143
2,3,15585
3,4,2989
4,5,15474


In [8]:
#Look at the distribution of ratings among each movie ID
movie_rating_count['movie_rating_count'].describe()

count    53889.000000
mean       515.011301
std       2934.758939
min          1.000000
25%          2.000000
50%          7.000000
75%         48.000000
max      97999.000000
Name: movie_rating_count, dtype: float64

In [9]:
#Count the number of ratings per user 
user_rating_count = (ratings.groupby(by = ['userId'])['rating'].count().reset_index().
                      rename(columns = {'rating' : 'user_rating_count'})[['userId','user_rating_count']])
user_rating_count 
user_rating_count.head()

Unnamed: 0,userId,user_rating_count
0,1,16
1,2,15
2,3,11
3,4,736
4,5,72


In [10]:
user_rating_count['user_rating_count'].describe()

count    283228.000000
mean         97.989761
std         212.760722
min           1.000000
25%          15.000000
50%          30.000000
75%          95.000000
max       23715.000000
Name: user_rating_count, dtype: float64

In [11]:
ratings = pd.merge(ratings, movie_rating_count, on='movieId', how='left')
ratings = pd.merge(ratings, user_rating_count, on='userId', how='left')
ratings.head()

Unnamed: 0,userId,movieId,rating,movie_rating_count,user_rating_count
0,1,307,3.5,7958,16
1,1,481,3.5,6037,16
2,1,1091,1.5,6138,16
3,1,1257,4.5,5902,16
4,1,1449,4.5,6867,16


In [12]:
matrix_input = pd.merge(metadata, ratings, on='movieId', how='left')
matrix_input.head()

Unnamed: 0,movieId,title,userId,rating,movie_rating_count,user_rating_count
0,12,Dracula: Dead and Loving It,8.0,3.0,4524.0,31.0
1,12,Dracula: Dead and Loving It,19.0,3.0,4524.0,262.0
2,12,Dracula: Dead and Loving It,134.0,3.0,4524.0,1208.0
3,12,Dracula: Dead and Loving It,158.0,4.0,4524.0,60.0
4,12,Dracula: Dead and Loving It,214.0,2.0,4524.0,1616.0


In [13]:
matrix_input.title.nunique()

5222

In [14]:
#filter top 500 movies for sample- horror genere >= 478
unique = matrix_input.drop_duplicates(['movieId'])
unique = unique.sort_values(['movie_rating_count'], ascending= False)
unique.head(5)

Unnamed: 0,movieId,title,userId,rating,movie_rating_count,user_rating_count
127942,593,The Silence of the Lambs,4.0,4.5,87899.0,736.0
1041071,2762,The Sixth Sense,4.0,5.0,52270.0,736.0
314975,1214,Alien,4.0,2.0,39282.0,736.0
280403,1200,Aliens,4.0,3.0,34572.0,736.0
398219,1258,The Shining,10.0,5.0,32129.0,121.0


In [15]:
matrix_input_filtered = matrix_input.loc[matrix_input['movie_rating_count'] > 478]
matrix_input_filtered.head()

Unnamed: 0,movieId,title,userId,rating,movie_rating_count,user_rating_count
0,12,Dracula: Dead and Loving It,8.0,3.0,4524.0,31.0
1,12,Dracula: Dead and Loving It,19.0,3.0,4524.0,262.0
2,12,Dracula: Dead and Loving It,134.0,3.0,4524.0,1208.0
3,12,Dracula: Dead and Loving It,158.0,4.0,4524.0,60.0
4,12,Dracula: Dead and Loving It,214.0,2.0,4524.0,1616.0


In [16]:
matrix_input_filtered.title.nunique()

500

In [17]:
matrix_input_filtered.shape

(1879620, 6)

In [18]:
ratings_pivot= matrix_input_filtered.pivot_table(index='title', columns='userId', values='rating').fillna(0)
ratings_pivot.head()

userId,1.0,2.0,3.0,4.0,6.0,7.0,8.0,10.0,12.0,13.0,...,283215.0,283219.0,283220.0,283221.0,283222.0,283223.0,283224.0,283226.0,283227.0,283228.0
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
13 Ghosts,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1408,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
28 Days Later,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
28 Weeks Later,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3 Extremes,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
ratings_matrix = csr_matrix(ratings_pivot.values)

In [20]:
model_knn = NearestNeighbors(metric = 'cosine', algorithm = 'brute')
model_knn.fit(ratings_matrix)

NearestNeighbors(algorithm='brute', leaf_size=30, metric='cosine',
         metric_params=None, n_jobs=None, n_neighbors=5, p=2, radius=1.0)

In [21]:
from fuzzywuzzy import fuzz

def print_movie_recommendations(query_movie, ratings_pivot, model_knn, k):
    query_index = None
    ratio_tuples = []
    
    for i in ratings_pivot.index:
        ratio = fuzz.ratio(i.lower(), query_movie.lower())
        if ratio >= 75:
            current_query_index = ratings_pivot.index.tolist().index(i)
            ratio_tuples.append((i, ratio, current_query_index))
    
    print('Possible matches: {0}\n'.format([(x[0], x[1]) for x in ratio_tuples]))
    
    try:
         # get the index of the best movie match in the data
        query_index = max(ratio_tuples, key = lambda x: x[1])[2]
        
    except:
        print('Your movie didn\'t match any movie in the data set.')
        return None
    
    distances, indices = model_knn.kneighbors(ratings_pivot.iloc[query_index, :].values.reshape(1, -1), n_neighbors = k + 1)

    for i in range(0, len(distances.flatten())):
        if i == 0:
            print('Other users who like the movie {0} also like:\n'.format(ratings_pivot.index[query_index]))
        else:
            print('{0}: {1}, with distance of {2}'.format(i, ratings_pivot.index[indices.flatten()[i]], distances.flatten()[i]))
            
    return None