In [53]:
# # Import Python libraries
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import StandardScaler
import warnings
from scipy.sparse import csr_matrix

In [28]:
metadata = pd.read_csv("Data/Horror.csv", low_memory=False)

#Select Needed columns for analysis from Metadata
metadata= metadata[['movieId', 'title']]

metadata.head()

Unnamed: 0,movieId,title
0,12,Dracula: Dead and Loving It
1,22,Copycat
2,70,From Dusk Till Dawn
3,92,Mary Reilly
4,93,Vampire in Brooklyn


In [29]:
#Count of movies in the file based off of ID 
metadata['movieId'].count()

5555

In [30]:
#Read movie ratings file
ratings= pd.read_csv("Data/ratings.csv")
#Select Columns that will be used
ratings= ratings[['userId', 'movieId', 'rating']]
#Change movie ID to Neumeric so that it can be merged. Errors = Coerce so that invalid parsing will be set as NaN 
ratings.movieId = pd.to_numeric(ratings.movieId)
ratings.userId = pd.to_numeric(ratings.userId)
ratings.userId.nunique()

283228

In [31]:
# Look at the count of ratings in our dataset. Will need to reduce size so code runs faster 
ratings.count()

userId     27753444
movieId    27753444
rating     27753444
dtype: int64

In [32]:
ratings.movieId.nunique()

53889

In [33]:
#Count the number of movie ratings per movie
movie_rating_count = (ratings.groupby(by = ['movieId'])['rating'].count().reset_index().
                      rename(columns = {'rating' : 'movie_rating_count'})[['movieId','movie_rating_count']])
movie_rating_count.head()

Unnamed: 0,movieId,movie_rating_count
0,1,68469
1,2,27143
2,3,15585
3,4,2989
4,5,15474


In [34]:
#Look at the distribution of ratings among each movie ID
movie_rating_count['movie_rating_count'].describe()

count    53889.000000
mean       515.011301
std       2934.758939
min          1.000000
25%          2.000000
50%          7.000000
75%         48.000000
max      97999.000000
Name: movie_rating_count, dtype: float64

In [35]:
#Count the number of ratings per user 
user_rating_count = (ratings.groupby(by = ['userId'])['rating'].count().reset_index().
                      rename(columns = {'rating' : 'user_rating_count'})[['userId','user_rating_count']])
user_rating_count 
user_rating_count.head()

Unnamed: 0,userId,user_rating_count
0,1,16
1,2,15
2,3,11
3,4,736
4,5,72


In [36]:
user_rating_count['user_rating_count'].describe()

count    283228.000000
mean         97.989761
std         212.760722
min           1.000000
25%          15.000000
50%          30.000000
75%          95.000000
max       23715.000000
Name: user_rating_count, dtype: float64

In [37]:
ratings = pd.merge(ratings, movie_rating_count, on='movieId', how='left')
ratings = pd.merge(ratings, user_rating_count, on='userId', how='left')
ratings.head()

Unnamed: 0,userId,movieId,rating,movie_rating_count,user_rating_count
0,1,307,3.5,7958,16
1,1,481,3.5,6037,16
2,1,1091,1.5,6138,16
3,1,1257,4.5,5902,16
4,1,1449,4.5,6867,16


In [38]:
matrix_input = pd.merge(metadata, ratings, on='movieId', how='left')
matrix_input.head()

Unnamed: 0,movieId,title,userId,rating,movie_rating_count,user_rating_count
0,12,Dracula: Dead and Loving It,8.0,3.0,4524.0,31.0
1,12,Dracula: Dead and Loving It,19.0,3.0,4524.0,262.0
2,12,Dracula: Dead and Loving It,134.0,3.0,4524.0,1208.0
3,12,Dracula: Dead and Loving It,158.0,4.0,4524.0,60.0
4,12,Dracula: Dead and Loving It,214.0,2.0,4524.0,1616.0


In [39]:
matrix_input.title.nunique()

5222

In [40]:
#filter top 500 movies for sample- horror genere >= 478
unique = matrix_input.drop_duplicates(['movieId'])
unique = unique.sort_values(['movie_rating_count'], ascending= False)
unique.head(5)

Unnamed: 0,movieId,title,userId,rating,movie_rating_count,user_rating_count
127942,593,The Silence of the Lambs,4.0,4.5,87899.0,736.0
1041071,2762,The Sixth Sense,4.0,5.0,52270.0,736.0
314975,1214,Alien,4.0,2.0,39282.0,736.0
280403,1200,Aliens,4.0,3.0,34572.0,736.0
398219,1258,The Shining,10.0,5.0,32129.0,121.0


In [41]:
matrix_input_filtered = matrix_input.loc[matrix_input['movie_rating_count'] > 478]
matrix_input_filtered.head()

Unnamed: 0,movieId,title,userId,rating,movie_rating_count,user_rating_count
0,12,Dracula: Dead and Loving It,8.0,3.0,4524.0,31.0
1,12,Dracula: Dead and Loving It,19.0,3.0,4524.0,262.0
2,12,Dracula: Dead and Loving It,134.0,3.0,4524.0,1208.0
3,12,Dracula: Dead and Loving It,158.0,4.0,4524.0,60.0
4,12,Dracula: Dead and Loving It,214.0,2.0,4524.0,1616.0


In [42]:
matrix_input_filtered.title.nunique()

500

In [43]:
ratings_pivot= matrix_input_filtered.pivot_table(index='userId', columns='title', values='rating').fillna(0)
ratings_pivot.head()

title,13 Ghosts,1408,28 Days Later,28 Weeks Later,3 Extremes,30 Days of Night,A Nightmare on Elm Street,A Nightmare on Elm Street 2: Freddy's Revenge,A Nightmare on Elm Street 3: Dream Warriors,A Nightmare on Elm Street 4: The Dream Master,...,White Noise,Willard,Wishmaster,Wolf,Wolf Creek,World War Z,Wrong Turn,You're Next,Zombieland,[REC]
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0
6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [48]:
ratings_pivot.shape

(199467, 500)

In [49]:
ratings_pivot_T = ratings_pivot.values.T
ratings_pivot_T.shape

(500, 199467)

In [59]:
SVD = TruncatedSVD(n_components = 100, random_state = 17)
matrix = SVD.fit_transform(ratings_pivot_T)
matrix.shape

(500, 100)

In [60]:
warnings.filterwarnings("ignore", category = RuntimeWarning)
corr = pd.DataFrame(np.corrcoef(matrix), index=ratings_pivot.columns, columns=ratings_pivot.columns)
corr.head()

title,13 Ghosts,1408,28 Days Later,28 Weeks Later,3 Extremes,30 Days of Night,A Nightmare on Elm Street,A Nightmare on Elm Street 2: Freddy's Revenge,A Nightmare on Elm Street 3: Dream Warriors,A Nightmare on Elm Street 4: The Dream Master,...,White Noise,Willard,Wishmaster,Wolf,Wolf Creek,World War Z,Wrong Turn,You're Next,Zombieland,[REC]
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
13 Ghosts,1.0,0.569163,0.493045,0.569934,0.441312,0.627592,0.309768,0.4746,0.48312,0.474641,...,0.609873,0.68409,0.577543,0.176437,0.520033,0.059825,0.559995,0.163913,0.178489,0.293854
1408,0.569163,1.0,0.373022,0.71266,0.507447,0.859846,0.25689,0.21917,0.246232,0.199873,...,0.715348,0.44636,0.251769,0.092887,0.673737,0.300031,0.600933,0.556872,0.35718,0.707056
28 Days Later,0.493045,0.373022,1.0,0.560323,0.337218,0.444927,0.236117,0.17883,0.206816,0.166702,...,0.324854,0.352267,0.184,0.077394,0.333462,0.243403,0.329191,0.232207,0.358,0.39768
28 Weeks Later,0.569934,0.71266,0.560323,1.0,0.38997,0.782007,0.175095,0.160886,0.181918,0.146785,...,0.36509,0.247126,0.165081,0.062701,0.413056,0.348635,0.283537,0.258979,0.336258,0.540869
3 Extremes,0.441312,0.507447,0.337218,0.38997,1.0,0.581868,0.278694,0.300648,0.347128,0.306472,...,0.530049,0.694207,0.366513,0.117671,0.802358,0.124347,0.590391,0.579712,0.202869,0.741579


In [67]:
corr_movies = corr.loc['Underworld'].sort_values(ascending=False)
corr_movies_df = corr_movies.reset_index(name='Correlation').head(4)
corr_movies_df

Unnamed: 0,title,Correlation
0,Underworld,1.0
1,Underworld: Evolution,0.940756
2,Underworld: Rise of the Lycans,0.8596
3,Underworld: Awakening,0.782128


In [62]:
from fuzzywuzzy import fuzz

def print_movie_recommendations(query_movie, ratings_pivot, N):
    query_index = None
    ratio_tuples = []
    movie_name = 'variable'
    
    for i in ratings_pivot.index:
        ratio = fuzz.ratio(i.lower(), query_movie.lower())
        if ratio >= 75:
            current_query_index = ratings_pivot.index.tolist().index(i)
            ratio_tuples.append((i, ratio, current_query_index))
            movie_name = str(ratio_tuples[0][0])
    
    print('Possible matches: {0}\n'.format([(x[0], x[1]) for x in ratio_tuples]))

    try:
        corr_movies = corr.loc[movie_name].sort_values(ascending=False)
        corr_movies_df = corr_movies.reset_index(name='Correlation').head(N + 1)

    except:
        print('Your movie didn\'t match any movie in the data set.')
        return None
    print('Other users who like the movie ' + str(movie_name) + ' also like:\n')
    print(corr_movies_df.tail(-1))
    
    

In [75]:
while True:
    a = input("Would you like help finding a movie? yes/no: ")
    if a=="yes":
        user_input = input('Enter the title of a Horror movie that you enjoy:')
        print_movie_recommendations(user_input, corr, 10)
        continue
    elif a=="no":
        break
    else:
        print("Enter either yes/no")

Would you like help finding a movie? yes/no: yes
Enter the title of a Horror movie that you enjoy:alien
Possible matches: [('Alien', 100), ('Aliens', 91), ('Alien³', 91)]

Other users who like the movie Alien also like:

                       title  Correlation
1                     Aliens     0.768615
2                       Jaws     0.563384
3                The Shining     0.522133
4                     Alien³     0.483402
5                     Psycho     0.474045
6   The Silence of the Lambs     0.458329
7            The Sixth Sense     0.458096
8                  The Thing     0.457162
9               The Exorcist     0.441705
10       Alien: Resurrection     0.426549
Would you like help finding a movie? yes/no: no
