In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from math import sqrt, ceil
import math

%matplotlib inline
%config Completer.use_jedi = False

In [2]:
ratings = pd.read_csv("./ratings.csv", usecols=['userId', 'movieId', 'rating'])

In [3]:
movies = pd.read_csv("./movies.csv")

In [4]:
# Convert Genre1|Genre2|Genre3 to "Genre1 Gengre2 Genre3"
movies['genres'] = movies['genres'].str.split('|').str.join(" ")

In [5]:
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy
1,2,Jumanji (1995),Adventure Children Fantasy
2,3,Grumpier Old Men (1995),Comedy Romance
3,4,Waiting to Exhale (1995),Comedy Drama Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action Animation Comedy Fantasy
9738,193583,No Game No Life: Zero (2017),Animation Comedy Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action Animation


In [6]:
# One thing that I noticed is that couple of movies actually have duplicates
movies[movies.duplicated('title')]

Unnamed: 0,movieId,title,genres
5601,26958,Emma (1996),Romance
6932,64997,War of the Worlds (2005),Action Sci-Fi
9106,144606,Confessions of a Dangerous Mind (2002),Comedy Crime Drama Romance Thriller
9135,147002,Eros (2004),Drama Romance
9468,168358,Saturn 3 (1980),Sci-Fi Thriller


In [7]:
# Let's clean up those and update our ratings matrix
for index, row in movies[movies.duplicated('title')].iterrows():
    movie_id_to_use = row['movieId'] # This will be the ID that all others will use
    
    # Find all duplicate IDs
    for duplicate_movie_id in list(movies[(movies['title'] == row['title']) & (movies['movieId'] != movie_id_to_use)].movieId):
        print("Replacing movie id {} with {}".format(duplicate_movie_id, movie_id_to_use))
        
        ratings[ratings['movieId'] == duplicate_movie_id] = movie_id_to_use

movies.drop_duplicates('title', keep='first', inplace=True)

Replacing movie id 838 with 26958
Replacing movie id 34048 with 64997
Replacing movie id 6003 with 144606
Replacing movie id 32600 with 147002
Replacing movie id 2851 with 168358


In [8]:
movies[movies.duplicated('title')]

Unnamed: 0,movieId,title,genres


In [9]:
# Much better ...

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(movies['genres'])
tfidf_matrix.shape

(9737, 177)

In [20]:
tf.vocabulary_

{'adventure': 17,
 'animation': 33,
 'children': 46,
 'comedy': 59,
 'fantasy': 108,
 'adventure animation': 18,
 'animation children': 34,
 'children comedy': 47,
 'comedy fantasy': 63,
 'adventure children': 19,
 'children fantasy': 51,
 'romance': 160,
 'comedy romance': 68,
 'drama': 96,
 'comedy drama': 62,
 'drama romance': 103,
 'action': 0,
 'crime': 73,
 'thriller': 168,
 'action crime': 5,
 'crime thriller': 84,
 'action adventure': 1,
 'adventure thriller': 30,
 'horror': 128,
 'comedy horror': 64,
 'adventure romance': 28,
 'crime drama': 75,
 'action comedy': 4,
 'comedy crime': 60,
 'drama thriller': 105,
 'mystery': 147,
 'drama horror': 99,
 'horror mystery': 131,
 'mystery thriller': 151,
 'sci': 166,
 'fi': 119,
 'drama sci': 104,
 'sci fi': 167,
 'children drama': 50,
 'adventure drama': 23,
 'drama fantasy': 97,
 'fantasy mystery': 113,
 'mystery sci': 150,
 'fi thriller': 121,
 'war': 172,
 'drama war': 106,
 'adventure fantasy': 24,
 'musical': 139,
 'drama musica

In [52]:
# Get the cosine similarities
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim = cosine_similarity(tfidf_matrix)
cosine_sim[:4, :4]

array([[1.        , 0.31377615, 0.06109245, 0.05269267],
       [0.31377615, 1.        , 0.        , 0.        ],
       [0.06109245, 0.        , 1.        , 0.35183255],
       [0.05269267, 0.        , 0.35183255, 1.        ]])

In [53]:
# Build a 1-dimensional array with movie titles
titles = movies['title']
indices = pd.Series(movies.index, index=movies['title'])

# Function that get movie recommendations based on the cosine similarity score of movie genres
def genre_recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:21]
    movie_indices = [i[0] for i in sim_scores]
    return titles.iloc[movie_indices]

In [54]:
movies_indexed = movies.set_index('title')

In [55]:
movies_indexed.loc[genre_recommendations('Good Will Hunting (1997)').head(5)]

Unnamed: 0_level_0,movieId,genres
title,Unnamed: 1_level_1,Unnamed: 2_level_1
Leaving Las Vegas (1995),25,Drama Romance
Persuasion (1995),28,Drama Romance
How to Make an American Quilt (1995),46,Drama Romance
When Night Is Falling (1995),49,Drama Romance
Bed of Roses (1996),74,Drama Romance


In [56]:
movies_indexed.loc[genre_recommendations('Terminator, The (1984)').head(5)]

Unnamed: 0_level_0,movieId,genres
title,Unnamed: 1_level_1,Unnamed: 2_level_1
Screamers (1995),76,Action Sci-Fi Thriller
Johnny Mnemonic (1995),172,Action Sci-Fi Thriller
Virtuosity (1995),338,Action Sci-Fi Thriller
Timecop (1994),379,Action Sci-Fi Thriller
Blade Runner (1982),541,Action Sci-Fi Thriller


In [57]:
# Ok so let's try to evaluate this. How do we evaluate?
# Well, we have list of users ratings actually so we can just compare what is being recommended to 
# what user actually liked. We'll find a match based on that

In [58]:
ratings.head(5)

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [59]:
# Since we actually can't get the rating, we can only make assumptions about whether user LIKED or DISLIKED
# certain movie. For sake of ease, I'll treat ratings 3, 4 and 5 as LIKE, and 1 and 2 as DISLIKE

In [60]:
ratings['like'] = ratings['rating'].apply(lambda x: 1 if x >= 3 else 0)
ratings

Unnamed: 0,userId,movieId,rating,like
0,1,1,4.0,1
1,1,3,4.0,1
2,1,6,4.0,1
3,1,47,5.0,1
4,1,50,5.0,1
...,...,...,...,...
100831,610,166534,4.0,1
100832,610,168248,5.0,1
100833,610,168250,5.0,1
100834,610,168252,5.0,1


In [61]:
# Ok, now that we have our LIKES or DISLIKES, let's take a fraction of our ratings, and try to evaluate

In [62]:
small_data = ratings.sample(frac=0.2)
small_data.shape

(20167, 4)

In [63]:
from sklearn.model_selection import train_test_split

train_data, test_data = train_test_split(small_data, test_size=0.4)

In [64]:
print("Train data shape: {}, test data shape: {}".format(train_data.shape, test_data.shape))

Train data shape: (12100, 4), test data shape: (8067, 4)


In [65]:
def get_movie_id(title):
    '''Returns movie ID given the movie title'''
    return movies[movies['title'] == title]['movieId'].values[0]

def get_movie_title(id):
    '''Returns movie title given the movie ID'''
    return movies[movies['movieId'] == movie_id]['title'].values[0]

In [205]:
timeit get_movie_title(24)

547 µs ± 8.29 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [206]:
train_data.shape

(12100, 4)

In [204]:
timeit genre_recommendations("Leaving Las Vegas (1995)")

4.37 ms ± 143 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [226]:
predicted_values = []
actual_values = []

In [211]:
train_data

Unnamed: 0,userId,movieId,rating,like
2565,19,1593,2.0,0
19463,125,100083,5.0,1
39710,274,2871,3.5,1
35012,234,2002,3.0,1
62176,412,1269,5.0,1
...,...,...,...,...
53108,351,6016,4.0,1
63673,414,4333,4.0,1
55998,370,223,4.5,1
5739,41,4816,5.0,1


In [76]:
predicted_values = []
actual_values = []

for index, row in train_data[train_data['like'] == 1][:20000].iterrows():
    user_id = row['userId']
    movie_id = row['movieId']
    like = row['like']
    
    try:
        movie_name = get_movie_title(movie_id)
    except:
        #print("Cant get movie name based on movie ID: {}".format(movie_id))
        pass
    
    #print("Predicting for user {} based on the movie he liked {}".format(user_id, movie_name))

    # Predict movies based on this movie
    predicted = genre_recommendations(movie_name)
    
    # Now iterate through first 20 predicted movies and see which one of these match what user has liked
    # and which match what user has disliked
    
#     print("\tPredicted movies: {}".format(predicted.head(20).str.join(", ")))
    
    for predicted_movie in predicted.head(50):
        predicted_movie_id = get_movie_id(predicted_movie)
        
        # Check if user has rated that movie 
        rating = small_data[(small_data['userId'] == user_id) & (small_data['movieId'] == predicted_movie_id)]['like']
        
        if len(rating.values) == 0:
            # User didn't rate this movie ... skip it
            #print("\tUser didn't rate movie {}".format(predicted_movie))
            continue
        
        #print("\t Predicted movie {} ({}) and actual likeness is {}".format(predicted_movie, predicted_movie_id, rating.values[0]))
        
        predicted_values.append(rating.values[0])
        
        actual_values.append(1)

In [77]:
len(actual_values)

7986

In [78]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score

In [79]:
accuracy_score(actual_values, predicted_values)

0.8933132982719759

In [80]:
from sklearn.metrics import f1_score

In [81]:
f1_score(actual_values, predicted_values)

0.9436507936507936