In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from IPython.display import display
import ipywidgets as widgets
import pandas as pd
import numpy as np
import re

#### Data can be found on this link https://files.grouplens.org/dataset/movielens/ml-25m.zip

In [2]:
movies = pd.read_csv("data/movies.csv")
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
62418,209157,We (2018),Drama
62419,209159,Window of the Soul (2001),Documentary
62420,209163,Bad Poems (2018),Comedy|Drama
62421,209169,A Girl Thing (2001),(no genres listed)


#### The function "clean_title" below will clean movie titles using regex. It will search through each title and remove any characters that are not a space, digit, or letter (lowercase or uppercase).

In [3]:
def clean_title(title):
    return re.sub("[^a-zA-Z0-9 ]", "", title)

#### A new column called 'Clean-title' will be created in the data frame to store the cleaned up titles.

In [4]:
movies["Clean-title"] = movies["title"].apply(clean_title)
movies.head()

Unnamed: 0,movieId,title,genres,Clean-title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji 1995
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men 1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale 1995
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II 1995


#### Creating a TFIDF (Term Frequency Inverse Document Frequency) Matrix. Taking the log of the values. Word "the" is common.

#### The parameter "ngram_range" is not just looking for individual words in the title, it also looks for consecutive pairs of words, making the search more accurate.

In [5]:
vectorizer = TfidfVectorizer(ngram_range=(1,2))

tfidf = vectorizer.fit_transform(movies["Clean-title"])

#### Creating the search function. We will compute the similarity between the term entered and all of the movies in our list using cosine similarity.

In [6]:
def search(title):
    
    # Put the input title in "title" variable
    title = clean_title(title)
    
    # Create TFIDF for the input word
    query_vec = vectorizer.transform([title])
    
    # Get the list of similarityes between the whole list and the query_vec
    similarity = cosine_similarity(query_vec, tfidf).flatten()
    
    # Get only those indeces that are relevan, the best ones. In sorted order
    indices = np.argpartition(similarity, -5)[-5:]
    
    # Put them into "results" and revert the order
    results = movies.iloc[indices][::-1]
    return results

In [7]:
search("Rocky")

Unnamed: 0,movieId,title,genres,Clean-title
1865,1954,Rocky (1976),Drama,Rocky 1976
2321,2412,Rocky V (1990),Action|Drama,Rocky V 1990
2318,2409,Rocky II (1979),Action|Drama,Rocky II 1979
2319,2410,Rocky III (1982),Action|Drama,Rocky III 1982
26168,125327,Rocky Mountain (1950),Action|Adventure|Western,Rocky Mountain 1950


In [8]:
search("The Girl on the Train")

Unnamed: 0,movieId,title,genres,Clean-title
41858,162602,The Girl on the Train (2016),Thriller,The Girl on the Train 2016
22356,114561,"Girl on the Train, The (2013)",Thriller,Girl on the Train The 2013
35825,148771,The Girl on the Stone (2007),Drama,The Girl on the Stone 2007
55015,190919,The Girl on the Broomstick (1972),Children|Comedy,The Girl on the Broomstick 1972
15121,79972,"Girl on the Train, The (La fille du RER) (2009)",Drama,Girl on the Train The La fille du RER 2009


#### Reading in movie ratings data set, and find movies that are similar to our movie that we liked.

In [9]:
ratings = pd.read_csv("data/ratings.csv")
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510
...,...,...,...,...
25000090,162541,50872,4.5,1240953372
25000091,162541,55768,2.5,1240951998
25000092,162541,56176,2.0,1240950697
25000093,162541,58559,4.0,1240953434


In [10]:
ratings.dtypes

userId         int64
movieId        int64
rating       float64
timestamp      int64
dtype: object

#### Find all the users who also liked the movie we typed in and we want to find the other movies they liked, because those are probably going to be good recommendations. Shorted, people who liked the same movie as us what else did they like.

In [11]:
# Let's say this is the movieId that we typed in.

movie_id = 1

# We need to finding anyone who watched our movie
# And find anyone who liked it
# Let the threshold value be 4.5 rating out of 5

similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4.5)]["userId"].unique()
similar_users

array([    36,     75,     86, ..., 162518, 162519, 162530], dtype=int64)

In [12]:
# First we going to find anyone who is similar to us and they watch the same movie and like it
# Than we want to find any movies they rated  greater than 4.5

similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4.5)]["movieId"]
similar_user_recs

5101           1
5105          34
5111         110
5114         150
5127         260
            ... 
24998388    3706
24998389    3735
24998391    3763
24998392    4187
24998393    4321
Name: movieId, Length: 678648, dtype: int64

#### Only the movies that greater than 10% of the users who are similar to us liked

In [13]:
# similar_user_recs.value_counts() - counts up how many times each movie appears

similar_user_recs = similar_user_recs.value_counts() / len(similar_users)
similar_user_recs

1         1.000000
318       0.367762
260       0.358285
296       0.299126
356       0.292240
            ...   
4435      0.000074
188477    0.000074
179439    0.000074
179427    0.000074
97957     0.000074
Name: movieId, Length: 14179, dtype: float64

#### Only take the ones with grater than 10%

In [14]:
similar_user_recs = similar_user_recs[similar_user_recs > .1]
similar_user_recs

1        1.000000
318      0.367762
260      0.358285
296      0.299126
356      0.292240
           ...   
1089     0.105064
590      0.104620
780      0.102991
78499    0.101436
750      0.100992
Name: movieId, Length: 64, dtype: float64

#### Finding how much all users like movies

In [15]:
all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4)]

In [16]:
all_users

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
72,2,110,5.0,1141416589
76,2,260,5.0,1141417172
79,2,318,5.0,1141417181
82,2,356,4.5,1141416637
...,...,...,...,...
25000018,162541,2858,5.0,1240950804
25000020,162541,2959,5.0,1240953488
25000057,162541,4993,5.0,1240952610
25000065,162541,5952,5.0,1240952617


#### All of the users who have watched movies that were recommended to us. What percentage of all users recommend each of these movies.

In [17]:
all_users_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())

In [18]:
all_users_recs

318      0.353117
296      0.293738
2571     0.251804
356      0.242757
593      0.233102
           ...   
3114     0.055416
34       0.053892
1073     0.050318
1148     0.048979
78499    0.036249
Name: movieId, Length: 64, dtype: float64

#### Creating a recommendation score. Comparing the percentages.

In [19]:
rec_percentages = pd.concat([similar_user_recs, all_users_recs], axis=1)
rec_percentages.columns = ["similar", "all"]

In [20]:
rec_percentages

Unnamed: 0,similar,all
1,1.000000,0.128700
318,0.367762,0.353117
260,0.358285,0.229282
296,0.299126,0.293738
356,0.292240,0.242757
...,...,...
1089,0.105064,0.103985
590,0.104620,0.072321
780,0.102991,0.055785
78499,0.101436,0.036249


#### We want movies where those two numbers are big in difference

In [21]:
rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]

In [22]:
rec_percentages = rec_percentages.sort_values("score", ascending = False)

In [23]:
rec_percentages

Unnamed: 0,similar,all,score
1,1.000000,0.128700,7.770003
3114,0.236043,0.055416,4.259489
78499,0.101436,0.036249,2.798306
588,0.193988,0.069663,2.784672
595,0.170295,0.061887,2.751715
...,...,...,...
2858,0.154376,0.172971,0.892494
7153,0.152303,0.178069,0.855303
4226,0.107286,0.136155,0.787968
58559,0.117577,0.151133,0.777973


In [24]:
rec_percentages.head(10).merge(movies, left_index = True, right_on = "movieId")

Unnamed: 0,similar,all,score,movieId,title,genres,Clean-title
0,1.0,0.1287,7.770003,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
3021,0.236043,0.055416,4.259489,3114,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 2 1999
14813,0.101436,0.036249,2.798306,78499,Toy Story 3 (2010),Adventure|Animation|Children|Comedy|Fantasy|IMAX,Toy Story 3 2010
580,0.193988,0.069663,2.784672,588,Aladdin (1992),Adventure|Animation|Children|Comedy|Musical,Aladdin 1992
587,0.170295,0.061887,2.751715,595,Beauty and the Beast (1991),Animation|Children|Fantasy|Musical|Romance|IMAX,Beauty and the Beast 1991
33,0.143418,0.053892,2.661202,34,Babe (1995),Children|Drama,Babe 1995
359,0.211832,0.088495,2.393727,364,"Lion King, The (1994)",Adventure|Animation|Children|Drama|Musical|IMAX,Lion King The 1994
1047,0.119502,0.050318,2.374924,1073,Willy Wonka & the Chocolate Factory (1971),Children|Comedy|Fantasy|Musical,Willy Wonka the Chocolate Factory 1971
1120,0.110247,0.048979,2.250903,1148,Wallace & Gromit: The Wrong Trousers (1993),Animation|Children|Comedy|Crime,Wallace Gromit The Wrong Trousers 1993
898,0.132682,0.062843,2.111309,919,"Wizard of Oz, The (1939)",Adventure|Children|Fantasy|Musical,Wizard of Oz The 1939


#### Building a recommendation function

In [25]:
def find_similar_movies(movie_id):
    # Finding recommendations from users similar to us
    similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique()
    similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]
    
    # Adjusting so we only have recommendations where over 10% of users
    # recommended that movie
    similar_user_recs = similar_user_recs.value_counts() / len(similar_users)
    similar_user_recs = similar_user_recs[similar_user_recs > .1]
    
    # Finding how common the recommendations were among all of the users
    all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4)]
    all_users_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())
    
    # Creating score
    rec_percentages = pd.concat([similar_user_recs, all_users_recs], axis=1)
    rec_percentages.columns = ["similar", "all"]
    
    rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]
    
    rec_percentages = rec_percentages.sort_values("score", ascending = False)
    return rec_percentages.head(10).merge(movies, left_index = True, right_on = "movieId")[["score", "title","genres"]]

In [26]:
results = search("Spiderman")
movie_id = results.iloc[0]["movieId"]
display(find_similar_movies(movie_id))

Unnamed: 0,score,title,genres
5241,30.584247,Spider-Man (2002),Action|Adventure|Sci-Fi|Thriller
7923,16.642465,Spider-Man 2 (2004),Action|Adventure|Sci-Fi|IMAX
6221,10.63735,X2: X-Men United (2003),Action|Adventure|Sci-Fi|Thriller
10804,10.382547,X-Men: The Last Stand (2006),Action|Sci-Fi|Thriller
3692,9.289533,X-Men (2000),Action|Adventure|Sci-Fi
5270,8.506496,Star Wars: Episode II - Attack of the Clones (...,Action|Adventure|Sci-Fi|IMAX
3890,6.855664,Unbreakable (2000),Drama|Sci-Fi
9952,6.815176,Star Wars: Episode III - Revenge of the Sith (...,Action|Adventure|Sci-Fi
7931,6.784866,"I, Robot (2004)",Action|Adventure|Sci-Fi|Thriller
7734,6.721119,Shrek 2 (2004),Adventure|Animation|Children|Comedy|Musical|Ro...


In [27]:
results = search("Beautiful mind")
movie_id = results.iloc[0]["movieId"]
display(find_similar_movies(movie_id))

Unnamed: 0,score,title,genres
4889,13.259363,"Beautiful Mind, A (2001)",Drama|Romance
10957,5.170292,"Pursuit of Happyness, The (2006)",Drama
11017,5.027395,"Illusionist, The (2006)",Drama|Fantasy|Mystery|Romance
5877,4.646959,Catch Me If You Can (2002),Crime|Drama
3918,4.516031,Cast Away (2000),Drama
9423,4.49584,Million Dollar Baby (2004),Drama
7129,4.486273,The Butterfly Effect (2004),Drama|Sci-Fi|Thriller
7018,4.453526,"Last Samurai, The (2003)",Action|Adventure|Drama|War
5883,4.121988,"Pianist, The (2002)",Drama|War
9430,4.102533,Hotel Rwanda (2004),Drama|War


In [28]:
results = search("The conjuring")
movie_id = results.iloc[0]["movieId"]
display(find_similar_movies(movie_id))

Unnamed: 0,score,title,genres
40615,701.990826,The Conjuring 2 (2016),Horror
47732,444.116645,Annabelle: Creation (2017),Horror
43017,431.994354,Ouija: Origin of Evil (2016),Horror|Thriller
25054,376.429863,Insidious: Chapter 3 (2015),Fantasy|Horror|Thriller
22399,311.995923,Annabelle (2014),Horror
20261,303.5636,Insidious: Chapter 2 (2013),Horror|Thriller
43248,209.247265,The Autopsy of Jane Doe (2016),Horror
39235,182.913102,Hush (2016),Thriller
19191,178.938838,Mama (2013),Horror
18603,169.686524,Sinister (2012),Horror|Thriller
