In [1]:
import pandas as pd
movies = pd.read_csv("movies.csv")

In [2]:
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
62418,209157,We (2018),Drama
62419,209159,Window of the Soul (2001),Documentary
62420,209163,Bad Poems (2018),Comedy|Drama
62421,209169,A Girl Thing (2001),(no genres listed)


# Data Cleaning

In [3]:
#Function to clean the data title from extra characters
import re

def clean_title(title):
    return re.sub("[^a-zA-Z0-9 ]", "", title)

In [4]:
movies["clean_title"] = movies["title"].apply(clean_title)

In [5]:
movies

Unnamed: 0,movieId,title,genres,clean_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji 1995
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men 1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale 1995
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II 1995
...,...,...,...,...
62418,209157,We (2018),Drama,We 2018
62419,209159,Window of the Soul (2001),Documentary,Window of the Soul 2001
62420,209163,Bad Poems (2018),Comedy|Drama,Bad Poems 2018
62421,209169,A Girl Thing (2001),(no genres listed),A Girl Thing 2001


# Creating a term frequencey * inverse document frequency matrix

In [6]:
# Convert clean_title into sets of numbers so we can find the ones are the most similar to the search term that we enter
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(ngram_range=(1,2))

tfidf = vectorizer.fit_transform(movies["clean_title"])

# Search Function 

In [7]:
# Compute the similarity between what we enter and all of the movies
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def search(title):
#    title = "Toy Story 1995"
    title = clean_title(title)
    query_vec = vectorizer.transform([title])
    similarity = cosine_similarity(query_vec, tfidf).flatten() #compare query_vec vs tfidf
    indices =  np.argpartition(similarity, -5) [-5:] #to find the 5 most similar term to what we searched
    results = movies.iloc[indices] [::-1] #to return the most similar term to the top
    return results

In [8]:
import ipywidgets as widgets
from IPython.display import display

movie_input = widgets.Text(
    value = "Toy Story",
    description = "Movie Title:",
    disabled = False
)

movie_list = widgets.Output()

def on_type(data):
    with movie_list:
        movie_list.clear_output()
        title = data["new"]
        if len(title) > 5:
            display(search(title))
            
movie_input.observe(on_type, names= 'value')
display(movie_input,movie_list)

Text(value='Toy Story', description='Movie Title:')

Output()

In [9]:
ratings = pd.read_csv("ratings.csv")
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510
...,...,...,...,...
25000090,162541,50872,4.5,1240953372
25000091,162541,55768,2.5,1240951998
25000092,162541,56176,2.0,1240950697
25000093,162541,58559,4.0,1240953434


In [10]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000095 entries, 0 to 25000094
Data columns (total 4 columns):
 #   Column     Dtype  
---  ------     -----  
 0   userId     int64  
 1   movieId    int64  
 2   rating     float64
 3   timestamp  int64  
dtypes: float64(1), int64(3)
memory usage: 762.9 MB


# Finding users who liked the same movie

In [11]:
movie_id = 1

In [12]:
#similar taste in movies
similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique() 

In [13]:
similar_users

array([    36,     75,     86, ..., 162527, 162530, 162533], dtype=int64)

In [14]:
similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)] ["movieId"]

In [15]:
similar_user_recs

5101            1
5105           34
5111          110
5114          150
5127          260
            ...  
24998854    60069
24998861    67997
24998876    78499
24998884    81591
24998888    88129
Name: movieId, Length: 1358326, dtype: int64

In [16]:
similar_user_recs.value_counts() / len(similar_users)
similar_user_recs = similar_user_recs[similar_user_recs >.1]

In [17]:
similar_user_recs

5101            1
5105           34
5111          110
5114          150
5127          260
            ...  
24998854    60069
24998861    67997
24998876    78499
24998884    81591
24998888    88129
Name: movieId, Length: 1358326, dtype: int64

# Finding how much all users like movies

In [18]:
all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4)]

In [19]:
all_users_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique()) 
#the precentage of all users who liked these movie

# Creating a recommendation score

In [20]:
rec_precentages = pd.concat([similar_user_recs, all_users_recs], axis=1)
rec_precentages.columns = ["similar", "all"]

In [21]:
rec_precentages

Unnamed: 0,similar,all
5101,1,0.001966
5105,34,0.008128
5111,110,0.001169
5114,150,0.002364
5127,260,0.001700
...,...,...
24998854,60069,
24998861,67997,
24998876,78499,
24998884,81591,


In [22]:
rec_precentages["scores"] = rec_precentages["similar"] / rec_precentages["all"]

In [23]:
rec_precentages = rec_precentages.sort_values("scores", ascending=False)

In [24]:
rec_precentages

Unnamed: 0,similar,all,scores
208615,201588,0.000027,7.589385e+09
127234,183869,0.000027,6.922300e+09
173513,179491,0.000027,6.757477e+09
173501,177593,0.000027,6.686021e+09
142921,176933,0.000027,6.661174e+09
...,...,...,...
24998854,60069,,
24998861,67997,,
24998876,78499,,
24998884,81591,,


In [25]:
rec_precentages.head(10).merge(movies, left_index=True, right_on="movieId")

Unnamed: 0,similar,all,scores,movieId,title,genres,clean_title
62293,201588,2.7e-05,7589385000.0,208615,Botinada: A Origem do Punk no Brasil (2006),Documentary,Botinada A Origem do Punk no Brasil 2006
26805,183869,2.7e-05,6922300000.0,127234,Reality (2012),Comedy|Drama,Reality 2012
46954,179491,2.7e-05,6757477000.0,173513,Mudhalvan (1999),Thriller,Mudhalvan 1999
46948,177593,2.7e-05,6686021000.0,173501,I am Jane Doe (2017),Crime|Documentary,I am Jane Doe 2017
33339,176933,2.7e-05,6661174000.0,142921,Essex Boys: Law of Survival (2015),Action|Adventure|Crime|Drama,Essex Boys Law of Survival 2015
62277,168250,2.7e-05,6334276000.0,208567,"Filhos de João, O Admirável Mundo Novo Baiano ...",Documentary,Filhos de Joo O Admirvel Mundo Novo Baiano 2009
33865,164909,2.7e-05,6208494000.0,144170,Teenage Bank Heist (2012),Thriller,Teenage Bank Heist 2012
46915,159817,2.7e-05,6016790000.0,173433,Metalocalypse: The Doomstar Requiem (2013),Animation|Comedy|Drama,Metalocalypse The Doomstar Requiem 2013
17709,148626,2.7e-05,5595472000.0,92441,"Human Resources Manager, The (2010)",Drama,Human Resources Manager The 2010
33221,143355,2.7e-05,5397029000.0,142654,Alien Opponent (2011),Action|Comedy|Sci-Fi,Alien Opponent 2011


# Build a Recommendation Function

In [26]:
def find_similar_movies(movie_id):
    #find users who are similar to us
    similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique()
    similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]
    #adjusting so that we have recommendation where over 10 precent of the users recommended that movie
    similar_user_recs = similar_user_recs.value_counts() / len(similar_users)
    similar_user_recs = similar_user_recs[similar_user_recs > .10]
    #finding how common the recommendation were among all the users
    all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4)]
    all_user_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())
    #creating the recommendation score
    rec_percentages = pd.concat([similar_user_recs, all_user_recs], axis=1)
    rec_percentages.columns = ["similar", "all"]
    rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]
    #sorting our score
    rec_percentages = rec_percentages.sort_values("score", ascending=False)
    #returning the top 10 recommendations and merging it with our movie data set selecting the three column that we need
    return rec_percentages.head(10).merge(movies, left_index=True, right_on="movieId")[["score", "title", "genres"]]

# Creating Interactive recommendation widget

In [27]:
import ipywidgets as widgets
from IPython.display import display

movie_name_input = widgets.Text(
    value='Toy Story',
    description='Movie Title:',
    disabled=False
)
recommendation_list = widgets.Output()

def on_type(data):
    with recommendation_list:
        recommendation_list.clear_output()
        title = data["new"]
        if len(title) > 5:
            results = search(title)
            movie_id = results.iloc[0]["movieId"]
            display(find_similar_movies(movie_id))

movie_name_input.observe(on_type, names='value')

display(movie_name_input, recommendation_list)

Text(value='Toy Story', description='Movie Title:')

Output()

In [28]:
#Send Help