In [1]:
import re
import pandas as pd
import numpy as np
import ipywidgets as widgets
from IPython.display import display
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
#load movies dataset
movies = pd.read_csv("movies.csv")

In [3]:
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
62418,209157,We (2018),Drama
62419,209159,Window of the Soul (2001),Documentary
62420,209163,Bad Poems (2018),Comedy|Drama
62421,209169,A Girl Thing (2001),(no genres listed)


In [4]:
def clean_title(title):
    return re.sub("[^a-zA-Z0-9 ]", "", title)

In [5]:
movies["clean_title"] = movies["title"].apply(clean_title)

In [6]:
movies

Unnamed: 0,movieId,title,genres,clean_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji 1995
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men 1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale 1995
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II 1995
...,...,...,...,...
62418,209157,We (2018),Drama,We 2018
62419,209159,Window of the Soul (2001),Documentary,Window of the Soul 2001
62420,209163,Bad Poems (2018),Comedy|Drama,Bad Poems 2018
62421,209169,A Girl Thing (2001),(no genres listed),A Girl Thing 2001


In [7]:
vectorizer = TfidfVectorizer(ngram_range = (1,2))
tfidf = vectorizer.fit_transform(movies["clean_title"])

In [8]:
def search(title):
    title = clean_title(title)
    query_vec = vectorizer.transform([title])
    similarity = cosine_similarity(query_vec, tfidf).flatten()
    indices = np.argpartition(similarity, -5)[-5:]
    results = movies.iloc[indices][::-1]
    return results

In [23]:
ratings = pd.read_csv("ratings.csv")

In [24]:
# display ratings dataset
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1.147880e+09
1,1,306,3.5,1.147869e+09
2,1,307,5.0,1.147869e+09
3,1,665,5.0,1.147879e+09
4,1,899,3.5,1.147869e+09
...,...,...,...,...
6629901,42908,4270,1.0,1.015126e+09
6629902,42908,4321,3.0,1.015126e+09
6629903,42908,4370,1.0,1.015125e+09
6629904,42908,4517,3.0,1.048391e+09


In [27]:
ratings.dtypes

userId         int64
movieId        int64
rating       float64
timestamp    float64
dtype: object

In [29]:
movie_id = 5

In [31]:
similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] >= 4)]["userId"].unique()
similar_users

array([   18,    75,   105,   130,   137,   166,   167,   169,   173,
         236,   291,   309,   371,   395,   407,   455,   484,   506,
         559,   580,   592,   601,   653,   718,   818,   907,   920,
         969,   984,   986,  1036,  1104,  1120,  1283,  1297,  1368,
        1395,  1498,  1513,  1575,  1635,  1657,  1676,  1732,  1748,
        1829,  1840,  1855,  2134,  2172,  2176,  2261,  2271,  2284,
        2286,  2353,  2374,  2407,  2431,  2451,  2591,  2614,  2616,
        2650,  2770,  2806,  2836,  2847,  3024,  3128,  3143,  3145,
        3226,  3267,  3352,  3403,  3434,  3491,  3553,  3622,  3653,
        3748,  3769,  3834,  3891,  3974,  4026,  4032,  4038,  4145,
        4185,  4196,  4226,  4254,  4286,  4325,  4355,  4392,  4458,
        4470,  4494,  4496,  4519,  4559,  4606,  4608,  4663,  4729,
        4779,  4794,  4921,  4930,  4986,  5101,  5239,  5298,  5340,
        5350,  5402,  5448,  5535,  5805,  5842,  5883,  5909,  5923,
        6020,  6039,

In [33]:
similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] >= 4)]["movieId"]
similar_user_recs

3018         5
3019         7
3020        10
3021        11
3023        19
          ... 
6588816    494
6588817    610
6588818    661
6588819    736
6588823    780
Name: movieId, Length: 98831, dtype: int64

In [35]:
similar_user_recs = similar_user_recs.value_counts()
similar_user_recs

movieId
5        838
1        403
356      403
780      369
318      334
        ... 
70206      1
70227      1
71490      1
72041      1
91886      1
Name: count, Length: 8666, dtype: int64

In [37]:
# convert into %
similar_user_recs = similar_user_recs.value_counts() / len(similar_users)

# take the movies that appears more (> 10%)
similar_user_recs = similar_user_recs[similar_user_recs > .1]
similar_user_recs

count
1     3.698091
2     1.381862
3     0.885442
4     0.576372
5     0.410501
6     0.321002
7     0.270883
8     0.198091
10    0.187351
9     0.184964
12    0.146778
11    0.132458
Name: count, dtype: float64

In [39]:
# how much popular these movies are (all people liked)
all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4)]
all_users

Unnamed: 0,userId,movieId,rating,timestamp
1459,9,2,5.0,8.593831e+08
1460,9,10,5.0,8.593834e+08
3864,23,3,5.0,9.431357e+08
3867,23,11,5.0,9.429695e+08
5101,36,1,5.0,8.571314e+08
...,...,...,...,...
6627919,42896,1,5.0,1.227295e+09
6628833,42898,1,4.5,1.364687e+09
6629257,42905,7,5.0,9.403186e+08
6629258,42905,11,5.0,9.403175e+08


In [41]:
# convert into % 
all_users_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())
all_users_recs


movieId
1     0.591111
6     0.228535
10    0.104176
11    0.103593
2     0.077462
7     0.047713
3     0.034298
5     0.028581
12    0.007583
9     0.007233
4     0.006300
8     0.003733
Name: count, dtype: float64

In [43]:
# how much similar users liked them and all users liked them
rec_percentages = pd.concat([similar_user_recs, all_users_recs], axis = 1)
rec_percentages.columns = ["similar", "all"]
rec_percentages

Unnamed: 0,similar,all
1,3.698091,0.591111
2,1.381862,0.077462
3,0.885442,0.034298
4,0.576372,0.0063
5,0.410501,0.028581
6,0.321002,0.228535
7,0.270883,0.047713
8,0.198091,0.003733
10,0.187351,0.104176
9,0.184964,0.007233


In [45]:
# ratio similar and all users liked movies 
rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]

# sort all the values
rec_percentages = rec_percentages.sort_values("score", ascending = False)

rec_percentages

Unnamed: 0,similar,all,score
4,0.576372,0.0063,91.493768
8,0.198091,0.003733,53.063544
3,0.885442,0.034298,25.816343
9,0.184964,0.007233,25.572792
12,0.146778,0.007583,19.356637
2,1.381862,0.077462,17.839333
5,0.410501,0.028581,14.362515
1,3.698091,0.591111,6.256174
7,0.270883,0.047713,5.677285
10,0.187351,0.104176,1.7984


In [47]:
# take top 10 recommendations and add them to movies data to get the titles
rec_percentages.head(10).merge(movies, left_index = True, right_on = "movieId")

Unnamed: 0,similar,all,score,movieId,title,genres,clean_title
3,0.576372,0.0063,91.493768,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale 1995
7,0.198091,0.003733,53.063544,8,Tom and Huck (1995),Adventure|Children,Tom and Huck 1995
2,0.885442,0.034298,25.816343,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men 1995
8,0.184964,0.007233,25.572792,9,Sudden Death (1995),Action,Sudden Death 1995
11,0.146778,0.007583,19.356637,12,Dracula: Dead and Loving It (1995),Comedy|Horror,Dracula Dead and Loving It 1995
1,1.381862,0.077462,17.839333,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji 1995
4,0.410501,0.028581,14.362515,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II 1995
0,3.698091,0.591111,6.256174,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
6,0.270883,0.047713,5.677285,7,Sabrina (1995),Comedy|Romance,Sabrina 1995
9,0.187351,0.104176,1.7984,10,GoldenEye (1995),Action|Adventure|Thriller,GoldenEye 1995


In [49]:
# recommendation function
def find_similar_movies(movie_id):
    # anyone who watched and rated that movie avobe 4
    similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] >= 4)]["userId"].unique()
    # get ids of other movies they liked and rated above 4
    similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] >= 4)]["movieId"]
    
    # convert into %
    similar_user_recs = similar_user_recs.value_counts() / len(similar_users)
    # take the movies that appears more (> 10%)
    similar_user_recs = similar_user_recs[similar_user_recs > .1]
    
    # how much popular these movies are (all people liked)
    all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4)]
    # convert into % 
    all_users_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())
    
    # how much similar users liked them and all users liked them
    rec_percentages = pd.concat([similar_user_recs, all_users_recs], axis = 1)
    rec_percentages.columns = ["similar", "all"]
    
    # ratio similar and all users liked movies 
    rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]
    
    # sort all the values
    rec_percentages = rec_percentages.sort_values("score", ascending = False)
    
    # take top 10 recommendations and add them to movies data to get the titles
    return rec_percentages.head(10).merge(movies, left_index = True, right_on = "movieId")[["score", "title", "genres"]]

In [51]:
# input field and label
movie_name_input = widgets.Text(
    value = "Toy Story",
    description = "Movie Title:",
    disabled = False
)

# create output widget
recommandation_list = widgets.Output()

# function for output
def on_type(data):
    with recommandation_list:
        recommandation_list.clear_output()
        title = data["new"]
        if len(title) > 5:
            results = search(title)
            movie_id = results.iloc[0]["movieId"]
            display(find_similar_movies(movie_id))

# make search box active on type
movie_name_input.observe(on_type, names = "value")

# display the widgets
display(movie_name_input, recommandation_list)

Text(value='Toy Story', description='Movie Title:')

Output()

In [53]:
results = search("Batman")
movie_id = results.iloc[0]["movieId"]
display(find_similar_movies(movie_id))

Unnamed: 0,score,title,genres
8614,1310.5625,Batman (1966),Action|Adventure|Comedy
4306,453.383784,Sons of Katie Elder (1965),Western
11409,206.083538,"36th Chamber of Shaolin, The (Shao Lin san shi...",Action|Adventure
4335,167.91992,"Big Boss, The (Fists of Fury) (Tang shan da xi...",Action|Thriller
5741,153.170197,Scanners (1981),Horror|Sci-Fi|Thriller
7439,137.853178,Jason and the Argonauts (1963),Action|Adventure|Fantasy
10415,137.389025,Zathura (2005),Action|Adventure|Children|Fantasy
4835,129.538224,Flash Gordon (1980),Action|Adventure|Sci-Fi
1506,122.235824,Batman & Robin (1997),Action|Adventure|Fantasy|Thriller
10469,119.311522,"Matador, The (2005)",Comedy|Drama|Thriller
