In [1]:
import pandas as pd

all_movies = pd.read_csv("movies.csv")

In [2]:
print (all_movies)

       movieId                               title  \
0            1                    Toy Story (1995)   
1            2                      Jumanji (1995)   
2            3             Grumpier Old Men (1995)   
3            4            Waiting to Exhale (1995)   
4            5  Father of the Bride Part II (1995)   
...        ...                                 ...   
62418   209157                           We (2018)   
62419   209159           Window of the Soul (2001)   
62420   209163                    Bad Poems (2018)   
62421   209169                 A Girl Thing (2001)   
62422   209171      Women of Devil's Island (1962)   

                                            genres  
0      Adventure|Animation|Children|Comedy|Fantasy  
1                       Adventure|Children|Fantasy  
2                                   Comedy|Romance  
3                             Comedy|Drama|Romance  
4                                           Comedy  
...                              

In [8]:
import re

def clean_title(title):
    title = re.sub("[^a-zA-Z0-9 ]", "", title)
    return title

In [9]:
all_movies["clean_title"] = all_movies["title"].apply(clean_title)

In [10]:
all_movies

Unnamed: 0,movieId,title,genres,clean_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji 1995
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men 1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale 1995
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II 1995
...,...,...,...,...
62418,209157,We (2018),Drama,We 2018
62419,209159,Window of the Soul (2001),Documentary,Window of the Soul 2001
62420,209163,Bad Poems (2018),Comedy|Drama,Bad Poems 2018
62421,209169,A Girl Thing (2001),(no genres listed),A Girl Thing 2001


In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(ngram_range=(1,2))

tfidf_vectorizer = vectorizer.fit_transform(all_movies["clean_title"])

In [13]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def search(title):
    title = clean_title(title)
    query_vec = vectorizer.transform([title])
    similarity = cosine_similarity(query_vec, tfidf_vectorizer).flatten()
    indices = np.argpartition(similarity, -5)[-5:]
    results =all_movies.iloc[indices].iloc[::-1]
    
    return results

In [14]:
import ipywidgets as widgets
from IPython.display import display

movie_input = widgets.Text(
    value='jumanji',
    description='Movie Title:',
    disabled=False
)
movie_list = widgets.Output()

def on_type(data):
    with movie_list:
        movie_list.clear_output()
        title = data["new"]
        if len(title) > 5:
            display(search(title))

movie_input.observe(on_type, names='value')


display(movie_input, movie_list)

Text(value='jumanji', description='Movie Title:')

Output()

In [15]:
movie_ratings = pd.read_csv("ratings.csv")

In [16]:
movie_ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510
...,...,...,...,...
25000090,162541,50872,4.5,1240953372
25000091,162541,55768,2.5,1240951998
25000092,162541,56176,2.0,1240950697
25000093,162541,58559,4.0,1240953434


In [18]:
movie_id = 1 

In [23]:
user_similarity = movie_ratings[(movie_ratings["userId"].isin(similar_users)) & (movie_ratings["rating"] > 4)]["movieId"]

In [30]:
user_similarity_1 = movie_ratings[(movie_ratings["movieId"] == movie_id) & (movie_ratings["rating"] > 4)]["userId"].unique()

In [25]:
user_similarity

5101            1
5105           34
5111          110
5114          150
5127          260
            ...  
24998854    60069
24998861    67997
24998876    78499
24998884    81591
24998888    88129
Name: movieId, Length: 1358326, dtype: int64

In [31]:
user_similarity_1

array([    36,     75,     86, ..., 162527, 162530, 162533], dtype=int64)

In [32]:
user_similarity_1

array([    36,     75,     86, ..., 162527, 162530, 162533], dtype=int64)

In [33]:
user_similarity_1 = similar_user_recs.value_counts() / len(similar_users)
user_similarity_1 = similar_user_recs[similar_user_recs > .10]

In [34]:
user_similarity_1

5101            1
5105           34
5111          110
5114          150
5127          260
            ...  
24998854    60069
24998861    67997
24998876    78499
24998884    81591
24998888    88129
Name: movieId, Length: 1358326, dtype: int64

In [35]:
all_users = movie_ratings[(movie_ratings["movieId"].isin(similar_user_recs.index)) & (movie_ratings["rating"] > 4)]

In [36]:
all_users

Unnamed: 0,userId,movieId,rating,timestamp
620,3,52950,4.5,1566089429
820,3,106782,4.5,1439473659
882,3,136449,5.0,1484753762
1137,4,174055,5.0,1573938041
2411,12,49272,4.5,1167574519
...,...,...,...,...
24999762,162538,93988,4.5,1438785545
24999875,162540,43936,5.0,1248857225
24999881,162540,49278,4.5,1248856357
24999890,162540,56587,5.0,1248859016


In [37]:
all_user_1 = all_users["movieId"].value_counts() / len(all_users["userId"].unique())

In [38]:
all_user_1


72998     0.190448
49272     0.161523
106782    0.154563
56367     0.134058
87232     0.076472
            ...   
154634    0.000027
154648    0.000027
161194    0.000027
173857    0.000027
130800    0.000027
Name: movieId, Length: 1532, dtype: float64

In [40]:
percentages = pd.concat([similar_user_recs, all_user_1], axis=1)
percentages.columns = ["similar", "all"]

In [41]:
percentages

Unnamed: 0,similar,all
5101,1,0.001966
5105,34,0.008128
5111,110,0.001169
5114,150,0.002364
5127,260,0.001700
...,...,...
24998854,60069,
24998861,67997,
24998876,78499,
24998884,81591,


In [42]:
percentages["score"] = percentages["similar"] / percentages["all"]

In [43]:
percentages = percentages.sort_values("score", ascending=False)

In [44]:
percentages

Unnamed: 0,similar,all,score
208615,201588,0.000027,7.589385e+09
127234,183869,0.000027,6.922300e+09
173513,179491,0.000027,6.757477e+09
173501,177593,0.000027,6.686021e+09
142921,176933,0.000027,6.661174e+09
...,...,...,...
24998854,60069,,
24998861,67997,,
24998876,78499,,
24998884,81591,,


In [45]:
percentages.head(10).merge(all_movies, left_index=True, right_on="movieId")

Unnamed: 0,similar,all,score,movieId,title,genres,clean_title
62293,201588,2.7e-05,7589385000.0,208615,Botinada: A Origem do Punk no Brasil (2006),Documentary,Botinada A Origem do Punk no Brasil 2006
26805,183869,2.7e-05,6922300000.0,127234,Reality (2012),Comedy|Drama,Reality 2012
46954,179491,2.7e-05,6757477000.0,173513,Mudhalvan (1999),Thriller,Mudhalvan 1999
46948,177593,2.7e-05,6686021000.0,173501,I am Jane Doe (2017),Crime|Documentary,I am Jane Doe 2017
33339,176933,2.7e-05,6661174000.0,142921,Essex Boys: Law of Survival (2015),Action|Adventure|Crime|Drama,Essex Boys Law of Survival 2015
62277,168250,2.7e-05,6334276000.0,208567,"Filhos de João, O Admirável Mundo Novo Baiano ...",Documentary,Filhos de Joo O Admirvel Mundo Novo Baiano 2009
33865,164909,2.7e-05,6208494000.0,144170,Teenage Bank Heist (2012),Thriller,Teenage Bank Heist 2012
46915,159817,2.7e-05,6016790000.0,173433,Metalocalypse: The Doomstar Requiem (2013),Animation|Comedy|Drama,Metalocalypse The Doomstar Requiem 2013
17709,148626,2.7e-05,5595472000.0,92441,"Human Resources Manager, The (2010)",Drama,Human Resources Manager The 2010
33221,143355,2.7e-05,5397029000.0,142654,Alien Opponent (2011),Action|Comedy|Sci-Fi,Alien Opponent 2011


In [46]:
def find_similar_movies(movie_id):
    user_similarity = movie_ratings[(movie_ratings["movieId"] == movie_id) & (movie_ratings["rating"] > 4)]["userId"].unique()
    user_similarity_1movie_ = movie_ratings[(movie_ratings["userId"].isin(similar_users)) & (movie_ratings["rating"] > 4)]["movieId"]
    user_similarity_1 = user_similarity_1.value_counts() / len(user_similarity)

    user_similarity_1 = user_similarity_1[user_similarity_1 > .10]
    user_similarity = ratings[(movie_ratings["movieId"].isin(user_similarity_1.index)) & (movie_ratings["rating"] > 4)]
    user_similarity_1 = all_users["movieId"].value_counts() / len(all_users["userId"].unique())
    percentages = pd.concat([user_similarity_1, all_user_1], axis=1)
    percentages.columns = ["similar", "all"]
    
    percentages["score"] = percentages["similar"] / percentages["all"]
    percentages = percentages.sort_values("score", ascending=False)
    return percentages.head(10).merge(all_movies, left_index=True, right_on="movieId")[["s|core", "title", "genres"]]

In [49]:
movie_input_name = widgets.Text(
    name='jumanji',
    descrip='Movie Title:',
    disable=False
)
list_of_recommendation = widgets.Output()

def on_type(data):
    with list_of_recommendation:
        list_of_recommendation.clear_output()
        title = data["new"]
        if len(title) > 5:
            results = search(title)
            movie_id = results.iloc[0]["movieId"]
            display(find_similar_movies(movie_id))

movie_input_name.observe(on_type, names='value')

display(movie_input_name, list_of_recommendation)

Text(value='')

Output()