# Authored by Melvern Amadio Hidayat

In [16]:
import pandas as pd
import numpy as np

In [17]:
df = pd.read_csv('movies.csv')
df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62423 entries, 0 to 62422
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  62423 non-null  int64 
 1   title    62423 non-null  object
 2   genres   62423 non-null  object
dtypes: int64(1), object(2)
memory usage: 1.4+ MB


In [18]:
df.describe()

Unnamed: 0,movieId
count,62423.0
mean,122220.387646
std,63264.744844
min,1.0
25%,82146.5
50%,138022.0
75%,173222.0
max,209171.0


In [12]:
import re
def clean_title(title):
    title = re.sub("[^a-zA-Z0-9 ]", "", title)
    return title

In [13]:
df["clean_title"] = df["title"].apply(clean_title)
df.head()

Unnamed: 0,movieId,title,genres,clean_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji 1995
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men 1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale 1995
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II 1995


## Text Preprocessing using Count Vectorizer

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(ngram_range=(1,2))

tfidf = vectorizer.fit_transform(df["clean_title"])

In [21]:
from sklearn.metrics.pairwise import cosine_similarity

def search(title):
    title = clean_title(title)
    query_vec = vectorizer.transform([title])
    similarity = cosine_similarity(query_vec, tfidf).flatten()
    indices = np.argpartition(similarity, -5)[-5:]
    results = df.iloc[indices].iloc[::-1]

    return results

In [23]:
import ipywidgets as widgets
from IPython.display import display

movie_input = widgets.Text(value='',description='Movie Title:',disabled=False)
movie_list = widgets.Output()

def on_type(data):
    with movie_list:
        movie_list.clear_output()
        title = data["new"]
        if len(title) > 5:
            display(search(title))

movie_input.observe(on_type, names='value')


display(movie_input, movie_list)

Text(value='', description='Movie Title:')

Output()

In [28]:
rate_df = pd.read_csv("ratings.csv")
rate_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510


In [43]:
similar_users = rate_df[(rate_df ["movieId"] == movie_id) & (rate_df["rating"] > 4)]["userId"].unique()

In [44]:
similar_user_recs = rate_df[(rate_df["userId"].isin(similar_users)) & (rate_df["rating"] > 4)]["movieId"]
similar_user_recs = similar_user_recs.value_counts() / len(similar_users)
similar_user_recs = similar_user_recs[similar_user_recs > .10]

In [45]:
all_users = ratings[(rate_df["movieId"].isin(similar_user_recs.index)) & (rate_df["rating"] > 4)]

  all_users = ratings[(rate_df["movieId"].isin(similar_user_recs.index)) & (rate_df["rating"] > 4)]


In [46]:
all_user_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())

In [47]:
rec_percentages = pd.concat([similar_user_recs, all_user_recs], axis=1)
rec_percentages.columns = ["similar", "all"]
rec_percentages

Unnamed: 0_level_0,similar,all
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
89745.0,1.000000,0.039933
58559.0,0.666667,0.131448
79132.0,0.614035,0.129784
59315.0,0.508772,0.046589
2571.0,0.473684,0.226290
...,...,...
3300.0,0.105263,0.014975
1721.0,0.105263,0.064892
89864.0,0.105263,0.008319
103335.0,0.105263,0.008319


In [48]:
rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]
rec_percentages = rec_percentages.sort_values("score", ascending=False)
rec_percentages.head(10).merge(df, left_index=True, right_on="movieId")

Unnamed: 0,similar,all,score,movieId,title,genres
17067,1.0,0.039933,25.041667,89745,"Avengers, The (2012)",Action|Adventure|Sci-Fi|IMAX
43646,0.122807,0.004992,24.602339,166528,Rogue One: A Star Wars Story (2016),Action|Adventure|Fantasy|Sci-Fi
25068,0.140351,0.006656,21.087719,122914,Avengers: Infinity War - Part II (2019),Action|Adventure|Sci-Fi
19733,0.140351,0.006656,21.087719,102445,Star Trek Into Darkness (2013),Action|Adventure|Sci-Fi|IMAX
5351,0.122807,0.006656,18.451754,5459,Men in Black II (a.k.a. MIIB) (a.k.a. MIB 2) (...,Action|Comedy|Sci-Fi
18896,0.122807,0.006656,18.451754,98491,Paperman (2012),Animation|Comedy|Romance
11728,0.122807,0.006656,18.451754,54259,Stardust (2007),Adventure|Comedy|Fantasy|Romance
17501,0.122807,0.006656,18.451754,91630,Mission: Impossible - Ghost Protocol (2011),Action|Adventure|Thriller|IMAX
25072,0.175439,0.009983,17.573099,122922,Doctor Strange (2016),Action|Adventure|Sci-Fi
18747,0.140351,0.008319,16.870175,97913,Wreck-It Ralph (2012),Animation|Comedy


In [50]:
def find_similar_movies(movie_id):
    similar_users = rate_df[(rate_df["movieId"] == movie_id) & (rate_df["rating"] > 4)]["userId"].unique()
    similar_user_recs = rate_df[(rate_df["userId"].isin(similar_users)) & (rate_df["rating"] > 4)]["movieId"]
    similar_user_recs = similar_user_recs.value_counts() / len(similar_users)

    similar_user_recs = similar_user_recs[similar_user_recs > .10]
    all_users = rate_df[(rate_df["movieId"].isin(similar_user_recs.index)) & (rate_df["rating"] > 4)]
    all_user_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())
    rec_percentages = pd.concat([similar_user_recs, all_user_recs], axis=1)
    rec_percentages.columns = ["similar", "all"]

    rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]
    rec_percentages = rec_percentages.sort_values("score", ascending=False)
    return rec_percentages.head(10).merge(df, left_index=True, right_on="movieId")[["score", "title", "genres"]]


In [51]:
import ipywidgets as widgets
from IPython.display import display

movie_name_input = widgets.Text(
    value='',
    description='Movie Title:',
    disabled=False
)
recommendation_list = widgets.Output()

def on_type(data):
    with recommendation_list:
        recommendation_list.clear_output()
        title = data["new"]
        if len(title) > 5:
            results = search(title)
            movie_id = results.iloc[0]["movieId"]
            display(find_similar_movies(movie_id))

movie_name_input.observe(on_type, names='value')

display(movie_name_input, recommendation_list)

Text(value='', description='Movie Title:')

Output()