## Interactive Movie Reccommender System

### Reading data with pandas

In [None]:
import pandas as pd
import numpy as np
import sklearn
import matplotlib.pyplot as plot

In [None]:
movies = pd.read_csv("movies.csv")

In [None]:
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
62418,209157,We (2018),Drama
62419,209159,Window of the Soul (2001),Documentary
62420,209163,Bad Poems (2018),Comedy|Drama
62421,209169,A Girl Thing (2001),(no genres listed)


### Cleaning movie titles with regex

In [None]:
import re
# Defining a function to clean movie title
def clean_title(title):
  return re.sub("[^a-zA-Z0-9 ]", "", title)

In [None]:
# Creating a new column in the movies dataframe after cleaning
movies['clean_title'] = movies['title'].apply( lambda x: clean_title(x))
movies

Unnamed: 0,movieId,title,genres,clean_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji 1995
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men 1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale 1995
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II 1995
...,...,...,...,...
62418,209157,We (2018),Drama,We 2018
62419,209159,Window of the Soul (2001),Documentary,Window of the Soul 2001
62420,209163,Bad Poems (2018),Comedy|Drama,Bad Poems 2018
62421,209169,A Girl Thing (2001),(no genres listed),A Girl Thing 2001


### Creating a TFIDF matrix
Term frequency - Inverse Document Frequency

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(ngram_range = (1,2))
tfidf = vectorizer.fit_transform(movies['clean_title']) # The titles are converted to vectors

### Using cosine similarity

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

def search_title(title):
  title = clean_title(title)
  query_vec = vectorizer.transform([title])
  similarity = cosine_similarity(query_vec,tfidf).flatten()
  indices = np.argpartition(similarity,-5)[-5:] # Selecting the last 5 movies, the top 5 movies which have max similarity
  results = movies.iloc[indices][::-1]
  return results

### Jupyter Notebook Widget

In [None]:
import ipywidgets as widgets
from IPython.display import display

In [None]:
movie_input = widgets.Text(
    value = 'Toy Story', # The default value
    description = 'Movie Title',
    disabled = False
)

movie_list = widgets.Output()

def on_type(data):
  with movie_list:
    movie_list.clear_output()
    title = data['new']
    if(len(title) > 5):
      display(search_title(title))

movie_input.observe(on_type,names = 'value')
display(movie_input,movie_list)

Text(value='Toy Story', description='Movie Title')

Output()

### Reading movie ratings data

In [None]:
ratings = pd.read_csv('ratings.csv')

In [None]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510
...,...,...,...,...
17615300,114192,353,3.5,1076965437
17615301,114192,361,2.0,1043260151
17615302,114192,368,3.0,1076965129
17615303,114192,377,5.0,1043160575


In [None]:
ratings.dtypes

Unnamed: 0,0
userId,int64
movieId,int64
rating,float64
timestamp,int64


### Finding users who liked the same movie

In [None]:
given_movie_id = 2273

In [None]:
similar_users = ratings[(ratings["movieId"] == given_movie_id) & (ratings["rating"] >=5)]["userId"].unique()
# They watched the same movie as us and liked it

In [None]:
similar_users

array([    30,    235,    240,    437,    597,    653,    935,   1041,
         1099,   1167,   1581,   1610,   1770,   2194,   2385,   2418,
         2444,   2464,   3279,   3394,   3403,   3969,   4047,   4124,
         4374,   4555,   5045,   5298,   5362,   5507,   5712,   5844,
         6184,   6393,   6674,   6745,   7361,   7412,   7423,   7470,
         7588,   7917,   8028,   8154,   8392,   8470,   8525,   8565,
         8641,   8770,   9900,  10041,  10178,  10331,  10361,  10813,
        11005,  11042,  11094,  11109,  11123,  11181,  11216,  11236,
        11279,  11556,  11578,  12002,  12220,  12646,  13020,  13098,
        13196,  13745,  13835,  13902,  13908,  14057,  14074,  14597,
        14765,  14942,  15092,  15116,  15125,  15196,  15269,  15476,
        15602,  15858,  16107,  16171,  16451,  16565,  16677,  16977,
        17344,  18010,  18065,  18126,  18236,  18347,  18367,  18391,
        18552,  18841,  19075,  19174,  19284,  20052,  20177,  20286,
      

In [None]:
similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] >4)]["movieId"]
# Getting the similar users best rated movies

In [None]:
similar_user_recs

Unnamed: 0,movieId
4530,410
4533,1136
4534,1197
4536,1220
4538,1580
...,...
17612682,176423
17612685,179135
17612687,179819
17612695,188301


In [None]:
# Getting the movies that atleast 10% or more liked
similar_user_recs = similar_user_recs.value_counts()/len(similar_users)

In [None]:
similar_user_recs

Unnamed: 0_level_0,count
movieId,Unnamed: 1_level_1
2273,1.000000
2571,0.552198
260,0.423077
356,0.409341
318,0.401099
...,...
6797,0.001374
6813,0.001374
6907,0.001374
6912,0.001374


In [None]:
similar_user_recs = similar_user_recs[similar_user_recs > 0.1] # Getting the movies which were rated as good by atleast 10% of the similar users

In [None]:
similar_user_recs

Unnamed: 0_level_0,count
movieId,Unnamed: 1_level_1
2273,1.000000
2571,0.552198
260,0.423077
356,0.409341
318,0.401099
...,...
110102,0.100275
2987,0.100275
3052,0.100275
53322,0.100275


In [None]:
# Finding ratings of selected movies by ALL users
all_users = ratings[ (ratings['movieId'].isin(similar_user_recs.index)) & (ratings['rating'] >4)]
all_users

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
72,2,110,5.0,1141416589
76,2,260,5.0,1141417172
79,2,318,5.0,1141417181
82,2,356,4.5,1141416637
...,...,...,...,...
17615271,114192,1,5.0,1076965175
17615272,114192,6,4.5,1076965171
17615276,114192,32,5.0,1043161382
17615294,114192,260,5.0,1043268625


In [None]:
all_users_rec = all_users["movieId"].value_counts()/len (all_users["userId"].unique())

In [None]:
all_users_rec

Unnamed: 0_level_0,count
movieId,Unnamed: 1_level_1
318,0.337671
296,0.281755
2571,0.241100
356,0.231734
593,0.223096
...,...
3624,0.006222
2002,0.005625
1918,0.005569
6548,0.004291


### Creating a recommendation Score

In [None]:
comparision_percentages = pd.concat([similar_user_recs,all_users_rec], axis =1)
comparision_percentages.columns = ["similar","all"]
comparision_percentages

Unnamed: 0_level_0,similar,all
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
2273,1.000000,0.011110
2571,0.552198,0.241100
260,0.423077,0.219206
356,0.409341,0.231734
318,0.401099,0.337671
...,...,...
110102,0.100275,0.016437
2987,0.100275,0.025663
3052,0.100275,0.027585
53322,0.100275,0.009683


In [None]:
# We need movies that have huge difference b/w no.of ppl similar to us liked vs all the ppl who liked that movie

In [None]:
# Creating a recommendation score
comparision_percentages["score"] = comparision_percentages["similar"]/comparision_percentages["all"]

In [None]:
comparision_percentages

Unnamed: 0_level_0,similar,all,score
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2273,1.000000,0.011110,90.005038
2571,0.552198,0.241100,2.290323
260,0.423077,0.219206,1.930043
356,0.409341,0.231734,1.766422
318,0.401099,0.337671,1.187839
...,...,...,...
110102,0.100275,0.016437,6.100482
2987,0.100275,0.025663,3.907324
3052,0.100275,0.027585,3.635120
53322,0.100275,0.009683,10.355539


In [None]:
comparision_percentages = comparision_percentages.sort_values("score", ascending = False)

In [None]:
comparision_percentages

Unnamed: 0_level_0,similar,all,score
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2273,1.000000,0.011110,90.005038
54648,0.134615,0.001735,77.581886
4701,0.348901,0.006595,52.900710
3624,0.182692,0.006222,29.361146
6548,0.101648,0.004291,23.687602
...,...,...,...
527,0.243132,0.212788,1.142603
2858,0.181319,0.165398,1.096257
296,0.281593,0.281755,0.999427
1193,0.107143,0.117532,0.911603


A higher score implies better recommendation

In [None]:
# Merging the datsets for the final movie recommendations
comparision_percentages.head(10).merge(movies,left_index = True, right_on = "movieId")

Unnamed: 0,similar,all,score,movieId,title,genres,clean_title
2182,1.0,0.01111,90.005038,2273,Rush Hour (1998),Action|Comedy|Crime|Thriller,Rush Hour 1998
11774,0.134615,0.001735,77.581886,54648,Rush Hour 3 (2007),Action|Comedy|Crime|Thriller,Rush Hour 3 2007
4596,0.348901,0.006595,52.90071,4701,Rush Hour 2 (2001),Action|Comedy,Rush Hour 2 2001
3525,0.182692,0.006222,29.361146,3624,Shanghai Noon (2000),Action|Adventure|Comedy|Western,Shanghai Noon 2000
6425,0.101648,0.004291,23.687602,6548,Bad Boys II (2003),Action|Comedy|Crime|Thriller,Bad Boys II 2003
1829,0.116758,0.005569,20.964852,1918,Lethal Weapon 4 (1998),Action|Comedy|Crime|Thriller,Lethal Weapon 4 1998
1913,0.111264,0.005625,19.779482,2002,Lethal Weapon 3 (1992),Action|Comedy|Crime|Drama,Lethal Weapon 3 1992
143,0.184066,0.010224,18.002858,145,Bad Boys (1995),Action|Comedy|Crime|Drama|Thriller,Bad Boys 1995
4264,0.115385,0.007454,15.480312,4369,"Fast and the Furious, The (2001)",Action|Crime|Thriller,Fast and the Furious The 2001
2077,0.175824,0.011969,14.690295,2167,Blade (1998),Action|Horror|Thriller,Blade 1998


### Building a recommendation function

In [None]:
def watch_next(given_movie_id):
  similar_users = ratings[(ratings["movieId"] == given_movie_id) & (ratings["rating"] >=5)]["userId"].unique()
  similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] >4)]["movieId"]
  similar_user_recs = similar_user_recs.value_counts()/len(similar_users)
  similar_user_recs = similar_user_recs[similar_user_recs > 0.1]
  # Finding how much movies all users like
  all_users = ratings[ (ratings['movieId'].isin(similar_user_recs.index)) & (ratings['rating'] >4)]
  all_users_rec = all_users["movieId"].value_counts()/len (all_users["userId"].unique())
  comparision_percentages = pd.concat([similar_user_recs,all_users_rec], axis =1)
  comparision_percentages.columns = ["similar","all"]
  comparision_percentages["score"] = comparision_percentages["similar"]/comparision_percentages["all"]
  comparision_percentages = comparision_percentages.sort_values("score", ascending = False)
  return comparision_percentages.head(10).merge(movies,left_index = True, right_on = "movieId")[["score", "title","genres"]]

In [None]:
watch_next(112)

Unnamed: 0,score,title,genres
110,94.901449,Rumble in the Bronx (Hont faan kui) (1995),Action|Adventure|Comedy|Crime
770,11.737222,Eraser (1996),Action|Drama|Thriller
489,11.14711,Executive Decision (1996),Action|Adventure|Thriller
93,10.894523,Broken Arrow (1996),Action|Adventure|Thriller
348,7.177209,"Crow, The (1994)",Action|Crime|Fantasy|Thriller
102,7.053489,Happy Gilmore (1996),Comedy
161,7.014421,Desperado (1995),Action|Romance|Western
721,6.002372,Twister (1996),Action|Adventure|Romance|Thriller
718,5.209481,"Rock, The (1996)",Action|Adventure|Thriller
764,4.55653,Independence Day (a.k.a. ID4) (1996),Action|Adventure|Sci-Fi|Thriller


### Creating an interactive recommendation widget

In [None]:
movie_input = widgets.Text(
    value = "Toy Story",
    description = "Movie Title:",
    disabled = False
)

recommendation_list = widgets.Output()

def on_type(data):
  with recommendation_list:
    recommendation_list.clear_output() # clearing the output screen
    title = data["new"]
    if(len(title) > 5):
      results = search_title(title) # Getting the movies similar to what the user typed out
      movie_id = results.iloc[0]["movieId"] # Grabbing the first most movieId that appears
      display(watch_next(movie_id))

movie_input.observe(on_type, names = "value")
display(movie_input,recommendation_list)


Text(value='Toy Story', description='Movie Title:')

Output()