## This is a movie recommendation sytem where you type in a movie and you get 10 other movies you might watch

In [2]:
# Loading the data set
import pandas as pd

movies = pd.read_csv('movies.csv')

In [3]:
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
62418,209157,We (2018),Drama
62419,209159,Window of the Soul (2001),Documentary
62420,209163,Bad Poems (2018),Comedy|Drama
62421,209169,A Girl Thing (2001),(no genres listed)


## Data cleaning

In [9]:
# importing regular expression package
import re

"""This function takes movies title and clean it
like removing extra characters that makes search difficult """

def clean_title(title):
  return re.sub("[^a-zA-Z0-9 ]", "", title)

In [10]:
"""creating a new column and applying 
the new clean titles through the function to our new column"""

movies["clean_title"] = movies["title"].apply(clean_title)

In [11]:
movies

Unnamed: 0,movieId,title,genres,clean_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji 1995
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men 1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale 1995
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II 1995
...,...,...,...,...
62418,209157,We (2018),Drama,We 2018
62419,209159,Window of the Soul (2001),Documentary,Window of the Soul 2001
62420,209163,Bad Poems (2018),Comedy|Drama,Bad Poems 2018
62421,209169,A Girl Thing (2001),(no genres listed),A Girl Thing 2001


## Creating a **TFIDF MATRIX**
- computers don't understand characters but only numbers, so we need to convert our tittles to set of numbers through a term frequency matrix and inverse document frequency to capture unique terms and a result we get a vector fro each movie (**Tf * Idf**) 

In [12]:
# importing packages to help form the matrix
from sklearn.feature_extraction.text import TfidfVectorizer

#setting set of 2 words that are consequetive
vectorizer = TfidfVectorizer(ngram_range=(1,2))

# forming the matrix
tfidf = vectorizer.fit_transform(movies['clean_title'])

## Creating a search function
- first I compute the similarity between the word I enter and all the movies in our list

In [14]:
#importing the packages we need
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def search(title):
  title = clean_title(title) #clean title
  query_vec = vectorizer.transform([title])#term to set of numbers
  similarity = cosine_similarity(query_vec, tfidf).flatten()#finding similarity
  indices = np.argpartition(similarity, -5)[-5:]#tiltles that have highest similarity to my term
  results = movies.iloc[indices][::-1]#reversing results
  return results

## Building an Interactive Search Box

In [20]:
#importing dependencies
import ipywidgets as widgets
from IPython.display import display

#Input widget
movie_input = widgets.Text(
    value = "Toy Story",
    description ="Movie Title:",
    disabled = False
)

#output widget
movie_list = widgets.Output()

"""This function is going to be called
whenever we type something to input widget"""

def on_type(data):
  with movie_list:
    movie_list.clear_output()
    title = data['new']
    if len(title) > 5:
      display(search(title))

movie_input.observe(on_type, names='value')

display(movie_input, movie_list)



Text(value='Toy Story', description='Movie Title:')

Output()

## Finding movies similar to the ones I like and aslo finding user who liked the same movie and building the recommender system

In [21]:
#reading ratings dataset
ratings = pd.read_csv('ratings.csv')
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510
...,...,...,...,...
8898526,57993,180095,4.5,1527272216
8898527,57993,187593,5.0,1527272211
8898528,57994,21,1.0,839950400
8898529,57994,39,1.0,839950400


In [44]:
movie_id = 1 #for the troy

In [47]:
similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings['rating'] > 4)]["userId"].unique()

similar_users


array([   36,    75,    86, ..., 57966, 57970, 57979])

In [52]:
similar_user_recs.value_counts() / len(similar_users)

1         1.000000
318       0.442558
260       0.391317
296       0.367543
356       0.361341
            ...   
149727    0.000148
132140    0.000148
127120    0.000148
119454    0.000148
31026     0.000148
Name: movieId, Length: 14288, dtype: float64

In [50]:
#finding other movies they also liked
similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]

similar_user_recs

5101            1
5105           34
5111          110
5114          150
5127          260
            ...  
8897191    135569
8897193    137857
8897194    142997
8897195    143355
8897197    166528
Name: movieId, Length: 484748, dtype: int64

In [53]:
# finnding movies that more than 10% also liked to filterthe recommendations
similar_user_recs = similar_user_recs.value_counts() / len(similar_users)

similar_user_recs = similar_user_recs[similar_user_recs > .1]

similar_user_recs

1        1.000000
318      0.442558
260      0.391317
296      0.367543
356      0.361341
           ...   
1259     0.102185
4896     0.101742
953      0.101299
551      0.101004
59315    0.100709
Name: movieId, Length: 111, dtype: float64

In [54]:
all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings['rating'] > 4)]


In [55]:
all_users

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
29,1,4973,4.5,1147869080
48,1,7361,5.0,1147880055
72,2,110,5.0,1141416589
76,2,260,5.0,1141417172
...,...,...,...,...
8898480,57993,5952,4.5,1527274126
8898482,57993,7153,5.0,1527274209
8898487,57993,48780,4.5,1527273878
8898493,57993,79132,5.0,1527272174


In [56]:
# calculating the percentage
all_users_recs = all_users['movieId'].value_counts() / len(all_users['userId'].unique())

In [57]:
all_users_recs

318      0.343573
296      0.283655
2571     0.242603
356      0.232808
593      0.224108
           ...   
953      0.044836
551      0.040959
50872    0.039828
78499    0.034856
2355     0.025340
Name: movieId, Length: 111, dtype: float64

## Creating a recommendation Score

In [58]:
rec_percentages = pd.concat([similar_user_recs, all_users_recs], axis=1)
rec_percentages.columns = ['similar', 'all']

In [59]:
rec_percentages

Unnamed: 0,similar,all
1,1.000000,0.125624
32,0.155641,0.098670
34,0.139102,0.052646
47,0.232575,0.144304
50,0.278500,0.200697
...,...,...
59315,0.100709,0.054297
60069,0.163615,0.077003
68954,0.158151,0.064890
78499,0.145747,0.034856


In [60]:
#creating a score
rec_percentages['score'] = rec_percentages['similar'] / rec_percentages['all']

In [61]:
#sorting tha values
rec_percentages = rec_percentages.sort_values('score', ascending=False)

In [62]:
rec_percentages

Unnamed: 0,similar,all,score
1,1.000000,0.125624,7.960278
3114,0.277171,0.053629,5.168260
2355,0.112670,0.025340,4.446334
78499,0.145747,0.034856,4.181370
4886,0.231394,0.071809,3.222360
...,...,...,...
2858,0.214117,0.166639,1.284916
2329,0.119758,0.094496,1.267331
79132,0.164796,0.131430,1.253870
4973,0.141022,0.112787,1.250340


In [63]:
#taking the top ten recommendations and merging them with our movie data
rec_percentages.head(10).merge(movies,left_index=True, right_on='movieId')

Unnamed: 0,similar,all,score,movieId,title,genres,clean_title
0,1.0,0.125624,7.960278,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
3021,0.277171,0.053629,5.16826,3114,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 2 1999
2264,0.11267,0.02534,4.446334,2355,"Bug's Life, A (1998)",Adventure|Animation|Children|Comedy,Bugs Life A 1998
14813,0.145747,0.034856,4.18137,78499,Toy Story 3 (2010),Adventure|Animation|Children|Comedy|Fantasy|IMAX,Toy Story 3 2010
4780,0.231394,0.071809,3.22236,4886,"Monsters, Inc. (2001)",Adventure|Animation|Children|Comedy|Fantasy,Monsters Inc 2001
580,0.211459,0.067078,3.152411,588,Aladdin (1992),Adventure|Animation|Children|Comedy|Musical,Aladdin 1992
6258,0.224011,0.072978,3.069583,6377,Finding Nemo (2003),Adventure|Animation|Children|Comedy,Finding Nemo 2003
587,0.18163,0.060549,2.999737,595,Beauty and the Beast (1991),Animation|Children|Fantasy|Musical|Romance|IMAX,Beauty and the Beast 1991
8246,0.200532,0.068099,2.944717,8961,"Incredibles, The (2004)",Action|Adventure|Animation|Children|Comedy,Incredibles The 2004
359,0.239811,0.085035,2.820133,364,"Lion King, The (1994)",Adventure|Animation|Children|Drama|Musical|IMAX,Lion King The 1994


## Building a recommendation funtion

In [64]:
def find_similar_movies(movie_id):
  similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings['rating'] > 4)]["userId"].unique()
  similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]

  similar_user_recs = similar_user_recs.value_counts() / len(similar_users)
  similar_user_recs = similar_user_recs[similar_user_recs > .10]

  all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings['rating'] > 4)]
  all_users_recs = all_users['movieId'].value_counts() / len(all_users['userId'].unique())

  rec_percentages = pd.concat([similar_user_recs, all_users_recs], axis=1)
  rec_percentages.columns = ['similar', 'all']

  rec_percentages['score'] = rec_percentages['similar'] / rec_percentages['all']

  rec_percentages = rec_percentages.sort_values('score', ascending=False)
  return rec_percentages.head(10).merge(movies,left_index=True, right_on='movieId')[['score', 'title', 'genres']]


## Creating an interactive recommendation widget

In [65]:
movie_name_input = widgets.Text(
    value = "Toy Story",
    description ="Movie Title:",
    disabled = False
)

#output widget
recommendation_list = widgets.Output()

"""This function is going to be called
whenever we type something to input widget"""

def on_type(data):
  with recommendation_list:
    recommendation_list.clear_output()
    title = data['new']
    if len(title) > 5:
      results = search(title)
      movie_id = results.iloc[0]['movieId']
      display(find_similar_movies(movie_id))

movie_name_input.observe(on_type, names='value')

display(movie_name_input, recommendation_list)

Text(value='Toy Story', description='Movie Title:')

Output()