In [1]:
import pandas as pd

In [2]:
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
62418,209157,We (2018),Drama
62419,209159,Window of the Soul (2001),Documentary
62420,209163,Bad Poems (2018),Comedy|Drama
62421,209169,A Girl Thing (2001),(no genres listed)


In [3]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1.147880e+09
1,1,306,3.5,1.147869e+09
2,1,307,5.0,1.147869e+09
3,1,665,5.0,1.147879e+09
4,1,899,3.5,1.147869e+09
...,...,...,...,...
3033674,20032,1198,4.0,9.976863e+08
3033675,20032,1206,4.0,9.962589e+08
3033676,20032,1265,3.0,9.962597e+08
3033677,20032,1580,4.0,9.962602e+08


In [4]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62423 entries, 0 to 62422
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  62423 non-null  int64 
 1   title    62423 non-null  object
 2   genres   62423 non-null  object
dtypes: int64(1), object(2)
memory usage: 1.4+ MB


In [5]:
import re

# method to clean up the movie titles - get rid of anything that's not a space, letter, or number
def clean_title(title):
  return re.sub('[^a-zA-Z0-9 ]', '', title)

movies['clean_title'] = movies['title'].apply(clean_title)
movies

Unnamed: 0,movieId,title,genres,clean_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji 1995
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men 1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale 1995
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II 1995
...,...,...,...,...
62418,209157,We (2018),Drama,We 2018
62419,209159,Window of the Soul (2001),Documentary,Window of the Soul 2001
62420,209163,Bad Poems (2018),Comedy|Drama,Bad Poems 2018
62421,209169,A Girl Thing (2001),(no genres listed),A Girl Thing 2001


In [29]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(ngram_range = (1,2)) # compares similarities - oversimplified explanation

tfidf = vectorizer.fit_transform(movies['clean_title'])
# dir(vectorizer)
# temp = list(vectorizer.vocabulary_)
# temp

['toy',
 'story',
 '1995',
 'toy story',
 'story 1995',
 'jumanji',
 'jumanji 1995',
 'grumpier',
 'old',
 'men',
 'grumpier old',
 'old men',
 'men 1995',
 'waiting',
 'to',
 'exhale',
 'waiting to',
 'to exhale',
 'exhale 1995',
 'father',
 'of',
 'the',
 'bride',
 'part',
 'ii',
 'father of',
 'of the',
 'the bride',
 'bride part',
 'part ii',
 'ii 1995',
 'heat',
 'heat 1995',
 'sabrina',
 'sabrina 1995',
 'tom',
 'and',
 'huck',
 'tom and',
 'and huck',
 'huck 1995',
 'sudden',
 'death',
 'sudden death',
 'death 1995',
 'goldeneye',
 'goldeneye 1995',
 'american',
 'president',
 'american president',
 'president the',
 'the 1995',
 'dracula',
 'dead',
 'loving',
 'it',
 'dracula dead',
 'dead and',
 'and loving',
 'loving it',
 'it 1995',
 'balto',
 'balto 1995',
 'nixon',
 'nixon 1995',
 'cutthroat',
 'island',
 'cutthroat island',
 'island 1995',
 'casino',
 'casino 1995',
 'sense',
 'sensibility',
 'sense and',
 'and sensibility',
 'sensibility 1995',
 'four',
 'rooms',
 'four 

In [17]:

from sklearn.metrics.pairwise import cosine_similarity
# cosine_similarity compares HOW similar are two texts by comparing the angle between their vectors:
# same direction - more similar
# opposite direction - less similar

import numpy as np

def search(title):
  title = clean_title(title)
  query_vec = vectorizer.transform([title])
  similarity = cosine_similarity(query_vec, tfidf).flatten()
  indices = np.argpartition(similarity, -5)[-5:]
  results = movies.iloc[indices][::-1]
  return results


In [34]:
# interactive search box - just a prototype, the final version is at the end
import ipywidgets as widgets
from IPython.display import display

movie_input = widgets.Text(
    value = 'Toy Story',
    description = 'Movie Title: ',
    disabled = False
)

movie_list = widgets.Output()

def on_type(data):
  with movie_list:
    movie_list.clear_output()
    # print('weeeeeeeeeee', data)
    title = data['new']
    if len(title) > 5:
      display(search(title))


movie_input.observe(on_type, names = 'value')

display(movie_input, movie_list)



Text(value='Toy Story', description='Movie Title: ')

Output()

In [94]:
# finding users who liked the movie we've searched for
movie_id = 1
similar_users = ratings[(ratings['movieId'] == movie_id) & (ratings['rating'] >= 5)]['userId'].unique()
similar_users

array([   36,    75,    86, ..., 20001, 20009, 20023])

In [95]:
# finding what other movies those users (similar users) liked
similar_user_recs = ratings[(ratings['userId'].isin(similar_users)) & (ratings['rating'] >= 4)]['movieId']

# converting it into % so it is easier to understand and compare
similar_user_recs = similar_user_recs.value_counts() / len(similar_users)

# keeping records of only where the % is more than 25% (can be any number - the more the better but the fewer the results get)
similar_user_recs = similar_user_recs[similar_user_recs > 0.25]

In [96]:
# similar_user_recs.sort_values(ascending = False)
# finding how much did everyone else like the shortlisted movies
all_users = ratings[(ratings['movieId'].isin(similar_user_recs.index)) & (ratings['rating'] >= 4)]

# turning it into % so we can compare how much a user similar to us liked the movie vs how the average user liked it
all_user_recs = all_users['movieId'].value_counts() / len(all_users['userId'].unique())

In [97]:
rec_pct = pd.concat([similar_user_recs, all_user_recs], axis = 1)
rec_pct.columns = ['similar', 'all']
rec_pct

Unnamed: 0_level_0,similar,all
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,1.0,0.248045
318,0.509265,0.45975
260,0.501494,0.337111
356,0.482367,0.387709
296,0.447101,0.4083
593,0.433951,0.379675
1210,0.402271,0.255227
1196,0.401076,0.287204
588,0.399283,0.159617
480,0.395099,0.231019


In [98]:
# we're looking for a big differential between similar and all columns of rec_pct dataframe
# for example you searched for Avengers, if 100% of people who liked it also liked Toy Story, it doesn't mean they're similar.
# it just means that everyone liked it - not a good recommendation
# on the other hand if 100% of the people who like Avengers also liked Thor, and on average only 40% of people liked Thor,
# it indicated that people of similar tastes like that movie so it's likely similar
# we'll call this differential 'score'

rec_pct['score'] = rec_pct['similar'] / rec_pct['all']
rec_pct = rec_pct.sort_values('score', ascending = False)


In [99]:
# now we actually need the title of these movies
varx = rec_pct.merge(movies, left_index = True, right_on = 'movieId')
varx

Unnamed: 0,similar,all,score,movieId,title,genres,clean_title
0,1.0,0.248045,4.031532,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
3021,0.377167,0.104655,3.603889,3114,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 2 1999
33,0.288105,0.112477,2.561465,34,Babe (1995),Children|Drama,Babe 1995
587,0.328751,0.128758,2.553252,595,Beauty and the Beast (1991),Animation|Children|Fantasy|Musical|Romance|IMAX,Beauty and the Beast 1991
580,0.399283,0.159617,2.501506,588,Aladdin (1992),Adventure|Animation|Children|Comedy|Musical,Aladdin 1992
359,0.375374,0.173078,2.168812,364,"Lion King, The (1994)",Adventure|Animation|Children|Drama|Musical|IMAX,Lion King The 1994
764,0.273162,0.133333,2.048715,780,Independence Day (a.k.a. ID4) (1996),Action|Adventure|Sci-Fi|Thriller,Independence Day aka ID4 1996
375,0.264196,0.129875,2.034234,380,True Lies (1994),Action|Adventure|Comedy|Romance|Thriller,True Lies 1994
1523,0.259414,0.131258,1.976364,1580,Men in Black (a.k.a. MIB) (1997),Action|Comedy|Sci-Fi,Men in Black aka MIB 1997
4780,0.272564,0.144187,1.890349,4886,"Monsters, Inc. (2001)",Adventure|Animation|Children|Comedy|Fantasy,Monsters Inc 2001


In [100]:
# now, we're just putting everything we did above inside a function, so we can reuse it instead
# of using the harcoded movieId = 1 (referencing to cell right after iPyWidgets implemenetation)
def recommender(movieId):
  similar_users = ratings[(ratings['movieId'] == movie_id) & (ratings['rating'] >= 5)]['userId'].unique()
  similar_user_recs = ratings[(ratings['userId'].isin(similar_users)) & (ratings['rating'] >= 4)]['movieId']
  similar_user_recs = similar_user_recs.value_counts() / len(similar_users)
  similar_user_recs = similar_user_recs[similar_user_recs > 0.25]

  all_users = ratings[(ratings['movieId'].isin(similar_user_recs.index)) & (ratings['rating'] >= 4)]
  all_user_recs = all_users['movieId'].value_counts() / len(all_users['userId'].unique())

  rec_pct = pd.concat([similar_user_recs, all_user_recs], axis = 1)
  rec_pct.columns = ['similar', 'all']
  rec_pct['score'] = rec_pct['similar'] / rec_pct['all']
  rec_pct = rec_pct.sort_values('score', ascending = False)

  return rec_pct.merge(movies, left_index = True, right_on = 'movieId')





In [107]:
# final version of the input widgets and stuff
import ipywidgets as widgets
from IPython.display import display

movie_name_input = widgets.Text(
    value = 'Toy Story',
    description = 'Movie Title: ',
    # disabled = False
)

recommendation_list = widgets.Output()

def on_type(data):
  with recommendation_list:
    recommendation_list.clear_output()
    title = data['new']
    if len(title) > 5:
      results = search(title)
      movie_id = results.iloc[0]['movieId']
      display(recommender(movie_id).head(10))

movie_name_input.observe(on_type, names = 'value')

display(movie_name_input, recommendation_list)

Text(value='Toy Story', description='Movie Title: ')

Output()