# 1. Import Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from string import punctuation

sw_eng = stopwords.words('english') 

# 2. Import Dataset

In [2]:
df = pd.read_csv('content_by_multiple.csv')

df.head()

Unnamed: 0,title,genres,cast,keywords,director,metadata
0,Toy Story,animation comedy family,tom_hanks tim_allen don_rickles,jealousy toy boy,john_lasseter,animation comedy family tom_hanks tim_allen do...
1,Jumanji,adventure fantasy family,robin_williams jonathan_hyde kirsten_dunst,board_game disappearance based_on_children's_book,joe_johnston,adventure fantasy family robin_williams jonath...
2,Grumpier Old Men,romance comedy,walter_matthau jack_lemmon ann-margret,fishing best_friend duringcreditsstinger,howard_deutch,romance comedy walter_matthau jack_lemmon ann-...
3,Waiting to Exhale,comedy drama romance,whitney_houston angela_bassett loretta_devine,based_on_novel interracial_relationship single...,forest_whitaker,comedy drama romance whitney_houston angela_ba...
4,Father of the Bride Part II,comedy,steve_martin diane_keaton martin_short,baby midlife_crisis confidence,charles_shyer,comedy steve_martin diane_keaton martin_short ...


In [3]:
df.metadata = df.title + ' ' + df.genres + ' ' + df.metadata 

In [4]:
df.metadata

0        Toy Story animation comedy family animation co...
1        Jumanji adventure fantasy family adventure fan...
2        Grumpier Old Men romance comedy romance comedy...
3        Waiting to Exhale comedy drama romance comedy ...
4        Father of the Bride Part II comedy comedy stev...
                               ...                        
42272    Caged Heat 3000 science_fiction science_fictio...
42273    Subdue drama family drama family jeanne_d'alcy...
42274     Century of Birthing drama drama   robert_gardner
42275                                                  NaN
42276                                                  NaN
Name: metadata, Length: 42277, dtype: object

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42277 entries, 0 to 42276
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   title     42277 non-null  object
 1   genres    39995 non-null  object
 2   cast      40005 non-null  object
 3   keywords  29045 non-null  object
 4   director  41447 non-null  object
 5   metadata  39995 non-null  object
dtypes: object(6)
memory usage: 1.9+ MB


In [6]:
df.fillna(' ', inplace=True)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42277 entries, 0 to 42276
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   title     42277 non-null  object
 1   genres    42277 non-null  object
 2   cast      42277 non-null  object
 3   keywords  42277 non-null  object
 4   director  42277 non-null  object
 5   metadata  42277 non-null  object
dtypes: object(6)
memory usage: 1.9+ MB


In [8]:
df.describe(include='O')

Unnamed: 0,title,genres,cast,keywords,director,metadata
count,42277,42277,42277.0,42277.0,42277.0,42277.0
unique,42277,1989,39226.0,23791.0,16747.0,39996.0
top,Toy Story,drama,,,,
freq,1,4565,2272.0,13232.0,830.0,2282.0


In [9]:
df.shape

(42277, 6)

## 3 Cleaning metadata

In [10]:
def clean_data(text):
    text = text.lower()
    text = text.replace('_',' ')
    clean_word = word_tokenize(text)
    clean_word = [word for word in clean_word if word not in punctuation]
    clean_word = [word for word in clean_word if word not in sw_eng]
    clean_word = ' '.join(clean_word)
    return clean_word

In [11]:
df.metadata = df.metadata.apply(clean_data)

In [12]:
df.head()

Unnamed: 0,title,genres,cast,keywords,director,metadata
0,Toy Story,animation comedy family,tom_hanks tim_allen don_rickles,jealousy toy boy,john_lasseter,toy story animation comedy family animation co...
1,Jumanji,adventure fantasy family,robin_williams jonathan_hyde kirsten_dunst,board_game disappearance based_on_children's_book,joe_johnston,jumanji adventure fantasy family adventure fan...
2,Grumpier Old Men,romance comedy,walter_matthau jack_lemmon ann-margret,fishing best_friend duringcreditsstinger,howard_deutch,grumpier old men romance comedy romance comedy...
3,Waiting to Exhale,comedy drama romance,whitney_houston angela_bassett loretta_devine,based_on_novel interracial_relationship single...,forest_whitaker,waiting exhale comedy drama romance comedy dra...
4,Father of the Bride Part II,comedy,steve_martin diane_keaton martin_short,baby midlife_crisis confidence,charles_shyer,father bride part ii comedy comedy steve marti...


# 4. Encode Metadata

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [14]:
tfidf = TfidfVectorizer()
tfidf_bank = tfidf.fit_transform(df.metadata)

In [15]:
idx = 0

In [16]:
content = df.loc[idx, 'metadata']
content

'toy story animation comedy family animation comedy family tom hanks tim allen rickles jealousy toy boy john lasseter'

In [17]:
code = tfidf.transform([content])
code

<1x62569 sparse matrix of type '<class 'numpy.float64'>'
	with 14 stored elements in Compressed Sparse Row format>

In [18]:
code.toarray()

array([[0., 0., 0., ..., 0., 0., 0.]])

# 5. Document Search

In [19]:
from sklearn.metrics.pairwise import cosine_distances

In [20]:
distance = cosine_distances(code, tfidf_bank)
distance

array([[0.        , 0.93810268, 0.97777938, ..., 1.        , 1.        ,
        1.        ]])

In [21]:
recom = distance.argsort()[0, 1:11]
recom

array([ 2963, 14771, 24390, 18267,  3839, 24391, 18318, 20885, 27890,
       10785], dtype=int64)

### Check Similarity

In [22]:
df.iloc[0][0] #Toy Story

'Toy Story'

In [23]:
df.iloc[14771][0]

'Toy Story 3'

In [24]:
df.iloc[2963][0]

'Toy Story 2'

# 5. Content Based Filtering

In [25]:
class RecommenderSystem:
    def __init__(self, data, meta_data):
        self.df = df
        self.meta_data = meta_data
        
    
    def fit(self):
        self.encoder =  TfidfVectorizer()
        self.bank = self.encoder.fit_transform(self.df[self.meta_data])
    
    
    def recommend (self, idx, topk = 10):
        content = df.loc[idx, self.meta_data]
        code = self.encoder.transform([content])
        distance = cosine_distances(code, self.bank)
        recom = distance.argsort()[0, 1:(topk+1)]
        return self.df.loc[recom]

In [26]:
recsys = RecommenderSystem(data=df, meta_data='metadata')
recsys.fit()

### Similar Movie with Toy Story

In [27]:
df[df['title'] == 'Toy Story']

Unnamed: 0,title,genres,cast,keywords,director,metadata
0,Toy Story,animation comedy family,tom_hanks tim_allen don_rickles,jealousy toy boy,john_lasseter,toy story animation comedy family animation co...


In [28]:
recsys.recommend(0) #Toy Story

Unnamed: 0,title,genres,cast,keywords,director,metadata
2963,Toy Story 2,animation comedy family,tom_hanks tim_allen joan_cusack,museum prosecution identity_crisis,john_lasseter,toy story 2 animation comedy family animation ...
14771,Toy Story 3,animation family comedy,tom_hanks tim_allen ned_beatty,hostage college toy,lee_unkrich,toy story 3 animation family comedy animation ...
24390,The Legend of Mor'du,animation family,tom_hanks tim_allen joan_cusack,toy short toy_story,steve_purcell,legend mor'du animation family animation famil...
18267,Tin Toy,animation,,pixar_animation,john_lasseter,tin toy animation animation pixar animation jo...
3839,The Transformers: The Movie,animation,judd_nelson peter_cullen frank_welker,toy transformation based_on_toy,nelson_shin,transformers movie animation animation judd ne...
24391,Toy Story That Time Forgot,animation family,justice_leak robert_pralgo jason_turner,,thomas_verrette,toy story time forgot animation family animati...
18318,Red's Dream,animation,,pixar_animation,john_lasseter,red 's dream animation animation pixar animati...
20885,Toy Story of Terror!,animation comedy family,maurice_chevalier merle_oberon ann_sothern,paris folies_bergère,roy_del_ruth,toy story terror animation comedy family anima...
27890,Barbie as the Island Princess,animation family,kelly_sheridan roger_monk garry_chalk,based_on_toy,william_lau,barbie island princess animation family animat...
10785,Monster House,animation comedy family,ryan_newman steve_buscemi mitchel_musso,monster secret toy,gil_kenan,monster house animation comedy family animatio...


### Similar Movie with Jumanji

In [29]:
df[df['title'] == 'Jumanji']

Unnamed: 0,title,genres,cast,keywords,director,metadata
1,Jumanji,adventure fantasy family,robin_williams jonathan_hyde kirsten_dunst,board_game disappearance based_on_children's_book,joe_johnston,jumanji adventure fantasy family adventure fan...


In [30]:
recsys.recommend(1) 

Unnamed: 0,title,genres,cast,keywords,director,metadata
13770,Where the Wild Things Are,family fantasy,max_records catherine_keener lauren_ambrose,creature based_on_children's_book children's_book,spike_jonze,wild things family fantasy family fantasy max ...
17504,G.I. Joe: The Revenge of Cobra,family fantasy action,,,,g.i joe revenge cobra family fantasy action fa...
28394,The Rain Fairy,family fantasy,,,,rain fairy family fantasy family fantasy
28401,The Amazing Mr Blunden,family fantasy,mille_dinesen robert_hansen peter_mygind,inventor board_game children,martin_miehe-renard,amazing mr blunden family fantasy family fanta...
11408,Spider-Man 3,fantasy action adventure,tobey_maguire kirsten_dunst james_franco,dual_identity amnesia sandstorm,sam_raimi,spider-man 3 fantasy action adventure fantasy ...
552,The Pagemaster,fantasy science_fiction family,macaulay_culkin christopher_lloyd patrick_stewart,library adventure part_animated,joe_johnston,pagemaster fantasy science fiction family fant...
7748,Spider-Man 2,action adventure fantasy,tobey_maguire kirsten_dunst james_franco,dual_identity love_of_one's_life pizza_boy,sam_raimi,spider-man 2 action adventure fantasy action a...
1806,Small Soldiers,comedy adventure fantasy,gregory_smith kirsten_dunst denis_leary,defense_industry toy_shop technical_toy,joe_dante,small soldiers comedy adventure fantasy comedy...
39899,Tainá: An Amazon Adventure,family fantasy adventure,,comedy,kahane_cooperman,tainá amazon adventure family fantasy adventur...
22246,Maleficent,fantasy adventure action,viggo_mortensen kirsten_dunst oscar_isaac,sailboat hotel american,hossein_amini,maleficent fantasy adventure action fantasy ad...


### Similar Movie with Home Alone

In [31]:
df[df['title'] == 'Home Alone']

Unnamed: 0,title,genres,cast,keywords,director,metadata
579,Home Alone,comedy family,macaulay_culkin joe_pesci daniel_stern,holiday burglar home_invasion,chris_columbus,home alone comedy family comedy family macaula...


In [32]:
recsys.recommend(579) #Home Alone

Unnamed: 0,title,genres,cast,keywords,director,metadata
2808,Home Alone 2: Lost in New York,comedy family adventure,macaulay_culkin joe_pesci catherine_o'hara,holiday new_york new_york_city,chris_columbus,home alone 2 lost new york comedy family adven...
9856,Home Movie,comedy documentary,,,chris_smith,home movie comedy documentary comedy documenta...
15955,Home Alone 4,crime comedy family,french_stewart erick_avari mike_weinberg,fight father house,rod_daniel,home alone 4 crime comedy family crime comedy ...
1623,Home Alone 3,comedy family,alex_d._linz olek_krupa rya_kihlstedt,parent_child_relationship burglar child_hero,raja_gosnell,home alone 3 comedy family comedy family alex ...
23933,Home Alone: The Holiday Heist,comedy crime family,oleg_menshikov leonid_bronevoy inna_ulyanova,,mikhail_kozakov,home alone holiday heist comedy crime family c...
552,The Pagemaster,fantasy science_fiction family,macaulay_culkin christopher_lloyd patrick_stewart,library adventure part_animated,joe_johnston,pagemaster fantasy science fiction family fant...
455,Getting Even with Dad,family comedy crime,macaulay_culkin ted_danson glenne_headly,,howard_deutch,getting even dad family comedy crime family co...
19021,Nativity!,comedy family,daniel_stern braeden_lemasters stacey_travis,holiday,brian_levant,nativity comedy family comedy family daniel st...
5751,My Girl,comedy drama family,anna_chlumsky macaulay_culkin dan_aykroyd,neighbor child_killed_by_animal writing_class,howard_zieff,girl comedy drama family comedy drama family a...
369,Ri¢hie Ri¢h,comedy family,macaulay_culkin john_larroquette edward_herrmann,family life_raft private_airplane,donald_petrie,ri¢hie ri¢h comedy family comedy family macaul...
