In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import word_tokenize

In [2]:
df = pd.read_csv('data/content_by_synopsis.csv')
df.head()

Unnamed: 0,title,overview
0,Toy Story,"Led by Woody, Andy's toys live happily in his ..."
1,Jumanji,When siblings Judy and Peter discover an encha...
2,Grumpier Old Men,A family wedding reignites the ancient feud be...
3,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom..."
4,Father of the Bride Part II,Just when George Banks has recovered from his ...


In [3]:
df.overview.info

<bound method Series.info of 0        Led by Woody, Andy's toys live happily in his ...
1        When siblings Judy and Peter discover an encha...
2        A family wedding reignites the ancient feud be...
3        Cheated on, mistreated and stepped on, the wom...
4        Just when George Banks has recovered from his ...
                               ...                        
41357    It's the year 3000 AD. The world's most danger...
41358          Rising and falling between a man and woman.
41359    An artist struggles to finish his work while a...
41360    In a small town live two brothers, one a minis...
41361    50 years after decriminalisation of homosexual...
Name: overview, Length: 41362, dtype: object>

## Encode All Synopsis to bank / menyimpan data  teks menjadi kode
bank = kumpulan kata - kata penting dari setiap data

In [3]:
bow = CountVectorizer(stop_words='english', tokenizer=word_tokenize)
bank = bow.fit_transform(df.overview)

In [4]:
idx = 0

content = df.loc[idx, 'overview']
content

"Led by Woody, Andy's toys live happily in his room until Andy's birthday brings Buzz Lightyear onto the scene. Afraid of losing his place in Andy's heart, Woody plots against Buzz. But when circumstances separate Buzz and Woody from their owner, the duo eventually learns to put aside their differences."

In [5]:
code = bow.transform([content])
code.toarray()

array([[0, 0, 0, ..., 0, 0, 0]], dtype=int64)

## Mencari kesamaan dari teks yg ingin dibandingkan

In [6]:
from sklearn.metrics.pairwise import cosine_distances

In [7]:
dist = cosine_distances(code, bank)
dist 

array([[0.        , 0.68698928, 0.70198022, ..., 0.88529213, 0.68931574,
        0.75277431]])

In [8]:
rec_film = dist.argsort()[0, 1:11]
rec_film

array([14706,  2945,  9984, 36827, 40606, 13404, 22084, 14078,  6172,
       27006], dtype=int64)

## Rekomendasi 

In [9]:
df.loc[rec_film]

Unnamed: 0,title,overview
14706,Toy Story 3,"Woody, Buzz, and the rest of Andy's toys haven..."
2945,Toy Story 2,"Andy heads off to Cowboy Camp, leaving his toy..."
9984,The 40 Year Old Virgin,Andy Stitzer has a pleasant life with a nice a...
36827,Wabash Avenue,Andy Clark discovers he was cheated out of a h...
40606,Stasis,After a night out of partying and left behind ...
13404,The Gang's All Here,"Playboy Andy Mason, on leave from the army, ro..."
22084,The Pied Piper,"Greed, corruption, ignorance, and disease. Mid..."
14078,A Matter of Dignity,"During one of her parents many parties, Chloe ..."
6172,The Courtship of Eddie's Father,The film that started the classic TV series. A...
27006,Superdome,"It's Superbowl. And there's a lot of drama, on..."


# Make a Class

In [14]:
df = pd.read_csv('data/content_by_multiple.csv')
df.head()

Unnamed: 0,title,genres,cast,keywords,director,metadata
0,Toy Story,animation comedy family,tom_hanks tim_allen don_rickles,jealousy toy boy,john_lasseter,animation comedy family tom_hanks tim_allen do...
1,Jumanji,adventure fantasy family,robin_williams jonathan_hyde kirsten_dunst,board_game disappearance based_on_children's_book,joe_johnston,adventure fantasy family robin_williams jonath...
2,Grumpier Old Men,romance comedy,walter_matthau jack_lemmon ann-margret,fishing best_friend duringcreditsstinger,howard_deutch,romance comedy walter_matthau jack_lemmon ann-...
3,Waiting to Exhale,comedy drama romance,whitney_houston angela_bassett loretta_devine,based_on_novel interracial_relationship single...,forest_whitaker,comedy drama romance whitney_houston angela_ba...
4,Father of the Bride Part II,comedy,steve_martin diane_keaton martin_short,baby midlife_crisis confidence,charles_shyer,comedy steve_martin diane_keaton martin_short ...


In [15]:
class RecommendSystem:
    def __init__(self, data, content_col):
        self.df = df = pd.read_csv(data)
        self.content_col = content_col
        self.encoder = None
        self.bank = None
    
    def fit(self):
        self.encoder = CountVectorizer(stop_words='english', tokenizer=word_tokenize)
        self.bank = self.encoder.fit_transform(self.df[self.content_col])
        
    def recommend(self, idx, topk=10):
        content = df.loc[idx, self.content_col]
        code = self.encoder.transform([content])
        dist = cosine_distances(code, self.bank)
        rec_film = dist.argsort()[0, 1:(topk + 1)]
        return self.df.loc[rec_film]

In [16]:
recsys = RecommendSystem('data/content_by_multiple.csv', content_col='metadata')
recsys.fit()

In [17]:
recsys.recommend(1)

Unnamed: 0,title,genres,cast,keywords,director,metadata
41600,The Kingdom of Fairies,adventure fantasy,,,,adventure fantasy
28394,The Rain Fairy,family fantasy,,,,family fantasy
39899,Tainá: An Amazon Adventure,family fantasy adventure,,comedy,kahane_cooperman,family fantasy adventure comedy kahane_cooperman
552,The Pagemaster,fantasy science_fiction family,macaulay_culkin christopher_lloyd patrick_stewart,library adventure part_animated,joe_johnston,fantasy science_fiction family macaulay_culkin...
40803,Princess Goldilocks,adventure family fantasy,charlie_durkin,woman_director,callie_t._wiser,adventure family fantasy charlie_durkin woman_...
14070,Playmobil: The Secret of Pirate Island,action adventure family,lee_tockar caitlin_williams,fantasy adventure cartoon,alexander_e._sokoloff,action adventure family lee_tockar caitlin_wil...
15781,Cirque du Soleil: Varekai,drama family fantasy,,,,drama family fantasy
21579,The Young and Prodigious T.S. Spivet,adventure drama family,,,,adventure drama family
12560,City of Ember,adventure family fantasy,saoirse_ronan harry_treadaway mary_kay_place,underground_world mayor adventure,gil_kenan,adventure family fantasy saoirse_ronan harry_t...
17504,G.I. Joe: The Revenge of Cobra,family fantasy action,,,,family fantasy action
