In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import word_tokenize

# Content = "Movie's Synonsis / Overview" 

In [2]:
df = pd.read_csv("data/content_by_synopsis.csv")
df.head()

Unnamed: 0,title,overview
0,Toy Story,"Led by Woody, Andy's toys live happily in his ..."
1,Jumanji,When siblings Judy and Peter discover an encha...
2,Grumpier Old Men,A family wedding reignites the ancient feud be...
3,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom..."
4,Father of the Bride Part II,Just when George Banks has recovered from his ...


# Encode all synopsis to bank

In [3]:
bow = CountVectorizer(stop_words="english", tokenizer=word_tokenize)
bank = bow.fit_transform(df.overview)

# Step 1 : Encode what user watch

In [4]:
idx = 0
content = df.loc[idx, "overview"] # Akses index ke 0 pada kolom overview. Sedangkan, df.loc[:,"overview"] digunakan untuk akses semua baris pada kolom overview
content

"Led by Woody, Andy's toys live happily in his room until Andy's birthday brings Buzz Lightyear onto the scene. Afraid of losing his place in Andy's heart, Woody plots against Buzz. But when circumstances separate Buzz and Woody from their owner, the duo eventually learns to put aside their differences."

In [5]:
code = bow.transform([content]) # Harus menggunakan kurung kurawal karena untuk menjaga konsistensi pada fungsi

# Step 2: Document Search

In [6]:
from sklearn.metrics.pairwise import cosine_distances

In [7]:
dist = cosine_distances(code, bank)
dist

array([[0.        , 0.68698928, 0.70198022, ..., 0.88529213, 0.68931574,
        0.75277431]])

In [8]:
rec_idx = dist.argsort()[0, 1:11]
rec_idx

array([14706,  2945,  9984, 36827, 40606, 13404, 22084, 14078,  6172,
       27006], dtype=int64)

# Step 3 : Recommend

In [9]:
df.loc[rec_idx]

Unnamed: 0,title,overview
14706,Toy Story 3,"Woody, Buzz, and the rest of Andy's toys haven..."
2945,Toy Story 2,"Andy heads off to Cowboy Camp, leaving his toy..."
9984,The 40 Year Old Virgin,Andy Stitzer has a pleasant life with a nice a...
36827,Wabash Avenue,Andy Clark discovers he was cheated out of a h...
40606,Stasis,After a night out of partying and left behind ...
13404,The Gang's All Here,"Playboy Andy Mason, on leave from the army, ro..."
22084,The Pied Piper,"Greed, corruption, ignorance, and disease. Mid..."
14078,A Matter of Dignity,"During one of her parents many parties, Chloe ..."
6172,The Courtship of Eddie's Father,The film that started the classic TV series. A...
27006,Superdome,"It's Superbowl. And there's a lot of drama, on..."


# ML engineering : Wrap All That Code

In [11]:
from sklearn.metrics.pairwise import cosine_distances

class RecommenderSystem:
    def __init__(self, data, content_col):
        self.df = pd.read_csv(data)
        self.content_col = content_col
        self.encoder = None
        self.bank = None
        
    def fit(self):
        self.encoder = CountVectorizer(stop_words="english", tokenizer=word_tokenize)
        self.bank = self.encoder.fit_transform(self.df[self.content_col])
    
    def recommend(self, idx, topk = 10):
        content = self.df.loc[idx, self.content_col]
        code = self.encoder.transform([content])
        dist = cosine_distances(code, bank)
        rec_idx = dist.argsort()[0, 1:topk+1]
        return self.df.loc[rec_idx]

In [13]:
recsys = RecommenderSystem(data="data/content_by_synopsis.csv", content_col="overview")
recsys.fit()

In [14]:
recsys.recommend(idx=1) # Jumanji film test

Unnamed: 0,title,overview
27006,Superdome,"It's Superbowl. And there's a lot of drama, on..."
40606,Stasis,After a night out of partying and left behind ...
37971,Snowed Under,"Alan Tanner's new play opens in a week, but Ta..."
18715,Wreck-It Ralph,"Wreck-It Ralph is the 9-foot-tall, 643-pound v..."
40431,Liar Game: Reborn,"To exact revenge, the Liar Game office is revi..."
38232,Enter the Battlefield: Life on the Magic - The...,Magic: The Gathering is the most popular colle...
36540,Beta Test,While testing the latest first person shooter ...
14859,Le Pont du Nord,"Marie, is just out from prison when she runs i..."
13105,Break Up,"Jimmy is married to the abusive Frank, but she..."
17918,Dante's Inferno: An Animated Epic,Dante journeys through the nine circles of Hel...
