In [1]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity
from custom_cleaning import CustomCleaning
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from collections import Counter

In [2]:
cleaning = CustomCleaning()

# BOOKS

In [3]:
df = pd.read_csv("books.csv")

df["desc_cleaned"] = [cleaning.clean(desc) for desc in df["description"]]
df["desc_cleaned"] = [cleaning.remove_stopword(desc) for desc in df["desc_cleaned"]]
df["desc_cleaned"] = [cleaning.stem(desc) for desc in df["desc_cleaned"]]

df["desc_cleaned"] = df["desc_cleaned"].str.split()

In [4]:
book_model = Word2Vec(
    sentences=df["desc_cleaned"],
    vector_size=300,
    window=5,
    workers=4,
    epochs=100,
    min_count=1,
    sg=1
)

In [5]:
def get_vector(synopsis):
    vectors = [book_model.wv[word] for word in synopsis if word in book_model.wv]
    if vectors:
        return np.mean(vectors, axis=0)  # sum of vectors / n
    else:
        return None

In [6]:
desc_vectors = []
for desc in df["desc_cleaned"]:
    desc_vector = get_vector(desc)
    desc_vectors.append(desc_vector)

df["desc_vector"] = [vector for vector in desc_vectors if vector is not None]

# USER BOOK

In [7]:
description = "Nyokap memandangi penjuru kamar gue. Dia diam sebentar, tersenyum, lalu bertanya, ‘Kamu takut ya? Makanya belom tidur?’‘Enggak, kenapa harus takut?’‘Ya, siapa tahu rumah baru ini ada hantunya, hiiiiii...,’ kata Nyokap, mencoba menakut-nakuti.‘Enggak takut, Ma,’ jawab gue.‘Kikkikikiki.’ Nyokap mencoba menirukan suara kuntilanak, yang malah terdengar seperti ABG kebanyakan ngisep lem sewaktu hendak photobox. ‘Kikikikikiki.’‘Aku enggak ta—’‘KIKIKIKIKIKIKIKI!’ Nyokap makin menjadi.‘Ma,’ kata gue, ‘kata orang, kalo kita malem-malem niruin ketawa kuntilanak, dia bisa dateng lho.’‘JANGAN NGOMONG GITU, DIKA!’ Nyokap sewot. ‘Kamu durhaka ya nakut-nakutin orang tua kayak gitu! Awas, ya, kamu, Dika!’‘Lah, tadi yang nakut-nakutin siapa, yang ketakutan siapa.’*****Manusia Setengah Salmon adalah kumpulan tulisan komedi Raditya Dika. Sembilan belas bab di dalam bercerita tentang pindah rumah, pindah hubungan keluarga, sampai pindah hati. Simak juga bab berisi tulisan galau, observasi ngawur, dan lelucon singkat khas Raditya Dika."

In [14]:
desc_cleaned = cleaning.clean(description)
desc_cleaned = cleaning.remove_stopword(desc_cleaned)
desc_cleaned = cleaning.stem(desc_cleaned)
desc_processed = desc_cleaned.split()

print(desc_processed)

['nyokap', 'pandang', 'penjuru', 'kamar', 'gue', 'diam', 'sebentar', 'senyum', 'takut', 'ya', 'bom', 'tidur', 'takut', 'ya', 'rumah', 'hantu', 'hiiiiii', 'nyokap', 'coba', 'takut', 'nakuti', 'takut', 'ma', 'gue', 'kikkikikiki', 'nyokap', 'coba', 'tiru', 'suara', 'kuntilanak', 'dengar', 'abg', 'banyak', 'ngisep', 'lem', 'photobox', 'kikikikikiki', 'ta', 'kikikikikikikiki', 'nyokap', 'ma', 'gue', 'orang', 'kalo', 'malem', 'malem', 'niruin', 'ketawa', 'kuntilanak', 'dateng', 'lho', 'ngomong', 'gitu', 'dika', 'nyokap', 'sewot', 'durhaka', 'ya', 'nakut', 'nakutin', 'orang', 'tua', 'kayak', 'gitu', 'awas', 'ya', 'dika', 'nakut', 'nakutin', 'takut', 'manusia', 'salmon', 'kumpul', 'tulis', 'komedi', 'raditya', 'dika', 'sembilan', 'belas', 'bab', 'cerita', 'pindah', 'rumah', 'pindah', 'hubung', 'keluarga', 'pindah', 'hati', 'simak', 'bab', 'isi', 'tulis', 'galau', 'observasi', 'ngawur', 'lelucon', 'singkat', 'khas', 'raditya', 'dika']


In [15]:
user_query_vector = get_vector(desc_processed)

# RECOMMEND

In [17]:
word_counts = Counter(desc_processed)
most_common_words = word_counts.most_common(10)

data_dicts = [{'token': row[0], 'frekuensi': row[1]} for row in most_common_words]
freq_df = pd.DataFrame(data_dicts)

freq_df.set_index('token', inplace=True)

freq_df

# for word, count in most_common_words:
#     print(f"{word}: {count} times")

Unnamed: 0_level_0,frekuensi
token,Unnamed: 1_level_1
nyokap,5
takut,5
ya,4
dika,4
gue,3
pindah,3
rumah,2
coba,2
ma,2
kuntilanak,2


In [21]:
similarity_scores = cosine_similarity([user_query_vector], df["desc_vector"].tolist())[0]
similar_indices = similarity_scores.argsort()[-6:][::-1]

similar_books = df.loc[
    similar_indices,
    ["title", "description"],
]
similar_books["similarity_score"] = similarity_scores[similar_indices]
similar_books

Unnamed: 0,title,description,similarity_score
80,Manusia Setengah Salmon (Paperback),Nyokap memandangi penjuru kamar gue. Dia diam ...,1.0
644,Watir (Paperback),Banyak anak zaman sekarang yang gampang galau....,0.874157
256,Poconggg Juga Pocong (Paperback),Predikat sebagai pocong jantan tinggal sedikit...,0.859139
1003,Rahasia Pesan Serigala (Kelompok 2&amp;1 #6),"HOMO HOMINI LUPUS.""Jadi... jadi... kaukah oran...",0.856259
422,Marriagable: Gue Mau Nikah Asal... (Paperback),Namaku Flory. Usia mendekati tiga puluh dua. S...,0.854566
408,"Trust No One, Suspect Everyone!: My Stupid Bos...",“This is the funniest book I have ever read. S...,0.851175
