In [15]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity
from custom_cleaning import CustomCleaning
from threads import Threads
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from collections import Counter

In [16]:
threadsnet = Threads()
cleaning = CustomCleaning()

# BOOKS

In [17]:
df = pd.read_csv("books.csv")

df["desc_cleaned"] = [cleaning.clean(desc) for desc in df["description"]]
df["desc_cleaned"] = [cleaning.remove_stopword(desc) for desc in df["desc_cleaned"]]
df["desc_cleaned"] = [cleaning.stem(desc) for desc in df["desc_cleaned"]]

df["desc_cleaned"] = df["desc_cleaned"].str.split()

In [4]:
book_model = Word2Vec(
    sentences=df["desc_cleaned"],
    vector_size=300,
    window=5,
    workers=4,
    epochs=100,
    sg=1
)

In [5]:
def get_vector(synopsis):
    vectors = [book_model.wv[word] for word in synopsis if word in book_model.wv]
    if vectors:
        return np.mean(vectors, axis=0)  # sum of vectors / n
    else:
        return None

In [6]:
desc_vectors = []
for desc in df["desc_cleaned"]:
    desc_vector = get_vector(desc)
    desc_vectors.append(desc_vector)

df["desc_vector"] = [vector for vector in desc_vectors if vector is not None]

# USER

In [7]:
username = 'salimafillah'

In [8]:
user_id = threadsnet.public_api.get_user_id(username)
user_threads = threadsnet.public_api.get_user_threads(user_id)
threads = user_threads["data"]["mediaData"]["threads"]

user_caption = []
for thread in threads:
    if thread["thread_items"][0]["post"]["caption"] is not None:
        text = thread["thread_items"][0]["post"]["caption"]["text"]
        text = cleaning.clean(text)  # clean sentence from unnecessary characters
        text = cleaning.remove_stopword(text)  # remove stopwords from sentence
        text = cleaning.stem(text)  # stem
        user_caption.append(text)

user_sent = " ".join(user_caption)

user_sent_processed = user_sent.split()

In [9]:
user_query_vector = get_vector(user_sent_processed)

# RECOMMEND

In [13]:
word_counts = Counter(user_sent_processed)
most_common_words = word_counts.most_common(10)

data_dicts = [{'token': row[0], 'frekuensi': row[1]} for row in most_common_words]
freq_df = pd.DataFrame(data_dicts)

freq_df

Unnamed: 0,token,frekuensi
0,allah,9
1,nikmat,4
2,jumat,4
3,jamin,3
4,doa,3
5,threads,2
6,iman,2
7,kau,2
8,tangan,2
9,dekat,2


In [14]:
similarity_scores = cosine_similarity([user_query_vector], df["desc_vector"].tolist())[0]
similar_indices = similarity_scores.argsort()[-10:][::-1]

similar_books = df.loc[
    similar_indices,
    ["title"],
]
similar_books["similarity_score"] = similarity_scores[similar_indices]
similar_books

Unnamed: 0,title,similarity_score
375,Jalan Bandungan (Paperback),0.921188
661,Tak Ada Santo dari Sirkus (Paperback),0.919571
211,Berjuta Rasanya (Paperback),0.913615
988,Kereta Tidur (Paperback),0.913129
3,Laskar Pelangi (Paperback),0.910859
8,Supernova: Akar (Paperback),0.906417
66,Muslihat Musang Emas (Paperback),0.905741
382,Kekasih Marionette dan 12 Kisah Lainnya (Paper...,0.905566
925,"Haduh, aku di-follow (Paperback)",0.905312
713,Jingga (Paperback),0.903698
