<a href="https://colab.research.google.com/github/Natasyaamb/Sistem-Pencarian-Rekomendasi-Buku-TF-IDF-dan-Cosine/blob/main/Sistem_Pencarian_Rekomendasi_Buku_Word2Vector.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import gensim
from gensim.models import Word2Vec
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.metrics.pairwise import cosine_similarity
import string
import numpy as np
import pandas as pd
import re
import ipywidgets as widgets
from IPython.display import display

In [None]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
# Membaca data dari Excel
buku = pd.read_excel('dataset_summary_book.xlsx')  # Pastikan nama file dan path sudah benar

In [None]:
print(buku.columns)

Index(['wikipedia ID', 'freebase ID', 'title', 'book author',
       'publication date', 'genres', 'summary'],
      dtype='object')


In [None]:
print(buku.head())

   wikipedia ID freebase ID                                      title  \
0           620     /m/0hhy                                Animal Farm   
1           843     /m/0k36                         A Clockwork Orange   
2           986     /m/0ldx                                 The Plague   
3          1756     /m/0sww  An Enquiry Concerning Human Understanding   
4          2080     /m/0wkt                       A Fire Upon the Deep   

       book author     publication date  \
0    George Orwell  1945-08-17 00:00:00   
1  Anthony Burgess                 1962   
2     Albert Camus                 1947   
3       David Hume                  NaN   
4     Vernor Vinge                  NaN   

                                              genres  \
0  Roman, Satire, Children's literature, Speculat...   
1  Science Fiction, Novella, Speculative fiction,...   
2  Existentialism, Fiction, Absurdist fiction, Novel   
3                                                NaN   
4  Hard science 

In [None]:
# Membersihkan teks
def clean_text(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()  # Mengubah menjadi huruf kecil
    text = re.sub(r'\d+', '', text)  # Menghapus angka
    text = re.sub(r'[^\w\s]', '', text)  # Menghapus tanda baca
    text = text.strip()  # Menghapus spasi berlebih
    return text

# Menghapus stopwords dan lemmatization
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = clean_text(text)
    words = word_tokenize(text)
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return words

buku['cleaned_summary'] = buku['summary'].apply(preprocess_text)

# Menampilkan hasil untuk memastikan tidak ada error
print(buku.head())

   wikipedia ID freebase ID                                      title  \
0           620     /m/0hhy                                Animal Farm   
1           843     /m/0k36                         A Clockwork Orange   
2           986     /m/0ldx                                 The Plague   
3          1756     /m/0sww  An Enquiry Concerning Human Understanding   
4          2080     /m/0wkt                       A Fire Upon the Deep   

       book author     publication date  \
0    George Orwell  1945-08-17 00:00:00   
1  Anthony Burgess                 1962   
2     Albert Camus                 1947   
3       David Hume                  NaN   
4     Vernor Vinge                  NaN   

                                              genres  \
0  Roman, Satire, Children's literature, Speculat...   
1  Science Fiction, Novella, Speculative fiction,...   
2  Existentialism, Fiction, Absurdist fiction, Novel   
3                                                NaN   
4  Hard science 

In [None]:
# Menggunakan kolom 'cleaned_summary' untuk melatih model Word2Vec
sentences = buku['cleaned_summary'].tolist()
model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)

# Menyimpan model untuk penggunaan selanjutnya (opsional)
model.save("word2vec_buku.model")

In [None]:
def get_book_recommendations(summary, top_n=5):
    # Preprocessing summary input
    cleaned_summary = preprocess_text(summary)

    # Mendapatkan vektor rata-rata dari kata-kata dalam sinopsis
    if cleaned_summary:
        vector = np.mean([model.wv[word] for word in cleaned_summary if word in model.wv], axis=0)
    else:
        vector = np.zeros(model.vector_size)

    # Menghitung kemiripan kosinus antara sinopsis input dan sinopsis dalam dataset
    similarities = []
    for idx, row in buku.iterrows():
        cleaned_summary = row['cleaned_summary']
        if not cleaned_summary:  # Mengatasi kasus cleaned_summary kosong
            continue

        book_vector = np.mean([model.wv[word] for word in cleaned_summary if word in model.wv], axis=0)
        similarity = cosine_similarity([vector], [book_vector])
        similarities.append((row['title'], similarity[0][0]))

    # Mengurutkan buku berdasarkan kemiripan dan mengambil top_n buku teratas
    similarities = sorted(similarities, key=lambda x: x[1], reverse=True)
    return similarities[:top_n]

In [None]:
# Membuat kolom input untuk sinopsis
summary_input = widgets.Textarea(
    value='',
    placeholder='Masukkan sinopsis buku di sini...',
    description='Summary:',
    disabled=False,
    layout=widgets.Layout(width='50%', height='100px')
)

# Membuat tombol untuk mendapatkan rekomendasi
button = widgets.Button(description="Get Recommendations")

# Fungsi untuk menampilkan rekomendasi
def on_button_click(b):
    summary = summary_input.value
    recommendations = get_book_recommendations(summary)
    for title, similarity in recommendations:
        print(f"Title: {title}, Similarity: {similarity}")

# Menambahkan fungsi ke tombol
button.on_click(on_button_click)

# Menampilkan kolom input dan tombol
display(summary_input, button)


Textarea(value='', description='Summary:', layout=Layout(height='100px', width='50%'), placeholder='Masukkan s…

Button(description='Get Recommendations', style=ButtonStyle())

Title: On War, Similarity: 0.9998793005943298
Title: Death of a Hero, Similarity: 0.9998303651809692
Title: All Quiet on the Western Front, Similarity: 0.9998170137405396
Title: The Myth of Sisyphus, Similarity: 0.9998139142990112
Title: The Eye of the World, Similarity: 0.9998115301132202
