In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Baca dataset dari Google Drive
file_path = '/content/drive/MyDrive/combined_artikel_final.csv'
data = pd.read_csv(file_path)

# Tampilkan beberapa baris untuk pratinjau
data

Unnamed: 0,tanggal,penulis,judul,isi_berita,lokasi,link_berita
0,"Senin, 06 Jan 2025 19:30 WIB",Sonia Basoni - \n ...,"Tak Terima Masakannya Dikritik, Chef Bintang M...",Jakarta - Tak terima makanan buatannya dikriti...,Jakarta,https://food.detik.com/info-kuliner/d-7719786/...
1,"Senin, 06 Jan 2025 19:30 WIB",Sonia Basoni - \n ...,"Tak Terima Masakannya Dikritik, Chef Bintang M...",Jakarta - Tak terima makanan buatannya dikriti...,Jakarta,https://food.detik.com/info-kuliner/d-7719786/...
2,"Senin, 06 Jan 2025 19:00 WIB",Yenny Mustika Sari - \n ...,Wanita Cantik Ini Jual Durian Musang King yang...,Jakarta - Penjual durian selalu menarik perhat...,Jakarta,https://food.detik.com/info-kuliner/d-7719898/...
3,"Senin, 06 Jan 2025 19:00 WIB",Yenny Mustika Sari - \n ...,Wanita Cantik Ini Jual Durian Musang King yang...,Jakarta - Penjual durian selalu menarik perhat...,Jakarta,https://food.detik.com/info-kuliner/d-7719898/...
4,"Senin, 06 Jan 2025 18:30 WIB",Atiqa Rana - \n ...,Zonk! Pria Ini Merasa Tertipu Kualitas Menu di...,Jakarta - Usai makan malam fine dining di rest...,Jakarta,https://food.detik.com/info-kuliner/d-7719864/...
...,...,...,...,...,...,...
206,"Minggu, 05 Jan 2025 21:00 WIB",Instagram Shin Tae-yong - \n ...,Momen Shin Tae-yong Makan Malam Bareng Pemain ...,Isi berita tidak ditemukan,Lokasi tidak ditemukan,https://food.detik.com/foto-kuliner/d-7718596/...
207,"Minggu, 05 Jan 2025 18:00 WIB",Atiqa Rana - \n ...,"Makan Pizza di Kafe, Pelanggan Kaget Kena Biay...",Jakarta - Tambahan biaya di resto kerap membua...,Jakarta,https://food.detik.com/info-kuliner/d-7717742/...
208,"Minggu, 05 Jan 2025 19:00 WIB",Sonia Basoni - \n ...,5 Kelakuan Aneh Mertua Saat Berseteru Masalah ...,Jakarta - Perseteruan antara mertua dan menant...,Jakarta,https://food.detik.com/info-kuliner/d-7717332/...
209,"Minggu, 05 Jan 2025 17:00 WIB",YouTube Trans 7 Official - \n ...,Potret Dapur di Rumah Baru Adly Fairuz dan Anb...,Isi berita tidak ditemukan,Lokasi tidak ditemukan,https://food.detik.com/foto-kuliner/d-7717343/...


In [None]:
import nltk
nltk.download('punkt_tab')
nltk.download('stopwords')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [None]:
# Step 1: Data Identification (focus on 'judul' column)
judul = data['judul'].dropna()

# Step 2: Preprocessing and Tokenization
# Fungsi preprocessing
def preprocess_text(text):
    text = text.lower()  # Ubah ke huruf kecil
    text = text.translate(str.maketrans('', '', string.punctuation))  # Hilangkan tanda baca
    tokens = word_tokenize(text)  # Tokenisasi
    tokens = [word for word in tokens if word not in stopwords.words('indonesian')]  # Hilangkan stopwords
    tokens = [word for word in tokens if word.isalpha()]  # Hilangkan token non-alfabet
    return tokens

# Terapkan preprocessing pada data
data['tokens'] = judul.apply(preprocess_text)

# Step 3: Create Training Base with Skip-Gram Model
def create_skipgram_training_pairs(tokens, window_size=2):
    pairs = []
    for i, target in enumerate(tokens):
        context_range = range(max(0, i - window_size), min(len(tokens), i + window_size + 1))
        for j in context_range:
            if i != j:
                pairs.append((target, tokens[j]))
    return pairs

data['training_pairs'] = data['tokens'].apply(lambda x: create_skipgram_training_pairs(x))

# Flatten all training pairs
total_training_pairs = [pair for sublist in data['training_pairs'] for pair in sublist]

# Vocabulary and initialization for word vectors
vocab = set(word for pair in total_training_pairs for word in pair)
vocab = list(vocab)
word_to_index = {word: i for i, word in enumerate(vocab)}
index_to_word = {i: word for word, i in word_to_index.items()}
vocab_size = len(vocab)
embedding_dim = 50

# Initialize word embeddings
W1 = np.random.rand(vocab_size, embedding_dim)
W2 = np.random.rand(embedding_dim, vocab_size)

# Define softmax function
def softmax(x):
    exp_x = np.exp(x - np.max(x))
    return exp_x / exp_x.sum(axis=0)

# Training function for Skip-Gram Model
def train_skipgram(pairs, epochs=100, learning_rate=0.01):
    global W1, W2
    for epoch in range(epochs):
        total_loss = 0
        for target, context in pairs:
            target_idx = word_to_index[target]
            context_idx = word_to_index[context]

            # Forward pass
            hidden_layer = W1[target_idx]
            output_layer = softmax(np.dot(hidden_layer, W2))

            # Compute loss (negative log likelihood)
            loss = -np.log(output_layer[context_idx])
            total_loss += loss

            # Backward pass (gradients)
            error = output_layer
            error[context_idx] -= 1

            dW2 = np.outer(hidden_layer, error)
            dW1 = np.dot(W2, error)

            # Update weights
            W1[target_idx] -= learning_rate * dW1
            W2 -= learning_rate * dW2

        print(f"Epoch {epoch + 1}/{epochs}, Loss: {total_loss / len(pairs):.4f}")

# Train the Skip-Gram Model
train_skipgram(total_training_pairs, epochs=10, learning_rate=0.01)

# Step 4: Word Similarity Evaluation
def most_similar(word, top_n=5):
    if word not in word_to_index:
        return []
    word_idx = word_to_index[word]
    word_vector = W1[word_idx]
    similarities = cosine_similarity([word_vector], W1)[0]
    similar_indices = np.argsort(similarities)[::-1][1:top_n + 1]
    return [(index_to_word[i], similarities[i]) for i in similar_indices]

Epoch 1/10, Loss: 6.3231
Epoch 2/10, Loss: 5.9265
Epoch 3/10, Loss: 5.7411
Epoch 4/10, Loss: 5.5771
Epoch 5/10, Loss: 5.4167
Epoch 6/10, Loss: 5.2552
Epoch 7/10, Loss: 5.0910
Epoch 8/10, Loss: 4.9234
Epoch 9/10, Loss: 4.7524
Epoch 10/10, Loss: 4.5787


In [None]:
data

Unnamed: 0,tanggal,penulis,judul,isi_berita,lokasi,link_berita,tokens,training_pairs
0,"Senin, 06 Jan 2025 19:30 WIB",Sonia Basoni - \n ...,"Tak Terima Masakannya Dikritik, Chef Bintang M...",Jakarta - Tak terima makanan buatannya dikriti...,Jakarta,https://food.detik.com/info-kuliner/d-7719786/...,"[terima, masakannya, dikritik, chef, bintang, ...","[(terima, masakannya), (terima, dikritik), (ma..."
1,"Senin, 06 Jan 2025 19:30 WIB",Sonia Basoni - \n ...,"Tak Terima Masakannya Dikritik, Chef Bintang M...",Jakarta - Tak terima makanan buatannya dikriti...,Jakarta,https://food.detik.com/info-kuliner/d-7719786/...,"[terima, masakannya, dikritik, chef, bintang, ...","[(terima, masakannya), (terima, dikritik), (ma..."
2,"Senin, 06 Jan 2025 19:00 WIB",Yenny Mustika Sari - \n ...,Wanita Cantik Ini Jual Durian Musang King yang...,Jakarta - Penjual durian selalu menarik perhat...,Jakarta,https://food.detik.com/info-kuliner/d-7719898/...,"[wanita, cantik, jual, durian, musang, king, l...","[(wanita, cantik), (wanita, jual), (cantik, wa..."
3,"Senin, 06 Jan 2025 19:00 WIB",Yenny Mustika Sari - \n ...,Wanita Cantik Ini Jual Durian Musang King yang...,Jakarta - Penjual durian selalu menarik perhat...,Jakarta,https://food.detik.com/info-kuliner/d-7719898/...,"[wanita, cantik, jual, durian, musang, king, l...","[(wanita, cantik), (wanita, jual), (cantik, wa..."
4,"Senin, 06 Jan 2025 18:30 WIB",Atiqa Rana - \n ...,Zonk! Pria Ini Merasa Tertipu Kualitas Menu di...,Jakarta - Usai makan malam fine dining di rest...,Jakarta,https://food.detik.com/info-kuliner/d-7719864/...,"[zonk, pria, tertipu, kualitas, menu, resto, f...","[(zonk, pria), (zonk, tertipu), (pria, zonk), ..."
...,...,...,...,...,...,...,...,...
206,"Minggu, 05 Jan 2025 21:00 WIB",Instagram Shin Tae-yong - \n ...,Momen Shin Tae-yong Makan Malam Bareng Pemain ...,Isi berita tidak ditemukan,Lokasi tidak ditemukan,https://food.detik.com/foto-kuliner/d-7718596/...,"[momen, shin, taeyong, makan, malam, bareng, p...","[(momen, shin), (momen, taeyong), (shin, momen..."
207,"Minggu, 05 Jan 2025 18:00 WIB",Atiqa Rana - \n ...,"Makan Pizza di Kafe, Pelanggan Kaget Kena Biay...",Jakarta - Tambahan biaya di resto kerap membua...,Jakarta,https://food.detik.com/info-kuliner/d-7717742/...,"[makan, pizza, kafe, pelanggan, kaget, kena, b...","[(makan, pizza), (makan, kafe), (pizza, makan)..."
208,"Minggu, 05 Jan 2025 19:00 WIB",Sonia Basoni - \n ...,5 Kelakuan Aneh Mertua Saat Berseteru Masalah ...,Jakarta - Perseteruan antara mertua dan menant...,Jakarta,https://food.detik.com/info-kuliner/d-7717332/...,"[kelakuan, aneh, mertua, berseteru, makanan, m...","[(kelakuan, aneh), (kelakuan, mertua), (aneh, ..."
209,"Minggu, 05 Jan 2025 17:00 WIB",YouTube Trans 7 Official - \n ...,Potret Dapur di Rumah Baru Adly Fairuz dan Anb...,Isi berita tidak ditemukan,Lokasi tidak ditemukan,https://food.detik.com/foto-kuliner/d-7717343/...,"[potret, dapur, rumah, adly, fairuz, anbeen, r...","[(potret, dapur), (potret, rumah), (dapur, pot..."


In [None]:
data['tokens']

Unnamed: 0,tokens
0,"[terima, masakannya, dikritik, chef, bintang, ..."
1,"[terima, masakannya, dikritik, chef, bintang, ..."
2,"[wanita, cantik, jual, durian, musang, king, l..."
3,"[wanita, cantik, jual, durian, musang, king, l..."
4,"[zonk, pria, tertipu, kualitas, menu, resto, f..."
...,...
206,"[momen, shin, taeyong, makan, malam, bareng, p..."
207,"[makan, pizza, kafe, pelanggan, kaget, kena, b..."
208,"[kelakuan, aneh, mertua, berseteru, makanan, m..."
209,"[potret, dapur, rumah, adly, fairuz, anbeen, r..."


In [None]:
data['training_pairs']

Unnamed: 0,training_pairs
0,"[(terima, masakannya), (terima, dikritik), (ma..."
1,"[(terima, masakannya), (terima, dikritik), (ma..."
2,"[(wanita, cantik), (wanita, jual), (cantik, wa..."
3,"[(wanita, cantik), (wanita, jual), (cantik, wa..."
4,"[(zonk, pria), (zonk, tertipu), (pria, zonk), ..."
...,...
206,"[(momen, shin), (momen, taeyong), (shin, momen..."
207,"[(makan, pizza), (makan, kafe), (pizza, makan)..."
208,"[(kelakuan, aneh), (kelakuan, mertua), (aneh, ..."
209,"[(potret, dapur), (potret, rumah), (dapur, pot..."


In [None]:
# User Interaction
while True:
    user_input = input("Enter a word to find similar words (or type 'exit' to quit): ").strip()
    if user_input.lower() == 'exit':
        print("Exiting the program. Goodbye!")
        break
    elif user_input in word_to_index:
        similar_words = most_similar(user_input)
        print(f"Words similar to '{user_input}': {similar_words}")
    else:
        print(f"Word '{user_input}' not found in vocabulary. Please try another word.")

Enter a word to find similar words (or type 'exit' to quit): kuliner
Words similar to 'kuliner': [('penuhi', 0.8381712757714516), ('ular', 0.821149398453208), ('montok', 0.8140936273216197), ('smart', 0.8109900417506906), ('bekasi', 0.8103972831167352)]
Enter a word to find similar words (or type 'exit' to quit): makanan
Words similar to 'makanan': [('pujian', 0.7704246550357744), ('driver', 0.7691261949348678), ('penambah', 0.7662511667611868), ('arena', 0.7651327013212049), ('sukses', 0.7636132194904848)]
Enter a word to find similar words (or type 'exit' to quit): resep
Words similar to 'resep': [('ayam', 0.7895517027229951), ('tomat', 0.7856032224698138), ('sosial', 0.7821744693763543), ('artomoro', 0.7733600419879195), ('sopir', 0.7718452721866893)]
Enter a word to find similar words (or type 'exit' to quit): food
Words similar to 'food': [('jantung', 0.8396507580262682), ('awards', 0.8363467104622282), ('minuman', 0.812957771361825), ('kalengan', 0.808760063149016), ('manfaat', 0

In [None]:
# Fungsi untuk menemukan kata-kata yang mirip
def find_similar_words(word, word_embeddings, word_to_index, top_n=5):
    if word not in word_to_index:
        print(f"Kata '{word}' tidak ditemukan dalam kosakata.")
        return []

    word_idx = word_to_index[word]
    word_vector = word_embeddings[word_idx]

    # Hitung kemiripan kosinus dengan semua vektor embedding
    similarities = cosine_similarity([word_vector], word_embeddings)[0]

    # Urutkan skor kemiripan secara descending, dan ambil kata-kata teratas
    similar_indices = np.argsort(similarities)[::-1][1:top_n + 1]
    return [(index_to_word[i], similarities[i]) for i in similar_indices]

In [None]:
# Contoh penggunaan
target_word = 'kuliner'
similar_words = find_similar_words(target_word, W1, word_to_index)

# Cetak hasil
print(f"Kata-kata yang mirip dengan '{target_word}':")
for word, similarity in similar_words:
    print(f"{word}: {similarity:.4f}")

Kata-kata yang mirip dengan 'kuliner':
penuhi: 0.8382
ular: 0.8211
montok: 0.8141
smart: 0.8110
bekasi: 0.8104


In [None]:
# Contoh penggunaan
target_word = 'makanan'
similar_words = find_similar_words(target_word, W1, word_to_index)

# Cetak hasil
print(f"Kata-kata yang mirip dengan '{target_word}':")
for word, similarity in similar_words:
    print(f"{word}: {similarity:.4f}")

Kata-kata yang mirip dengan 'makanan':
pujian: 0.7704
driver: 0.7691
penambah: 0.7663
arena: 0.7651
sukses: 0.7636


In [None]:
# Contoh penggunaan
target_word = 'resep'
similar_words = find_similar_words(target_word, W1, word_to_index)

# Cetak hasil
print(f"Kata-kata yang mirip dengan '{target_word}':")
for word, similarity in similar_words:
    print(f"{word}: {similarity:.4f}")

Kata-kata yang mirip dengan 'resep':
ayam: 0.7896
tomat: 0.7856
sosial: 0.7822
artomoro: 0.7734
sopir: 0.7718


In [None]:
# Contoh penggunaan
target_word = 'food'
similar_words = find_similar_words(target_word, W1, word_to_index)

# Cetak hasil
print(f"Kata-kata yang mirip dengan '{target_word}':")
for word, similarity in similar_words:
    print(f"{word}: {similarity:.4f}")

Kata-kata yang mirip dengan 'food':
jantung: 0.8397
awards: 0.8363
minuman: 0.8130
kalengan: 0.8088
manfaat: 0.8047


In [None]:
# Contoh penggunaan
target_word = 'minuman'
similar_words = find_similar_words(target_word, W1, word_to_index)

# Cetak hasil
print(f"Kata-kata yang mirip dengan '{target_word}':")
for word, similarity in similar_words:
    print(f"{word}: {similarity:.4f}")

Kata-kata yang mirip dengan 'minuman':
brin: 0.8388
sayuran: 0.8357
manfaat: 0.8318
kangen: 0.8250
salad: 0.8227


In [None]:
# Contoh penggunaan
target_word = 'sayuran'
similar_words = find_similar_words(target_word, W1, word_to_index)

# Cetak hasil
print(f"Kata-kata yang mirip dengan '{target_word}':")
for word, similarity in similar_words:
    print(f"{word}: {similarity:.4f}")

Kata-kata yang mirip dengan 'sayuran':
luncurkan: 0.8916
michelin: 0.8883
gokil: 0.8676
salad: 0.8635
bintang: 0.8582


In [None]:
# Contoh penggunaan
target_word = 'buah'
similar_words = find_similar_words(target_word, W1, word_to_index)

# Cetak hasil
print(f"Kata-kata yang mirip dengan '{target_word}':")
for word, similarity in similar_words:
    print(f"{word}: {similarity:.4f}")

Kata-kata yang mirip dengan 'buah':
mantap: 0.8586
lebaran: 0.8386
kranggan: 0.8349
salah: 0.8229
creamy: 0.8157
