In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from tensorflow.keras.preprocessing.text import Tokenizer, text_to_word_sequence
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, TimeDistributed, RepeatVector
import tensorflow as tf

# --- Load Data dari kolom 'utt' ---
df = pd.read_csv("teksAsli.csv").dropna().astype(str)
texts = df['utt'].tolist()

# --- Split Data ---
train_texts, test_texts = train_test_split(texts, test_size=0.2, random_state=42)

# --- Tokenisasi ---
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_texts)
vocab_size = len(tokenizer.word_index) + 1

# --- Hitung panjang maksimum ---
max_len = max([len(text_to_word_sequence(t)) for t in texts])

def encode(texts):
    seqs = tokenizer.texts_to_sequences(texts)
    return pad_sequences(seqs, maxlen=max_len, padding='post')

train_seq = encode(train_texts)
test_seq = encode(test_texts)

# --- Target = input untuk autoencoder ---
train_target = train_seq  # Tetap integer, tidak to_categorical

# --- Bangun Model LSTM ---
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=128))
model.add(LSTM(128))
model.add(RepeatVector(max_len))
model.add(LSTM(128, return_sequences=True))
model.add(TimeDistributed(Dense(vocab_size, activation='softmax')))
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# --- Training ---
model.fit(train_seq, np.expand_dims(train_target, -1), epochs=50, batch_size=32, verbose=1)

# --- Fungsi Decode ---
reverse_word_map = dict(map(reversed, tokenizer.word_index.items()))
def sequence_to_text(sequence):
    return ' '.join([reverse_word_map.get(i, '') for i in sequence if i != 0])

# --- Parafrase Data Testing ---
preds = model.predict(test_seq)
pred_seqs = np.argmax(preds, axis=-1)
originals = test_texts
paraphrases = [sequence_to_text(seq) for seq in pred_seqs]

# --- Hitung BLEU ---
smoothie = SmoothingFunction().method4
scores = []

for ori, para in zip(originals, paraphrases):
    score = sentence_bleu(
        [text_to_word_sequence(ori)],
        text_to_word_sequence(para),
        smoothing_function=smoothie
    )
    scores.append(score)

# --- Simpan ke CSV ---
output_df = pd.DataFrame({
    'original': originals,
    'paraphrase': paraphrases,
    'bleu_score': scores
})
output_df.to_csv('hasil_parafrase.csv', index=False)
print("📁 File disimpan sebagai 'hasil_parafrase.csv'")


Epoch 1/50
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 33ms/step - accuracy: 0.1217 - loss: 3.7066
Epoch 2/50
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step - accuracy: 0.2635 - loss: 3.6580
Epoch 3/50
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step - accuracy: 0.2596 - loss: 3.5425
Epoch 4/50
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step - accuracy: 0.2661 - loss: 3.2443
Epoch 5/50
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step - accuracy: 0.2577 - loss: 2.9234
Epoch 6/50
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step - accuracy: 0.2871 - loss: 2.9746
Epoch 7/50
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step - accuracy: 0.2867 - loss: 2.8314
Epoch 8/50
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step - accuracy: 0.2871 - loss: 2.7458
Epoch 9/50
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [