In [3]:
!pip install torch torchtext torchvision --quiet
!pip install sentencepiece --quiet
!pip install sacrebleu --quiet
!pip install tqdm --quiet
!pip install matplotlib --quiet

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m32.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[?25h

In [4]:
# Import library standar
import os
import zipfile
import requests
import random
import numpy as np
import json
import re
import unicodedata
from collections import Counter
import string

# Import library PyTorch
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F

# Import library lainnya
import sentencepiece as spm
import sacrebleu
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split
import math
import time
import matplotlib.pyplot as plt

# Konfigurasi dasar
SEED = 1234
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

# Tentukan device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Menggunakan device: {device}")


Menggunakan device: cpu


In [5]:
# --- 2.1 Unduh dan Ekstrak Dataset ---
DATA_URL = "https://www.manythings.org/anki/ind-eng.zip"
ZIP_PATH = "/content/ind-eng.zip"
DATA_PATH = "ind.txt"

# Fungsi untuk mengunduh file dengan User-Agent
def download_file(url, path):
    print("Mengunduh dataset...")
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    try:
        response = requests.get(url, stream=True, headers=headers)
        response.raise_for_status() # Cek jika ada error HTTP
        with open(path, 'wb') as f:
            for chunk in response.iter_content(chunk_size=8192):
                f.write(chunk)
        print("Unduhan selesai.")
        return True
    except requests.exceptions.RequestException as e:
        print(f"Gagal mengunduh: {e}")
        return False

# Cek dan validasi file zip
file_is_valid = False
if os.path.exists(ZIP_PATH):
    try:
        with zipfile.ZipFile(ZIP_PATH, 'r') as zip_ref:
            if zip_ref.testzip() is None:
                 print(f"'{ZIP_PATH}' sudah ada dan merupakan file zip yang valid.")
                 file_is_valid = True
            else:
                 print(f"'{ZIP_PATH}' terdeteksi korup. Akan diunduh ulang.")
    except zipfile.BadZipFile:
        print(f"'{ZIP_PATH}' bukan file zip yang valid. Akan diunduh ulang.")

download_successful = file_is_valid
if not file_is_valid:
    if os.path.exists(ZIP_PATH):
        os.remove(ZIP_PATH)
    download_successful = download_file(DATA_URL, ZIP_PATH)

# Ekstrak file zip hanya jika unduhan berhasil dan file ada
if download_successful and os.path.exists(ZIP_PATH):
    try:
        with zipfile.ZipFile(ZIP_PATH, 'r') as zip_ref:
            zip_ref.extractall()
        print(f"Dataset berhasil diekstrak.")
    except zipfile.BadZipFile:
        print(f"Gagal mengekstrak. File '{ZIP_PATH}' sepertinya masih korup. Coba jalankan sel ini lagi.")
else:
    if not os.path.exists(ZIP_PATH):
        print("Eksekusi dihentikan karena file dataset gagal diunduh dan tidak ditemukan.")


# --- 2.2 Baca dan Bersihkan Data ---
def clean_text(s):
    # Normalisasi unicode, ubah ke huruf kecil, hapus spasi berlebih
    s = s.lower().strip()
    # Beri spasi antara kata dan tanda baca
    s = re.sub(r"([?.!,¿])", r" \1 ", s)
    s = re.sub(r'[" "]+', " ", s)
    # Ganti semua karakter non-alfanumerik/tanda baca dasar dengan spasi
    s = re.sub(r"[^a-zA-Z?.!,¿]+", " ", s)
    return s.strip()

pairs = []
# Pastikan file DATA_PATH ada sebelum membacanya
if os.path.exists(DATA_PATH):
    with open(DATA_PATH, 'r', encoding='utf-8') as f:
        lines = f.read().strip().split('\n')
        for line in tqdm(lines, desc="Membersihkan dan memuat data"):
            parts = line.split('\t')
            if len(parts) >= 2:
                eng, ind = parts[0], parts[1]
                # Hanya ambil pasangan dengan panjang kalimat wajar
                if 1 < len(eng.split()) < 50 and 1 < len(ind.split()) < 50:
                     pairs.append([clean_text(eng), clean_text(ind)])

    print(f"\nTotal pasangan kalimat setelah dibersihkan: {len(pairs)}")
    print("Contoh data bersih:")
    for pair in pairs[:5]:
        print(pair)
else:
    print(f"File data '{DATA_PATH}' tidak ditemukan. Pastikan proses unduh dan ekstrak berhasil.")


# --- 2.3 Tokenisasi Subword (SentencePiece) ---
# Simpan data ke file sementara untuk melatih SentencePiece
if pairs:
    with open('eng.txt', 'w', encoding='utf-8') as f:
        for pair in pairs:
            f.write(pair[0] + '\n')
    with open('ind.txt', 'w', encoding='utf-8') as f:
        for pair in pairs:
            f.write(pair[1] + '\n')

    # Latih model SentencePiece
    VOCAB_SIZE = 8000
    # SentencePiece secara default memiliki unk, bos, eos. Kita akan gunakan unk_id() sebagai pad_id()
    spm.SentencePieceTrainer.train(f'--input=eng.txt --model_prefix=eng_spm --vocab_size={VOCAB_SIZE} --character_coverage=1.0 --model_type=bpe')
    spm.SentencePieceTrainer.train(f'--input=ind.txt --model_prefix=ind_spm --vocab_size={VOCAB_SIZE} --character_coverage=1.0 --model_type=bpe')

    # Muat tokenizer yang sudah dilatih
    sp_eng = spm.SentencePieceProcessor()
    sp_ind = spm.SentencePieceProcessor()
    sp_eng.load('eng_spm.model')
    sp_ind.load('ind_spm.model')

    # Verifikasi ukuran vocab dan ID token padding
    PAD_IDX_ENG = sp_eng.unk_id() # Gunakan ID <unk> sebagai padding
    PAD_IDX_IND = sp_ind.unk_id() # Gunakan ID <unk> sebagai padding
    print(f"Ukuran Vocab EN (get_piece_size): {sp_eng.get_piece_size()}")
    print(f"Ukuran Vocab ID (get_piece_size): {sp_ind.get_piece_size()}")
    print(f"Padding ID (EN & ID): {PAD_IDX_ENG} & {PAD_IDX_IND}")


    # Contoh tokenisasi
    print("\nContoh Tokenisasi (EN):", sp_eng.encode_as_pieces("this is a test."))
    print("Contoh Tokenisasi (ID):", sp_ind.encode_as_pieces("ini adalah sebuah tes."))
else:
    print("Tidak ada data untuk melatih tokenizer.")


# --- 2.4 Bagi Data & Buat Dataset PyTorch ---
if pairs:
    train_pairs, temp_pairs = train_test_split(pairs, test_size=0.2, random_state=SEED)
    val_pairs, test_pairs = train_test_split(temp_pairs, test_size=0.5, random_state=SEED)

    print(f"\nUkuran data: Train={len(train_pairs)}, Val={len(val_pairs)}, Test={len(test_pairs)}")

    class TranslationDataset(Dataset):
        def __init__(self, pairs, sp_src, sp_trg):
            self.pairs = pairs
            self.sp_src = sp_src
            self.sp_trg = sp_trg

        def __len__(self):
            return len(self.pairs)

        def __getitem__(self, idx):
            src_text, trg_text = self.pairs[idx]
            src_ids = [self.sp_src.bos_id()] + self.sp_src.encode_as_ids(src_text) + [self.sp_src.eos_id()]
            trg_ids = [self.sp_trg.bos_id()] + self.sp_trg.encode_as_ids(trg_text) + [self.sp_trg.eos_id()]
            return torch.tensor(src_ids), torch.tensor(trg_ids)

    def collate_fn(batch):
        src_batch, trg_batch = [], []
        for src_sample, trg_sample in batch:
            src_batch.append(src_sample)
            trg_batch.append(trg_sample)

        # PERBAIKAN: Gunakan PAD_IDX yang sudah kita definisikan
        src_padded = nn.utils.rnn.pad_sequence(src_batch, padding_value=PAD_IDX_ENG)
        trg_padded = nn.utils.rnn.pad_sequence(trg_batch, padding_value=PAD_IDX_IND)
        return src_padded, trg_padded

    BATCH_SIZE = 64
    train_dataset = TranslationDataset(train_pairs, sp_eng, sp_ind)
    val_dataset = TranslationDataset(val_pairs, sp_eng, sp_ind)
    test_dataset = TranslationDataset(test_pairs, sp_eng, sp_ind)

    train_iterator = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
    valid_iterator = DataLoader(val_dataset, batch_size=BATCH_SIZE, collate_fn=collate_fn)
    test_iterator = DataLoader(test_dataset, batch_size=BATCH_SIZE, collate_fn=collate_fn)
else:
    print("Tidak ada data untuk membuat DataLoader.")


Mengunduh dataset...
Unduhan selesai.
Dataset berhasil diekstrak.


Membersihkan dan memuat data:   0%|          | 0/14881 [00:00<?, ?it/s]


Total pasangan kalimat setelah dibersihkan: 14769
Contoh data bersih:
['i see .', 'aku mengerti .']
['i see .', 'begitu rupanya .']
['i see .', 'aku melihat .']
['i see .', 'oh , begitu .']
['i see .', 'saya melihat .']
Ukuran Vocab EN (get_piece_size): 8000
Ukuran Vocab ID (get_piece_size): 8000
Padding ID (EN & ID): 0 & 0

Contoh Tokenisasi (EN): ['▁this', '▁is', '▁a', '▁test', '.']
Contoh Tokenisasi (ID): ['▁ini', '▁adalah', '▁sebuah', '▁tes', '.']

Ukuran data: Train=11815, Val=1477, Test=1477


In [6]:
class EncoderRNN(nn.Module):
    def __init__(self, input_dim, emb_dim, enc_hid_dim, dec_hid_dim, dropout, pad_idx):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, emb_dim, padding_idx=pad_idx)
        self.rnn = nn.GRU(emb_dim, enc_hid_dim, bidirectional=True)
        self.fc = nn.Linear(enc_hid_dim * 2, dec_hid_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, src):
        embedded = self.dropout(self.embedding(src))
        outputs, hidden = self.rnn(embedded)
        hidden = torch.tanh(self.fc(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1)))
        return outputs, hidden

class Attention(nn.Module):
    def __init__(self, enc_hid_dim, dec_hid_dim):
        super().__init__()
        self.attn = nn.Linear((enc_hid_dim * 2) + dec_hid_dim, dec_hid_dim)
        self.v = nn.Linear(dec_hid_dim, 1, bias=False)

    def forward(self, hidden, encoder_outputs):
        batch_size = encoder_outputs.shape[1]
        src_len = encoder_outputs.shape[0]
        hidden = hidden.unsqueeze(1).repeat(1, src_len, 1)
        encoder_outputs = encoder_outputs.permute(1, 0, 2)
        energy = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim=2)))
        attention = self.v(energy).squeeze(2)
        return F.softmax(attention, dim=1)

class DecoderRNN(nn.Module):
    def __init__(self, output_dim, emb_dim, enc_hid_dim, dec_hid_dim, dropout, attention, pad_idx):
        super().__init__()
        self.output_dim = output_dim
        self.attention = attention
        self.embedding = nn.Embedding(output_dim, emb_dim, padding_idx=pad_idx)
        self.rnn = nn.GRU((enc_hid_dim * 2) + emb_dim, dec_hid_dim)
        self.fc_out = nn.Linear((enc_hid_dim * 2) + dec_hid_dim + emb_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input, hidden, encoder_outputs):
        input = input.unsqueeze(0)
        embedded = self.dropout(self.embedding(input))
        a = self.attention(hidden, encoder_outputs).unsqueeze(1)
        encoder_outputs = encoder_outputs.permute(1, 0, 2)
        weighted = torch.bmm(a, encoder_outputs).permute(1, 0, 2)
        rnn_input = torch.cat((embedded, weighted), dim=2)
        output, hidden = self.rnn(rnn_input, hidden.unsqueeze(0))
        embedded = embedded.squeeze(0)
        output = output.squeeze(0)
        weighted = weighted.squeeze(0)
        prediction = self.fc_out(torch.cat((output, weighted, embedded), dim=1))
        return prediction, hidden.squeeze(0)

class Seq2SeqRNN(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        batch_size = trg.shape[1]
        trg_len = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim
        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)
        encoder_outputs, hidden = self.encoder(src)
        input = trg[0,:]
        for t in range(1, trg_len):
            output, hidden = self.decoder(input, hidden, encoder_outputs)
            outputs[t] = output
            teacher_force = random.random() < teacher_forcing_ratio
            top1 = output.argmax(1)
            input = trg[t] if teacher_force else top1
        return outputs

In [7]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

class TransformerModel(nn.Module):
    def __init__(self, input_vocab_size, output_vocab_size, d_model, nhead, num_encoder_layers,
                 num_decoder_layers, dim_feedforward, dropout, src_pad_idx, trg_pad_idx):
        super(TransformerModel, self).__init__()
        self.model_type = 'Transformer'
        self.d_model = d_model

        self.src_embedding = nn.Embedding(input_vocab_size, d_model, padding_idx=src_pad_idx)
        self.trg_embedding = nn.Embedding(output_vocab_size, d_model, padding_idx=trg_pad_idx)
        self.pos_encoder = PositionalEncoding(d_model, dropout)

        self.transformer = nn.Transformer(d_model, nhead, num_encoder_layers, num_decoder_layers,
                                          dim_feedforward, dropout)
        self.fc_out = nn.Linear(d_model, output_vocab_size)

    def _generate_square_subsequent_mask(self, sz):
        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        return mask

    def forward(self, src, trg):
        # PERBAIKAN: Nama variabel pad_idx di Transformer internal berbeda
        src_key_padding_mask = (src == self.src_embedding.padding_idx).transpose(0, 1)
        trg_key_padding_mask = (trg == self.trg_embedding.padding_idx).transpose(0, 1)
        trg_mask = self._generate_square_subsequent_mask(trg.size(0)).to(device)

        src = self.src_embedding(src) * math.sqrt(self.d_model)
        trg = self.trg_embedding(trg) * math.sqrt(self.d_model)
        src = self.pos_encoder(src)
        trg = self.pos_encoder(trg)

        output = self.transformer(src, trg,
                                  src_mask=None,
                                  tgt_mask=trg_mask,
                                  memory_mask=None,
                                  src_key_padding_mask=src_key_padding_mask,
                                  tgt_key_padding_mask=trg_key_padding_mask,
                                  memory_key_padding_mask=src_key_padding_mask)

        return self.fc_out(output)


In [8]:
def train(model, iterator, optimizer, criterion, clip):
    model.train()
    epoch_loss = 0
    for i, batch in enumerate(tqdm(iterator, desc="Training")):
        src, trg = batch
        src, trg = src.to(device), trg.to(device)
        optimizer.zero_grad()

        if isinstance(model, TransformerModel):
            output = model(src, trg[:-1,:])
            output_dim = output.shape[-1]
            output = output.view(-1, output_dim)
            trg = trg[1:,:].view(-1)
        else: # Untuk RNN
            output = model(src, trg)
            output_dim = output.shape[-1]
            output = output[1:].view(-1, output_dim)
            trg = trg[1:].view(-1)

        loss = criterion(output, trg)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        epoch_loss += loss.item()
    return epoch_loss / len(iterator)

def evaluate(model, iterator, criterion):
    model.eval()
    epoch_loss = 0
    with torch.no_grad():
        for i, batch in enumerate(tqdm(iterator, desc="Evaluating")):
            src, trg = batch
            src, trg = src.to(device), trg.to(device)

            if isinstance(model, TransformerModel):
                output = model(src, trg[:-1,:])
                output_dim = output.shape[-1]
                output = output.view(-1, output_dim)
                trg = trg[1:,:].view(-1)
            else: # Untuk RNN
                output = model(src, trg, 0) # teacher forcing off
                output_dim = output.shape[-1]
                output = output[1:].view(-1, output_dim)
                trg = trg[1:].view(-1)

            loss = criterion(output, trg)
            epoch_loss += loss.item()
    return epoch_loss / len(iterator)

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

def run_training_loop(model, train_iterator, valid_iterator, optimizer, criterion, n_epochs, clip, model_save_path, scheduler=None):
    best_valid_loss = float('inf')

    for epoch in range(n_epochs):
        start_time = time.time()

        train_loss = train(model, train_iterator, optimizer, criterion, clip)
        valid_loss = evaluate(model, valid_iterator, criterion)

        if scheduler:
            scheduler.step(valid_loss)

        end_time = time.time()
        epoch_mins, epoch_secs = epoch_time(start_time, end_time)

        if valid_loss < best_valid_loss:
            best_valid_loss = valid_loss
            torch.save(model.state_dict(), model_save_path)

        print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
        print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
        print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

In [11]:
N_EPOCHS = 10
CLIP = 1

# --- 6.1 Latih Baseline RNN+Attention ---
print("\n--- Melatih Baseline: RNN + Attention ---")
INPUT_DIM = sp_eng.get_piece_size()
OUTPUT_DIM = sp_ind.get_piece_size()
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
ENC_HID_DIM = 512
DEC_HID_DIM = 512
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

attn = Attention(ENC_HID_DIM, DEC_HID_DIM)
# PERBAIKAN: Kirim pad_idx ke Encoder dan Decoder
enc = EncoderRNN(INPUT_DIM, ENC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, ENC_DROPOUT, PAD_IDX_ENG)
dec = DecoderRNN(OUTPUT_DIM, DEC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, DEC_DROPOUT, attn, PAD_IDX_IND)
model_rnn = Seq2SeqRNN(enc, dec, device).to(device)

optimizer_rnn = optim.Adam(model_rnn.parameters())
# PERBAIKAN: Pastikan criterion menggunakan pad_idx yang benar
criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX_IND)

run_training_loop(model_rnn, train_iterator, valid_iterator, optimizer_rnn, criterion, N_EPOCHS, CLIP, 'baseline-rnn-model.pt')



--- Melatih Baseline: RNN + Attention ---


Training:   0%|          | 0/185 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/24 [00:00<?, ?it/s]

Epoch: 01 | Time: 16m 1s
	Train Loss: 4.649 | Train PPL: 104.468
	 Val. Loss: 3.810 |  Val. PPL:  45.168


Training:   0%|          | 0/185 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/24 [00:00<?, ?it/s]

Epoch: 02 | Time: 16m 11s
	Train Loss: 3.097 | Train PPL:  22.125
	 Val. Loss: 3.317 |  Val. PPL:  27.587


Training:   0%|          | 0/185 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/24 [00:00<?, ?it/s]

Epoch: 03 | Time: 15m 58s
	Train Loss: 2.281 | Train PPL:   9.787
	 Val. Loss: 3.247 |  Val. PPL:  25.715


Training:   0%|          | 0/185 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/24 [00:00<?, ?it/s]

Epoch: 04 | Time: 15m 51s
	Train Loss: 1.769 | Train PPL:   5.865
	 Val. Loss: 3.224 |  Val. PPL:  25.125


Training:   0%|          | 0/185 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/24 [00:00<?, ?it/s]

Epoch: 05 | Time: 16m 2s
	Train Loss: 1.461 | Train PPL:   4.308
	 Val. Loss: 3.308 |  Val. PPL:  27.319


Training:   0%|          | 0/185 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/24 [00:00<?, ?it/s]

Epoch: 06 | Time: 16m 0s
	Train Loss: 1.275 | Train PPL:   3.577
	 Val. Loss: 3.349 |  Val. PPL:  28.467


Training:   0%|          | 0/185 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/24 [00:00<?, ?it/s]

Epoch: 07 | Time: 16m 10s
	Train Loss: 1.120 | Train PPL:   3.066
	 Val. Loss: 3.456 |  Val. PPL:  31.684


Training:   0%|          | 0/185 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/24 [00:00<?, ?it/s]

Epoch: 08 | Time: 16m 33s
	Train Loss: 1.014 | Train PPL:   2.755
	 Val. Loss: 3.499 |  Val. PPL:  33.087


Training:   0%|          | 0/185 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/24 [00:00<?, ?it/s]

Epoch: 09 | Time: 16m 33s
	Train Loss: 0.916 | Train PPL:   2.499
	 Val. Loss: 3.590 |  Val. PPL:  36.237


Training:   0%|          | 0/185 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/24 [00:00<?, ?it/s]

Epoch: 10 | Time: 16m 22s
	Train Loss: 0.843 | Train PPL:   2.324
	 Val. Loss: 3.631 |  Val. PPL:  37.746


In [12]:
# --- 6.2 Latih Transformer ---
print("\n--- Melatih Model: Transformer ---")
INPUT_DIM_TRANS = sp_eng.get_piece_size()
OUTPUT_DIM_TRANS = sp_ind.get_piece_size()
D_MODEL = 512
NHEAD = 8
NUM_ENCODER_LAYERS = 3
NUM_DECODER_LAYERS = 3
DIM_FEEDFORWARD = 2048
DROPOUT = 0.1

model_transformer = TransformerModel(INPUT_DIM_TRANS, OUTPUT_DIM_TRANS, D_MODEL, NHEAD,
                                     NUM_ENCODER_LAYERS, NUM_DECODER_LAYERS,
                                     DIM_FEEDFORWARD, DROPOUT, PAD_IDX_ENG, PAD_IDX_IND).to(device)

optimizer_transformer = optim.Adam(model_transformer.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)
# PERBAIKAN: Hapus argumen 'verbose' yang sudah tidak digunakan
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer_transformer, 'min', patience=2)

run_training_loop(model_transformer, train_iterator, valid_iterator, optimizer_transformer, criterion, N_EPOCHS, CLIP, 'transformer-model.pt', scheduler=scheduler)


--- Melatih Model: Transformer ---


Training:   0%|          | 0/185 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/24 [00:00<?, ?it/s]

Epoch: 01 | Time: 13m 6s
	Train Loss: 5.083 | Train PPL: 161.248
	 Val. Loss: 4.307 |  Val. PPL:  74.213


Training:   0%|          | 0/185 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/24 [00:00<?, ?it/s]

Epoch: 02 | Time: 12m 54s
	Train Loss: 4.095 | Train PPL:  60.066
	 Val. Loss: 3.877 |  Val. PPL:  48.281


Training:   0%|          | 0/185 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/24 [00:00<?, ?it/s]

Epoch: 03 | Time: 12m 49s
	Train Loss: 3.666 | Train PPL:  39.104
	 Val. Loss: 3.636 |  Val. PPL:  37.952


Training:   0%|          | 0/185 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/24 [00:00<?, ?it/s]

Epoch: 04 | Time: 12m 39s
	Train Loss: 3.339 | Train PPL:  28.204
	 Val. Loss: 3.405 |  Val. PPL:  30.102


Training:   0%|          | 0/185 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/24 [00:00<?, ?it/s]

Epoch: 05 | Time: 12m 41s
	Train Loss: 3.062 | Train PPL:  21.373
	 Val. Loss: 3.248 |  Val. PPL:  25.733


Training:   0%|          | 0/185 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/24 [00:00<?, ?it/s]

Epoch: 06 | Time: 12m 38s
	Train Loss: 2.828 | Train PPL:  16.913
	 Val. Loss: 3.135 |  Val. PPL:  22.979


Training:   0%|          | 0/185 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/24 [00:00<?, ?it/s]

Epoch: 07 | Time: 12m 41s
	Train Loss: 2.620 | Train PPL:  13.741
	 Val. Loss: 3.055 |  Val. PPL:  21.231


Training:   0%|          | 0/185 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/24 [00:00<?, ?it/s]

Epoch: 08 | Time: 12m 31s
	Train Loss: 2.433 | Train PPL:  11.389
	 Val. Loss: 2.914 |  Val. PPL:  18.432


Training:   0%|          | 0/185 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/24 [00:00<?, ?it/s]

Epoch: 09 | Time: 12m 34s
	Train Loss: 2.252 | Train PPL:   9.505
	 Val. Loss: 2.857 |  Val. PPL:  17.411


Training:   0%|          | 0/185 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/24 [00:00<?, ?it/s]

Epoch: 10 | Time: 12m 39s
	Train Loss: 2.093 | Train PPL:   8.110
	 Val. Loss: 2.793 |  Val. PPL:  16.328


In [13]:
def translate_sentence_rnn(sentence, model, sp_src, sp_trg, device, max_len=50):
    model.eval()
    tokens = [sp_src.bos_id()] + sp_src.encode_as_ids(sentence) + [sp_src.eos_id()]
    src_tensor = torch.LongTensor(tokens).unsqueeze(1).to(device)

    with torch.no_grad():
        encoder_outputs, hidden = model.encoder(src_tensor)

    trg_indexes = [sp_trg.bos_id()]
    for i in range(max_len):
        trg_tensor = torch.LongTensor([trg_indexes[-1]]).to(device)
        with torch.no_grad():
            output, hidden = model.decoder(trg_tensor, hidden, encoder_outputs)
        pred_token = output.argmax(1).item()
        trg_indexes.append(pred_token)
        if pred_token == sp_trg.eos_id():
            break

    trg_tokens = sp_trg.decode_ids(trg_indexes)
    return trg_tokens

def translate_sentence_transformer(sentence, model, sp_src, sp_trg, device, max_len=50):
    model.eval()
    src_tokens = [sp_src.bos_id()] + sp_src.encode_as_ids(sentence) + [sp_src.eos_id()]
    src_tensor = torch.LongTensor(src_tokens).unsqueeze(1).to(device)

    trg_tokens = [sp_trg.bos_id()]
    for i in range(max_len):
        trg_tensor = torch.LongTensor(trg_tokens).unsqueeze(1).to(device)
        with torch.no_grad():
            output = model(src_tensor, trg_tensor)
        pred_token = output.argmax(2)[-1, :].item()
        trg_tokens.append(pred_token)
        if pred_token == sp_trg.eos_id():
            break

    return sp_trg.decode_ids(trg_tokens)

def calculate_bleu(data, model, sp_src, sp_trg, device, model_type):
    trgs = []
    preds = []

    for pair in tqdm(data, desc="Calculating BLEU"):
        src = pair[0]
        trg = pair[1]

        if model_type == 'rnn':
            pred_trg = translate_sentence_rnn(src, model, sp_src, sp_trg, device)
        else:
            pred_trg = translate_sentence_transformer(src, model, sp_src, sp_trg, device)

        preds.append(pred_trg)
        trgs.append([trg])

    return sacrebleu.corpus_bleu(preds, trgs)

# Muat model terbaik
model_rnn.load_state_dict(torch.load('baseline-rnn-model.pt'))
model_transformer.load_state_dict(torch.load('transformer-model.pt'))

# Hitung BLEU score
bleu_rnn = calculate_bleu(test_pairs, model_rnn, sp_eng, sp_ind, device, 'rnn')
bleu_transformer = calculate_bleu(test_pairs, model_transformer, sp_eng, sp_ind, device, 'transformer')

print(f'\nBLEU Score (Baseline RNN): {bleu_rnn.score:.2f}')
print(f'BLEU Score (Transformer): {bleu_transformer.score:.2f}')

# Tampilkan beberapa contoh terjemahan
print("\n--- Contoh Hasil Terjemahan ---")
for i, pair in enumerate(test_pairs[:10]):
    src, trg = pair
    pred_rnn = translate_sentence_rnn(src, model_rnn, sp_eng, sp_ind, device)
    pred_transformer = translate_sentence_transformer(src, model_transformer, sp_eng, sp_ind, device)

    print(f"Contoh #{i+1}")
    print(f"SRC: {src}")
    print(f"TRG: {trg}")
    print(f"RNN: {pred_rnn}")
    print(f"Transformer: {pred_transformer}\n")

Calculating BLEU:   0%|          | 0/1477 [00:00<?, ?it/s]

Calculating BLEU:   0%|          | 0/1477 [00:00<?, ?it/s]


BLEU Score (Baseline RNN): 54.11
BLEU Score (Transformer): 36.89

--- Contoh Hasil Terjemahan ---
Contoh #1
SRC: i should go home before my parents start to worry .
TRG: aku harus pulang sebelum orang tuaku menjadi khawatir .
RNN: aku harus pulang ke rumah orang tuaku .
Transformer: orang harus pulang lebih awal aku harus pulang .

Contoh #2
SRC: thanks for helping tom .
TRG: terima kasih telah membantu tom .
RNN: terima kasih untuk tom .
Transformer: terima kasih telah tom .

Contoh #3
SRC: mt . everest is the highest mountain in the world .
TRG: gunung everest adalah gunung paling tinggi di dunia .
RNN: gunung everest adalah gunung tertinggi tinggi di dunia .
Transformer: gunung everest adalah gunung everest adalah gunung tertinggi di dunia .

Contoh #4
SRC: i can tell you what tom thinks .
TRG: aku bisa memberitahukanmu yang tom pikirkan .
RNN: aku bisa memberitahumu apa yang tom .
Transformer: aku bisa tahu apa yang kamu katakan .

Contoh #5
SRC: nobody called .
TRG: tidak ada yan