# Filtering dan Re-save Dataset

In [5]:
import torch
from torch.nn.utils.rnn import pad_sequence

dae_data = torch.load('/kaggle/input/dae-preprocessed/dae_preprocessed.pt')
input_seqs = dae_data['input_seqs']
target_seqs = dae_data['target_seqs']
word_map = dae_data['word_map']

max_target_len = 64  # kamu bisa sesuaikan (64 cukup untuk caption)
new_input, new_target = [], []
for inp, tgt in zip(input_seqs, target_seqs):
    tgt_len = (tgt != word_map['<pad>']).sum().item()
    if tgt_len <= max_target_len:
        new_input.append(inp)
        new_target.append(tgt)
input_seqs_pad = pad_sequence(new_input, batch_first=True, padding_value=word_map['<pad>'])
target_seqs_pad = pad_sequence(new_target, batch_first=True, padding_value=word_map['<pad>'])
print("Setelah filtering, jumlah sample:", input_seqs_pad.shape[0])

# Re-save
torch.save({
    'input_seqs': input_seqs_pad,
    'target_seqs': target_seqs_pad,
    'word_map': word_map
}, 'dae_preprocessed_filtered.pt')

Setelah filtering, jumlah sample: 6480


# Train

## DataLoader

In [6]:
import torch
from torch.utils.data import Dataset, DataLoader

class DAEDataset(Dataset):
    def __init__(self, data_path):
        data = torch.load(data_path)
        self.input_seqs = data['input_seqs']
        self.target_seqs = data['target_seqs']
    def __len__(self):
        return self.input_seqs.size(0)
    def __getitem__(self, idx):
        return self.input_seqs[idx], self.target_seqs[idx]

batch_size = 32
train_dataset = DAEDataset('/kaggle/working/dae_preprocessed_filtered.pt')
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

## Model DAE LSTM 

In [7]:
class DAELSTM(nn.Module):
    def __init__(self, vocab_size, embed_dim=300, hidden_dim=512, pad_idx=0):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=pad_idx)
        self.encoder = nn.LSTM(embed_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.decoder = nn.LSTM(embed_dim, hidden_dim*2, batch_first=True)
        self.fc = nn.Linear(hidden_dim*2, vocab_size)
    def forward(self, src, tgt):
        emb_src = self.embedding(src)
        _, (h, c) = self.encoder(emb_src)  # h, c: [2, B, 512]
        # Gabungkan arah forward dan backward (biLSTM) untuk jadi 1 state [1, B, 1024]
        h_cat = torch.cat([h[0], h[1]], dim=-1).unsqueeze(0)  # [1, B, 1024]
        c_cat = torch.cat([c[0], c[1]], dim=-1).unsqueeze(0)  # [1, B, 1024]
        emb_tgt = self.embedding(tgt[:, :-1])  # input target tanpa <end>
        dec_out, _ = self.decoder(emb_tgt, (h_cat, c_cat))
        out = self.fc(dec_out)
        return out

## Main Loop

In [8]:
import torch.optim as optim
import numpy as np

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
vocab_size = len(word_map)
pad_idx = word_map['<pad>']

model = DAELSTM(vocab_size=vocab_size, embed_dim=300, hidden_dim=512, pad_idx=pad_idx).to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)

n_epochs = 10
best_loss = float('inf')

for epoch in range(n_epochs):
    model.train()
    epoch_loss = 0
    for inputs, targets in train_loader:
        inputs = inputs.to(device)
        targets = targets.to(device)
        optimizer.zero_grad()
        outputs = model(inputs, targets)  # (B, T, V)
        # Shift targets for loss: target[:, 1:] (tanpa <start>)
        outputs = outputs.reshape(-1, vocab_size)
        gold = targets[:, 1:].reshape(-1)
        loss = criterion(outputs, gold)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
    avg_loss = epoch_loss / len(train_loader)
    print(f"Epoch {epoch+1} | Loss: {avg_loss:.4f}")

    # Save best
    if avg_loss < best_loss:
        best_loss = avg_loss
        torch.save(model.state_dict(), "dae_lstm_best.pt")
        print("Best model saved.")

print("Training selesai. Best loss:", best_loss)

Epoch 1 | Loss: 5.5120
Best model saved.
Epoch 2 | Loss: 4.6987
Best model saved.
Epoch 3 | Loss: 4.2932
Best model saved.
Epoch 4 | Loss: 3.9147
Best model saved.
Epoch 5 | Loss: 3.5086
Best model saved.
Epoch 6 | Loss: 3.0540
Best model saved.
Epoch 7 | Loss: 2.5720
Best model saved.
Epoch 8 | Loss: 2.0944
Best model saved.
Epoch 9 | Loss: 1.6568
Best model saved.
Epoch 10 | Loss: 1.2930
Best model saved.
Training selesai. Best loss: 1.293038960748118


## Load

In [9]:
# Load for inference
model = DAELSTM(vocab_size=vocab_size, embed_dim=300, hidden_dim=512, pad_idx=pad_idx).to(device)
model.load_state_dict(torch.load("/kaggle/working/dae_lstm_best.pt", map_location=device))
model.eval()

DAELSTM(
  (embedding): Embedding(8842, 300, padding_idx=0)
  (encoder): LSTM(300, 512, batch_first=True, bidirectional=True)
  (decoder): LSTM(300, 1024, batch_first=True)
  (fc): Linear(in_features=1024, out_features=8842, bias=True)
)

In [10]:
import torch

def decode_caption(model, input_seq, word_map, max_len=64, device='cpu'):
    model.eval()
    inv_word_map = {v: k for k, v in word_map.items()}
    with torch.no_grad():
        # (1, seq_len)
        inp = torch.tensor(input_seq, dtype=torch.long).unsqueeze(0).to(device)
        emb_src = model.embedding(inp)
        _, (h, c) = model.encoder(emb_src)
        h_cat = torch.cat([h[0], h[1]], dim=-1).unsqueeze(0)
        c_cat = torch.cat([c[0], c[1]], dim=-1).unsqueeze(0)

        # Start dengan <start>
        cur_token = torch.tensor([[word_map['<start>']]], dtype=torch.long).to(device)
        decoded = []
        for _ in range(max_len):
            emb_tgt = model.embedding(cur_token)
            out, (h_cat, c_cat) = model.decoder(emb_tgt, (h_cat, c_cat))
            logits = model.fc(out[:, -1])  # Ambil token terakhir
            next_token = logits.argmax(-1)
            word = inv_word_map[next_token.item()]
            if word == '<end>':
                break
            decoded.append(word)
            cur_token = next_token.unsqueeze(0)
    return ' '.join(decoded)

In [11]:
# Load model (pastikan path dan param benar)
model = DAELSTM(vocab_size=len(word_map), embed_dim=300, hidden_dim=512, pad_idx=word_map['<pad>']).to(device)
model.load_state_dict(torch.load("/kaggle/working/dae_lstm_best.pt", map_location=device))
model.eval()

# Inference untuk 5 sample random
import random
dae_data = torch.load('/kaggle/working/dae_preprocessed_filtered.pt')
input_seqs = dae_data['input_seqs']
word_map = dae_data['word_map']

samples = random.sample(range(len(input_seqs)), 5)
for i in samples:
    inp = input_seqs[i].cpu().numpy()
    # Hapus padding 0
    inp = inp[inp != word_map['<pad>']]
    decoded_caption = decode_caption(model, inp, word_map, max_len=64, device=device)
    print(f"\nSample #{i}")
    print(f"Decoded DAE caption: {decoded_caption}")


Sample #2648
Decoded DAE caption: i love the idea and the lighting looks just a little too close to the right <unk>

Sample #454
Decoded DAE caption: i just love the crops that you do like <unk> as they have been a fun image

Sample #1209
Decoded DAE caption: i love the colors and the <unk> texture is all around great job

Sample #5041
Decoded DAE caption: i just love the idea and effort for this image to work for me

Sample #3396
Decoded DAE caption: i just love the crops that you do like <unk> as they have been a <unk> <unk>


In [12]:
import json

results = []
for i in range(len(input_seqs)):
    inp = input_seqs[i].cpu().numpy()
    inp = inp[inp != word_map['<pad>']]
    caption = decode_caption(model, inp, word_map, max_len=64, device=device)
    results.append({'id': int(i), 'dae_caption': caption})

with open('dae_lstm_outputs.json', 'w', encoding='utf8') as f:
    json.dump(results, f, ensure_ascii=False, indent=2)