# Train Fine Tune

## Fungsi Training, Evaluasi, dan Save Checkpoint

In [1]:
import torch
import torch.nn as nn
import torchvision.models as models
import torch.optim as optim
from torchvision import transforms
from torch.utils.data import DataLoader
import os
import json
from tqdm import tqdm

# --- Encoder ---
class EncoderCNN(nn.Module):
    def __init__(self, encoded_image_size=14):
        super(EncoderCNN, self).__init__()
        self.enc_image_size = encoded_image_size
        self.encoder_dim = 2048

        resnet = models.resnet101(weights='DEFAULT')
        modules = list(resnet.children())[:-2]
        self.resnet = nn.Sequential(*modules)
        self.adaptive_pool = nn.AdaptiveAvgPool2d((encoded_image_size, encoded_image_size))
        self.fine_tune()

    def forward(self, images):
        out = self.resnet(images)
        out = self.adaptive_pool(out)
        out = out.permute(0, 2, 3, 1)  # (B, 14, 14, 2048)
        out = out.view(out.size(0), -1, out.size(-1))  # (B, num_pixels, 2048)
        return out

    def fine_tune(self, fine_tune=True):
        for p in self.resnet.parameters():
            p.requires_grad = False
        for c in list(self.resnet.children())[5:]:
            for p in c.parameters():
                p.requires_grad = fine_tune

# --- Decoder ---
class DecoderRNN(nn.Module):
    def __init__(self, embed_size, hidden_size, vocab_size, encoder_dim=2048, dropout=0.5,
                 pretrained_embeddings=None, freeze_embeddings=False):
        super(DecoderRNN, self).__init__()
        self.encoder_dim = encoder_dim
        self.embed_size = embed_size
        self.hidden_size = hidden_size
        self.vocab_size = vocab_size

        self.embedding = nn.Embedding(vocab_size, embed_size)
        if pretrained_embeddings is not None:
            self.embedding.weight = nn.Parameter(pretrained_embeddings)
            self.embedding.weight.requires_grad = not freeze_embeddings
        else:
            self.embedding.weight.data.uniform_(-0.1, 0.1)

        self.dropout = nn.Dropout(p=dropout)
        self.init_h = nn.Linear(encoder_dim, hidden_size)
        self.init_c = nn.Linear(encoder_dim, hidden_size)
        self.lstm = nn.LSTMCell(embed_size + encoder_dim, hidden_size)
        self.fc = nn.Linear(hidden_size, vocab_size)
        self.init_weights()

    def init_weights(self):
        self.fc.bias.data.fill_(0)
        self.fc.weight.data.uniform_(-0.1, 0.1)

    def init_hidden_state(self, encoder_out):
        mean_encoder_out = encoder_out.mean(dim=1)
        h = self.init_h(mean_encoder_out)
        c = self.init_c(mean_encoder_out)
        return h, c

    def forward(self, encoder_out, captions, caplens):
        batch_size = encoder_out.size(0)
        encoder_out = encoder_out.view(batch_size, -1, self.encoder_dim)
        caplens, sort_ind = caplens.squeeze(1).sort(dim=0, descending=True)
        encoder_out = encoder_out[sort_ind]
        captions = captions[sort_ind]
        embeddings = self.embedding(captions)

        h, c = self.init_hidden_state(encoder_out)
        decode_lengths = (caplens - 1).tolist()
        predictions = torch.zeros(batch_size, max(decode_lengths), self.vocab_size).to(encoder_out.device)

        for t in range(max(decode_lengths)):
            batch_size_t = sum([l > t for l in decode_lengths])
            awe = encoder_out[:batch_size_t].mean(dim=1)
            input_lstm = torch.cat([embeddings[:batch_size_t, t, :], awe], dim=1)
            h, c = self.lstm(input_lstm, (h[:batch_size_t], c[:batch_size_t]))
            preds = self.fc(self.dropout(h))
            predictions[:batch_size_t, t, :] = preds

        return predictions, captions, decode_lengths, sort_ind

In [2]:
import os
import json
import torch
from torch.utils.data import Dataset
import h5py

class CaptionDataset(Dataset):
    def __init__(self, data_folder, data_name, split, transform=None):
        self.split = split
        self.transform = transform

        self.h = h5py.File(os.path.join(data_folder, f"{split}_images_{data_name}.hdf5"), 'r')
        self.imgs = self.h['images']
        self.cpi = self.h.attrs['captions_per_image']

        with open(os.path.join(data_folder, f"{split}_captions_{data_name}.json"), 'r') as j:
            self.captions = json.load(j)
        with open(os.path.join(data_folder, f"{split}_caplength_{data_name}.json"), 'r') as j:
            self.caplens = json.load(j)

        self.dataset_size = len(self.captions)

    def __getitem__(self, i):
        img = torch.FloatTensor(self.imgs[i // self.cpi] / 255.)
        if self.transform is not None:
            img = self.transform(img)

        caption = torch.LongTensor(self.captions[i])
        caplen = torch.LongTensor([self.caplens[i]])

        if self.split == 'train':
            return img, caption, caplen
        else:
            all_captions = torch.LongTensor(
                self.captions[((i // self.cpi) * self.cpi):(((i // self.cpi) * self.cpi) + self.cpi)]
            )
            return img, caption, caplen, all_captions

    def __len__(self):
        return self.dataset_size

## Loop

In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
import os, json
from torchvision import transforms
from tqdm import tqdm

# --- Konfigurasi ---
aspects = ["color_light", "composition", "dof_and_focus"]
data_folder = "/kaggle/input/food-iac-fine-tune-dataset/preprocessed_dataset"
word_map_path = os.path.join(data_folder, "wordmap_all.json")
glove_path = "/kaggle/input/glove6b300dtxt/glove.6B.300d.txt"
pretrained_checkpoint = "/kaggle/input/pretrained-coco-food-cnn-lstm/pytorch/default/1/pretrain_coco_food_epoch4.pth"
output_dir = "/kaggle/working/fine-tuned-models"
os.makedirs(output_dir, exist_ok=True)

# --- Hyperparameters ---
with open(word_map_path, "r") as j:
    word_map = json.load(j)

vocab_size = len(word_map)
embed_size = 300
hidden_size = 512
batch_size = 64
num_epochs = 20
patience = 3
learning_rate = 1e-4
grad_clip = 5.0
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# --- Transform ---
transform = transforms.Compose([
    transforms.Resize((256, 256)),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225])
])

# --- GloVe Embedding Loader ---
def load_glove_embeddings(glove_path, word_map, embedding_dim=300):
    import numpy as np
    embeddings_index = {}
    with open(glove_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.strip().split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = vector
    embedding_matrix = np.random.uniform(-0.1, 0.1, (len(word_map), embedding_dim)).astype(np.float32)
    for word, idx in word_map.items():
        if word in embeddings_index:
            embedding_matrix[idx] = embeddings_index[word]
    print("✅ GloVe loaded. Coverage:", sum([1 for w in word_map if w in embeddings_index]))
    return torch.tensor(embedding_matrix)

embedding_matrix = load_glove_embeddings(glove_path, word_map, embed_size)

# --- Load from pretrained checkpoint ---
def load_pretrained_model(embed_matrix, ckpt_path):
    encoder = EncoderCNN().to(device)
    decoder = DecoderRNN(
        embed_size=embed_size,
        hidden_size=hidden_size,
        vocab_size=vocab_size,
        pretrained_embeddings=embed_matrix
    ).to(device)

    checkpoint = torch.load(ckpt_path, map_location=device)

    # Load encoder state dict normally (semua layer seharusnya match)
    encoder.load_state_dict(checkpoint["encoder"])

    # Load decoder state dict *selectively*
    decoder_state = checkpoint["decoder"]
    model_state = decoder.state_dict()

    # Filter matching keys
    filtered_state = {k: v for k, v in decoder_state.items() if k in model_state and v.size() == model_state[k].size()}
    model_state.update(filtered_state)
    decoder.load_state_dict(model_state)

    print(f"✅ Partially loaded decoder from {ckpt_path} ({len(filtered_state)} layers matched)")
    return encoder, decoder

# --- Training Utilities ---
def train_epoch(loader, encoder, decoder, criterion, enc_opt, dec_opt, device):
    encoder.train(); decoder.train(); total_loss = 0
    for imgs, caps, caplens in tqdm(loader, desc="🔥 Training", leave=False):
        imgs, caps, caplens = imgs.to(device), caps.to(device), caplens.to(device)
        enc_out = encoder(imgs)
        scores, caps_sorted, decode_lengths, _ = decoder(enc_out, caps, caplens)
        targets = caps_sorted[:, 1:]
        scores_packed = nn.utils.rnn.pack_padded_sequence(scores, decode_lengths, batch_first=True).data
        targets_packed = nn.utils.rnn.pack_padded_sequence(targets, decode_lengths, batch_first=True).data
        loss = criterion(scores_packed, targets_packed)
        dec_opt.zero_grad(); enc_opt.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(decoder.parameters(), grad_clip)
        torch.nn.utils.clip_grad_norm_(encoder.parameters(), grad_clip)
        dec_opt.step(); enc_opt.step()
        total_loss += loss.item()
    return total_loss / len(loader)

def evaluate(loader, encoder, decoder, criterion, device):
    encoder.eval(); decoder.eval(); total_loss = 0
    with torch.no_grad():
        for imgs, caps, caplens, _ in tqdm(loader, desc="🔍 Evaluating", leave=False):
            imgs, caps, caplens = imgs.to(device), caps.to(device), caplens.to(device)
            enc_out = encoder(imgs)
            scores, caps_sorted, decode_lengths, _ = decoder(enc_out, caps, caplens)
            targets = caps_sorted[:, 1:]
            scores_packed = nn.utils.rnn.pack_padded_sequence(scores, decode_lengths, batch_first=True).data
            targets_packed = nn.utils.rnn.pack_padded_sequence(targets, decode_lengths, batch_first=True).data
            loss = criterion(scores_packed, targets_packed)
            total_loss += loss.item()
    return total_loss / len(loader)

# --- Save checkpoint ---
def save_checkpoint(aspect, epoch, encoder, decoder, enc_opt, dec_opt, val_loss, previous_ckpt_path=None):
    ckpt_path = os.path.join(output_dir, f"{aspect}_best.pth")
    
    # Hapus checkpoint lama jika ada
    if previous_ckpt_path and os.path.exists(previous_ckpt_path):
        os.remove(previous_ckpt_path)
        print(f"🧹 Removed previous checkpoint: {previous_ckpt_path}")

    # Simpan checkpoint baru
    state = {
        "epoch": epoch,
        "val_loss": val_loss,
        "encoder": encoder.state_dict(),
        "decoder": decoder.state_dict(),
        "encoder_optimizer": enc_opt.state_dict(),
        "decoder_optimizer": dec_opt.state_dict()
    }
    torch.save(state, ckpt_path)
    print(f"💾 Saved: {ckpt_path}")
    return ckpt_path

✅ GloVe loaded. Coverage: 8577


In [4]:
for aspect in aspects:
    print(f"\n📌 Fine-tuning for aspect: {aspect.upper()}")
    train_loader = DataLoader(
        CaptionDataset(data_folder, aspect, split='train', transform=transform),
        batch_size=batch_size, shuffle=True, num_workers=2, pin_memory=True
    )
    val_loader = DataLoader(
        CaptionDataset(data_folder, aspect, split='val', transform=transform),
        batch_size=batch_size, shuffle=False, num_workers=2, pin_memory=True
    )

    encoder, decoder = load_pretrained_model(embedding_matrix, pretrained_checkpoint)
    criterion = nn.CrossEntropyLoss().to(device)
    enc_opt = optim.Adam(filter(lambda p: p.requires_grad, encoder.parameters()), lr=learning_rate)
    dec_opt = optim.Adam(decoder.parameters(), lr=learning_rate)

    best_val = float('inf')
    no_improve = 0
    best_ckpt_path = None
    
    for epoch in range(1, num_epochs + 1):
        print(f"\n🔁 Epoch {epoch}/{num_epochs}")
        train_loss = train_epoch(train_loader, encoder, decoder, criterion, enc_opt, dec_opt, device)
        val_loss = evaluate(val_loader, encoder, decoder, criterion, device)
        print(f"✅ Train: {train_loss:.4f} | 🔍 Val: {val_loss:.4f}")
    
        if val_loss < best_val:
            best_val = val_loss
            best_ckpt_path = save_checkpoint(aspect, epoch, encoder, decoder, enc_opt, dec_opt, val_loss, best_ckpt_path)
            no_improve = 0
        else:
            no_improve += 1
            print(f"⚠️ No improvement ({no_improve}/{patience})")
            if no_improve >= patience:
                print("🛑 Early stopping triggered.")
                break



📌 Fine-tuning for aspect: COLOR_LIGHT


Downloading: "https://download.pytorch.org/models/resnet101-cd907fc2.pth" to /root/.cache/torch/hub/checkpoints/resnet101-cd907fc2.pth
100%|██████████| 171M/171M [00:00<00:00, 210MB/s]


✅ Partially loaded decoder from /kaggle/input/pretrained-coco-food-cnn-lstm/pytorch/default/1/pretrain_coco_food_epoch4.pth (8 layers matched)

🔁 Epoch 1/20


                                                              

✅ Train: 6.8168 | 🔍 Val: 5.6112
💾 Saved: /kaggle/working/fine-tuned-models/color_light_best.pth

🔁 Epoch 2/20


                                                              

✅ Train: 5.6089 | 🔍 Val: 5.3453
🧹 Removed previous checkpoint: /kaggle/working/fine-tuned-models/color_light_best.pth
💾 Saved: /kaggle/working/fine-tuned-models/color_light_best.pth

🔁 Epoch 3/20


                                                              

✅ Train: 5.2996 | 🔍 Val: 5.1807
🧹 Removed previous checkpoint: /kaggle/working/fine-tuned-models/color_light_best.pth
💾 Saved: /kaggle/working/fine-tuned-models/color_light_best.pth

🔁 Epoch 4/20


                                                              

✅ Train: 5.0537 | 🔍 Val: 5.0743
🧹 Removed previous checkpoint: /kaggle/working/fine-tuned-models/color_light_best.pth
💾 Saved: /kaggle/working/fine-tuned-models/color_light_best.pth

🔁 Epoch 5/20


                                                              

✅ Train: 4.8385 | 🔍 Val: 5.0010
🧹 Removed previous checkpoint: /kaggle/working/fine-tuned-models/color_light_best.pth
💾 Saved: /kaggle/working/fine-tuned-models/color_light_best.pth

🔁 Epoch 6/20


                                                              

✅ Train: 4.6439 | 🔍 Val: 4.9546
🧹 Removed previous checkpoint: /kaggle/working/fine-tuned-models/color_light_best.pth
💾 Saved: /kaggle/working/fine-tuned-models/color_light_best.pth

🔁 Epoch 7/20


                                                              

✅ Train: 4.4705 | 🔍 Val: 4.9285
🧹 Removed previous checkpoint: /kaggle/working/fine-tuned-models/color_light_best.pth
💾 Saved: /kaggle/working/fine-tuned-models/color_light_best.pth

🔁 Epoch 8/20


                                                              

✅ Train: 4.3094 | 🔍 Val: 4.9126
🧹 Removed previous checkpoint: /kaggle/working/fine-tuned-models/color_light_best.pth
💾 Saved: /kaggle/working/fine-tuned-models/color_light_best.pth

🔁 Epoch 9/20


                                                              

✅ Train: 4.1617 | 🔍 Val: 4.9106
🧹 Removed previous checkpoint: /kaggle/working/fine-tuned-models/color_light_best.pth
💾 Saved: /kaggle/working/fine-tuned-models/color_light_best.pth

🔁 Epoch 10/20


                                                              

✅ Train: 4.0185 | 🔍 Val: 4.8960
🧹 Removed previous checkpoint: /kaggle/working/fine-tuned-models/color_light_best.pth
💾 Saved: /kaggle/working/fine-tuned-models/color_light_best.pth

🔁 Epoch 11/20


                                                              

✅ Train: 3.8880 | 🔍 Val: 4.9094
⚠️ No improvement (1/3)

🔁 Epoch 12/20


                                                              

✅ Train: 3.7641 | 🔍 Val: 4.9449
⚠️ No improvement (2/3)

🔁 Epoch 13/20


                                                              

✅ Train: 3.6455 | 🔍 Val: 4.9348
⚠️ No improvement (3/3)
🛑 Early stopping triggered.

📌 Fine-tuning for aspect: COMPOSITION
✅ Partially loaded decoder from /kaggle/input/pretrained-coco-food-cnn-lstm/pytorch/default/1/pretrain_coco_food_epoch4.pth (8 layers matched)

🔁 Epoch 1/20


                                                              

✅ Train: 6.8546 | 🔍 Val: 5.6831
💾 Saved: /kaggle/working/fine-tuned-models/composition_best.pth

🔁 Epoch 2/20


                                                              

✅ Train: 5.6914 | 🔍 Val: 5.4317
🧹 Removed previous checkpoint: /kaggle/working/fine-tuned-models/composition_best.pth
💾 Saved: /kaggle/working/fine-tuned-models/composition_best.pth

🔁 Epoch 3/20


                                                              

✅ Train: 5.3943 | 🔍 Val: 5.2669
🧹 Removed previous checkpoint: /kaggle/working/fine-tuned-models/composition_best.pth
💾 Saved: /kaggle/working/fine-tuned-models/composition_best.pth

🔁 Epoch 4/20


                                                              

✅ Train: 5.1426 | 🔍 Val: 5.1543
🧹 Removed previous checkpoint: /kaggle/working/fine-tuned-models/composition_best.pth
💾 Saved: /kaggle/working/fine-tuned-models/composition_best.pth

🔁 Epoch 5/20


                                                              

✅ Train: 4.9381 | 🔍 Val: 5.0746
🧹 Removed previous checkpoint: /kaggle/working/fine-tuned-models/composition_best.pth
💾 Saved: /kaggle/working/fine-tuned-models/composition_best.pth

🔁 Epoch 6/20


                                                              

✅ Train: 4.7487 | 🔍 Val: 5.0236
🧹 Removed previous checkpoint: /kaggle/working/fine-tuned-models/composition_best.pth
💾 Saved: /kaggle/working/fine-tuned-models/composition_best.pth

🔁 Epoch 7/20


                                                              

✅ Train: 4.5776 | 🔍 Val: 4.9948
🧹 Removed previous checkpoint: /kaggle/working/fine-tuned-models/composition_best.pth
💾 Saved: /kaggle/working/fine-tuned-models/composition_best.pth

🔁 Epoch 8/20


                                                              

✅ Train: 4.4153 | 🔍 Val: 4.9767
🧹 Removed previous checkpoint: /kaggle/working/fine-tuned-models/composition_best.pth
💾 Saved: /kaggle/working/fine-tuned-models/composition_best.pth

🔁 Epoch 9/20


                                                              

✅ Train: 4.2732 | 🔍 Val: 4.9786
⚠️ No improvement (1/3)

🔁 Epoch 10/20


                                                              

✅ Train: 4.1428 | 🔍 Val: 4.9610
🧹 Removed previous checkpoint: /kaggle/working/fine-tuned-models/composition_best.pth
💾 Saved: /kaggle/working/fine-tuned-models/composition_best.pth

🔁 Epoch 11/20


                                                              

✅ Train: 4.0109 | 🔍 Val: 4.9666
⚠️ No improvement (1/3)

🔁 Epoch 12/20


                                                              

✅ Train: 3.8974 | 🔍 Val: 4.9970
⚠️ No improvement (2/3)

🔁 Epoch 13/20


                                                              

✅ Train: 3.7866 | 🔍 Val: 5.0021
⚠️ No improvement (3/3)
🛑 Early stopping triggered.

📌 Fine-tuning for aspect: DOF_AND_FOCUS
✅ Partially loaded decoder from /kaggle/input/pretrained-coco-food-cnn-lstm/pytorch/default/1/pretrain_coco_food_epoch4.pth (8 layers matched)

🔁 Epoch 1/20


                                                              

✅ Train: 6.9238 | 🔍 Val: 5.7556
💾 Saved: /kaggle/working/fine-tuned-models/dof_and_focus_best.pth

🔁 Epoch 2/20


                                                              

✅ Train: 5.7347 | 🔍 Val: 5.5056
🧹 Removed previous checkpoint: /kaggle/working/fine-tuned-models/dof_and_focus_best.pth
💾 Saved: /kaggle/working/fine-tuned-models/dof_and_focus_best.pth

🔁 Epoch 3/20


                                                              

✅ Train: 5.4134 | 🔍 Val: 5.3362
🧹 Removed previous checkpoint: /kaggle/working/fine-tuned-models/dof_and_focus_best.pth
💾 Saved: /kaggle/working/fine-tuned-models/dof_and_focus_best.pth

🔁 Epoch 4/20


                                                              

✅ Train: 5.1458 | 🔍 Val: 5.2221
🧹 Removed previous checkpoint: /kaggle/working/fine-tuned-models/dof_and_focus_best.pth
💾 Saved: /kaggle/working/fine-tuned-models/dof_and_focus_best.pth

🔁 Epoch 5/20


                                                              

✅ Train: 4.9141 | 🔍 Val: 5.1502
🧹 Removed previous checkpoint: /kaggle/working/fine-tuned-models/dof_and_focus_best.pth
💾 Saved: /kaggle/working/fine-tuned-models/dof_and_focus_best.pth

🔁 Epoch 6/20


                                                              

✅ Train: 4.7087 | 🔍 Val: 5.1076
🧹 Removed previous checkpoint: /kaggle/working/fine-tuned-models/dof_and_focus_best.pth
💾 Saved: /kaggle/working/fine-tuned-models/dof_and_focus_best.pth

🔁 Epoch 7/20


                                                              

✅ Train: 4.5236 | 🔍 Val: 5.0821
🧹 Removed previous checkpoint: /kaggle/working/fine-tuned-models/dof_and_focus_best.pth
💾 Saved: /kaggle/working/fine-tuned-models/dof_and_focus_best.pth

🔁 Epoch 8/20


                                                              

✅ Train: 4.3562 | 🔍 Val: 5.0674
🧹 Removed previous checkpoint: /kaggle/working/fine-tuned-models/dof_and_focus_best.pth
💾 Saved: /kaggle/working/fine-tuned-models/dof_and_focus_best.pth

🔁 Epoch 9/20


                                                              

✅ Train: 4.1991 | 🔍 Val: 5.0716
⚠️ No improvement (1/3)

🔁 Epoch 10/20


                                                              

✅ Train: 4.0545 | 🔍 Val: 5.0669
🧹 Removed previous checkpoint: /kaggle/working/fine-tuned-models/dof_and_focus_best.pth
💾 Saved: /kaggle/working/fine-tuned-models/dof_and_focus_best.pth

🔁 Epoch 11/20


                                                              

✅ Train: 3.9206 | 🔍 Val: 5.0856
⚠️ No improvement (1/3)

🔁 Epoch 12/20


                                                              

✅ Train: 3.7902 | 🔍 Val: 5.0992
⚠️ No improvement (2/3)

🔁 Epoch 13/20


                                                              

✅ Train: 3.6664 | 🔍 Val: 5.1000
⚠️ No improvement (3/3)
🛑 Early stopping triggered.




# Eval

In [5]:
# from nltk.translate.bleu_score import corpus_bleu
# import torch.nn.functional as F

# fine_tune_ckpt = "/kaggle/input/single-aspects/pytorch/default/2/fine-tuned-models"
# aspects = ["color_light", "composition", "dof_and_focus"]

# def caption_image_beam_search(encoder, decoder, image, word_map, beam_size=5, max_len=25):
#     k = beam_size
#     vocab_size = len(word_map)
#     rev_word_map = {v: k for k, v in word_map.items()}

#     encoder_out = encoder(image.unsqueeze(0))  # (1, num_pixels, encoder_dim)
#     encoder_dim = encoder_out.size(-1)
#     encoder_out = encoder_out.expand(k, -1, encoder_dim)

#     k_prev_words = torch.LongTensor([[word_map['<start>']]] * k).to(device)
#     seqs = k_prev_words
#     top_k_scores = torch.zeros(k, 1).to(device)

#     complete_seqs = []
#     complete_seqs_scores = []

#     h, c = decoder.init_hidden_state(encoder_out)

#     step = 1
#     while True:
#         embeddings = decoder.embedding(k_prev_words).squeeze(1)
#         awe = encoder_out.mean(dim=1)  # mean attention
#         input_lstm = torch.cat([embeddings, awe], dim=1)
#         h, c = decoder.lstm(input_lstm, (h, c))
#         scores = decoder.fc(h)
#         scores = F.log_softmax(scores, dim=1)

#         scores = top_k_scores.expand_as(scores) + scores
#         if step == 1:
#             top_k_scores, top_k_words = scores[0].topk(k, 0, True, True)
#         else:
#             top_k_scores, top_k_words = scores.view(-1).topk(k, 0, True, True)

#         prev_word_inds = top_k_words // vocab_size
#         next_word_inds = top_k_words % vocab_size

#         seqs = torch.cat([seqs[prev_word_inds], next_word_inds.unsqueeze(1)], dim=1)

#         incomplete_inds = [ind for ind, next_word in enumerate(next_word_inds) if next_word != word_map['<end>']]
#         complete_inds = list(set(range(len(next_word_inds))) - set(incomplete_inds))

#         if len(complete_inds) > 0:
#             complete_seqs.extend(seqs[complete_inds].tolist())
#             complete_seqs_scores.extend(top_k_scores[complete_inds])

#         k -= len(complete_inds)
#         if k == 0 or step > max_len:
#             break

#         seqs = seqs[incomplete_inds]
#         h = h[prev_word_inds[incomplete_inds]]
#         c = c[prev_word_inds[incomplete_inds]]
#         encoder_out = encoder_out[prev_word_inds[incomplete_inds]]
#         top_k_scores = top_k_scores[incomplete_inds].unsqueeze(1)
#         k_prev_words = next_word_inds[incomplete_inds].unsqueeze(1)

#         step += 1

#     if len(complete_seqs) == 0:
#     # fallback to the current best in seqs
#         best_seq = seqs[0].tolist()
#     else:
#         i = complete_seqs_scores.index(max(complete_seqs_scores))
#         best_seq = complete_seqs[i]

#     decoded = [rev_word_map[idx] for idx in best_seq if idx not in {word_map['<start>'], word_map['<pad>'], word_map['<end>']}]
#     return decoded

# # --- Evaluasi Semua Aspek ---
# bleu_scores = {}

# # Load word_map & reverse-nya
# with open(word_map_path, "r") as f:
#     word_map = json.load(f)
# rev_word_map = {v: k for k, v in word_map.items()}

# from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction

# results = []
# chencherry = SmoothingFunction()

# for aspect in aspects:
#     print(f"\n📊 Evaluating aspect: {aspect.upper()}")
#     val_loader = DataLoader(
#         CaptionDataset(data_folder, aspect, split='val', transform=transform),
#         batch_size=1, shuffle=False
#     )

#     encoder = EncoderCNN().to(device)
#     decoder = DecoderRNN(embed_size, hidden_size, vocab_size).to(device)

#     ckpt = torch.load(os.path.join(fine_tune_ckpt, f"{aspect}_best.pth"), map_location=device)
#     encoder.load_state_dict(ckpt["encoder"])
#     decoder.load_state_dict(ckpt["decoder"])
#     encoder.eval(); decoder.eval()

#     references, hypotheses = [], []
#     total_unk = 0
#     total_words = 0

#     for img, _, _, all_caps in tqdm(val_loader, desc=f"📝 Beam Decode ({aspect})"):
#         img = img.squeeze(0).to(device)
#         gen_caption = caption_image_beam_search(encoder, decoder, img, word_map, beam_size=5, max_len=25)
#         hypotheses.append(gen_caption)
#         total_words += len(gen_caption)
#         total_unk += gen_caption.count('<unk>')

#         caps = all_caps[0].tolist()
#         refs = [[rev_word_map[i] for i in cap if i not in {word_map['<start>'], word_map['<end>'], word_map['<pad>']}] for cap in caps]
#         references.append(refs)

#     results.append({
#         "Aspect": aspect,
#         "BLEU-1": corpus_bleu(references, hypotheses, weights=(1, 0, 0, 0), smoothing_function=chencherry.method1),
#         "BLEU-2": corpus_bleu(references, hypotheses, weights=(0.5, 0.5, 0, 0), smoothing_function=chencherry.method1),
#         "BLEU-3": corpus_bleu(references, hypotheses, weights=(0.33, 0.33, 0.33, 0), smoothing_function=chencherry.method1),
#         "BLEU-4": corpus_bleu(references, hypotheses, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=chencherry.method1),
#         "UNK Ratio": total_unk / total_words if total_words > 0 else 0
#     })

# import pandas as pd
# df_results = pd.DataFrame(results)
# print("\n📈 BLEU-n & UNK Summary")
# print(df_results)

# Infer Test

In [6]:
# import random
# from torchvision.utils import make_grid
# import matplotlib.pyplot as plt

# def show_random_predictions(aspects, k=10):
#     for aspect in aspects:
#         print(f"\n🎨 Aspect: {aspect.upper()}")
#         dataset = CaptionDataset(data_folder, aspect, split='test', transform=transform)
#         indices = random.sample(range(len(dataset)), k)
#         subset = [dataset[i] for i in indices]

#         encoder = EncoderCNN().to(device)
#         decoder = DecoderRNN(embed_size, hidden_size, vocab_size).to(device)
#         ckpt = torch.load(os.path.join(fine_tune_ckpt, f"{aspect}_best.pth"), map_location=device)
#         encoder.load_state_dict(ckpt["encoder"])
#         decoder.load_state_dict(ckpt["decoder"])
#         encoder.eval(); decoder.eval()

#         for i, (img, _, _, all_caps) in enumerate(subset):
#             img_tensor = img.to(device)
#             gen_caption = caption_image_beam_search(encoder, decoder, img_tensor, word_map, beam_size=5, max_len=25)
#             caps = all_caps.squeeze(0).tolist()  # shape: (5, max_len) -> list of list
#             # references = [[rev_word_map[idx] for idx in cap if idx not in {word_map['<start>'], word_map['<end>'], word_map['<pad>']}] for cap in caps]
#             references = [
#                 [rev_word_map[idx] for idx in cap if idx not in {word_map['<start>'], word_map['<end>'], word_map['<pad>']}]
#                 for cap in caps
#             ]

#             print(f"\n📸 Sample {i+1}")
#             print("🔹 Predicted :", " ".join(gen_caption))
#             print("🔸 References:")
#             for r in references[:3]:
#                 print("  -", " ".join(r))

In [7]:
# show_random_predictions(["color_light", "composition", "dof_and_focus"])

In [8]:
# def predict_image_caption(image_path, aspect, ckpt_dir, word_map_path, embed_matrix, beam_size=5, max_len=25):
#     """
#     Generate a caption for a given image and aesthetic aspect.
#     """
#     with open(word_map_path, "r") as f:
#         word_map = json.load(f)

#     vocab_size = len(word_map)

#     encoder = EncoderCNN().to(device)
#     decoder = DecoderRNN(
#         embed_size=embed_matrix.shape[1],
#         hidden_size=512,
#         vocab_size=vocab_size,
#         pretrained_embeddings=embed_matrix,
#         freeze_embeddings=True
#     ).to(device)

#     ckpt = torch.load(os.path.join(ckpt_dir, f"{aspect}_best.pth"), map_location=device)
#     encoder.load_state_dict(ckpt["encoder"])
#     decoder.load_state_dict(ckpt["decoder"])
#     encoder.eval()
#     decoder.eval()

#     transform = transforms.Compose([
#         transforms.Resize((256, 256)),
#         transforms.ToTensor(),
#         transforms.Normalize(mean=[0.485, 0.456, 0.406],
#                              std=[0.229, 0.224, 0.225])
#     ])
#     image = Image.open(image_path).convert("RGB")
#     image_tensor = transform(image).to(device)

#     caption_ids = caption_image_beam_search(encoder, decoder, image_tensor, word_map, beam_size, max_len)
#     caption_text = " ".join(caption_ids)
#     print(f"📷 {aspect.upper()} - Caption:")
#     print(f"📝 {caption_text}")
#     return caption_text

In [9]:
# predict_image_caption(
#     image_path="/kaggle/input/dpchallenge-images-food-gallery/images/1000368.jpg",  # atau path gambar lokal
#     aspect="color_light",
#     ckpt_dir="/kaggle/input/single-aspects/pytorch/default/2/fine-tuned-models",
#     word_map_path="/kaggle/input/food-iac-fine-tune-dataset/preprocessed_dataset/wordmap_all.json",
#     embed_matrix=embedding_matrix,  # variabel yang kamu sudah punya sebelumnya
#     beam_size=5,
#     max_len=25
# )

In [10]:
# def load_trained_model(aspect, ckpt_dir, word_map_path, embed_matrix):
#     with open(word_map_path, 'r') as j:
#         word_map = json.load(j)
#     vocab_size = len(word_map)

#     encoder = EncoderCNN().to(device)
#     decoder = DecoderRNN(
#         embed_size=embed_size,
#         hidden_size=hidden_size,
#         vocab_size=vocab_size,
#         pretrained_embeddings=embed_matrix,
#         freeze_embeddings=True
#     ).to(device)

#     ckpt = torch.load(os.path.join(ckpt_dir, f"{aspect}_best.pth"), map_location=device)
#     encoder.load_state_dict(ckpt["encoder"])
#     decoder.load_state_dict(ckpt["decoder"])

#     return encoder, decoder, word_map

In [11]:
# def caption_image_beam_search(encoder, decoder, image, word_map, beam_size=5, max_len=25):
#     k = beam_size
#     vocab_size = len(word_map)
#     rev_word_map = {v: k for k, v in word_map.items()}

#     encoder_out = encoder(image.unsqueeze(0))
#     encoder_dim = encoder_out.size(-1)
#     encoder_out = encoder_out.expand(k, -1, encoder_dim)

#     k_prev_words = torch.LongTensor([[word_map['<start>']]] * k).to(device)
#     seqs = k_prev_words
#     top_k_scores = torch.zeros(k, 1).to(device)

#     complete_seqs = []
#     complete_seqs_scores = []

#     h, c = decoder.init_hidden_state(encoder_out)
#     step = 1
#     while True:
#         embeddings = decoder.embedding(k_prev_words).squeeze(1)
#         awe = encoder_out.mean(dim=1)
#         input_lstm = torch.cat([embeddings, awe], dim=1)
#         h, c = decoder.lstm(input_lstm, (h, c))
#         scores = decoder.fc(h)
#         scores = F.log_softmax(scores, dim=1)

#         scores = top_k_scores.expand_as(scores) + scores
#         if step == 1:
#             top_k_scores, top_k_words = scores[0].topk(k, 0, True, True)
#         else:
#             top_k_scores, top_k_words = scores.view(-1).topk(k, 0, True, True)

#         prev_word_inds = top_k_words // vocab_size
#         next_word_inds = top_k_words % vocab_size
#         seqs = torch.cat([seqs[prev_word_inds], next_word_inds.unsqueeze(1)], dim=1)

#         incomplete_inds = [ind for ind, next_word in enumerate(next_word_inds) if next_word != word_map['<end>']]
#         complete_inds = list(set(range(len(next_word_inds))) - set(incomplete_inds))

#         if len(complete_inds) > 0:
#             complete_seqs.extend(seqs[complete_inds].tolist())
#             complete_seqs_scores.extend(top_k_scores[complete_inds])

#         k -= len(complete_inds)
#         if k == 0 or step > max_len:
#             break

#         seqs = seqs[incomplete_inds]
#         h = h[prev_word_inds[incomplete_inds]]
#         c = c[prev_word_inds[incomplete_inds]]
#         encoder_out = encoder_out[prev_word_inds[incomplete_inds]]
#         top_k_scores = top_k_scores[incomplete_inds].unsqueeze(1)
#         k_prev_words = next_word_inds[incomplete_inds].unsqueeze(1)

#         step += 1

#     if len(complete_seqs_scores) == 0:
#         return ["<unk>"]

#     best_seq = complete_seqs[complete_seqs_scores.index(max(complete_seqs_scores))]
#     return [rev_word_map[idx] for idx in best_seq if idx not in {word_map['<start>'], word_map['<end>'], word_map['<pad>']}]

In [12]:
# from PIL import Image
# import matplotlib.pyplot as plt

# def predict_image_caption(image_path, aspect, ckpt_dir, word_map_path, embed_matrix, beam_size=5, max_len=25):
#     transform = transforms.Compose([
#         transforms.Resize((256, 256)),
#         transforms.ToTensor(),
#         transforms.Normalize(mean=[0.485, 0.456, 0.406],
#                              std=[0.229, 0.224, 0.225])
#     ])

#     image = Image.open(image_path).convert("RGB")
#     image_tensor = transform(image).to(device)

#     encoder, decoder, word_map = load_trained_model(aspect, ckpt_dir, word_map_path, embed_matrix)
#     caption = caption_image_beam_search(encoder, decoder, image_tensor, word_map, beam_size, max_len)

#     print(f"📷 {os.path.basename(image_path)} — [{aspect.upper()}]")
#     print(f"📝 Caption: {' '.join(caption)}")

#     plt.imshow(image)
#     plt.axis('off')
#     plt.title(f"{aspect.upper()} — {' '.join(caption)}")
#     plt.show()

In [13]:
# import random

# def infer_visualize_random_images(aspect, dataset, ckpt_dir, word_map_path, embed_matrix,
#                                    test_split="test", beam_size=5, max_len=25, n_samples=10):
#     transform = transforms.Compose([
#         transforms.Resize((256, 256)),
#         transforms.ToTensor(),
#         transforms.Normalize(mean=[0.485, 0.456, 0.406],
#                              std=[0.229, 0.224, 0.225])
#     ])

#     test_images = [img['filename'] for img in dataset if img['split'] == test_split]
#     selected_images = random.sample(test_images, n_samples)

#     encoder, decoder, word_map = load_trained_model(aspect, ckpt_dir, word_map_path, embed_matrix)

#     for idx, filename in enumerate(selected_images):
#         image_path = os.path.join("/kaggle/input/dpchallenge-images-food-gallery/images", filename)
#         image = Image.open(image_path).convert("RGB")
#         image_tensor = transform(image).to(device)

#         caption = caption_image_beam_search(encoder, decoder, image_tensor, word_map, beam_size, max_len)
#         plt.figure(figsize=(6, 4))
#         plt.imshow(image)
#         plt.axis("off")
#         plt.title(f"[{aspect.upper()}] — {' '.join(caption)}", fontsize=10)
#         plt.show()

In [14]:
# import json

# with open("/kaggle/input/food-iac-fine-tune-dataset/final/all.json", "r") as f:
#     all_data = json.load(f)

# # Ambil list gambar
# image_list = all_data["images"]

# infer_visualize_random_images(
#     aspect="color_light",
#     dataset=image_list,
#     ckpt_dir="/kaggle/input/single-aspects/pytorch/default/2/fine-tuned-models",
#     word_map_path="/kaggle/input/food-iac-fine-tune-dataset/preprocessed_dataset/wordmap_all.json",
#     embed_matrix=embedding_matrix
# )

In [15]:
# import json

# with open("/kaggle/input/food-iac-fine-tune-dataset/final/all.json", "r") as f:
#     all_data = json.load(f)

# # Ambil list gambar
# image_list = all_data["images"]

# infer_visualize_random_images(
#     aspect="composition",
#     dataset=image_list,
#     ckpt_dir="/kaggle/input/single-aspects/pytorch/default/2/fine-tuned-models",
#     word_map_path="/kaggle/input/food-iac-fine-tune-dataset/preprocessed_dataset/wordmap_all.json",
#     embed_matrix=embedding_matrix
# )

In [16]:
# import json

# with open("/kaggle/input/food-iac-fine-tune-dataset/final/all.json", "r") as f:
#     all_data = json.load(f)

# # Ambil list gambar
# image_list = all_data["images"]

# infer_visualize_random_images(
#     aspect="dof_and_focus",
#     dataset=image_list,
#     ckpt_dir="/kaggle/input/single-aspects/pytorch/default/2/fine-tuned-models",
#     word_map_path="/kaggle/input/food-iac-fine-tune-dataset/preprocessed_dataset/wordmap_all.json",
#     embed_matrix=embedding_matrix
# )