In [5]:
# Install pycocoevalcap and dependencies
!git clone https://github.com/salaniz/pycocoevalcap.git
!pip install git+https://github.com/salaniz/pycocoevalcap
!pip install nltk
!python -m nltk.downloader punkt

fatal: destination path 'pycocoevalcap' already exists and is not an empty directory.
Collecting git+https://github.com/salaniz/pycocoevalcap
  Cloning https://github.com/salaniz/pycocoevalcap to /tmp/pip-req-build-umeymzsa
  Running command git clone --filter=blob:none --quiet https://github.com/salaniz/pycocoevalcap /tmp/pip-req-build-umeymzsa
  Resolved https://github.com/salaniz/pycocoevalcap to commit a24f74c408c918f1f4ec34e9514bc8a76ce41ffd
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pycocoevalcap
  Building wheel for pycocoevalcap (setup.py) ... [?25l[?25hdone
  Created wheel for pycocoevalcap: filename=pycocoevalcap-1.2-py3-none-any.whl size=104312245 sha256=14df605407bf39014bf38718471d60509887806ae0ab71eb033a62a16584f2fa
  Stored in directory: /tmp/pip-ephem-wheel-cache-ly7zcn7c/wheels/d2/1f/44/6485e566f8ae3d42b56e7c05fd50a3bbb70a50b0e6e7c55212
Successfully built pycocoevalcap
Installing collected packages: pycocoevalcap
Succe

# CNN-LSTM

In [6]:
import torch
import torch.nn as nn
import torchvision.models as models
import torch.optim as optim
from torchvision import transforms
from torch.utils.data import DataLoader
import os
import json
from tqdm import tqdm

# --- Encoder ---
class EncoderCNN(nn.Module):
    def __init__(self, encoded_image_size=14):
        super(EncoderCNN, self).__init__()
        self.enc_image_size = encoded_image_size
        self.encoder_dim = 2048

        resnet = models.resnet101(weights='DEFAULT')
        modules = list(resnet.children())[:-2]
        self.resnet = nn.Sequential(*modules)
        self.adaptive_pool = nn.AdaptiveAvgPool2d((encoded_image_size, encoded_image_size))
        self.fine_tune()

    def forward(self, images):
        out = self.resnet(images)
        out = self.adaptive_pool(out)
        out = out.permute(0, 2, 3, 1)  # (B, 14, 14, 2048)
        out = out.view(out.size(0), -1, out.size(-1))  # (B, num_pixels, 2048)
        return out

    def fine_tune(self, fine_tune=True):
        for p in self.resnet.parameters():
            p.requires_grad = False
        for c in list(self.resnet.children())[5:]:
            for p in c.parameters():
                p.requires_grad = fine_tune

# --- Decoder ---
class DecoderRNN(nn.Module):
    def __init__(self, embed_size, hidden_size, vocab_size, encoder_dim=2048, dropout=0.5,
                 pretrained_embeddings=None, freeze_embeddings=False):
        super(DecoderRNN, self).__init__()
        self.encoder_dim = encoder_dim
        self.embed_size = embed_size
        self.hidden_size = hidden_size
        self.vocab_size = vocab_size

        self.embedding = nn.Embedding(vocab_size, embed_size)
        if pretrained_embeddings is not None:
            self.embedding.weight = nn.Parameter(pretrained_embeddings)
            self.embedding.weight.requires_grad = not freeze_embeddings
        else:
            self.embedding.weight.data.uniform_(-0.1, 0.1)

        self.dropout = nn.Dropout(p=dropout)
        self.init_h = nn.Linear(encoder_dim, hidden_size)
        self.init_c = nn.Linear(encoder_dim, hidden_size)
        self.lstm = nn.LSTMCell(embed_size + encoder_dim, hidden_size)
        self.fc = nn.Linear(hidden_size, vocab_size)
        self.init_weights()

    def init_weights(self):
        self.fc.bias.data.fill_(0)
        self.fc.weight.data.uniform_(-0.1, 0.1)

    def init_hidden_state(self, encoder_out):
        mean_encoder_out = encoder_out.mean(dim=1)
        h = self.init_h(mean_encoder_out)
        c = self.init_c(mean_encoder_out)
        return h, c

    def forward(self, encoder_out, captions, caplens):
        batch_size = encoder_out.size(0)
        encoder_out = encoder_out.view(batch_size, -1, self.encoder_dim)
        caplens, sort_ind = caplens.squeeze(1).sort(dim=0, descending=True)
        encoder_out = encoder_out[sort_ind]
        captions = captions[sort_ind]
        embeddings = self.embedding(captions)

        h, c = self.init_hidden_state(encoder_out)
        decode_lengths = (caplens - 1).tolist()
        predictions = torch.zeros(batch_size, max(decode_lengths), self.vocab_size).to(encoder_out.device)

        for t in range(max(decode_lengths)):
            batch_size_t = sum([l > t for l in decode_lengths])
            awe = encoder_out[:batch_size_t].mean(dim=1)
            input_lstm = torch.cat([embeddings[:batch_size_t, t, :], awe], dim=1)
            h, c = self.lstm(input_lstm, (h[:batch_size_t], c[:batch_size_t]))
            preds = self.fc(self.dropout(h))
            predictions[:batch_size_t, t, :] = preds

        return predictions, captions, decode_lengths, sort_ind

In [7]:
import os
import json
import torch
from torch.utils.data import Dataset
import h5py

class CaptionDataset(Dataset):
    def __init__(self, data_folder, data_name, split, transform=None):
        self.split = split
        self.transform = transform

        self.h = h5py.File(os.path.join(data_folder, f"{split}_images_{data_name}.hdf5"), 'r')
        self.imgs = self.h['images']
        self.cpi = self.h.attrs['captions_per_image']

        with open(os.path.join(data_folder, f"{split}_captions_{data_name}.json"), 'r') as j:
            self.captions = json.load(j)
        with open(os.path.join(data_folder, f"{split}_caplength_{data_name}.json"), 'r') as j:
            self.caplens = json.load(j)

        self.dataset_size = len(self.captions)

    def __getitem__(self, i):
        img = torch.FloatTensor(self.imgs[i // self.cpi] / 255.)
        if self.transform is not None:
            img = self.transform(img)

        caption = torch.LongTensor(self.captions[i])
        caplen = torch.LongTensor([self.caplens[i]])

        if self.split == 'train':
            return img, caption, caplen
        else:
            all_captions = torch.LongTensor(
                self.captions[((i // self.cpi) * self.cpi):(((i // self.cpi) * self.cpi) + self.cpi)]
            )
            return img, caption, caplen, all_captions

    def __len__(self):
        return self.dataset_size

In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
import os, json
from torchvision import transforms
from tqdm import tqdm

# --- Konfigurasi ---
data_folder = "/kaggle/input/food-iac-fine-tune-dataset/preprocessed_dataset"
word_map_path = os.path.join(data_folder, "wordmap_all.json")

# --- Hyperparameters ---
with open(word_map_path, "r") as j:
    word_map = json.load(j)

vocab_size = len(word_map)
embed_size = 300
hidden_size = 512
batch_size = 64
num_epochs = 20
patience = 3
learning_rate = 1e-4
grad_clip = 5.0
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# --- Transform ---
transform = transforms.Compose([
    transforms.Resize((256, 256)),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225])
])

In [8]:
def load_trained_model(aspect, ckpt_dir, word_map_path, embed_matrix):
    with open(word_map_path, 'r') as j:
        word_map = json.load(j)
    vocab_size = len(word_map)

    encoder = EncoderCNN().to(device)
    decoder = DecoderRNN(
        embed_size=embed_size,
        hidden_size=hidden_size,
        vocab_size=vocab_size,
        pretrained_embeddings=embed_matrix,
        freeze_embeddings=True
    ).to(device)

    ckpt = torch.load(os.path.join(ckpt_dir, f"{aspect}_best.pth"), map_location=device)
    encoder.load_state_dict(ckpt["encoder"])
    decoder.load_state_dict(ckpt["decoder"])

    return encoder, decoder, word_map

In [None]:
def caption_image_beam_search(encoder, decoder, image, word_map, beam_size=5, max_len=25, unk_penalty_weight=0.5):
    k = beam_size
    vocab_size = len(word_map)
    rev_word_map = {v: k for k, v in word_map.items()}
    unk_idx = word_map['<unk>']

    encoder_out = encoder(image.unsqueeze(0))  # (1, num_pixels, encoder_dim)
    encoder_dim = encoder_out.size(-1)
    encoder_out = encoder_out.expand(k, -1, encoder_dim)

    k_prev_words = torch.LongTensor([[word_map['<start>']]] * k).to(device)
    seqs = k_prev_words
    top_k_scores = torch.zeros(k, 1).to(device)

    complete_seqs = []
    complete_seqs_scores = []

    h, c = decoder.init_hidden_state(encoder_out)

    step = 1
    while True:
        embeddings = decoder.embedding(k_prev_words).squeeze(1)
        awe = encoder_out.mean(dim=1)  # mean attention
        input_lstm = torch.cat([embeddings, awe], dim=1)
        h, c = decoder.lstm(input_lstm, (h, c))
        scores = decoder.fc(h)
        scores = F.log_softmax(scores, dim=1)

        scores = top_k_scores.expand_as(scores) + scores
        if step == 1:
            top_k_scores, top_k_words = scores[0].topk(k, 0, True, True)
        else:
            top_k_scores, top_k_words = scores.view(-1).topk(k, 0, True, True)

        prev_word_inds = top_k_words // vocab_size
        next_word_inds = top_k_words % vocab_size

        seqs = torch.cat([seqs[prev_word_inds], next_word_inds.unsqueeze(1)], dim=1)

        incomplete_inds = [ind for ind, next_word in enumerate(next_word_inds) if next_word != word_map['<end>']]
        complete_inds = list(set(range(len(next_word_inds))) - set(incomplete_inds))

        if len(complete_inds) > 0:
            complete_seqs.extend(seqs[complete_inds].tolist())
            complete_seqs_scores.extend(top_k_scores[complete_inds])

        k -= len(complete_inds)
        if k == 0 or step > max_len:
            break

        seqs = seqs[incomplete_inds]
        h = h[prev_word_inds[incomplete_inds]]
        c = c[prev_word_inds[incomplete_inds]]
        encoder_out = encoder_out[prev_word_inds[incomplete_inds]]
        top_k_scores = top_k_scores[incomplete_inds].unsqueeze(1)
        k_prev_words = next_word_inds[incomplete_inds].unsqueeze(1)

        step += 1

    if len(complete_seqs) == 0:
        best_seq = seqs[0].tolist()
    else:
        # Penalize scores by UNK count
        penalized_scores = [
            score - unk_penalty_weight * seq.count(unk_idx)
            for seq, score in zip(complete_seqs, complete_seqs_scores)
        ]
        best_index = penalized_scores.index(max(penalized_scores))
        best_seq = complete_seqs[best_index]

    decoded = [rev_word_map[idx] for idx in best_seq if idx not in {word_map['<start>'], word_map['<pad>'], word_map['<end>']}]
    return decoded

In [9]:
data_folder = "/kaggle/input/food-iac-fine-tune-dataset/preprocessed_dataset"

vocab_size = len(word_map)
embed_size = 300
hidden_size = 512
batch_size = 64
num_epochs = 20
patience = 3
learning_rate = 1e-4
grad_clip = 5.0
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
from pycocoevalcap.rouge.rouge import Rouge
from pycocoevalcap.cider.cider import Cider
from pycocoevalcap.meteor.meteor import Meteor
from pycocoevalcap.spice.spice import Spice

from pycocoevalcap.tokenizer.ptbtokenizer import PTBTokenizer
from tqdm import tqdm
import torch.nn.functional as F
import pandas as pd

aspects = ["color_light", "composition", "dof_and_focus"]
fine_tune_ckpt = "/kaggle/input/single-aspects/pytorch/default/3/fine-tuned-models"

# Load word map
with open(word_map_path, "r") as f:
    word_map = json.load(f)
rev_word_map = {v: k for k, v in word_map.items()}

# Smoother untuk BLEU
chencherry = SmoothingFunction()

results = []

for aspect in aspects:
    print(f"\n📊 Evaluating aspect: {aspect.upper()}")
    val_loader = DataLoader(
        CaptionDataset(data_folder, aspect, split='val', transform=transform),
        batch_size=1, shuffle=False
    )

    encoder = EncoderCNN().to(device)
    decoder = DecoderRNN(embed_size, hidden_size, vocab_size).to(device)

    ckpt = torch.load(os.path.join(fine_tune_ckpt, f"{aspect}_best.pth"), map_location=device)
    encoder.load_state_dict(ckpt["encoder"])
    decoder.load_state_dict(ckpt["decoder"])
    encoder.eval(); decoder.eval()

    references_bleu, hypotheses_bleu = [], []
    gts, res = {}, {}
    total_unk, total_words = 0, 0

    for i, (img, _, _, all_caps) in enumerate(tqdm(val_loader, desc=f"📝 Beam Decode ({aspect})")):
        img = img.squeeze(0).to(device)
        gen_caption = caption_image_beam_search(encoder, decoder, img, word_map, beam_size=5, max_len=25)
        total_words += len(gen_caption)
        total_unk += gen_caption.count('<unk>')

        # Prepare for BLEU
        hypotheses_bleu.append(gen_caption)
        caps = all_caps[0].tolist()
        refs = [[rev_word_map[i] for i in cap if i not in {word_map['<start>'], word_map['<end>'], word_map['<pad>']}] for cap in caps]
        references_bleu.append(refs)

        # Prepare for COCO-style metrics
        res[i] = [{'caption': ' '.join(gen_caption)}]
        gts[i] = [{'caption': ' '.join(ref)} for ref in refs]

    # BLEU Scores
    bleu1 = corpus_bleu(references_bleu, hypotheses_bleu, weights=(1, 0, 0, 0), smoothing_function=chencherry.method1)
    bleu2 = corpus_bleu(references_bleu, hypotheses_bleu, weights=(0.5, 0.5, 0, 0), smoothing_function=chencherry.method1)
    bleu3 = corpus_bleu(references_bleu, hypotheses_bleu, weights=(0.33, 0.33, 0.33, 0), smoothing_function=chencherry.method1)
    bleu4 = corpus_bleu(references_bleu, hypotheses_bleu, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=chencherry.method1)

    # Tokenizer
    tokenizer = PTBTokenizer()
    gts_tok = tokenizer.tokenize(gts)
    res_tok = tokenizer.tokenize(res)

    # ROUGE-L
    rouge_scorer = Rouge()
    rouge_score, _ = rouge_scorer.compute_score(gts_tok, res_tok)

    # METEOR
    meteor_scorer = Meteor()
    meteor_score, _ = meteor_scorer.compute_score(gts_tok, res_tok)

    # CIDEr
    cider_scorer = Cider()
    cider_score, _ = cider_scorer.compute_score(gts_tok, res_tok)

    # SPICE
    try:
        spice_scorer = Spice()
        spice_score, _ = spice_scorer.compute_score(gts_tok, res_tok)
    except Exception as e:
        print(f"⚠️ Skipping SPICE for {aspect} due to error: {e}")
        spice_score = -1  # bisa juga None

    results.append({
        "Aspect": aspect,
        "BLEU-1": bleu1,
        "BLEU-2": bleu2,
        "BLEU-3": bleu3,
        "BLEU-4": bleu4,
        "ROUGE-L": rouge_score,
        "METEOR": meteor_score,
        "CIDEr": cider_score,
        "SPICE": spice_score,
        "UNK Ratio": total_unk / total_words if total_words > 0 else 0
    })

# Hasil akhir
df_results = pd.DataFrame(results)
print("\n📈 Full Evaluation Results:")
display(df_results)

In [None]:
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
from pycocoevalcap.rouge.rouge import Rouge
from pycocoevalcap.cider.cider import Cider
from pycocoevalcap.meteor.meteor import Meteor
from pycocoevalcap.spice.spice import Spice

from pycocoevalcap.tokenizer.ptbtokenizer import PTBTokenizer
from tqdm import tqdm
import torch.nn.functional as F
import pandas as pd

aspects = ["general_impression", "subject", "use_of_camera"]
fine_tune_ckpt = "/kaggle/input/single-aspects-part-2/pytorch/default/2/fine-tuned-models"

# Load word map
with open(word_map_path, "r") as f:
    word_map = json.load(f)
rev_word_map = {v: k for k, v in word_map.items()}

# Smoother untuk BLEU
chencherry = SmoothingFunction()

results = []

for aspect in aspects:
    print(f"\n📊 Evaluating aspect: {aspect.upper()}")
    val_loader = DataLoader(
        CaptionDataset(data_folder, aspect, split='val', transform=transform),
        batch_size=1, shuffle=False
    )

    encoder = EncoderCNN().to(device)
    decoder = DecoderRNN(embed_size, hidden_size, vocab_size).to(device)

    ckpt = torch.load(os.path.join(fine_tune_ckpt, f"{aspect}_best.pth"), map_location=device)
    encoder.load_state_dict(ckpt["encoder"])
    decoder.load_state_dict(ckpt["decoder"])
    encoder.eval(); decoder.eval()

    references_bleu, hypotheses_bleu = [], []
    gts, res = {}, {}
    total_unk, total_words = 0, 0

    for i, (img, _, _, all_caps) in enumerate(tqdm(val_loader, desc=f"📝 Beam Decode ({aspect})")):
        img = img.squeeze(0).to(device)
        gen_caption = caption_image_beam_search(encoder, decoder, img, word_map, beam_size=5, max_len=25)
        total_words += len(gen_caption)
        total_unk += gen_caption.count('<unk>')

        # Prepare for BLEU
        hypotheses_bleu.append(gen_caption)
        caps = all_caps[0].tolist()
        refs = [[rev_word_map[i] for i in cap if i not in {word_map['<start>'], word_map['<end>'], word_map['<pad>']}] for cap in caps]
        references_bleu.append(refs)

        # Prepare for COCO-style metrics
        res[i] = [{'caption': ' '.join(gen_caption)}]
        gts[i] = [{'caption': ' '.join(ref)} for ref in refs]

    # BLEU Scores
    bleu1 = corpus_bleu(references_bleu, hypotheses_bleu, weights=(1, 0, 0, 0), smoothing_function=chencherry.method1)
    bleu2 = corpus_bleu(references_bleu, hypotheses_bleu, weights=(0.5, 0.5, 0, 0), smoothing_function=chencherry.method1)
    bleu3 = corpus_bleu(references_bleu, hypotheses_bleu, weights=(0.33, 0.33, 0.33, 0), smoothing_function=chencherry.method1)
    bleu4 = corpus_bleu(references_bleu, hypotheses_bleu, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=chencherry.method1)

    # Tokenizer
    tokenizer = PTBTokenizer()
    gts_tok = tokenizer.tokenize(gts)
    res_tok = tokenizer.tokenize(res)

    # ROUGE-L
    rouge_scorer = Rouge()
    rouge_score, _ = rouge_scorer.compute_score(gts_tok, res_tok)

    # METEOR
    meteor_scorer = Meteor()
    meteor_score, _ = meteor_scorer.compute_score(gts_tok, res_tok)

    # CIDEr
    cider_scorer = Cider()
    cider_score, _ = cider_scorer.compute_score(gts_tok, res_tok)

    # SPICE
    try:
        spice_scorer = Spice()
        spice_score, _ = spice_scorer.compute_score(gts_tok, res_tok)
    except Exception as e:
        print(f"⚠️ Skipping SPICE for {aspect} due to error: {e}")
        spice_score = -1  # bisa juga None

    results.append({
        "Aspect": aspect,
        "BLEU-1": bleu1,
        "BLEU-2": bleu2,
        "BLEU-3": bleu3,
        "BLEU-4": bleu4,
        "ROUGE-L": rouge_score,
        "METEOR": meteor_score,
        "CIDEr": cider_score,
        "SPICE": spice_score,
        "UNK Ratio": total_unk / total_words if total_words > 0 else 0
    })

# Hasil akhir
df_results = pd.DataFrame(results)
print("\n📈 Full Evaluation Results:")
display(df_results)

In [None]:
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
from pycocoevalcap.rouge.rouge import Rouge
from pycocoevalcap.cider.cider import Cider
from pycocoevalcap.meteor.meteor import Meteor
from pycocoevalcap.spice.spice import Spice
from pycocoevalcap.tokenizer.ptbtokenizer import PTBTokenizer
from tqdm import tqdm
import torch.nn.functional as F
import pandas as pd

# Semua aspek & path ckpt-nya
aspect_ckpt_map = {
    "general_impression": "/kaggle/input/single-aspects-part-2/pytorch/default/2/fine-tuned-models",
    "subject": "/kaggle/input/single-aspects-part-2/pytorch/default/2/fine-tuned-models",
    "use_of_camera": "/kaggle/input/single-aspects-part-2/pytorch/default/2/fine-tuned-models",
    "color_light": "/kaggle/input/single-aspects/pytorch/default/3/fine-tuned-models",
    "composition": "/kaggle/input/single-aspects/pytorch/default/3/fine-tuned-models",
    "dof_and_focus": "/kaggle/input/single-aspects/pytorch/default/3/fine-tuned-models"
}

# Load word map
with open(word_map_path, "r") as f:
    word_map = json.load(f)
rev_word_map = {v: k for k, v in word_map.items()}
chencherry = SmoothingFunction()
results = []

for aspect, ckpt_dir in aspect_ckpt_map.items():
    print(f"\n📊 Evaluating aspect: {aspect.upper()}")
    val_loader = DataLoader(
        CaptionDataset(data_folder, aspect, split='val', transform=transform),
        batch_size=1, shuffle=False
    )

    encoder = EncoderCNN().to(device)
    decoder = DecoderRNN(embed_size, hidden_size, vocab_size).to(device)

    ckpt_path = os.path.join(ckpt_dir, f"{aspect}_best.pth")
    ckpt = torch.load(ckpt_path, map_location=device)
    encoder.load_state_dict(ckpt["encoder"])
    decoder.load_state_dict(ckpt["decoder"])
    encoder.eval(); decoder.eval()

    references_bleu, hypotheses_bleu = [], []
    gts, res = {}, {}
    total_unk, total_words = 0, 0

    for i, (img, _, _, all_caps) in enumerate(tqdm(val_loader, desc=f"📝 Beam Decode ({aspect})")):
        img = img.squeeze(0).to(device)
        gen_caption = caption_image_beam_search(encoder, decoder, img, word_map, beam_size=5, max_len=25)
        total_words += len(gen_caption)
        total_unk += gen_caption.count('<unk>')

        hypotheses_bleu.append(gen_caption)
        caps = all_caps[0].tolist()
        refs = [[rev_word_map[i] for i in cap if i not in {word_map['<start>'], word_map['<end>'], word_map['<pad>']}] for cap in caps]
        references_bleu.append(refs)

        res[i] = [{'caption': ' '.join(gen_caption)}]
        gts[i] = [{'caption': ' '.join(ref)} for ref in refs]

    bleu1 = corpus_bleu(references_bleu, hypotheses_bleu, weights=(1, 0, 0, 0), smoothing_function=chencherry.method1)
    bleu2 = corpus_bleu(references_bleu, hypotheses_bleu, weights=(0.5, 0.5, 0, 0), smoothing_function=chencherry.method1)
    bleu3 = corpus_bleu(references_bleu, hypotheses_bleu, weights=(0.33, 0.33, 0.33, 0), smoothing_function=chencherry.method1)
    bleu4 = corpus_bleu(references_bleu, hypotheses_bleu, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=chencherry.method1)

    tokenizer = PTBTokenizer()
    gts_tok = tokenizer.tokenize(gts)
    res_tok = tokenizer.tokenize(res)

    rouge_scorer = Rouge()
    rouge_score, _ = rouge_scorer.compute_score(gts_tok, res_tok)

    meteor_scorer = Meteor()
    meteor_score, _ = meteor_scorer.compute_score(gts_tok, res_tok)

    cider_scorer = Cider()
    cider_score, _ = cider_scorer.compute_score(gts_tok, res_tok)

    try:
        spice_scorer = Spice()
        spice_score, _ = spice_scorer.compute_score(gts_tok, res_tok)
    except Exception as e:
        print(f"⚠️ Skipping SPICE for {aspect} due to error: {e}")
        spice_score = -1

    results.append({
        "Aspect": aspect,
        "BLEU-1": bleu1,
        "BLEU-2": bleu2,
        "BLEU-3": bleu3,
        "BLEU-4": bleu4,
        "ROUGE-L": rouge_score,
        "METEOR": meteor_score,
        "CIDEr": cider_score,
        "SPICE": spice_score,
        "UNK Ratio": total_unk / total_words if total_words > 0 else 0
    })

# Akhir
df_results = pd.DataFrame(results)
print("\n📈 Full Evaluation Results:")
display(df_results)

In [None]:
import matplotlib.pyplot as plt

bleu_cols = ["BLEU-1", "BLEU-2", "BLEU-3", "BLEU-4"]
df_bleu = df_results.set_index("Aspect")[bleu_cols]

df_bleu.plot(kind="bar", figsize=(10, 6), title="BLEU-n Scores per Aspect")
plt.ylabel("Score")
plt.ylim(0, 0.25)
plt.xticks(rotation=30)
plt.grid(axis='y')
plt.tight_layout()
plt.show()

In [None]:
metric_cols = ["ROUGE-L", "METEOR", "CIDEr"]
df_metrics = df_results.set_index("Aspect")[metric_cols]

df_metrics.plot(kind="bar", figsize=(10, 6), title="ROUGE, METEOR, CIDEr per Aspect", color=["#66c2a5", "#fc8d62", "#8da0cb"])
plt.ylabel("Score")
plt.ylim(0, 0.2)
plt.xticks(rotation=30)
plt.grid(axis='y')
plt.tight_layout()
plt.show()

In [15]:
import json, os, torch
from torchsummary import summary

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
ckpt_path = "/kaggle/input/single-aspects-part-2/pytorch/default/2/fine-tuned-models/general_impression_best.pth"
word_map_path = "/kaggle/input/food-iac-fine-tune-dataset/preprocessed_dataset/wordmap_all.json"

# Load word_map
with open(word_map_path, "r") as j:
    word_map = json.load(j)
vocab_size = len(word_map)

# Load checkpoint
ckpt = torch.load(ckpt_path, map_location=device)

In [16]:
# Init models
encoder = EncoderCNN().to(device)
decoder = DecoderRNN(
    embed_size=300,
    hidden_size=512,
    vocab_size=vocab_size,
    pretrained_embeddings=None,
    freeze_embeddings=False
).to(device)

# Load weights from checkpoint
encoder.load_state_dict(ckpt["encoder"])
decoder.load_state_dict(ckpt["decoder"])

<All keys matched successfully>

In [17]:
summary(encoder, input_size=(3, 256, 256))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1         [-1, 64, 128, 128]           9,408
       BatchNorm2d-2         [-1, 64, 128, 128]             128
              ReLU-3         [-1, 64, 128, 128]               0
         MaxPool2d-4           [-1, 64, 64, 64]               0
            Conv2d-5           [-1, 64, 64, 64]           4,096
       BatchNorm2d-6           [-1, 64, 64, 64]             128
              ReLU-7           [-1, 64, 64, 64]               0
            Conv2d-8           [-1, 64, 64, 64]          36,864
       BatchNorm2d-9           [-1, 64, 64, 64]             128
             ReLU-10           [-1, 64, 64, 64]               0
           Conv2d-11          [-1, 256, 64, 64]          16,384
      BatchNorm2d-12          [-1, 256, 64, 64]             512
           Conv2d-13          [-1, 256, 64, 64]          16,384
      BatchNorm2d-14          [-1, 256,

In [18]:
# Dummy input
batch_size = 2
num_pixels = 14 * 14
encoder_dim = 2048
caption_len = 20

dummy_encoder_out = torch.randn(batch_size, num_pixels, encoder_dim).to(device)
dummy_captions = torch.randint(0, vocab_size, (batch_size, caption_len)).to(device)
dummy_caplens = torch.randint(5, caption_len, (batch_size, 1)).to(device)

# Check forward pass
decoder.eval()
with torch.no_grad():
    output = decoder(dummy_encoder_out, dummy_captions, dummy_caplens)
    print("Decoder forward pass successful. Output shape:", output[0].shape)

# Print architecture
print("\nDecoderRNN architecture:")
print(decoder)

Decoder forward pass successful. Output shape: torch.Size([2, 16, 8842])

DecoderRNN architecture:
DecoderRNN(
  (embedding): Embedding(8842, 300)
  (dropout): Dropout(p=0.5, inplace=False)
  (init_h): Linear(in_features=2048, out_features=512, bias=True)
  (init_c): Linear(in_features=2048, out_features=512, bias=True)
  (lstm): LSTMCell(2348, 512)
  (fc): Linear(in_features=512, out_features=8842, bias=True)
)


# DAE

In [None]:
import json
from pycocoevalcap.tokenizer.ptbtokenizer import PTBTokenizer
from pycocoevalcap.bleu.bleu import Bleu
from pycocoevalcap.rouge.rouge import Rouge
from pycocoevalcap.cider.cider import Cider
from pycocoevalcap.meteor.meteor import Meteor
from pycocoevalcap.spice.spice import Spice
import pandas as pd

# Load ground truth (all.json)
with open("/kaggle/input/food-iac-fine-tune-dataset/final/all.json", "r") as f:
    gt_data = json.load(f)["images"]  # <-- fix di sini

# Load DAE output
with open("/kaggle/input/dae-model/pytorch/default/1/dae_lstm_outputs.json", "r") as f:
    dae_outputs = json.load(f)

# --- Mapping ID ke nama gambar
id_to_img = {i: item["filename"] for i, item in enumerate(gt_data)}  # pakai 'filename'

# --- Buat gts dan res pakai nama file gambar sebagai key
gts, res = {}, {}
for item in dae_outputs:
    img_name = id_to_img.get(item["id"])
    if img_name:
        gt_captions = next(x["sentences"] for x in gt_data if x["filename"] == img_name)
        gts[img_name] = [{"caption": s["raw"]} for s in gt_captions]
        res[img_name] = [{"caption": item["dae_caption"]}]


# --- Tokenisasi
tokenizer = PTBTokenizer()
gts_tok = tokenizer.tokenize(gts)
res_tok = tokenizer.tokenize(res)

# Sinkronkan key
valid_keys = list(set(gts_tok.keys()) & set(res_tok.keys()))
gts_tok = {k: gts_tok[k] for k in valid_keys}
res_tok = {k: res_tok[k] for k in valid_keys}

# --- Evaluasi
bleu = Bleu(4)
rouge = Rouge()
meteor = Meteor()
cider = Cider()
try:
    spice = Spice()
    spice_score, _ = spice.compute_score(gts_tok, res_tok)
except:
    spice_score = None

bleu_scores, _ = bleu.compute_score(gts_tok, res_tok)
rouge_score, _ = rouge.compute_score(gts_tok, res_tok)
meteor_score, _ = meteor.compute_score(gts_tok, res_tok)
cider_score, _ = cider.compute_score(gts_tok, res_tok)

# --- Hasil
results = {
    "BLEU-1": bleu_scores[0],
    "BLEU-2": bleu_scores[1],
    "BLEU-3": bleu_scores[2],
    "BLEU-4": bleu_scores[3],
    "ROUGE-L": rouge_score,
    "METEOR": meteor_score,
    "CIDEr": cider_score,
    "SPICE": spice_score,
}

df = pd.DataFrame

In [None]:
print("\n📈 Evaluation Results (DAE):")
for k, v in results.items():
    print(f"{k}: {v:.4f}" if isinstance(v, (int, float)) else f"{k}: {v}")

In [None]:
# Filter metrik yang valid (bukan None)
filtered_results = {k: v for k, v in results.items() if v is not None}

# Plot
plt.figure(figsize=(10, 6))
plt.bar(filtered_results.keys(), filtered_results.values())
plt.ylabel("Score")
plt.title("Evaluasi Caption - DAE LSTM (tanpa retouch)")
plt.ylim(0, max(filtered_results.values()) + 0.05)

# Tambahkan nilai di atas bar
for i, (metric, score) in enumerate(filtered_results.items()):
    plt.text(i, score + 0.005, f"{score:.3f}", ha='center')

plt.grid(axis='y', linestyle='--', alpha=0.6)
plt.show()

In [1]:
import torch
import json

# Load data
dae_data = torch.load('/kaggle/input/dae-model/pytorch/default/1/dae_preprocessed_filtered.pt')
input_seqs = dae_data['input_seqs']
word_map = dae_data['word_map']

# Buat reverse word_map
rev_word_map = {v: k for k, v in word_map.items()}

# Decode input_seqs
dae_inputs = []
for idx, seq in enumerate(input_seqs):
    words = []
    for token_id in seq.tolist():
        word = rev_word_map.get(token_id, '<unk>')
        if word == '<end>':
            break
        if word not in ['<start>', '<pad>']:
            words.append(word)
    caption = ' '.join(words)
    dae_inputs.append({
        "id": idx,
        "caption_input": caption
    })

# Simpan ke file JSON
with open("dae_inputs.json", "w") as f:
    json.dump(dae_inputs, f, indent=2)

print(f"Berhasil menyimpan {len(dae_inputs)} caption input ke dae_inputs.json")

Berhasil menyimpan 6480 caption input ke dae_inputs.json


In [5]:
import requests
import base64
import traceback

# Fungsi encode gambar ke base64
def encode_image_base64(img_path):
    try:
        with open(img_path, "rb") as img_f:
            return base64.b64encode(img_f.read()).decode('utf-8')
    except Exception as e:
        print(f"[WARNING] Gagal encode gambar {img_path}: {e}")
        return None

In [6]:
import os

OPENAI_API_KEY = "API KEY"

def retouch_gpt_api(caption_dae, input_dae, img_path=None, openai_api_key=OPENAI_API_KEY):
    """
    Mengirim caption DAE, input DAE, dan image (opsional) ke GPT-4o Mini via API.
    Return hasil retouch atau None jika rate limit.
    """
    url = "https://api.openai.com/v1/chat/completions"
    headers = {
        "Authorization": f"Bearer {openai_api_key}",
        "Content-Type": "application/json"
    }
    # Prompt
    prompt = (
        f"Here are combined aspect captions from the CNN-LSTM model (may still contain '<unk>'):\n"
        f"\"{input_dae}\"\n\n"
        f"And this is the DAE-generated caption (may still contain <unk>):\n"
        f"\"{caption_dae}\"\n\n"
        f"Your task:\n"
        f"1. Rewrite the DAE caption to be more natural and fluent, but do NOT change its structure or core words.\n"
        f"2. IMPORTANT: Ensure the final caption **explicitly covers all the key information/aspects** mentioned in the combined CNN-LSTM caption above (general impression, subject, use of camera, color/light, composition, dof/focus)—even if briefly, and not necessarily in order.\n"
        f"3. Do not add new information that is not present in the input/DAE captions. Do not guess objects or details from the image.\n"
        f"4. Fix <unk> if possible, and merge all aspects into one cohesive sentence.\n"
        f"Return only the final caption as one simple sentence containing all aspects. No extra words, no extra creativity.\n"
    )


    # Build messages
    messages = [
        {"role": "system", "content": "You are an expert assistant for rewriting image captions."},
        {"role": "user", "content": prompt}
    ]
    # Jika gambar bisa di-base64, tambahkan sebagai attachment (OpenAI API VISION style)
    img_base64 = encode_image_base64(img_path) if img_path is not None else None
    if img_base64:
        # Use OpenAI "vision" input format
        messages.append({
            "role": "user",
            "content": [
                {"type": "text", "text": prompt},
                {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{img_base64}"}}
            ]
        })
    # Build payload
    payload = {
        "model": "gpt-4o-mini",  # atau 'gpt-4o-mini' jika sudah available, biasanya 'gpt-4o'
        "messages": messages,
        "max_tokens": 96,
        "temperature": 0.7,
    }
    # Kirim ke API
    try:
        response = requests.post(url, headers=headers, json=payload, timeout=40)
        if response.status_code == 429:
            print("[Rate Limit] OpenAI API rate limit, skip sample.")
            return None
        response.raise_for_status()
        result = response.json()
        if "choices" in result:
            # Jika ada image vision, ambil response terakhir
            reply = result['choices'][-1]['message']['content'].strip()
            return reply
        else:
            print("[ERROR] Response tidak mengandung 'choices'")
            print(result)
            return None
    except Exception as e:
        print(f"[ERROR] API call gagal: {e}\n{traceback.format_exc()}")
        return None

In [7]:
import json
from tqdm import tqdm

# Path input dan output
dae_output_path = "/kaggle/input/dae-model/pytorch/default/1/dae_lstm_outputs.json"
dae_input_path = "/kaggle/working/dae_inputs.json"  # berisi dae_input per ID
save_path = "/kaggle/working/retouch_gpt.json"

# Load caption DAE
with open(dae_output_path, "r") as f:
    dae_outputs = json.load(f)

# Load caption_input (gabungan 6 aspek)
with open(dae_input_path, "r") as f:
    dae_inputs = json.load(f)  # format: list of {"id": ..., "caption_input": ...}

# Buat mapping ID ke input caption
id_to_input = {x["id"]: x["caption_input"] for x in dae_inputs}

# Bangun hasil retouch
results = []
for item in tqdm(dae_outputs):
    id_ = item["id"]
    caption_dae = item["dae_caption"]
    caption_input = id_to_input.get(id_)

    if not caption_input:
        continue  # skip jika tidak ada input

    # Panggil API untuk retouch
    caption_retouch = retouch_gpt_api(caption_dae, caption_input)

    # Skip jika gagal
    if caption_retouch is None:
        continue

    results.append({
        "id": id_,
        "caption_dae": caption_dae,
        "caption_input": caption_input,
        "caption_retouch": caption_retouch
    })

    # Simpan sementara (checkpoint)
    with open(save_path, "w") as f:
        json.dump(results, f, indent=2)

100%|██████████| 6480/6480 [2:25:18<00:00,  1.35s/it]  


In [10]:
import json
from pycocoevalcap.tokenizer.ptbtokenizer import PTBTokenizer
from pycocoevalcap.bleu.bleu import Bleu
from pycocoevalcap.rouge.rouge import Rouge
from pycocoevalcap.cider.cider import Cider
from pycocoevalcap.meteor.meteor import Meteor

# Load ground truth (all.json)
with open("/kaggle/input/food-iac-fine-tune-dataset/final/all.json", "r") as f:
    gt_data = json.load(f)["images"]

# Load retouch caption
with open("/kaggle/working/retouch_gpt.json", "r") as f:
    retouch_data = json.load(f)

# Buat mapping ID ke nama file gambar
id_to_filename = {i: item["filename"] for i, item in enumerate(gt_data)}

# Siapkan gts (ground truth) dan res (retouch caption)
gts, res = {}, {}
for item in retouch_data:
    id_ = item["id"]
    img_name = id_to_filename.get(id_)
    if img_name is None:
        continue
    # Ground truth caption dari all.json
    gt_captions = next(x["sentences"] for x in gt_data if x["filename"] == img_name)
    gts[img_name] = [{"caption": s["raw"]} for s in gt_captions]
    # Caption hasil retouch
    res[img_name] = [{"caption": item["caption_retouch"]}]

# Tokenisasi
tokenizer = PTBTokenizer()
gts_tok = tokenizer.tokenize(gts)
res_tok = tokenizer.tokenize(res)

# Sinkronkan key (harus sama)
valid_keys = list(set(gts_tok.keys()) & set(res_tok.keys()))
gts_tok = {k: gts_tok[k] for k in valid_keys}
res_tok = {k: res_tok[k] for k in valid_keys}

# Evaluasi
bleu = Bleu(4)
rouge = Rouge()
meteor = Meteor()
cider = Cider()

bleu_scores, _ = bleu.compute_score(gts_tok, res_tok)
rouge_score, _ = rouge.compute_score(gts_tok, res_tok)
meteor_score, _ = meteor.compute_score(gts_tok, res_tok)
cider_score, _ = cider.compute_score(gts_tok, res_tok)

# Print hasil evaluasi
print("=== Evaluation of Caption Retouch ===")
print(f"BLEU-1   : {bleu_scores[0]:.4f}")
print(f"BLEU-2   : {bleu_scores[1]:.4f}")
print(f"BLEU-3   : {bleu_scores[2]:.4f}")
print(f"BLEU-4   : {bleu_scores[3]:.4f}")
print(f"ROUGE-L  : {rouge_score:.4f}")
print(f"METEOR   : {meteor_score:.4f}")
print(f"CIDEr    : {cider_score:.4f}")


PTBTokenizer tokenized 1080434 tokens at 1419965.27 tokens per second.
PTBTokenizer tokenized 280362 tokens at 670416.67 tokens per second.


{'testlen': 246090, 'reflen': 197117, 'guess': [246090, 239610, 233130, 226650], 'correct': [106386, 22253, 3155, 377]}
ratio: 1.2484463541957251
=== Evaluation of Caption Retouch ===
BLEU-1   : 0.4323
BLEU-2   : 0.2004
BLEU-3   : 0.0816
BLEU-4   : 0.0308
ROUGE-L  : 0.2431
METEOR   : 0.1314
CIDEr    : 0.0074
