In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import os
print(os.listdir("/kaggle/input"))


In [None]:
print(os.listdir("/kaggle/input/dataforvtt/data"))


In [None]:
import os

data_path = "/kaggle/input/data-val/data_val"
print(os.listdir(data_path))


In [4]:
import os
import json
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
import torchvision.models as models
import tiktoken

from pycocoevalcap.spice.spice import Spice
from IPython.display import FileLink

# ========================
# CONFIGURATION
# ========================
DATA_ROOT      = r"/kaggle/input/dataforvtt/data"         # train data root
# DATA_ROOT      = r""         # train data root
CLIPS_DIR      = os.path.join(DATA_ROOT, "clips")
CAPTIONS_FILE  = os.path.join(DATA_ROOT, "captions.json")

VAL_ROOT       = "/kaggle/input/data-val/data_val"     # validation data root
VAL_CLIPS_DIR  = os.path.join(VAL_ROOT, "clips")
VAL_CAPTIONS   = os.path.join(VAL_ROOT, "captions.json")

EPOCHS      = 5
BATCH_SIZE  = 8
LR          = 1e-4
EMBED_DIM   = 512
MAX_LEN     = 32
ALPHA       = 0.1  # contrastive loss weight

# ========================
# TOKENIZER
# ========================
tokenizer = tiktoken.get_encoding("gpt2")
VOCAB_SIZE = tokenizer.n_vocab
BOS_ID, EOS_ID, PAD_ID = 50256, 50256, 0

# ========================
# DATASET
# ========================
class MSRVTTClipsDataset(Dataset):
    def __init__(self, clips_dir, captions_file, max_len=MAX_LEN):
        self.clips_dir = clips_dir
        with open(captions_file, "r") as f:
            self.captions = json.load(f)
        self.video_ids = sorted(list(self.captions.keys()))
        self.max_len = max_len

    def _tokenize(self, text):
        ids = [BOS_ID] + tokenizer.encode(text) + [EOS_ID]
        arr = np.full(self.max_len, PAD_ID, dtype=np.int64)
        arr[:min(len(ids), self.max_len)] = ids[:self.max_len]
        return torch.tensor(arr, dtype=torch.long)

    def __getitem__(self, idx):
        vid = self.video_ids[idx]
        clip = np.load(os.path.join(self.clips_dir, vid, "clip.npy")).astype(np.float32)
        if clip.max() > 1.5:
            clip /= 255.0
        cap_text = self.captions[vid][0]
        cap_ids = self._tokenize(cap_text)
        return torch.from_numpy(clip), cap_ids

    def __len__(self):
        return len(self.video_ids)

# ========================
# C3D Backbone
# ========================
class C3DBackbone(nn.Module):
    def __init__(self, out_dim=4096):
        super().__init__()
        self.features = nn.Sequential(
            nn.Conv3d(3,64,3,padding=1), nn.ReLU(), nn.MaxPool3d(2,2),
            nn.Conv3d(64,128,3,padding=1), nn.ReLU(), nn.AdaptiveAvgPool3d((1,1,1))
        )
        self.fc = nn.Linear(128, out_dim)

    def forward(self, x):
        x = self.features(x)
        x = x.view(x.size(0), -1)
        return self.fc(x)

# ========================
# Encoder
# ========================
class EncoderResNetAllFramesC3D(nn.Module):
    def __init__(self, embed_dim=EMBED_DIM, c3d_out=4096):
        super().__init__()
        resnet = models.resnet152(weights=models.ResNet152_Weights.IMAGENET1K_V2)
        self.resnet = nn.Sequential(*list(resnet.children())[:-1])
        self.res_proj = nn.Linear(2048, embed_dim)
        self.c3d = C3DBackbone(out_dim=c3d_out)
        self.c3d_proj = nn.Linear(c3d_out, embed_dim)
        self.fuse = nn.Linear(embed_dim*2, embed_dim)
        self.bn = nn.BatchNorm1d(embed_dim)
        self.register_buffer("mean", torch.tensor([0.485, 0.456, 0.406]).view(1,3,1,1))
        self.register_buffer("std", torch.tensor([0.229, 0.224, 0.225]).view(1,3,1,1))
        for p in self.resnet.parameters():
            p.requires_grad = False
        for p in self.c3d.parameters():
            p.requires_grad = False

    def forward(self, clip_16):
        B = clip_16.size(0)
        frames = clip_16.permute(0,2,1,3,4).reshape(B*16,3,112,112)
        frames = F.interpolate(frames, size=(224,224), mode="bilinear")
        frames = (frames - self.mean) / self.std
        with torch.no_grad():
            r = self.resnet(frames)
        r = self.res_proj(r.view(B*16, -1)).view(B, 16, -1).mean(1)
        with torch.no_grad():
            c = self.c3d(clip_16)
        c = self.c3d_proj(c)
        f = self.fuse(torch.cat([r, c], 1))
        return self.bn(f)

# ========================
# Decoder
# ========================
class GPTStyleDecoder(nn.Module):
    def __init__(self, embed_dim=EMBED_DIM, vocab_size=VOCAB_SIZE,
                 n_heads=8, n_layers=4, ff_dim=2048, max_len=MAX_LEN):
        super().__init__()
        self.tok_emb = nn.Embedding(vocab_size, embed_dim)
        self.pos_emb = nn.Embedding(max_len, embed_dim)
        layer = nn.TransformerDecoderLayer(embed_dim, n_heads, ff_dim, activation="gelu")
        self.decoder = nn.TransformerDecoder(layer, n_layers)
        self.lm_head = nn.Linear(embed_dim, vocab_size)
        self.max_len = max_len

    def forward(self, vid_emb, token_ids):
        B, T = token_ids.shape
        pos = torch.arange(T, device=token_ids.device).unsqueeze(0).expand(B, T)
        x = self.tok_emb(token_ids) + self.pos_emb(pos)
        mem = vid_emb.unsqueeze(1).expand(B, T, -1)
        x, mem = x.transpose(0, 1), mem.transpose(0, 1)
        mask = torch.triu(torch.ones(T, T, device=token_ids.device) * float("-inf"), 1)
        y = self.decoder(x, mem, tgt_mask=mask)
        y = y.transpose(0, 1)
        pooled = y.mean(dim=1)
        logits = self.lm_head(y)
        return logits, pooled

# ========================
# Contrastive Loss
# ========================
class ContrastiveLoss(nn.Module):
    def __init__(self, temperature=0.07):
        super().__init__()
        self.temperature = temperature
        self.cos = nn.CosineSimilarity(dim=-1)

    def forward(self, video_embed, caption_embed):
        video_norm = F.normalize(video_embed, dim=-1)
        caption_norm = F.normalize(caption_embed, dim=-1)
        logits = torch.matmul(video_norm, caption_norm.T) / self.temperature
        labels = torch.arange(video_embed.size(0), device=video_embed.device)
        return nn.CrossEntropyLoss()(logits, labels)

# ========================
# Greedy Decode
# ========================
def greedy_decode(enc, dec, clip_path, max_len=MAX_LEN, device=None):
    if device is None:
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    clip = np.load(clip_path).astype(np.float32)
    if clip.max() > 1.5:
        clip /= 255.0
    clip = torch.from_numpy(clip).unsqueeze(0).to(device).float()
    enc.eval()
    dec.eval()
    with torch.no_grad():
        emb = enc(clip)
        seq = [BOS_ID]
        for _ in range(max_len - 1):
            inp = torch.tensor(seq, device=device).unsqueeze(0)
            logits, _ = dec(emb, inp)
            nxt = int(logits[0, -1].argmax())
            seq.append(nxt)
            if nxt == EOS_ID:
                break
    return tokenizer.decode([i for i in seq if i not in (BOS_ID, EOS_ID, PAD_ID)])

# ========================
# Evaluate with SPICE
# ========================
def evaluate(enc, dec, dataloader, crit, contrastive_crit, alpha, device):
    enc.eval()
    dec.eval()
    spice = Spice()
    total_loss = 0
    total_tokens = 0
    correct = 0
    all_gen = []
    all_refs = []
    with torch.no_grad():
        for clip, cap in dataloader:
            clip, cap = clip.to(device).float(), cap.to(device)
            vid_emb = enc(clip)
            logits, cap_emb = dec(vid_emb, cap[:, :-1])
            target = cap[:, 1:]
            ce_loss = crit(logits.reshape(-1, logits.size(-1)), target.reshape(-1))
            contrast_loss = contrastive_crit(vid_emb, cap_emb)
            loss = ce_loss + alpha * contrast_loss
            total_loss += loss.item() * clip.size(0)
            preds = logits.argmax(-1)
            mask = target != PAD_ID
            correct += (preds == target).masked_select(mask).sum().item()
            total_tokens += mask.sum().item()
            # prepare for SPICE
            for b in range(clip.size(0)):
                img_id = len(all_gen)
                pred_ids = preds[b].tolist()
                pred_caption = tokenizer.decode([i for i in pred_ids if i not in (BOS_ID, EOS_ID, PAD_ID)])
                true_ids = target[b].tolist()
                true_caption = tokenizer.decode([i for i in true_ids if i not in (BOS_ID, EOS_ID, PAD_ID)])
                all_gen.append((img_id, pred_caption))
                all_refs.append((img_id, [true_caption]))

    avg_loss = total_loss / len(dataloader.dataset)
    acc = correct / total_tokens if total_tokens > 0 else 0
    # Convert lists into dictionaries
    gen_dict = {img_id: [caption] for img_id, caption in all_gen}
    ref_dict = {img_id: refs for img_id, refs in all_refs}
    spice_score, _ = spice.compute_score(ref_dict, gen_dict)
    return avg_loss, acc, spice_score

# ========================
# Train Model
# ========================
def train_model():
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print("Using device:", device)

    train_ds = MSRVTTClipsDataset(CLIPS_DIR, CAPTIONS_FILE, max_len=MAX_LEN)
    val_ds = MSRVTTClipsDataset(VAL_CLIPS_DIR, VAL_CAPTIONS, max_len=MAX_LEN)

    train_dl = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
    val_dl = DataLoader(val_ds, batch_size=BATCH_SIZE)

    enc = EncoderResNetAllFramesC3D(EMBED_DIM).to(device)
    dec = GPTStyleDecoder(embed_dim=EMBED_DIM, max_len=MAX_LEN).to(device)

    crit = nn.CrossEntropyLoss(ignore_index=PAD_ID)
    contrastive_crit = ContrastiveLoss()
    params = list(dec.parameters()) + list(enc.res_proj.parameters()) + \
             list(enc.c3d_proj.parameters()) + list(enc.fuse.parameters()) + list(enc.bn.parameters())
    opt = optim.Adam(params, lr=LR)

    for ep in range(1, EPOCHS + 1):
        enc.train()
        dec.train()
        for step, (clip, cap) in enumerate(train_dl, 1):
            clip, cap = clip.to(device).float(), cap.to(device)
            vid_emb = enc(clip)
            logits, cap_emb = dec(vid_emb, cap[:, :-1])
            target = cap[:, 1:]
            ce_loss = crit(logits.reshape(-1, logits.size(-1)), target.reshape(-1))
            contrast_loss = contrastive_crit(vid_emb, cap_emb)
            loss = ce_loss + ALPHA * contrast_loss
            opt.zero_grad()
            loss.backward()
            nn.utils.clip_grad_norm_(params, 1.0)
            opt.step()
            if step % 10 == 0:
                print(f"Epoch {ep}/{EPOCHS}, Step {step}, Train Loss {loss.item():.4f}")

        val_loss, val_acc, spice_score = evaluate(enc, dec, val_dl, crit, contrastive_crit, ALPHA, device)
        print(f"Epoch {ep} DONE. Val Loss {val_loss:.4f}, Val Acc {val_acc*100:.2f}%, SPICE {spice_score:.4f}")

    torch.save(enc.state_dict(), "/kaggle/working/encoder.pth")
    torch.save(dec.state_dict(), "/kaggle/working/decoder.pth")
    print("Models saved successfully.")
    return enc, dec

# ========================
# Main
# ========================
if __name__ == "__main__":
    enc, dec = train_model()


Using device: cuda


Downloading: "https://download.pytorch.org/models/resnet152-f82ba261.pth" to /root/.cache/torch/hub/checkpoints/resnet152-f82ba261.pth
100%|██████████| 230M/230M [00:01<00:00, 203MB/s]  


Epoch 1/5, Step 10, Train Loss 9.8233
Epoch 1/5, Step 20, Train Loss 8.6549
Epoch 1/5, Step 30, Train Loss 8.4684
Epoch 1/5, Step 40, Train Loss 7.3511
Epoch 1/5, Step 50, Train Loss 6.8163
Epoch 1/5, Step 60, Train Loss 6.4702
Epoch 1/5, Step 70, Train Loss 6.3055
Epoch 1/5, Step 80, Train Loss 5.7503
Epoch 1/5, Step 90, Train Loss 6.7666
Epoch 1/5, Step 100, Train Loss 6.5504
Epoch 1/5, Step 110, Train Loss 5.7121
Epoch 1/5, Step 120, Train Loss 6.3460
Epoch 1/5, Step 130, Train Loss 5.6124
Epoch 1/5, Step 140, Train Loss 5.3321
Epoch 1/5, Step 150, Train Loss 6.6381
Epoch 1/5, Step 160, Train Loss 6.6829
Epoch 1/5, Step 170, Train Loss 5.9128
Epoch 1/5, Step 180, Train Loss 5.0446
Epoch 1/5, Step 190, Train Loss 4.5552
Epoch 1/5, Step 200, Train Loss 5.6204
Epoch 1/5, Step 210, Train Loss 5.9888
Epoch 1/5, Step 220, Train Loss 5.7405
Epoch 1/5, Step 230, Train Loss 5.3708
Epoch 1/5, Step 240, Train Loss 5.7389
Epoch 1/5, Step 250, Train Loss 6.6341
Epoch 1/5, Step 260, Train Loss 5.

Parsing reference captions
Initiating Stanford parsing pipeline
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator tokenize
[main] INFO edu.stanford.nlp.pipeline.TokenizerAnnotator - TokenizerAnnotator: No tokenizer type provided. Defaulting to PTBTokenizer.
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator ssplit
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator parse
[main] INFO edu.stanford.nlp.parser.common.ParserGrammar - Loading parser from serialized file edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz ... 
done [0.6 sec].
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator lemma
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator ner
Loading classifier from edu/stanford/nlp/models/ner/english.all.3class.distsim.crf.ser.gz ... done [1.3 sec].
Loading classifier from edu/stanford/nlp/models/ner/english.muc.7class.distsim.crf.ser.gz ... done [0.7 sec].
Loading classif

SPICE evaluation took: 3.543 min
Epoch 1 DONE. Val Loss 4.8524, Val Acc 30.23%, SPICE 0.0631
Epoch 2/5, Step 10, Train Loss 4.5143
Epoch 2/5, Step 20, Train Loss 5.0056
Epoch 2/5, Step 30, Train Loss 5.9333
Epoch 2/5, Step 40, Train Loss 4.5362
Epoch 2/5, Step 50, Train Loss 3.9123
Epoch 2/5, Step 60, Train Loss 4.3847
Epoch 2/5, Step 70, Train Loss 4.8134
Epoch 2/5, Step 80, Train Loss 5.2461
Epoch 2/5, Step 90, Train Loss 4.9668
Epoch 2/5, Step 100, Train Loss 3.8438
Epoch 2/5, Step 110, Train Loss 3.8587
Epoch 2/5, Step 120, Train Loss 4.4336
Epoch 2/5, Step 130, Train Loss 4.6415
Epoch 2/5, Step 140, Train Loss 4.0564
Epoch 2/5, Step 150, Train Loss 4.0636
Epoch 2/5, Step 160, Train Loss 4.8708
Epoch 2/5, Step 170, Train Loss 4.5541
Epoch 2/5, Step 180, Train Loss 4.9501
Epoch 2/5, Step 190, Train Loss 4.6782
Epoch 2/5, Step 200, Train Loss 4.4651
Epoch 2/5, Step 210, Train Loss 4.4435
Epoch 2/5, Step 220, Train Loss 5.1831
Epoch 2/5, Step 230, Train Loss 4.2720
Epoch 2/5, Step 240

Parsing reference captions
Parsing test captions
Initiating Stanford parsing pipeline
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator tokenize
[main] INFO edu.stanford.nlp.pipeline.TokenizerAnnotator - TokenizerAnnotator: No tokenizer type provided. Defaulting to PTBTokenizer.
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator ssplit
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator parse
[main] INFO edu.stanford.nlp.parser.common.ParserGrammar - Loading parser from serialized file edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz ... 
done [0.6 sec].
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator lemma
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator ner
Loading classifier from edu/stanford/nlp/models/ner/english.all.3class.distsim.crf.ser.gz ... done [1.3 sec].
Loading classifier from edu/stanford/nlp/models/ner/english.muc.7class.distsim.crf.ser.gz ... done [0.7

SPICE evaluation took: 3.950 min
Epoch 2 DONE. Val Loss 4.5730, Val Acc 32.44%, SPICE 0.0795
Epoch 3/5, Step 10, Train Loss 3.8931
Epoch 3/5, Step 20, Train Loss 2.9807
Epoch 3/5, Step 30, Train Loss 4.5647
Epoch 3/5, Step 40, Train Loss 3.3538
Epoch 3/5, Step 50, Train Loss 4.2478
Epoch 3/5, Step 60, Train Loss 4.6744
Epoch 3/5, Step 70, Train Loss 4.4727
Epoch 3/5, Step 80, Train Loss 4.4343
Epoch 3/5, Step 90, Train Loss 4.3897
Epoch 3/5, Step 100, Train Loss 4.8336
Epoch 3/5, Step 110, Train Loss 4.5930
Epoch 3/5, Step 120, Train Loss 4.7028
Epoch 3/5, Step 130, Train Loss 4.1604
Epoch 3/5, Step 140, Train Loss 3.9854
Epoch 3/5, Step 150, Train Loss 3.8404
Epoch 3/5, Step 160, Train Loss 4.1965
Epoch 3/5, Step 170, Train Loss 3.4119
Epoch 3/5, Step 180, Train Loss 3.3257
Epoch 3/5, Step 190, Train Loss 3.4249
Epoch 3/5, Step 200, Train Loss 3.7009
Epoch 3/5, Step 210, Train Loss 4.1687
Epoch 3/5, Step 220, Train Loss 4.4901
Epoch 3/5, Step 230, Train Loss 3.5755
Epoch 3/5, Step 240

Parsing reference captions
Parsing test captions
Initiating Stanford parsing pipeline
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator tokenize
[main] INFO edu.stanford.nlp.pipeline.TokenizerAnnotator - TokenizerAnnotator: No tokenizer type provided. Defaulting to PTBTokenizer.
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator ssplit
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator parse
[main] INFO edu.stanford.nlp.parser.common.ParserGrammar - Loading parser from serialized file edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz ... 
done [0.6 sec].
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator lemma
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator ner
Loading classifier from edu/stanford/nlp/models/ner/english.all.3class.distsim.crf.ser.gz ... done [1.3 sec].
Loading classifier from edu/stanford/nlp/models/ner/english.muc.7class.distsim.crf.ser.gz ... done [0.7

SPICE evaluation took: 4.434 min
Epoch 3 DONE. Val Loss 4.4311, Val Acc 33.63%, SPICE 0.0711
Epoch 4/5, Step 10, Train Loss 3.6915
Epoch 4/5, Step 20, Train Loss 4.1440
Epoch 4/5, Step 30, Train Loss 3.0877
Epoch 4/5, Step 40, Train Loss 2.9022
Epoch 4/5, Step 50, Train Loss 4.3101
Epoch 4/5, Step 60, Train Loss 3.1811
Epoch 4/5, Step 70, Train Loss 4.3193
Epoch 4/5, Step 80, Train Loss 3.8317
Epoch 4/5, Step 90, Train Loss 4.3005
Epoch 4/5, Step 100, Train Loss 3.7869
Epoch 4/5, Step 110, Train Loss 2.5447
Epoch 4/5, Step 120, Train Loss 3.0982
Epoch 4/5, Step 130, Train Loss 3.4620
Epoch 4/5, Step 140, Train Loss 3.9467
Epoch 4/5, Step 150, Train Loss 3.9877
Epoch 4/5, Step 160, Train Loss 3.9207
Epoch 4/5, Step 170, Train Loss 3.3010
Epoch 4/5, Step 180, Train Loss 3.8431
Epoch 4/5, Step 190, Train Loss 3.1116
Epoch 4/5, Step 200, Train Loss 3.8232
Epoch 4/5, Step 210, Train Loss 3.7342
Epoch 4/5, Step 220, Train Loss 3.8479
Epoch 4/5, Step 230, Train Loss 3.0691
Epoch 4/5, Step 240

Parsing reference captions
Parsing test captions
Initiating Stanford parsing pipeline
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator tokenize
[main] INFO edu.stanford.nlp.pipeline.TokenizerAnnotator - TokenizerAnnotator: No tokenizer type provided. Defaulting to PTBTokenizer.
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator ssplit
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator parse
[main] INFO edu.stanford.nlp.parser.common.ParserGrammar - Loading parser from serialized file edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz ... 
done [0.6 sec].
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator lemma
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator ner
Loading classifier from edu/stanford/nlp/models/ner/english.all.3class.distsim.crf.ser.gz ... done [1.3 sec].
Loading classifier from edu/stanford/nlp/models/ner/english.muc.7class.distsim.crf.ser.gz ... done [0.6

SPICE evaluation took: 3.701 min
Epoch 4 DONE. Val Loss 4.3872, Val Acc 33.45%, SPICE 0.0721
Epoch 5/5, Step 10, Train Loss 2.5408
Epoch 5/5, Step 20, Train Loss 3.0672
Epoch 5/5, Step 30, Train Loss 3.4228
Epoch 5/5, Step 40, Train Loss 2.5702
Epoch 5/5, Step 50, Train Loss 3.0369
Epoch 5/5, Step 60, Train Loss 3.0873
Epoch 5/5, Step 70, Train Loss 3.4339
Epoch 5/5, Step 80, Train Loss 3.2424
Epoch 5/5, Step 90, Train Loss 2.6391
Epoch 5/5, Step 100, Train Loss 3.4499
Epoch 5/5, Step 110, Train Loss 3.7120
Epoch 5/5, Step 120, Train Loss 2.6033
Epoch 5/5, Step 130, Train Loss 3.7258
Epoch 5/5, Step 140, Train Loss 3.1650
Epoch 5/5, Step 150, Train Loss 3.1193
Epoch 5/5, Step 160, Train Loss 3.0431
Epoch 5/5, Step 170, Train Loss 2.9790
Epoch 5/5, Step 180, Train Loss 3.3844
Epoch 5/5, Step 190, Train Loss 3.2624
Epoch 5/5, Step 200, Train Loss 3.7177
Epoch 5/5, Step 210, Train Loss 2.1653
Epoch 5/5, Step 220, Train Loss 2.9289
Epoch 5/5, Step 230, Train Loss 3.6300
Epoch 5/5, Step 240

Parsing reference captions
Parsing test captions
Initiating Stanford parsing pipeline
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator tokenize
[main] INFO edu.stanford.nlp.pipeline.TokenizerAnnotator - TokenizerAnnotator: No tokenizer type provided. Defaulting to PTBTokenizer.
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator ssplit
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator parse
[main] INFO edu.stanford.nlp.parser.common.ParserGrammar - Loading parser from serialized file edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz ... 
done [0.6 sec].
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator lemma
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator ner
Loading classifier from edu/stanford/nlp/models/ner/english.all.3class.distsim.crf.ser.gz ... done [1.3 sec].
Loading classifier from edu/stanford/nlp/models/ner/english.muc.7class.distsim.crf.ser.gz ... done [0.7

SPICE evaluation took: 3.989 min
Epoch 5 DONE. Val Loss 4.3587, Val Acc 34.41%, SPICE 0.0776
Models saved successfully.


In [3]:
!pip install nltk



In [None]:
from IPython.display import FileLink

# Create clickable links for download
display(FileLink("/kaggle/working/encoder.pth"))
display(FileLink("/kaggle/working/decoder.pth"))


In [None]:
import os
print(os.getcwd())


In [5]:
import os
import torch
import numpy as np
from random import sample

def predict_captions(enc, dec, clips_dir, num_samples=5, max_len=MAX_LEN, device=None):
    if device is None:
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    # Get list of video clip files
    video_files = [f for f in os.listdir(clips_dir) if os.path.exists(os.path.join(clips_dir, f, "clip.npy"))]
    
    # Sample random videos
    sample_vids = sample(video_files, min(num_samples, len(video_files)))
    
    # Set models to evaluation mode
    enc.eval()
    dec.eval()
    
    predictions = []
    
    with torch.no_grad():
        for vid in sample_vids:
            # Load and preprocess clip
            clip_path = os.path.join(clips_dir, vid, "clip.npy")
            clip = np.load(clip_path).astype(np.float32)
            if clip.max() > 1.5:
                clip /= 255.0
            clip = torch.from_numpy(clip).unsqueeze(0).to(device).float()
            
            # Generate caption
            caption = greedy_decode(enc, dec, clip_path, max_len=max_len, device=device)
            
            predictions.append({
                'video_id': vid,
                'predicted_caption': caption
            })
    
    return predictions

# Example usage
def main():
    # Assuming enc and dec are already loaded from train_model()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    # Load saved models if not already loaded
    enc = EncoderResNetAllFramesC3D(embed_dim=EMBED_DIM).to(device)
    dec = GPTStyleDecoder(embed_dim=EMBED_DIM, max_len=MAX_LEN).to(device)
    enc.load_state_dict(torch.load("/kaggle/working/encoder.pth", map_location=device))
    dec.load_state_dict(torch.load("/kaggle/working/decoder.pth", map_location=device))
    
    # Predict captions for 5 random training videos
    predictions = predict_captions(enc, dec, CLIPS_DIR, num_samples=5, max_len=MAX_LEN, device=device)
    
    # Print results
    print("\nPredicted Captions:")
    for pred in predictions:
        print(f"Video ID: {pred['video_id']}")
        print(f"Caption: {pred['predicted_caption']}\n")

if __name__ == "__main__":
    main()


Predicted Captions:
Video ID: video2789
Caption: a man is giving a speech

Video ID: video3483
Caption: a band is performing a song on stage

Video ID: video4184
Caption: a woman is riding a bike on a road

Video ID: video3654
Caption: a man is talking about a man

Video ID: video2795
Caption: a clip from a movie is playing

