In [1]:
import os
import json
import random
import pickle
from collections import Counter

import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

# -------------------------------
# Settings and Hyperparameters
# -------------------------------

FEAT_DIR = os.path.join(os.getcwd(), "features", "feat_train")
train_split = 0.85
max_length = 10               # maximum caption length (including <bos> and <eos>)
latent_dim = 256              # LSTM hidden state dimension
time_steps_encoder = 20       # number of frames (features) per video
num_encoder_tokens = 2560     # dimension of each feature vector
num_decoder_tokens = 1500     # maximum vocabulary size (including special tokens)
batch_size = 100
num_epochs = 30
learning_rate = 0.0001

# Special tokens
PAD_TOKEN = "<pad>"
BOS_TOKEN = "<bos>"
EOS_TOKEN = "<eos>"
UNK_TOKEN = "<unk>"

# -------------------------------
# 1. Load and preprocess captions from JSON
# -------------------------------
with open(TRAIN_VIDEO_LIST, "r") as f:
    data = json.load(f)

# Build a list of (caption, video_id) pairs.
# For each caption, add <bos> and <eos> tokens and filter by length.
train_list = []
for item in data:
    video_id = item["id"]
    if video_id.split(".")[0] not in valid_video_ids:
        continue  # skip video not in training set
    for caption in item["caption"]:
        caption_full = BOS_TOKEN + " " + caption + " " + EOS_TOKEN
        if 6 <= len(caption_full.split()) <= 10:
            train_list.append((caption_full, video_id))

print("Total valid caption–video pairs:", len(train_list))

# Shuffle and split into training and validation sets
random.shuffle(train_list)
split_index = int(len(train_list) * train_split)
training_list = train_list[:split_index]
validation_list = train_list[split_index:]
print("Training samples:", len(training_list), "Validation samples:", len(validation_list))

# -------------------------------
# 2. Build Vocabulary from training captions
# -------------------------------
# Count words from training captions
counter = Counter()
for caption, _ in training_list:
    counter.update(caption.split())

# Remove special tokens if they appear in the counter (we add them manually)
for token in [PAD_TOKEN, BOS_TOKEN, EOS_TOKEN, UNK_TOKEN]:
    if token in counter:
        del counter[token]

# Allow top (num_decoder_tokens - number_of_special_tokens) words.
num_allowed = num_decoder_tokens - 4
most_common = counter.most_common(num_allowed)
vocab_words = [w for w, _ in most_common]

# Build vocabulary: reserve index 0 for PAD, then BOS, EOS, UNK, then other words.
vocab = {}
vocab[PAD_TOKEN] = 0
vocab[BOS_TOKEN] = 1
vocab[EOS_TOKEN] = 2
vocab[UNK_TOKEN] = 3
index = 4
for word in vocab_words:
    vocab[word] = index
    index += 1

vocab_size = len(vocab)
print("Vocabulary size:", vocab_size)

def caption_to_seq(caption, vocab, max_length):
    """
    Convert a caption string into a list of integer indices.
    Pads or truncates the sequence to max_length.
    """
    words = caption.split()
    seq = [vocab.get(w, vocab[UNK_TOKEN]) for w in words]
    if len(seq) < max_length:
        seq += [vocab[PAD_TOKEN]] * (max_length - len(seq))
    else:
        seq = seq[:max_length]
    return seq

# Save the vocabulary for later use.
with open(os.path.join(train_path, "vocab.pkl"), "wb") as f:
    pickle.dump(vocab, f)

# -------------------------------
# 3. Load Pre-Extracted Features
# -------------------------------
# x_data maps a video ID (without extension) to its features (a numpy array).
x_data = {}
for filename in os.listdir(FEAT_DIR):
    if filename.endswith(".npy"):
        video_id = filename[:-4]
        feat = np.load(os.path.join(FEAT_DIR, filename))
        x_data[video_id] = feat
print("Loaded features for", len(x_data), "videos.")

# -------------------------------
# 4. Create PyTorch Dataset and DataLoader
# -------------------------------
class VideoCaptionDataset(Dataset):
    def __init__(self, caption_list, x_data, vocab, max_length):
        self.caption_list = caption_list  # list of (caption, video_id)
        self.x_data = x_data
        self.vocab = vocab
        self.max_length = max_length

    def __len__(self):
        return len(self.caption_list)

    def __getitem__(self, idx):
        caption, video_id = self.caption_list[idx]
        # Remove file extension if present
        video_id = video_id.split(".")[0]
        if video_id not in self.x_data:
            raise ValueError(f"Feature for video ID {video_id} not found.")
        encoder_feat = self.x_data[video_id]  # NumPy array of shape (time_steps_encoder, num_encoder_tokens)
        encoder_feat = torch.tensor(encoder_feat, dtype=torch.float)

        seq = caption_to_seq(caption, self.vocab, self.max_length)
        seq = torch.tensor(seq, dtype=torch.long)
        # For teacher forcing: decoder input is seq[:-1] and target is seq[1:].
        decoder_input = seq[:-1]  # length = max_length - 1
        decoder_target = seq[1:]
        return encoder_feat, decoder_input, decoder_target

train_dataset = VideoCaptionDataset(training_list, x_data, vocab, max_length)
val_dataset = VideoCaptionDataset(validation_list, x_data, vocab, max_length)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, drop_last=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, drop_last=True)

# -------------------------------
# 5. Define the Encoder–Decoder (LSTM–LSTM) Model in PyTorch
# -------------------------------
class BiLSTMEncoder(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers=1):
        super(BiLSTMEncoder, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, bidirectional=True)
    
    def forward(self, x):
        outputs, (h, c) = self.lstm(x)
        # For a single-layer bidirectional LSTM:
        # h has shape (2, batch, hidden_size) -> concatenate along hidden dimension
        h_cat = torch.cat((h[0], h[1]), dim=-1)  # shape: (batch, 2*hidden_size)
        c_cat = torch.cat((c[0], c[1]), dim=-1)
        # Add a layer dimension for compatibility with decoder (assuming num_layers=1)
        return h_cat.unsqueeze(0), c_cat.unsqueeze(0)


class Decoder(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_layers=1, dropout=0.5):
        super(Decoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size, padding_idx=0)
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True, dropout=dropout)
        self.fc = nn.Linear(hidden_size, vocab_size)
    
    def forward(self, x, hidden, cell):
        # x: (batch, seq_len)
        embedded = self.embedding(x)  # (batch, seq_len, embed_size)
        outputs, (h, c) = self.lstm(embedded, (hidden, cell))
        logits = self.fc(outputs)     # (batch, seq_len, vocab_size)
        return logits, h, c

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
    
    def forward(self, encoder_input, decoder_input):
        # Encoder
        hidden, cell = self.encoder(encoder_input)
        # Decoder (using teacher forcing)
        logits, _, _ = self.decoder(decoder_input, hidden, cell)
        return logits

# Hyperparameters for the model
embed_size = 256
encoder = BiLSTMEncoder(num_encoder_tokens, latent_dim)
decoder = Decoder(vocab_size, embed_size, latent_dim**2)
model = Seq2Seq(encoder, decoder)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print(model)

# -------------------------------
# 6. Loss, Optimizer, and Training Loop
# -------------------------------
criterion = nn.CrossEntropyLoss(ignore_index=0)  # ignore PAD token (index 0)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

def train_epoch(model, dataloader, optimizer, criterion, device):
    model.train()
    epoch_loss = 0
    for encoder_input, decoder_input, decoder_target in dataloader:
        encoder_input = encoder_input.to(device)   # (B, time_steps, num_encoder_tokens)
        decoder_input = decoder_input.to(device)   # (B, max_length-1)
        decoder_target = decoder_target.to(device) # (B, max_length-1)
        
        optimizer.zero_grad()
        outputs = model(encoder_input, decoder_input)  # (B, seq_len, vocab_size)
        # Reshape for loss computation
        outputs = outputs.view(-1, vocab_size)
        decoder_target = decoder_target.view(-1)
        loss = criterion(outputs, decoder_target)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
    return epoch_loss / len(dataloader)

def evaluate(model, dataloader, criterion, device):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for encoder_input, decoder_input, decoder_target in dataloader:
            encoder_input = encoder_input.to(device)
            decoder_input = decoder_input.to(device)
            decoder_target = decoder_target.to(device)
            outputs = model(encoder_input, decoder_input)
            outputs = outputs.view(-1, vocab_size)
            decoder_target = decoder_target.view(-1)
            loss = criterion(outputs, decoder_target)
            total_loss += loss.item()
    return total_loss / len(dataloader)

best_val_loss = float("inf")
for epoch in range(num_epochs):
    train_loss = train_epoch(model, train_loader, optimizer, criterion, device)
    val_loss = evaluate(model, val_loader, criterion, device)
    print(f"Epoch {epoch+1}/{num_epochs}: Train Loss {train_loss:.4f}, Val Loss {val_loss:.4f}")
    # Save best model checkpoint
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(model.state_dict(), os.path.join(train_path, "best_seq2seq_model.pth"))

# -------------------------------
# 7. Save final model and vocabulary
# -------------------------------
# save_model_path = os.path.join(train_path, "modelLSTM_LSTM")
save_model_path = os.path.join(os.getcwd(), "model_final_2")

os.makedirs(save_model_path, exist_ok=True)
torch.save(encoder.state_dict(), os.path.join(save_model_path, "encoder_model_LSTM_LSTM.pth"))
torch.save(decoder.state_dict(), os.path.join(save_model_path, "decoder_model_LSTM_LSTM.pth"))
with open(os.path.join(save_model_path, "vocab.pkl"), "wb") as f:
    pickle.dump(vocab, f)

print("Training complete and models saved.")


FileNotFoundError: [Errno 2] No such file or directory: 'training_data\\training_label.json'