# Setup

In [1]:
!pip install torch
!pip install numpy
!pip install pandas
!pip install nltk

Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cusolver-cu12==11.6.1.9 (from torch)
  Downloading nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cusparse-cu12==12.3.1.170 (from torch)
  Downloading nvidia_cusparse_cu12-12.3.1.170-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-nvjitlink-cu12==12.4.127 (from torch)
  Downloading nvidia_nvjitlink_cu12-12.4.127-py3-n

In [1]:
import re
import torch
import argparse
import numpy as np
import torch.nn.functional as F
import itertools
import collections
import pandas as pd
from torch.utils.data import Dataset, DataLoader, Subset
import torch.nn as nn
import torch.nn.functional as F
from transformers import AutoTokenizer
from torch.optim.lr_scheduler import CosineAnnealingLR
import nltk
from nltk.stem import WordNetLemmatizer

In [2]:
nltk.download('wordnet')
nltk.download('omw-1.4')
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /usr/share/nltk_data...


In [3]:
PAD_TOKEN = '[PAD]'
UNK_TOKEN = '[UNK]'

auto_tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased", use_fast=True)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

# Utils.py

In [17]:
def cuda(args, tensor):
    """
    Places tensor on CUDA device (by default, uses cuda:0).
    
    Returns:
        Tensor on CUDA device.
    """
    if args.use_gpu and torch:
        return tensor.cuda()
    else:
        return tensor

def unpack(tensor):
    """
    Unpacks a tensor into a Python list.

    Args:
        tensor: PyTorch tensor.

    Returns:
        Python list with tensor contents.
    """
    if tensor.requires_grad:
        tensor = tensor.detach()
    return tensor.cpu().numpy().tolist()

def load_embeddings(path):
    """
    Loads GloVe-style embeddings into memory.
    Args:
        path: Embedding path, e.g. "glove/glove.6B.300d.txt".

    Returns:
        Dictionary mapping words (strings) to vectors (list of floats).
    """
    embedding_map = {}
    with open(path, 'r', encoding="utf-8") as f:
        for line in f:
            try:
                pieces = line.rstrip().split()
                word = pieces[0].lower()  # Normalize to lowercase
                embedding_map[word] = [float(weight) for weight in pieces[1:]]
                
                # Also store lemma if different
                lemma = lemmatizer.lemmatize(word)
                if lemma != word and lemma not in embedding_map:
                    embedding_map[lemma] = [float(weight) for weight in pieces[1:]]
            except:
                pass
    return embedding_map

def embed_batch(embedding_map, embedding_layer, batch_token_ids, idx2word, embed_dim):
    """
    Iteratively converts a batch of token id sequences into their embeddings.

    Args:
        embedding_map (dict): Mapping from to embedding vectors.
        batch_token_ids (List[List[int]]): Batch where each element is a list of token ids.
        idx2word (dict): Mapping from token ID (int) to the corresponding word (str)
        embed_dim (int): The dimensionality of the embeddings.
    
    Returns:
        Numpy array of shape (batch_size, seq_len, embed_dim) containing the embeddings.
    """
    batch_embeddings = []
    
    for token_ids in batch_token_ids:
        sequence_embeddings = []
        for token_id in token_ids:
            # Retrieve the corresponding word for the token id.
            token = idx2word.get(token_id.item(), None)
            # print("Token", token_id.item(), token)
            if token is None and token not in embedding_map:
                token_embedding = np.zeros(embed_dim)
            else:
                try:
                    token_tensor = torch.tensor([token_id.item()], device=device)
                    token_embedding = embedding_layer(token_tensor).squeeze(0).cpu().detach().numpy()
                except Exception as e:
                    print(f"Token ID {token_id} caused error: {e}")
                    token_embedding = np.zeros(embed_dim)

            sequence_embeddings.append(token_embedding)
        batch_embeddings.append(sequence_embeddings)
    return np.array(batch_embeddings)

def co_attention(context_embedding, question_embedding, conv=True):
    """
    Co-attention mechanism that computes attention between context and question encodings.
    If `convolution=True`, applies local smoothing to the affinity matrix.

    Args:
        context_embedding (Tensor): (B, context_len, d)
        question_embedding (Tensor): (B, question_len, d)
        convolution (bool): whether to apply convolution-based smoothing.

    Returns:
        CP (Tensor): passage attention context
        E_Out (Tensor): final encoder output
    """
    # Step 1: Affinity matrix A ∈ (B, context_len, question_len)
    A = torch.bmm(context_embedding, question_embedding.transpose(1, 2))
    # print("context_embedding = ", context_embedding[0])
    # print("question_embedding = ", question_embedding[0])
    
    # print("context_embedding:", context_embedding)
    # print("question_embedding:", question_embedding)
    # print("Affinity range:", A.min().item(), A.max().item())

    # Apply learned smoothing
    if conv:
        # A = conv_co_attention(A)
        A = conv_co_attention(A)

    # Step 2: Passage-to-question attention (row-wise)
    A_P = F.softmax(A, dim=2)

    # Step 3: Question-to-passage attention (column-wise)
    A_Q = F.softmax(A.transpose(1, 2), dim=2)

    # Step 4: Passage attention context: CP = H^P × A^Q
    # print("Context Embedding Shape", context_embedding.shape)
    # print("Question Embedding Shape", question_embedding.shape)
    # print("A_Q Shape", A_Q.shape)
    CP = torch.bmm(A_Q, context_embedding)
    # print("CP Shape", CP.shape)  # (B, Lq, d)

    # Step 5: Encoder output: concat(H^P, [H^Q; CP] × A^P)
    # QC = torch.cat([question_embedding, CP], dim=1)
    QC_1 = torch.bmm(A_P, question_embedding)  # (B, Lq, d)
    # print("QC_1 Shape", QC_1.shape)

    QC_2 = torch.bmm(A_P, CP)  # (B, Lq, d)
    # print("QC_2 Shape", QC_2.shape)

    # QC = torch.cat([QC_1, QC_2], dim=1) # (B, Lq, 2d)
    QC = torch.cat([QC_1, QC_2], dim=-1)  # (B, Lq, 2d)
    # print("QC Shape", QC.shape)

    # Final encoder output
    # E_Out = torch.cat([context_embedding, QC], dim=2)
    E_Out = torch.cat([context_embedding, QC], dim=-1)  # (B, Lq, 3d)
    E_Out = nn.LayerNorm(E_Out.shape[-1]).to(E_Out.device)(E_Out)
    E_Out = torch.tanh(E_Out)  # Apply non-linearity
    # print("E_Out Shape", E_Out.shape)

    return CP, E_Out

def create_gaussian_kernel(kernel_width, device, sigma=1.0):
    """Creates a 1D Gaussian kernel."""
    x = torch.arange(-kernel_width//2 + 1, kernel_width//2 + 1, dtype=torch.float, device=device)
    kernel = torch.exp(-x**2 / (2*sigma**2))
    kernel /= kernel.sum()  # Normalize to sum to 1
    return kernel.view(1, 1, -1)

def conv_co_attention(A, kernel_width=11):
    """
    Enhanced convolution to shift attention to neighboring words.
    Applies 1D convolution along context dimension per question word.
    """
    B, Lp, Lq = A.shape
    # Permute A for per-question-word processing: (B, Lq, Lp) -> (B*Lq, 1, Lp)
    A_reshaped = A.permute(0, 2, 1).reshape(-1, 1, Lp)
    
    # Create Gaussian kernel with odd kernel width (e.g., 11)
    kernel = create_gaussian_kernel(kernel_width, A.device, sigma=1.0)
    
    # Use symmetric padding that keeps the sequence length unchanged.
    padded_length = (kernel_width - 1) // 2
    smoothed_A = F.conv1d(A_reshaped, kernel, padding=padded_length)
    
    # Reshape back: current shape is (B*Lq, 1, Lp) --> (B, Lq, Lp) then permute to (B, Lp, Lq)
    smoothed_A = smoothed_A.view(B, Lq, Lp).permute(0, 2, 1)
    A_adjusted = A + smoothed_A  # Enhance original scores with neighbor context
    return F.softmax(A_adjusted, dim=-1)

def tokenize_with_bert(text):
    # Tokenize the text and request offset mappings.
    encoding = auto_tokenizer(
        text,
        return_offsets_mapping=True,
        add_special_tokens=False  # Disable adding special tokens to mimic simple whitespace tokenization.
    )
    
    # Retrieve the tokens.
    tokens = auto_tokenizer.convert_ids_to_tokens(encoding['input_ids'])
    
    # Retrieve the spans from the offset mapping.
    spans = encoding['offset_mapping']
    return tokens, spans
    
def create_embedding_matrix(vocab, embedding_map, embedding_dim=300, scale=0.6):
    """Initialize embedding matrix with:
    - GloVe vectors for known words
    - Random vectors for UNK tokens
    - Zero vector for padding
    """
    # Initialize with random normal distribution (match GloVe scale)
    embedding_matrix = np.random.normal(
        scale=scale, 
        size=(len(vocab), embedding_dim)
    )
    
    # Handle special tokens
    embedding_matrix[vocab.encoding[PAD_TOKEN]] = np.zeros(embedding_dim)
    unk_idx = vocab.encoding[UNK_TOKEN]
    embedding_matrix[unk_idx] = np.random.normal(scale=scale, size=embedding_dim)
    
    for word, idx in vocab.encoding.items():
        if word in [PAD_TOKEN, UNK_TOKEN]:
            continue
            
        # Try direct match
        if word in embedding_map:
            embedding_matrix[idx] = embedding_map[word]
            continue
            
        # Try lemma
        lemma = lemmatizer.lemmatize(word)
        if lemma in embedding_map:
            embedding_matrix[idx] = embedding_map[lemma]
            continue
            
        # Try lowercase lemma
        lower_lemma = lemmatizer.lemmatize(word.lower())
        if lower_lemma in embedding_map:
            embedding_matrix[idx] = embedding_map[lower_lemma]

    return torch.tensor(embedding_matrix, dtype=torch.float32)

# Data.py

In [6]:
class Vocabulary:
    """
    Creates mappings for words → indices and indices → words.
    """
    def __init__(self, samples, vocab_size):
        self.samples = samples
        self.vocab_size = vocab_size
        self.words = self._initialize(samples, vocab_size)
        self.encoding = {word: idx for idx, word in enumerate(self.words)}
        self.decoding = {idx: word for idx, word in enumerate(self.words)}

    def _initialize(self, samples, vocab_size):
        """Build vocabulary with lemma support"""
        embedding_map = load_embeddings("/kaggle/input/glove/other/default/1/glove.6B.300d.txt")
        vocab_counts = collections.defaultdict(int)
        
        for _, row in samples.iterrows():
            # Get base tokens
            tokens = re.findall(r"\w+(?:[-']\w+)*", row['context'].lower()) + \
                     re.findall(r"\w+(?:[-']\w+)*", row['question'].lower())
            
            # Count both original and lemma forms
            for token in tokens:
                vocab_counts[token] += 1
                lemma = lemmatizer.lemmatize(token)
                if lemma != token:
                    vocab_counts[lemma] += 0.5  # Partial count for lemmas
        
        # Sort by combined frequency
        sorted_words = sorted(vocab_counts.items(), 
                            key=lambda x: (-x[1], x[0]))[:vocab_size-2]
        
        return [PAD_TOKEN, UNK_TOKEN] + [w[0] for w in sorted_words]
        
    def __len__(self):
        return len(self.words)

class Tokenizer:
    """
    Converts lists of words to indices and vice versa.
    """
    def __init__(self, vocabulary):
        self.vocabulary = vocabulary
        self.pad_token_id = vocabulary.encoding[PAD_TOKEN]
        self.unk_token_id = vocabulary.encoding[UNK_TOKEN]

    def convert_tokens_to_ids(self, tokens):
        return [self.vocabulary.encoding.get(token.lower(), self.unk_token_id) for token in tokens]

    def convert_ids_to_tokens(self, token_ids):
        return [self.vocabulary.decoding.get(token_id, UNK_TOKEN) for token_id in token_ids]

class QADataset(Dataset):
    """
    Data generator for a QA task; the JSON file should contain character-level answer indices.
    """
    def __init__(self, path):
        # Load JSON-lines file; each line is a JSON object.
        self.samples = pd.read_json(path, lines=True)
        self.tokenizer = None
        # Default pad token id; updated after tokenizer registration.
        self.pad_token_id = 0

    def _collate_batch(self, batch):
        batch = [sample for sample in batch if sample is not None]
        if len(batch) == 0:
            return None  # All samples failed
    
        max_context_len = max(sample['context'].size(0) for sample in batch)
        max_question_len = max(sample['question'].size(0) for sample in batch)
    
        contexts = torch.stack([
            torch.cat([
                sample['context'],
                torch.full((max_context_len - sample['context'].size(0),), self.pad_token_id, dtype=torch.long)
            ]) for sample in batch
        ])
    
        questions = torch.stack([
            torch.cat([
                sample['question'],
                torch.full((max_question_len - sample['question'].size(0),), self.pad_token_id, dtype=torch.long)
            ]) for sample in batch
        ])
    
        answer_starts = torch.stack([sample['answer_start'] for sample in batch])
        answer_ends = torch.stack([sample['answer_end'] for sample in batch])
    
        return {
            'context': contexts,
            'question': questions,
            'answer_start': answer_starts,
            'answer_end': answer_ends
        }

    def register_tokenizer(self, tokenizer):
        """
        Registers a Tokenizer instance and updates pad token id.
        """
        self.tokenizer = tokenizer
        self.pad_token_id = tokenizer.pad_token_id

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        sample = self.samples.iloc[idx]
        context_str = sample['context']
        question_str = sample['question']
        answers = sample['answers']
    
        context_tokens, context_spans = tokenize_with_bert(context_str)
        question_tokens, _ = tokenize_with_bert(question_str)
        context_indices = self.tokenizer.convert_tokens_to_ids(context_tokens)
        question_indices = self.tokenizer.convert_tokens_to_ids(question_tokens)
    
        answer_text = answers['text'][0].strip()
        answer_tokens, _ = tokenize_with_bert(answer_text)
    
        context_tokens_lower = [t.lower() for t in context_tokens]
        answer_tokens_lower = [t.lower() for t in answer_tokens]
    
        token_start, token_end = -1, -1
        for i in range(len(context_tokens_lower) - len(answer_tokens_lower) + 1):
            if context_tokens_lower[i:i+len(answer_tokens_lower)] == answer_tokens_lower:
                token_start = i
                token_end = i + len(answer_tokens_lower) - 1
                break
    
        if token_start == -1 or token_end == -1:
            # skip
            return None
    
        return {
            'context': torch.tensor(context_indices, dtype=torch.long),
            'question': torch.tensor(question_indices, dtype=torch.long),
            'answer_start': torch.tensor(token_start, dtype=torch.long),
            'answer_end': torch.tensor(token_end, dtype=torch.long),
        }


# Model.py

In [18]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class BiLSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size, dropout_prob=0.3):
        super(BiLSTMModel, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        # Create a bidirectional LSTM layer; note batch_first=True keeps tensors as (batch, seq, feature)
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, bidirectional=True)
        
        # Dropout layer
        self.dropout = nn.Dropout(dropout_prob)

    def forward(self, x, return_sequence=True):
        """
        Args:
            x: Input tensor of shape (batch_size, seq_len, input_size).
            return_sequence: If True, return the entire sequence; if False, return the last hidden state.

        """
        # Initialize hidden state and cell state
        h0 = torch.zeros(self.num_layers * 2, x.size(0), self.hidden_size).to(x.device)
        c0 = torch.zeros(self.num_layers * 2, x.size(0), self.hidden_size).to(x.device)

        # Run the input sequence through the LSTM layer
        output, _ = self.lstm(x, (h0, c0))
        
        # Apply dropout to the output of the LSTM layer
        output = self.dropout(output)

        # Return the entire sequence
        return output # Shape: (batch_size, seq_len, hidden_size * 2)

class QAModel(nn.Module):
    def __init__(self, vocab_size, vocab_decoder, embedding_dim, hidden_size, num_layers, output_size=300, dropout_prob=0.3):
        super(QAModel, self).__init__()

        self.vocab_size = vocab_size
        self.vocab_decoder = vocab_decoder

        # Embedding map using pre-trained GloVe embeddings
        self.embedding_map = load_embeddings("/kaggle/input/glove/other/default/1/glove.6B.300d.txt")

        # Initialize Embedding layer
        self.embedding_layer = nn.Embedding(
            num_embeddings = len(vocab),
            embedding_dim = embedding_dim,
            padding_idx = vocab.encoding[PAD_TOKEN]
        )

        # Load pretrained weights
        embedding_matrix = create_embedding_matrix(vocab, self.embedding_map)
        self.embedding_layer.weight.data.copy_(embedding_matrix)

        # Freeze only GloVe vectors
        for i, word in enumerate(vocab.words):
            if word in self.embedding_map and word != UNK_TOKEN:
                self.embedding_layer.weight.requires_grad_(False)

        # Context Modeling
        self.start_decoder = BiLSTMModel(embedding_dim * 3, hidden_size, num_layers, output_size=output_size)
        self.end_decoder = BiLSTMModel(embedding_dim * 6, hidden_size, num_layers, output_size)

        # Prediction Layers - two linear layers for start and end index predictions.
        self.start_linear = nn.Linear(3 * embedding_dim + 2 * hidden_size, 1)
        self.end_linear = nn.Linear(3 * embedding_dim + 2 * hidden_size, 1)
    
    def forward(self, context_ids, question_ids):
        """
        Forward pass that includes contextual encoding.

        Args:
            context_ids (Tensor): shape (batch, context_len)
            question_ids (Tensor): shape (batch, question_len)

        Returns:
            start_logits: Tensor of shape (batch, context_len)
            end_logits: Tensor of question word IDs.
            affinity: (batch, context_len, question_len)
        """
        ### Word Embedding
        context_emb_np = embed_batch(embedding_map=self.embedding_map, embedding_layer=self.embedding_layer, batch_token_ids=context_ids, idx2word=self.vocab_decoder, embed_dim=300)
        question_emb_np = embed_batch(embedding_map=self.embedding_map, embedding_layer=self.embedding_layer, batch_token_ids=question_ids, idx2word=self.vocab_decoder, embed_dim=300)

        # Convert numpy arrays to torch tensors (and ensure they are float type).
        context_emb = torch.from_numpy(context_emb_np).float().contiguous().to(device)
        question_emb = torch.from_numpy(question_emb_np).float().contiguous().to(device)

        # --- Encoder: Contextual Embedding via CoAttention ---
        # passage_attention_context, encoder_out = co_attention(context_emb, question_emb, True)
        passage_attention_context, encoder_out = co_attention(
            context_emb, question_emb, conv=True
        )
        # encoder_out shape: (B, L, 3*embedding_dim)

        # --- Decoder for the Start Index Prediction ---
        start_decoded = self.start_decoder(encoder_out)
        # start_decoded shape: (B, L, 2*hidden_size)

        # Concatenate encoder output with the decoded representation
        start_input = torch.cat([encoder_out, start_decoded], dim=-1)  # (B, L, 3*embedding_dim + 2*hidden_size)
        start_logits = self.start_linear(start_input).squeeze(-1)
        start_probs = F.softmax(start_logits, dim=-1).unsqueeze(-1)

        # Compute weighted summary using start_probs
        start_summary = torch.sum(encoder_out * start_probs, dim=1, keepdim=True)
        start_summary_expanded = start_summary.repeat(1, encoder_out.size(1), 1)  # (B, L, 3*embedding_dim)

        # --- Prepare Features for End Index Prediction ---
        combined_for_end = torch.cat([encoder_out, start_summary_expanded], dim=-1)  # (B, L, 6*embedding_dim)
        
        # --- Decoder for the End Index Prediction ---
        end_decoded = self.end_decoder(combined_for_end)
        
        # Concatenate encoder output with the end decoded representation
        # (Note: ensure arguments to torch.cat are provided as a list)
        end_input = torch.cat([encoder_out, end_decoded], dim=-1)  # (B, L, 3*embedding_dim + 2*hidden_size)
        end_logits = self.end_linear(end_input).squeeze(-1)

        return start_logits, end_logits

# Main.py

In [8]:
# Load the datasets
train_dataset = QADataset(path="/kaggle/input/squad-v2/train.json")

# Instantiate the vocabulary and tokenizer
vocab = Vocabulary(train_dataset.samples, vocab_size=200000)
tokenizer = Tokenizer(vocab)
train_dataset.tokenizer = tokenizer

In [15]:
import copy

# Make a shallow copy of the dataset object
filtered_train_dataset = copy.deepcopy(train_dataset)

# Replace only the samples with the filtered ones
filtered_train_dataset.samples = train_dataset.samples[
    train_dataset.samples['context'].apply(lambda x: len(x) <= 400)
].reset_index(drop=True)

print("Number of samples:", len(filtered_train_dataset.samples))

Number of samples: 17822


In [48]:
# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(torch.cuda.is_available())        # Should return True
print(torch.cuda.device_count())        # Should be > 0
print(torch.cuda.get_device_name(0))    # Should show your NVIDIA GPU
print(f"Using device: {device}")

# Hyperparameters
vocab_size = 200000
embedding_dim = 300         # Embedding size for word embeddings
hidden_size = 128           # Hidden size for LSTM
num_layers = 1              # Number of LSTM layers
output_size = 400           # Not used in our prediction layers for QA; prediction layers output logits per token
batch_size = 32
learning_rate = 0.001

# Create DataLoader
train_loader = DataLoader(filtered_train_dataset, batch_size=batch_size, shuffle=True, collate_fn=train_dataset._collate_batch)

# Initialize the model
model = QAModel(vocab_size=vocab_size,
                vocab_decoder=vocab.decoding,
                embedding_dim=embedding_dim,
                hidden_size=hidden_size,
                num_layers=num_layers,
                output_size=output_size).to(device)

# Loss function and optimizer
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
scheduler = CosineAnnealingLR(optimizer, T_max=10)

# Checkpoint directory (modify as needed)
checkpoint_dir = "/kaggle/working/model/"
import os
if not os.path.exists(checkpoint_dir):
    os.makedirs(checkpoint_dir)

# Number of training epochs
num_epochs = 15
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    valid_sample_count = 0

    print(f"Training Epoch {epoch + 1}/{num_epochs}...")

    # Iterate over the DataLoader
    for batch in train_loader:
        if batch is None:
            continue  # skip batch if all samples failed
                    
        batch_size = batch['context'].size(0)
        valid_sample_count += batch_size  # <-- Count valid samples
            
        # Unpack the batch
        contexts = batch['context'].to(device)        # shape: (batch_size, context_len)
        questions = batch['question'].to(device)       # shape: (batch_size, question_len)
        start_positions = batch['answer_start'].to(device)  # shape: (batch_size,)
        end_positions = batch['answer_end'].to(device)      # shape: (batch_size,)

        # Forward pass: model returns start_logits, end_logits
        start_logits, end_logits = model(contexts, questions)

        def enforce_position_constraints(end_logits, start_positions):
            """
            Mask positions in end_logits that are before the corresponding start positions.
            
            Args:
                end_logits (Tensor): shape (batch_size, seq_len)
                start_positions (Tensor): shape (batch_size,)
                
            Returns:
                Tensor of the same shape as end_logits with positions before the start masked to -inf.
            """
            # Ensure end_probs is 2D (batch_size, seq_len)
            if end_logits.dim() == 3:
                end_logits = end_logits.squeeze(-1)
                
            batch_size, seq_len = end_logits.shape
            
            # Create a tensor of position indices for each sequence
            positions = torch.arange(seq_len, device=end_logits.device).unsqueeze(0).expand(batch_size, seq_len)
            
            # Create a mask where positions are before the corresponding start position
            mask = positions < start_positions.unsqueeze(1)
            
            # Mask out positions by setting them to -inf
            end_logits = end_logits.masked_fill(mask, -float('inf'))
            
            return end_logits
            
        end_logits = enforce_position_constraints(end_logits, start_positions)
        
        # Compute loss for start and end positions
        start_loss = F.cross_entropy(start_logits, start_positions, reduction='none')
        end_loss = F.cross_entropy(end_logits, end_positions, reduction='none')
        ce_loss = 0.7 * start_loss.mean() + 0.3 * end_loss.mean()

        # Span length regularization
        pred_lengths = end_logits.argmax(-1) - start_logits.argmax(-1)
        length_loss = F.relu(1 - pred_lengths.float()).mean()  # Penalize invalid spans

        loss = ce_loss + 0.1 * length_loss

        # -------------------------------------------------------------------------
        # Additional Text Loss based on predicted and ground truth answer spans.
        # -------------------------------------------------------------------------
        
        # Retrieve the context token embeddings using the model's embedding layer.
        # Assumes model.embedding is the embedding layer and returns (batch_size, seq_len, embedding_dim)
        context_embeddings = model.embedding_layer(contexts)
        
        # Determine predicted answer spans from model outputs.
        pred_start_indices = start_logits.argmax(dim=1)  # shape: (batch_size,)
        pred_end_indices   = end_logits.argmax(dim=1)      # shape: (batch_size,)
    
        predicted_embeddings = []
        gt_embeddings = []

        for i in range(contexts.size(0)):
            # ---------------------------
            # Predicted answer embedding
            # ---------------------------
            pred_s = pred_start_indices[i].item()
            pred_e = pred_end_indices[i].item()
            
            # Ensure a valid span; if not, fallback to a single token.
            if pred_e < pred_s:
                pred_e = pred_s
                
            pred_span_embeds = context_embeddings[i, pred_s:pred_e+1, :]  # (span_length, embedding_dim)
            
            if pred_span_embeds.size(0) > 0:
                pred_emb = pred_span_embeds.mean(dim=0)
            else:
                pred_emb = context_embeddings[i, pred_s, :]
                
            predicted_embeddings.append(pred_emb)
            
            # -------------------------------------
            # Ground truth answer embedding
            # -------------------------------------
            # Use provided start_positions and end_positions (which are word indices)
            gt_s = start_positions[i].item()
            gt_e = end_positions[i].item()
            
            if gt_e < gt_s:
                gt_e = gt_s
                
            gt_span_embeds = context_embeddings[i, gt_s:gt_e+1, :]  # (span_length, embedding_dim)
            
            if gt_span_embeds.size(0) > 0:
                gt_emb = gt_span_embeds.mean(dim=0)
            else:
                gt_emb = context_embeddings[i, gt_s, :]
                
            gt_embeddings.append(gt_emb)
        
        predicted_embeddings = torch.stack(predicted_embeddings)  # (batch_size, embedding_dim)
        gt_embeddings = torch.stack(gt_embeddings)                # (batch_size, embedding_dim)
        
        # Compute cosine similarity between predicted and ground truth embeddings.
        cos_sim = F.cosine_similarity(predicted_embeddings, gt_embeddings, dim=1)
        
        # Define text loss as 1 minus the cosine similarity (perfect match gives 0 loss).
        text_loss = (1 - cos_sim).mean()
        
        # Weight the text loss and add it to the overall loss.
        alpha = 0.3  # Adjust weighting factor as needed.
        loss = loss + alpha * text_loss

        # Backward pass and optimization
        loss.backward()
        max_grad_norm = 1.0

        # torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm, norm_type=2)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

        running_loss += loss.item()
    
    avg_loss = running_loss / len(train_loader)
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}")
    print(f"✅ Valid training samples this epoch: {valid_sample_count}")

    # Save a checkpoint every 5 epochs (or customize as needed)
    if (epoch + 1) % 5 == 0:
        checkpoint_path = os.path.join(checkpoint_dir, f"qa_model_epoch_{epoch+1}.pt")
        checkpoint = {
            'epoch': epoch + 1,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'scheduler_state_dict': scheduler.state_dict(),
            'vocab': vocab
        }
        torch.save(checkpoint, checkpoint_path)
        print(f"Checkpoint saved at {checkpoint_path}")


True
1
Tesla P100-PCIE-16GB
Using device: cuda
Training Epoch 1/15...
Epoch [1/15], Loss: 3.6734
✅ Valid training samples this epoch: 17722
Training Epoch 2/15...
Epoch [2/15], Loss: 3.2208
✅ Valid training samples this epoch: 17722
Training Epoch 3/15...
Epoch [3/15], Loss: 2.9229
✅ Valid training samples this epoch: 17722
Training Epoch 4/15...
Epoch [4/15], Loss: 2.6022
✅ Valid training samples this epoch: 17722
Training Epoch 5/15...
Epoch [5/15], Loss: 2.3117
✅ Valid training samples this epoch: 17722
Checkpoint saved at /kaggle/working/model/qa_model_epoch_5.pt
Training Epoch 6/15...
Epoch [6/15], Loss: 2.0362
✅ Valid training samples this epoch: 17722
Training Epoch 7/15...
Epoch [7/15], Loss: 1.7459
✅ Valid training samples this epoch: 17722
Training Epoch 8/15...
Epoch [8/15], Loss: 1.4959
✅ Valid training samples this epoch: 17722
Training Epoch 9/15...
Epoch [9/15], Loss: 1.2855
✅ Valid training samples this epoch: 17722
Training Epoch 10/15...
Epoch [10/15], Loss: 1.1087
✅ 

In [49]:
import shutil

shutil.make_archive('models', 'zip', '/kaggle/working/model/')

'/kaggle/working/models.zip'

In [51]:
from IPython.display import FileLink
FileLink(r'models.zip')

## Model Testing

In [52]:
# --- Utility functions ---
def compute_em(predicted, actual):
    return int(predicted.strip().lower() == actual.strip().lower())

def compute_f1(predicted, actual):
    pred_tokens = predicted.strip().lower().split()
    actual_tokens = actual.strip().lower().split()
    
    common = set(pred_tokens) & set(actual_tokens)
    if not common:
        return 0.0
    
    precision = len(common) / len(pred_tokens)
    recall = len(common) / len(actual_tokens)
    f1 = 2 * precision * recall / (precision + recall)
    return f1

In [53]:
# --- Setup ---
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Checkpoint directory (modify as needed)
checkpoint_dir = "/kaggle/working/out/"
import os
if not os.path.exists(checkpoint_dir):
    os.makedirs(checkpoint_dir)

# Load validation dataset
val_dataset = QADataset(path="/kaggle/input/validation/validation.json")
checkpoint_dir = "/kaggle/working/model/"
checkpoints = sorted([f for f in os.listdir(checkpoint_dir) if f.endswith(".pt")])

# Prepare validation vocab from any one checkpoint (they all use same vocab)
sample_checkpoint = torch.load(os.path.join(checkpoint_dir, checkpoints[-1]), map_location=device)
val_vocab = sample_checkpoint['vocab']
val_tokenizer = Tokenizer(val_vocab)
val_dataset.tokenizer = val_tokenizer

# Filter short contexts only
val_dataset.samples = val_dataset.samples[val_dataset.samples['context'].apply(lambda x: len(x) <= 400)].reset_index(drop=True)

# Shared hyperparameters
vocab_size = 200000
embedding_dim = 300
hidden_size = 128
num_layers = 1
output_size = 400

# Output summary
summary = []

# === Loop Over Checkpoints ===
for ckpt_file in checkpoints:
    print(f"\nEvaluating {ckpt_file}...")

    # Load model
    checkpoint = torch.load(os.path.join(checkpoint_dir, ckpt_file), map_location=device)
    model = QAModel(
        vocab_size=vocab_size,
        vocab_decoder=val_vocab.decoding,
        embedding_dim=embedding_dim,
        hidden_size=hidden_size,
        num_layers=num_layers,
        output_size=output_size
    ).to(device)
    model.load_state_dict(checkpoint['model_state_dict'])
    model.eval()

    em_scores = []
    f1_scores = []
    output_data = []

    valid_sample_count = 0
    for i in range(len(val_dataset)):
        sample = val_dataset[i]
        
        if sample is None:
            continue
        
        row = val_dataset.samples.iloc[i]
        valid_sample_count = valid_sample_count + 1

        context_text = row['context']
        question_text = row['question']
        answer_text = row['answers']['text'][0].strip()
        answer_start = row['answers']['answer_start'][0]
        answer_end = answer_start + len(answer_text)

        context_tensor = sample['context'].unsqueeze(0).to(device)
        question_tensor = sample['question'].unsqueeze(0).to(device)

        with torch.no_grad():
            start_logits, end_logits = model(context_tensor, question_tensor)
            start_idx = torch.argmax(start_logits, dim=1).item()
            end_idx = torch.argmax(end_logits, dim=1).item()
            if end_idx < start_idx:
                end_idx = start_idx

        context_tokens = val_tokenizer.convert_ids_to_tokens(sample['context'].tolist())
        predicted_answer = " ".join(context_tokens[start_idx:end_idx + 1]).strip()

        em = compute_em(predicted_answer, answer_text)
        f1 = compute_f1(predicted_answer, answer_text)

        em_scores.append(em)
        f1_scores.append(f1)

        output_data.append({
            "context": context_text,
            "question": question_text,
            "answer": answer_text,
            "predicted_answer": predicted_answer,
            "answer_start_index": answer_start,
            "answer_end_index": answer_end,
            "predicted_start_index": start_idx,
            "predicted_end_index": end_idx
        })

    # Save per-checkpoint results
    epoch_num = ckpt_file.split("_")[-1].replace(".pt", "")
    df_output = pd.DataFrame(output_data)
    df_output.to_csv(f"/kaggle/working/out/validation_output_epoch_{epoch_num}.csv", index=False)

    # Log summary
    mean_em = np.mean(em_scores) * 100
    mean_f1 = np.mean(f1_scores) * 100
    summary.append({
        "checkpoint": ckpt_file,
        "EM": mean_em,
        "F1": mean_f1
    })

    print(f"→ EM: {mean_em:.2f}%, F1: {mean_f1:.2f}%")
    print(f"✅ Valid testing samples this epoch: {valid_sample_count}")


Using device: cuda


  sample_checkpoint = torch.load(os.path.join(checkpoint_dir, checkpoints[-1]), map_location=device)



Evaluating qa_model_epoch_10.pt...


  checkpoint = torch.load(os.path.join(checkpoint_dir, ckpt_file), map_location=device)


→ EM: 6.12%, F1: 14.67%
✅ Valid testing samples this epoch: 735

Evaluating qa_model_epoch_15.pt...


  checkpoint = torch.load(os.path.join(checkpoint_dir, ckpt_file), map_location=device)


→ EM: 6.39%, F1: 14.83%
✅ Valid testing samples this epoch: 735

Evaluating qa_model_epoch_5.pt...


  checkpoint = torch.load(os.path.join(checkpoint_dir, ckpt_file), map_location=device)


→ EM: 7.76%, F1: 15.41%
✅ Valid testing samples this epoch: 735


In [44]:
# Save summary CSV
summary_df = pd.DataFrame(summary)
summary_df.to_csv("/kaggle/working/out/eval_summary.csv", index=False)

print("\n✅ Evaluation complete! Results saved to:")
print("- /kaggle/working/out/eval_summary.csv (summary)")
print("- /kaggle/working/out/validation_output_epoch_*.csv (detailed per checkpoint)")


✅ Evaluation complete! Results saved to:
- /kaggle/working/out/eval_summary.csv (summary)
- /kaggle/working/out/validation_output_epoch_*.csv (detailed per checkpoint)


In [45]:
import shutil

shutil.make_archive('out', 'zip', '/kaggle/working/out/')

'/kaggle/working/out.zip'

In [46]:
from IPython.display import FileLink
FileLink(r'out.zip')

In [47]:
summary_df.head()

Unnamed: 0,checkpoint,EM,F1
0,qa_model_epoch_10.pt,7.346939,15.14196
1,qa_model_epoch_15.pt,7.755102,16.12414
2,qa_model_epoch_5.pt,6.802721,15.616295
