# 1. Setup and Configuration
- Loads necessary libraries
- Defines model hyperparameters and training configurations
- Specifies the path to the processed data file from the preprocessing notebook

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import math
import random
import pickle
import os
from collections import Counter
from tqdm.notebook import tqdm
import multiprocessing
import time

In [2]:
PROCESSED_DATA_PICKLE_PATH = 'processed_cache_data.pkl'
SEQ_LENGTH = 20
MODEL_MAX_SEQ_LENGTH = 50
BATCH_SIZE = 8
NUM_EPOCHS = 5
K_PREFETCH_EVAL = 1
GRAD_CLIP = 1.0
TRAIN_SPLIT_RATIO = 0.8
NUM_WORKERS_DATALOADER = 8
NUM_INIT_WORKERS_DATASET = 8

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


# 2. Define Transformer Model Components

In [4]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 5000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)
        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(max_len, d_model)
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = x + self.pe[:x.size(1), :]
        return self.dropout(x)

In [18]:
class MultiHeadSelfAttention(nn.Module):
    def __init__(self, d_model: int, num_heads: int, dropout: float = 0.1):
        super().__init__()
        assert d_model % num_heads == 0, "d_model must be divisible by num_heads"
        self.d_model = d_model
        self.num_heads = num_heads
        self.head_dim = d_model // num_heads
        self.W_q = nn.Linear(d_model, d_model)
        self.W_k = nn.Linear(d_model, d_model)
        self.W_v = nn.Linear(d_model, d_model)
        self.W_o = nn.Linear(d_model, d_model)
        self.dropout_attn = nn.Dropout(dropout)

    def forward(self, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, attention_mask: torch.Tensor = None) -> tuple[torch.Tensor, torch.Tensor]:
        batch_size = query.size(0)
        seq_len_q = query.size(1)
        seq_len_k = key.size(1)
        Q = self.W_q(query).view(batch_size, seq_len_q, self.num_heads, self.head_dim).transpose(1, 2)
        K = self.W_k(key).view(batch_size, seq_len_k, self.num_heads, self.head_dim).transpose(1, 2)
        V = self.W_v(value).view(batch_size, seq_len_k, self.num_heads, self.head_dim).transpose(1, 2)
        attention_scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.head_dim)
        if attention_mask is not None:
            attention_scores = attention_scores.masked_fill(attention_mask == True, float('-inf'))
        attention_weights = F.softmax(attention_scores, dim=-1)
        attention_weights = self.dropout_attn(attention_weights)
        context_vector = torch.matmul(attention_weights, V).transpose(1, 2).contiguous().view(batch_size, seq_len_q, self.d_model)
        output = self.W_o(context_vector)
        return output, attention_weights

In [6]:
class PositionwiseFeedForward(nn.Module):
    def __init__(self, d_model: int, d_ff: int, dropout: float = 0.1):
        super().__init__()
        self.linear1 = nn.Linear(d_model, d_ff)
        self.linear2 = nn.Linear(d_ff, d_model)
        self.dropout = nn.Dropout(dropout)
        self.activation = nn.ReLU()

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return self.linear2(self.dropout(self.activation(self.linear1(x))))

In [7]:
class DecoderBlockScratch(nn.Module):
    def __init__(self, d_model: int, num_heads: int, d_ff: int, dropout: float = 0.1):
        super().__init__()
        self.self_attention = MultiHeadSelfAttention(d_model, num_heads, dropout)
        self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)

    def forward(self, x: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
        norm_x = self.norm1(x)
        attn_output, _ = self.self_attention(norm_x, norm_x, norm_x, attention_mask)
        x = x + self.dropout1(attn_output)
        norm_x = self.norm2(x)
        ff_output = self.feed_forward(norm_x)
        x = x + self.dropout2(ff_output)
        return x

In [8]:
class DecoderOnlyTransformerScratch(nn.Module):
    def __init__(self, vocab_size: int, d_model: int, num_heads: int, num_layers: int, d_ff: int, max_seq_length: int, dropout: float = 0.1):
        super().__init__()
        self.d_model = d_model
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.pos_encoder = PositionalEncoding(d_model, dropout, max_seq_length)
        self.decoder_blocks = nn.ModuleList([DecoderBlockScratch(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])
        self.final_norm = nn.LayerNorm(d_model)
        self.fc_out = nn.Linear(d_model, vocab_size)
        self.dropout_emb = nn.Dropout(dropout)
        self._init_weights()

    def _init_weights(self):
        initrange = 0.1
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.fc_out.bias.data.zero_()
        self.fc_out.weight.data.uniform_(-initrange, initrange)

    def _generate_causal_mask(self, size: int, device: torch.device) -> torch.Tensor:
        return torch.triu(torch.ones(size, size, device=device, dtype=torch.bool), diagonal=1)

    def forward(self, src: torch.Tensor, src_key_padding_mask: torch.Tensor = None) -> torch.Tensor:
        batch_size, seq_len = src.shape
        device = src.device
        emb_out = self.embedding(src) * math.sqrt(self.d_model)
        x = self.dropout_emb(self.pos_encoder(emb_out))
        causal_mask = self._generate_causal_mask(seq_len, device).unsqueeze(0).unsqueeze(0)
        if src_key_padding_mask is not None:
            expanded_padding_mask = src_key_padding_mask.unsqueeze(1).unsqueeze(2)
            causal_mask = (causal_mask | expanded_padding_mask).bool()
        for block in self.decoder_blocks:
            x = block(x, causal_mask)
        return self.fc_out(self.final_norm(x))

# 3. Define `CacheDataset`

In [9]:
def _create_single_sequence_pair_for_mp(args_tuple):
    # Unpack arguments
    indexed_obj_ids_ref, i, sequence_length_val = args_tuple
    
    # Create the slices. Note: indexed_obj_ids_ref is the full list.
    input_seq_list = indexed_obj_ids_ref[i : i + sequence_length_val]
    target_seq_list = indexed_obj_ids_ref[i + 1 : i + sequence_length_val + 1]
    
    return torch.tensor(input_seq_list, dtype=torch.long), \
           torch.tensor(target_seq_list, dtype=torch.long)

In [10]:
class CacheDataset(Dataset):
    def __init__(self, filtered_obj_id_sequence: list, list_of_popular_objects: list, sequence_length: int, num_init_workers: int = 0):
        super().__init__()
        self.sequence_length = sequence_length
        self.popular_objects_vocab = sorted(list(set(list_of_popular_objects)))
        self.obj_to_idx = {obj: i for i, obj in enumerate(self.popular_objects_vocab)}
        self.idx_to_obj = {i: obj for obj, i in self.obj_to_idx.items()}
        self.vocab_size = len(self.popular_objects_vocab)
        
        self.indexed_obj_ids = [self.obj_to_idx[obj] for obj in filtered_obj_id_sequence if obj in self.obj_to_idx]
        
        self.input_sequences = []
        self.target_sequences = []
        
        if len(self.indexed_obj_ids) >= self.sequence_length + 1:
            num_total_sequences = len(self.indexed_obj_ids) - self.sequence_length

            actual_init_workers = 0
            if num_init_workers > 0:
                 actual_init_workers = min(num_init_workers, os.cpu_count() if os.cpu_count() else 1)
            
            min_sequences_for_parallel = 1000 
            min_sequences_per_worker = 50

            if actual_init_workers > 0 and \
               num_total_sequences >= min_sequences_for_parallel and \
               (num_total_sequences / actual_init_workers) >= min_sequences_per_worker:
                
                print(f"Using {actual_init_workers} workers for CacheDataset sequence creation ({num_total_sequences} sequences).")
                tasks_args = [(self.indexed_obj_ids, i, self.sequence_length) for i in range(num_total_sequences)]

                # Ensure the pool is only created if multiprocessing is to be used.
                # Also, set a proper start method if needed, especially for some OS.
                # For Jupyter, 'spawn' or 'forkserver' might be more stable than 'fork' (default on Unix).
                # However, let's try with default first.
                # ctx = multiprocessing.get_context('spawn') # Example for more control
                # with ctx.Pool(processes=actual_init_workers) as pool:
                with multiprocessing.Pool(processes=actual_init_workers) as pool:
                    results = []
                    for pair in tqdm(pool.imap_unordered(_create_single_sequence_pair_for_mp, tasks_args), 
                                     total=num_total_sequences, 
                                     desc="Creating Dataset Sequences (Parallel)", 
                                     unit="sequence", 
                                     leave=False):
                        results.append(pair)
                
                if results:
                    self.input_sequences, self.target_sequences = zip(*results)
                    self.input_sequences = list(self.input_sequences)
                    self.target_sequences = list(self.target_sequences)
            else:
                if actual_init_workers > 0:
                    print(f"Dataset size ({num_total_sequences} sequences) or worker load too small for parallel init with {actual_init_workers} workers. Using sequential.")
                for i in tqdm(range(num_total_sequences), desc="Creating Dataset Sequences (Sequential)", unit="sequence", leave=False):
                    self.input_sequences.append(torch.tensor(self.indexed_obj_ids[i : i + self.sequence_length], dtype=torch.long))
                    self.target_sequences.append(torch.tensor(self.indexed_obj_ids[i + 1 : i + self.sequence_length + 1], dtype=torch.long))

    def __len__(self):
        return len(self.input_sequences)
    
    def __getitem__(self, idx):
        return self.input_sequences[idx], self.target_sequences[idx]
    
    def get_vocab_info(self):
        return {'obj_to_idx': self.obj_to_idx, 'idx_to_obj': self.idx_to_obj, 'vocab_size': self.vocab_size}

# 4. Define Training and Evaluation Loops

In [11]:
def train_epoch(model: nn.Module, dataloader: DataLoader, criterion: nn.Module, 
                optimizer: optim.Optimizer, device: torch.device, grad_clip_value: float = None, epoch_num: int = 0, config_name: str = ""):
    model.train()
    total_loss = 0.0
    # Wrap dataloader with tqdm for batch progress
    batch_iterator = tqdm(dataloader, desc=f"Epoch {epoch_num} Training", leave=False, unit="batch")
    for batch_idx, (input_seqs, target_seqs) in enumerate(batch_iterator):
        input_seqs, target_seqs = input_seqs.to(device), target_seqs.to(device)
        optimizer.zero_grad()
        outputs = model(input_seqs, src_key_padding_mask=None)
        loss = criterion(outputs.view(-1, outputs.size(-1)), target_seqs.view(-1))
        loss.backward()
        if grad_clip_value:
            torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip_value)
        optimizer.step()
        total_loss += loss.item()
        # Update tqdm description with current loss
        batch_iterator.set_postfix_str(f"Loss: {loss.item():.4f}")
    return total_loss / len(dataloader) if len(dataloader) > 0 else 0.0

def evaluate_model(model: nn.Module, dataloader: DataLoader, criterion: nn.Module, 
                   device: torch.device, k_prefetch: int = 1):
    model.eval()
    total_loss = 0.0
    total_misses = 0
    total_predictions = 0
    # Wrap dataloader with tqdm for batch progress
    batch_iterator = tqdm(dataloader, desc="Evaluating", leave=False, unit="batch")
    with torch.no_grad():
        for input_seqs, target_seqs in batch_iterator:
            input_seqs, target_seqs = input_seqs.to(device), target_seqs.to(device)
            outputs = model(input_seqs, src_key_padding_mask=None)
            loss = criterion(outputs.view(-1, outputs.size(-1)), target_seqs.view(-1))
            total_loss += loss.item()
            actual_k_prefetch = min(k_prefetch, outputs.size(-1))
            _, top_k_indices = torch.topk(outputs, k=actual_k_prefetch, dim=2)
            target_seqs_expanded = target_seqs.unsqueeze(-1).expand_as(top_k_indices)
            hits_at_each_step = torch.any(top_k_indices == target_seqs_expanded, dim=2)
            misses_at_each_step = ~hits_at_each_step
            total_misses += misses_at_each_step.sum().item()
            total_predictions += target_seqs.numel()
            # Update tqdm description with current loss
            batch_iterator.set_postfix_str(f"Loss: {loss.item():.4f}")

    avg_loss = total_loss / len(dataloader) if len(dataloader) > 0 else 0.0
    miss_ratio = total_misses / total_predictions if total_predictions > 0 else 0.0
    return avg_loss, miss_ratio

# 5. Load Processed Data and Prepare Datasets

In [12]:
print(f"Loading processed data from {PROCESSED_DATA_PICKLE_PATH}...")
if not os.path.exists(PROCESSED_DATA_PICKLE_PATH):
    print(f"Error: Processed data file not found at {PROCESSED_DATA_PICKLE_PATH}.")
    print("Please run the preprocessing notebook first to generate this file.")
    raise FileNotFoundError(f"Missing {PROCESSED_DATA_PICKLE_PATH}")

with open(PROCESSED_DATA_PICKLE_PATH, 'rb') as f:
    processed_data = pickle.load(f)

filtered_sequence = processed_data['filtered_sequence_popular_obj_ids']
list_of_popular_objects_for_vocab = processed_data['list_of_popular_obj_ids']

print(f"Loaded filtered sequence of length: {len(filtered_sequence)}")
print(f"Number of unique popular objects for vocabulary: {len(list_of_popular_objects_for_vocab)}")

if not filtered_sequence or not list_of_popular_objects_for_vocab:
    print("Error: Loaded data is empty. Cannot proceed with training.")
    raise ValueError("Empty data loaded from pickle file.")

# Split the filtered sequence for training and validation
split_idx = int(TRAIN_SPLIT_RATIO * len(filtered_sequence))
train_filtered_ids = filtered_sequence[:split_idx]
val_filtered_ids = filtered_sequence[split_idx:]

print(f"Training sequence length: {len(train_filtered_ids)}")
print(f"Validation sequence length: {len(val_filtered_ids)}")

Loading processed data from processed_cache_data.pkl...
Loaded filtered sequence of length: 1000
Number of unique popular objects for vocabulary: 8
Training sequence length: 800
Validation sequence length: 200


In [13]:
# Create Datasets and DataLoaders
# The vocabulary is defined by list_of_popular_objects_for_vocab for both datasets.
print("Creating Training Dataset...")
train_dataset = CacheDataset(train_filtered_ids, list_of_popular_objects_for_vocab, SEQ_LENGTH, num_init_workers=NUM_INIT_WORKERS_DATASET)
print("Creating Validation Dataset...")
val_dataset = CacheDataset(val_filtered_ids, list_of_popular_objects_for_vocab, SEQ_LENGTH, num_init_workers=NUM_INIT_WORKERS_DATASET)

# Ensure datasets are not empty
if len(train_dataset) == 0:
    raise ValueError("Training dataset is empty after processing. Insufficient data for SEQ_LENGTH.")
if len(val_dataset) == 0:
    print("Warning: Validation dataset is empty. Evaluation will be skipped.")

Creating Training Dataset...
Dataset size (780 sequences) or worker load too small for parallel init with 8 workers. Using sequential.


Creating Dataset Sequences (Sequential):   0%|          | 0/780 [00:00<?, ?sequence/s]

Creating Validation Dataset...
Dataset size (180 sequences) or worker load too small for parallel init with 8 workers. Using sequential.


Creating Dataset Sequences (Sequential):   0%|          | 0/180 [00:00<?, ?sequence/s]

In [14]:
pin_memory_flag = True if device.type == 'cuda' else False
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, drop_last=True, num_workers=NUM_WORKERS_DATALOADER, pin_memory=pin_memory_flag)
val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=NUM_WORKERS_DATALOADER, pin_memory=pin_memory_flag) if len(val_dataset) > 0 else None

In [15]:
MODEL_VOCAB_SIZE = train_dataset.vocab_size
print(f"Effective Vocabulary size for all models: {MODEL_VOCAB_SIZE}")
print(f"Number of training sequences: {len(train_dataset)}")
print(f"Number of validation sequences: {len(val_dataset)}")

Effective Vocabulary size for all models: 8
Number of training sequences: 780
Number of validation sequences: 180


# 6. Define Hyperparameter Configurations

In [16]:
hyperparameter_configs = [
    {
        "name": "Config1_Baseline",
        "D_MODEL": 128,
        "NUM_HEADS": 4,
        "NUM_LAYERS": 3,
        "D_FF": 256,
        "DROPOUT": 0.1,
        "LEARNING_RATE": 0.001,
    },
    {
        "name": "Config2_LargerModel_LowerLR",
        "D_MODEL": 256, # Larger model
        "NUM_HEADS": 8,  # Must be divisor of D_MODEL
        "NUM_LAYERS": 4, # Deeper model
        "D_FF": 512,   # Larger FFN
        "DROPOUT": 0.15, # Slightly more dropout for larger model
        "LEARNING_RATE": 0.0005, # Lower LR for potentially more stable training
    },
    {
        "name": "Config3_SmallerModel_HigherLR_MoreDropout",
        "D_MODEL": 64,  # Smaller model
        "NUM_HEADS": 2,  # Must be divisor of D_MODEL
        "NUM_LAYERS": 2, # Shallower model
        "D_FF": 128,   # Smaller FFN
        "DROPOUT": 0.2,  # Higher dropout
        "LEARNING_RATE": 0.002, # Higher LR
    },
    # Add a fourth configuration
    {
        "name": "Config4_MediumModel_VariedHeads",
        "D_MODEL": 128,
        "NUM_HEADS": 8,  # More heads for same D_MODEL
        "NUM_LAYERS": 3,
        "D_FF": 256,
        "DROPOUT": 0.1,
        "LEARNING_RATE": 0.001,
    }
]

# 7. Training and Evaluation Loop for Each Configuration

In [20]:
all_results = [] 

# Outer loop for configurations with tqdm - assign the instance to a variable
configurations_pbar = tqdm(hyperparameter_configs, desc="Configurations")
for config in configurations_pbar: # Iterate over the tqdm instance
    print(f"\n\n{'='*20} Starting Training for: {config['name']} {'='*20}")
    print(f"Hyperparameters: {config}")
    start_time_config = time.time()

    current_model = DecoderOnlyTransformerScratch(
        vocab_size=MODEL_VOCAB_SIZE,
        d_model=config['D_MODEL'],
        num_heads=config['NUM_HEADS'],
        num_layers=config['NUM_LAYERS'],
        d_ff=config['D_FF'],
        max_seq_length=MODEL_MAX_SEQ_LENGTH,
        dropout=config['DROPOUT']
    ).to(device)

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(current_model.parameters(), lr=config['LEARNING_RATE'])

    print(f"Model for {config['name']} initialized with {sum(p.numel() for p in current_model.parameters())} parameters.")

    config_results = {
        "config_name": config['name'],
        "hyperparameters": config,
        "train_losses": [],
        "val_losses": [],
        "val_miss_ratios": [],
        "best_val_miss_ratio": float('inf'),
        "best_epoch": -1
    }

    if len(train_dataloader) == 0:
        print(f"Training dataloader is empty for {config['name']}. Skipping training for this config.")
        all_results.append(config_results) 
        continue

    for epoch in tqdm(range(1, NUM_EPOCHS + 1), desc=f"Config '{config['name']}' Epochs", leave=False):
        epoch_start_time = time.time()
        
        avg_train_loss = train_epoch(current_model, train_dataloader, criterion, optimizer, device, GRAD_CLIP, epoch_num=epoch, config_name=config['name'])
        config_results["train_losses"].append(avg_train_loss)
        print(f"Config: {config['name']}, Epoch {epoch} Training: Avg Loss: {avg_train_loss:.4f}")
        
        current_val_loss = float('nan')
        current_val_miss_ratio = float('nan')

        if val_dataloader and len(val_dataset) > 0:
            avg_val_loss, val_miss_ratio = evaluate_model(current_model, val_dataloader, criterion, device, k_prefetch=K_PREFETCH_EVAL)
            config_results["val_losses"].append(avg_val_loss)
            config_results["val_miss_ratios"].append(val_miss_ratio)
            current_val_loss = avg_val_loss
            current_val_miss_ratio = val_miss_ratio
            print(f"Config: {config['name']}, Epoch {epoch} Validation: Avg Loss: {avg_val_loss:.4f}, Miss Ratio (Top-{K_PREFETCH_EVAL}): {val_miss_ratio:.4f}")
            
            if not math.isnan(val_miss_ratio) and val_miss_ratio < config_results["best_val_miss_ratio"]:
                config_results["best_val_miss_ratio"] = val_miss_ratio
                config_results["best_epoch"] = epoch
        else:
            config_results["val_losses"].append(float('nan')) 
            config_results["val_miss_ratios"].append(float('nan'))
            if len(val_dataset) == 0:
                 print(f"Config: {config['name']}, Epoch {epoch}: Validation dataset is empty. Skipping validation.")
            else:
                 print(f"Config: {config['name']}, Epoch {epoch}: Validation dataloader not available. Skipping validation.")
        
        epoch_duration = time.time() - epoch_start_time
        # Update outer tqdm (config progress bar) with validation miss ratio if available
        if not math.isnan(current_val_miss_ratio):
             # Call set_description_str on the instance of the outer progress bar
             configurations_pbar.set_description_str(f"Configurations (Best Miss for {config['name']}: {config_results['best_val_miss_ratio']:.4f})")
        print(f"Config: {config['name']}, Epoch {epoch} duration: {epoch_duration:.2f} seconds")

    config_duration = time.time() - start_time_config
    print(f"\nTraining for {config['name']} completed in {config_duration:.2f} seconds.")
    print(f"Best Validation Miss Ratio for {config['name']}: {config_results['best_val_miss_ratio']:.4f} at Epoch {config_results['best_epoch']}")
    all_results.append(config_results)
    configurations_pbar.set_description_str("Configurations") # Reset description for next config

print(f"\n\n{'='*20} All Configurations Processed {'='*20}")

Configurations:   0%|          | 0/4 [00:00<?, ?it/s]



Hyperparameters: {'name': 'Config1_Baseline', 'D_MODEL': 128, 'NUM_HEADS': 4, 'NUM_LAYERS': 3, 'D_FF': 256, 'DROPOUT': 0.1, 'LEARNING_RATE': 0.001}
Model for Config1_Baseline initialized with 399752 parameters.


Config 'Config1_Baseline' Epochs:   0%|          | 0/5 [00:00<?, ?it/s]

Epoch 1 Training:   0%|          | 0/97 [00:00<?, ?batch/s]

Config: Config1_Baseline, Epoch 1 Training: Avg Loss: 0.2957


Evaluating:   0%|          | 0/23 [00:00<?, ?batch/s]

Config: Config1_Baseline, Epoch 1 Validation: Avg Loss: 0.1984, Miss Ratio (Top-1): 0.0317
Config: Config1_Baseline, Epoch 1 duration: 5.44 seconds


Epoch 2 Training:   0%|          | 0/97 [00:00<?, ?batch/s]

Config: Config1_Baseline, Epoch 2 Training: Avg Loss: 0.2558


Evaluating:   0%|          | 0/23 [00:00<?, ?batch/s]

Config: Config1_Baseline, Epoch 2 Validation: Avg Loss: 0.2381, Miss Ratio (Top-1): 0.0692
Config: Config1_Baseline, Epoch 2 duration: 5.06 seconds


Epoch 3 Training:   0%|          | 0/97 [00:00<?, ?batch/s]

Config: Config1_Baseline, Epoch 3 Training: Avg Loss: 0.2492


Evaluating:   0%|          | 0/23 [00:00<?, ?batch/s]

Config: Config1_Baseline, Epoch 3 Validation: Avg Loss: 0.3340, Miss Ratio (Top-1): 0.0761
Config: Config1_Baseline, Epoch 3 duration: 5.03 seconds


Epoch 4 Training:   0%|          | 0/97 [00:00<?, ?batch/s]

Config: Config1_Baseline, Epoch 4 Training: Avg Loss: 0.2478


Evaluating:   0%|          | 0/23 [00:00<?, ?batch/s]

Config: Config1_Baseline, Epoch 4 Validation: Avg Loss: 0.3996, Miss Ratio (Top-1): 0.0761
Config: Config1_Baseline, Epoch 4 duration: 5.01 seconds


Epoch 5 Training:   0%|          | 0/97 [00:00<?, ?batch/s]

Config: Config1_Baseline, Epoch 5 Training: Avg Loss: 0.2451


Evaluating:   0%|          | 0/23 [00:00<?, ?batch/s]

Config: Config1_Baseline, Epoch 5 Validation: Avg Loss: 0.3303, Miss Ratio (Top-1): 0.0761
Config: Config1_Baseline, Epoch 5 duration: 4.83 seconds

Training for Config1_Baseline completed in 25.43 seconds.
Best Validation Miss Ratio for Config1_Baseline: 0.0317 at Epoch 1


Hyperparameters: {'name': 'Config2_LargerModel_LowerLR', 'D_MODEL': 256, 'NUM_HEADS': 8, 'NUM_LAYERS': 4, 'D_FF': 512, 'DROPOUT': 0.15, 'LEARNING_RATE': 0.0005}
Model for Config2_LargerModel_LowerLR initialized with 2113032 parameters.


Config 'Config2_LargerModel_LowerLR' Epochs:   0%|          | 0/5 [00:00<?, ?it/s]

Epoch 1 Training:   0%|          | 0/97 [00:00<?, ?batch/s]

Config: Config2_LargerModel_LowerLR, Epoch 1 Training: Avg Loss: 0.3064


Evaluating:   0%|          | 0/23 [00:00<?, ?batch/s]

Config: Config2_LargerModel_LowerLR, Epoch 1 Validation: Avg Loss: 0.2132, Miss Ratio (Top-1): 0.0317
Config: Config2_LargerModel_LowerLR, Epoch 1 duration: 5.65 seconds


Epoch 2 Training:   0%|          | 0/97 [00:00<?, ?batch/s]

Config: Config2_LargerModel_LowerLR, Epoch 2 Training: Avg Loss: 0.2559


Evaluating:   0%|          | 0/23 [00:00<?, ?batch/s]

Config: Config2_LargerModel_LowerLR, Epoch 2 Validation: Avg Loss: 0.2939, Miss Ratio (Top-1): 0.0764
Config: Config2_LargerModel_LowerLR, Epoch 2 duration: 5.63 seconds


Epoch 3 Training:   0%|          | 0/97 [00:00<?, ?batch/s]

Config: Config2_LargerModel_LowerLR, Epoch 3 Training: Avg Loss: 0.2528


Evaluating:   0%|          | 0/23 [00:00<?, ?batch/s]

Config: Config2_LargerModel_LowerLR, Epoch 3 Validation: Avg Loss: 0.3438, Miss Ratio (Top-1): 0.0764
Config: Config2_LargerModel_LowerLR, Epoch 3 duration: 5.64 seconds


Epoch 4 Training:   0%|          | 0/97 [00:00<?, ?batch/s]

Config: Config2_LargerModel_LowerLR, Epoch 4 Training: Avg Loss: 0.2488


Evaluating:   0%|          | 0/23 [00:00<?, ?batch/s]

Config: Config2_LargerModel_LowerLR, Epoch 4 Validation: Avg Loss: 0.3395, Miss Ratio (Top-1): 0.0761
Config: Config2_LargerModel_LowerLR, Epoch 4 duration: 5.66 seconds


Epoch 5 Training:   0%|          | 0/97 [00:00<?, ?batch/s]

Config: Config2_LargerModel_LowerLR, Epoch 5 Training: Avg Loss: 0.2463


Evaluating:   0%|          | 0/23 [00:00<?, ?batch/s]

Config: Config2_LargerModel_LowerLR, Epoch 5 Validation: Avg Loss: 0.4580, Miss Ratio (Top-1): 0.0761
Config: Config2_LargerModel_LowerLR, Epoch 5 duration: 5.65 seconds

Training for Config2_LargerModel_LowerLR completed in 28.33 seconds.
Best Validation Miss Ratio for Config2_LargerModel_LowerLR: 0.0317 at Epoch 1


Hyperparameters: {'name': 'Config3_SmallerModel_HigherLR_MoreDropout', 'D_MODEL': 64, 'NUM_HEADS': 2, 'NUM_LAYERS': 2, 'D_FF': 128, 'DROPOUT': 0.2, 'LEARNING_RATE': 0.002}
Model for Config3_SmallerModel_HigherLR_MoreDropout initialized with 68104 parameters.


Config 'Config3_SmallerModel_HigherLR_MoreDropout' Epochs:   0%|          | 0/5 [00:00<?, ?it/s]

Epoch 1 Training:   0%|          | 0/97 [00:00<?, ?batch/s]

Config: Config3_SmallerModel_HigherLR_MoreDropout, Epoch 1 Training: Avg Loss: 0.3122


Evaluating:   0%|          | 0/23 [00:00<?, ?batch/s]

Config: Config3_SmallerModel_HigherLR_MoreDropout, Epoch 1 Validation: Avg Loss: 0.2017, Miss Ratio (Top-1): 0.0311
Config: Config3_SmallerModel_HigherLR_MoreDropout, Epoch 1 duration: 4.16 seconds


Epoch 2 Training:   0%|          | 0/97 [00:00<?, ?batch/s]

Config: Config3_SmallerModel_HigherLR_MoreDropout, Epoch 2 Training: Avg Loss: 0.2704


Evaluating:   0%|          | 0/23 [00:00<?, ?batch/s]

Config: Config3_SmallerModel_HigherLR_MoreDropout, Epoch 2 Validation: Avg Loss: 0.1928, Miss Ratio (Top-1): 0.0311
Config: Config3_SmallerModel_HigherLR_MoreDropout, Epoch 2 duration: 4.22 seconds


Epoch 3 Training:   0%|          | 0/97 [00:00<?, ?batch/s]

Config: Config3_SmallerModel_HigherLR_MoreDropout, Epoch 3 Training: Avg Loss: 0.2652


Evaluating:   0%|          | 0/23 [00:00<?, ?batch/s]

Config: Config3_SmallerModel_HigherLR_MoreDropout, Epoch 3 Validation: Avg Loss: 0.2053, Miss Ratio (Top-1): 0.0311
Config: Config3_SmallerModel_HigherLR_MoreDropout, Epoch 3 duration: 4.16 seconds


Epoch 4 Training:   0%|          | 0/97 [00:00<?, ?batch/s]

Config: Config3_SmallerModel_HigherLR_MoreDropout, Epoch 4 Training: Avg Loss: 0.2585


Evaluating:   0%|          | 0/23 [00:00<?, ?batch/s]

Config: Config3_SmallerModel_HigherLR_MoreDropout, Epoch 4 Validation: Avg Loss: 0.3369, Miss Ratio (Top-1): 0.0764
Config: Config3_SmallerModel_HigherLR_MoreDropout, Epoch 4 duration: 4.22 seconds


Epoch 5 Training:   0%|          | 0/97 [00:00<?, ?batch/s]

Config: Config3_SmallerModel_HigherLR_MoreDropout, Epoch 5 Training: Avg Loss: 0.2581


Evaluating:   0%|          | 0/23 [00:00<?, ?batch/s]

Config: Config3_SmallerModel_HigherLR_MoreDropout, Epoch 5 Validation: Avg Loss: 0.3415, Miss Ratio (Top-1): 0.0764
Config: Config3_SmallerModel_HigherLR_MoreDropout, Epoch 5 duration: 4.21 seconds

Training for Config3_SmallerModel_HigherLR_MoreDropout completed in 21.01 seconds.
Best Validation Miss Ratio for Config3_SmallerModel_HigherLR_MoreDropout: 0.0311 at Epoch 1


Hyperparameters: {'name': 'Config4_MediumModel_VariedHeads', 'D_MODEL': 128, 'NUM_HEADS': 8, 'NUM_LAYERS': 3, 'D_FF': 256, 'DROPOUT': 0.1, 'LEARNING_RATE': 0.001}
Model for Config4_MediumModel_VariedHeads initialized with 399752 parameters.


Config 'Config4_MediumModel_VariedHeads' Epochs:   0%|          | 0/5 [00:00<?, ?it/s]

Epoch 1 Training:   0%|          | 0/97 [00:00<?, ?batch/s]

Config: Config4_MediumModel_VariedHeads, Epoch 1 Training: Avg Loss: 0.2909


Evaluating:   0%|          | 0/23 [00:00<?, ?batch/s]

Config: Config4_MediumModel_VariedHeads, Epoch 1 Validation: Avg Loss: 0.1936, Miss Ratio (Top-1): 0.0311
Config: Config4_MediumModel_VariedHeads, Epoch 1 duration: 4.96 seconds


Epoch 2 Training:   0%|          | 0/97 [00:00<?, ?batch/s]

Config: Config4_MediumModel_VariedHeads, Epoch 2 Training: Avg Loss: 0.2586


Evaluating:   0%|          | 0/23 [00:00<?, ?batch/s]

Config: Config4_MediumModel_VariedHeads, Epoch 2 Validation: Avg Loss: 0.2278, Miss Ratio (Top-1): 0.0400
Config: Config4_MediumModel_VariedHeads, Epoch 2 duration: 5.06 seconds


Epoch 3 Training:   0%|          | 0/97 [00:00<?, ?batch/s]

Config: Config4_MediumModel_VariedHeads, Epoch 3 Training: Avg Loss: 0.2522


Evaluating:   0%|          | 0/23 [00:00<?, ?batch/s]

Config: Config4_MediumModel_VariedHeads, Epoch 3 Validation: Avg Loss: 0.2827, Miss Ratio (Top-1): 0.0761
Config: Config4_MediumModel_VariedHeads, Epoch 3 duration: 4.95 seconds


Epoch 4 Training:   0%|          | 0/97 [00:00<?, ?batch/s]

Config: Config4_MediumModel_VariedHeads, Epoch 4 Training: Avg Loss: 0.2487


Evaluating:   0%|          | 0/23 [00:00<?, ?batch/s]

Config: Config4_MediumModel_VariedHeads, Epoch 4 Validation: Avg Loss: 0.3521, Miss Ratio (Top-1): 0.0761
Config: Config4_MediumModel_VariedHeads, Epoch 4 duration: 5.09 seconds


Epoch 5 Training:   0%|          | 0/97 [00:00<?, ?batch/s]

Config: Config4_MediumModel_VariedHeads, Epoch 5 Training: Avg Loss: 0.2460


Evaluating:   0%|          | 0/23 [00:00<?, ?batch/s]

Config: Config4_MediumModel_VariedHeads, Epoch 5 Validation: Avg Loss: 0.4430, Miss Ratio (Top-1): 0.0761
Config: Config4_MediumModel_VariedHeads, Epoch 5 duration: 4.71 seconds

Training for Config4_MediumModel_VariedHeads completed in 24.82 seconds.
Best Validation Miss Ratio for Config4_MediumModel_VariedHeads: 0.0311 at Epoch 1




# 8. Summarize Results

In [21]:
print("\n\n--- Summary of All Hyperparameter Configurations ---")
for result in all_results:
    print(f"\nConfiguration: {result['config_name']}")
    print(f"  Hyperparameters: {result['hyperparameters']}")
    if result['train_losses']: # Check if training was run
        print(f"  Final Training Loss (Epoch {len(result['train_losses'])}): {result['train_losses'][-1]:.4f}")
        if result['val_miss_ratios'] and not math.isnan(result['val_miss_ratios'][-1]):
             print(f"  Final Validation Loss (Epoch {len(result['val_losses'])}): {result['val_losses'][-1]:.4f}")
             print(f"  Final Validation Miss Ratio (Epoch {len(result['val_miss_ratios'])}): {result['val_miss_ratios'][-1]:.4f}")
        print(f"  Best Validation Miss Ratio: {result['best_val_miss_ratio']:.4f} (Epoch {result['best_epoch']})")
    else:
        print("  Training was not run for this configuration (e.g., empty dataloader).")



--- Summary of All Hyperparameter Configurations ---

Configuration: Config1_Baseline
  Hyperparameters: {'name': 'Config1_Baseline', 'D_MODEL': 128, 'NUM_HEADS': 4, 'NUM_LAYERS': 3, 'D_FF': 256, 'DROPOUT': 0.1, 'LEARNING_RATE': 0.001}
  Final Training Loss (Epoch 5): 0.2451
  Final Validation Loss (Epoch 5): 0.3303
  Final Validation Miss Ratio (Epoch 5): 0.0761
  Best Validation Miss Ratio: 0.0317 (Epoch 1)

Configuration: Config2_LargerModel_LowerLR
  Hyperparameters: {'name': 'Config2_LargerModel_LowerLR', 'D_MODEL': 256, 'NUM_HEADS': 8, 'NUM_LAYERS': 4, 'D_FF': 512, 'DROPOUT': 0.15, 'LEARNING_RATE': 0.0005}
  Final Training Loss (Epoch 5): 0.2463
  Final Validation Loss (Epoch 5): 0.4580
  Final Validation Miss Ratio (Epoch 5): 0.0761
  Best Validation Miss Ratio: 0.0317 (Epoch 1)

Configuration: Config3_SmallerModel_HigherLR_MoreDropout
  Hyperparameters: {'name': 'Config3_SmallerModel_HigherLR_MoreDropout', 'D_MODEL': 64, 'NUM_HEADS': 2, 'NUM_LAYERS': 2, 'D_FF': 128, 'DROPOUT'

In [22]:
# Find the overall best configuration based on best_val_miss_ratio
best_overall_config_result = None
if all_results:
    # Filter out results where training might not have run or validation was skipped leading to inf miss ratio
    valid_results = [r for r in all_results if r['best_val_miss_ratio'] != float('inf') and r['best_val_miss_ratio'] is not None and not math.isnan(r['best_val_miss_ratio'])]
    if valid_results:
        best_overall_config_result = min(valid_results, key=lambda x: x['best_val_miss_ratio'])

if best_overall_config_result:
    print(f"\n--- Overall Best Configuration ---")
    print(f"Name: {best_overall_config_result['config_name']}")
    print(f"Best Validation Miss Ratio: {best_overall_config_result['best_val_miss_ratio']:.4f} at Epoch {best_overall_config_result['best_epoch']}")
    print(f"Hyperparameters: {best_overall_config_result['hyperparameters']}")
else:
    print("\nCould not determine an overall best configuration (e.g., no valid validation results).")


--- Overall Best Configuration ---
Name: Config3_SmallerModel_HigherLR_MoreDropout
Best Validation Miss Ratio: 0.0311 at Epoch 1
Hyperparameters: {'name': 'Config3_SmallerModel_HigherLR_MoreDropout', 'D_MODEL': 64, 'NUM_HEADS': 2, 'NUM_LAYERS': 2, 'D_FF': 128, 'DROPOUT': 0.2, 'LEARNING_RATE': 0.002}
