In [None]:
# Import könyvtárak
import os
import json
from pathlib import Path
import random
import shutil

import numpy as np
import math
import pandas as pd
import torch
import torch.nn as nn
from torch import amp
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel, get_linear_schedule_with_warmup
from torch.optim import AdamW
from sklearn.metrics import accuracy_score, f1_score
from sklearn.utils.class_weight import compute_class_weight
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
import sys
import re

In [None]:
def set_seed(seed=42):
    """Set random seed for reproducibility."""
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(42)

In [None]:
class LegalTextDataset(Dataset):
    """Dataset for legal text classification."""

    def __init__(self, texts, labels, tokenizer, max_length=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

In [None]:
def normalize_label(raw):
    s = str(raw).strip()
    m = re.match(r'^([1-5])', s)
    return int(m.group(1)) if m else 0


def build_ordinal_mapping(labels):
    numeric = [normalize_label(l) for l in labels]
    unique = sorted(set(numeric))
    label2id = {u: i for i, u in enumerate(unique)}
    id2label = {label2id[u]: str(u) for u in unique}
    encoded = [label2id[n] for n in numeric]
    return encoded, label2id, id2label

## Progressive Model Architectures

4 különböző modell architektúra definiálása:

In [None]:
class Step1_BaselineModel(nn.Module):
    """Step 1: Minimal baseline - Transformer + single linear classifier."""
    def __init__(self, transformer_model, num_classes=5):
        super().__init__()
        self.transformer = transformer_model
        self.num_classes = num_classes
        hidden_size = transformer_model.config.hidden_size
        self.classifier = nn.Linear(hidden_size, num_classes)
    
    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.transformer(
            input_ids=input_ids,
            attention_mask=attention_mask,
            output_hidden_states=False,
            return_dict=True
        )
        pooled = outputs.last_hidden_state[:, 0, :]
        logits = self.classifier(pooled)
        
        output = type('Output', (), {'logits': logits})()
        if labels is not None:
            output.loss = nn.CrossEntropyLoss()(logits, labels)
        return output

In [None]:
class Step2_ExtendedModel(nn.Module):
    """Step 2: Extended - 2-layer adapter + BatchNorm + Dropout."""
    def __init__(self, transformer_model, num_classes=5, hidden_dim=256, dropout=0.3):
        super().__init__()
        self.transformer = transformer_model
        self.num_classes = num_classes
        trans_hidden = transformer_model.config.hidden_size
        
        self.adapter = nn.Sequential(
            nn.Linear(trans_hidden, hidden_dim),
            nn.BatchNorm1d(hidden_dim),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, hidden_dim // 2),
            nn.BatchNorm1d(hidden_dim // 2),
            nn.GELU(),
            nn.Dropout(dropout)
        )
        self.classifier = nn.Linear(hidden_dim // 2, num_classes)
    
    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.transformer(
            input_ids=input_ids,
            attention_mask=attention_mask,
            output_hidden_states=False,
            return_dict=True
        )
        pooled = outputs.last_hidden_state[:, 0, :]
        adapted = self.adapter(pooled)
        logits = self.classifier(adapted)
        
        output = type('Output', (), {'logits': logits})()
        if labels is not None:
            output.loss = nn.CrossEntropyLoss()(logits, labels)
        return output

In [None]:
class Step3_AdvancedModel(nn.Module):
    """Step 3: Advanced - Attention pooling + 3-layer adapter + gating."""
    def __init__(self, transformer_model, num_classes=5, hidden_dim=256, dropout=0.4, num_heads=4):
        super().__init__()
        self.transformer = transformer_model
        self.num_classes = num_classes
        trans_hidden = transformer_model.config.hidden_size
        
        # Attention pooling
        self.attention_pool = nn.MultiheadAttention(
            embed_dim=trans_hidden,
            num_heads=num_heads,
            dropout=dropout,
            batch_first=True
        )
        self.query = nn.Parameter(torch.randn(1, 1, trans_hidden))
        
        # 3-layer adapter
        self.adapter_1 = nn.Sequential(
            nn.Linear(trans_hidden, hidden_dim),
            nn.LayerNorm(hidden_dim),
            nn.GELU(),
            nn.Dropout(dropout)
        )
        self.adapter_2 = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim),
            nn.LayerNorm(hidden_dim),
            nn.GELU(),
            nn.Dropout(dropout)
        )
        self.adapter_3 = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim // 2),
            nn.LayerNorm(hidden_dim // 2),
            nn.GELU(),
            nn.Dropout(dropout)
        )
        
        self.gate = nn.Linear(hidden_dim, hidden_dim)
        self.classifier = nn.Linear(hidden_dim // 2, num_classes)
    
    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.transformer(
            input_ids=input_ids,
            attention_mask=attention_mask,
            output_hidden_states=False,
            return_dict=True
        )
        hidden = outputs.last_hidden_state
        
        # Attention pooling
        batch_size = hidden.size(0)
        query = self.query.expand(batch_size, -1, -1)
        pooled, _ = self.attention_pool(query, hidden, hidden, key_padding_mask=~attention_mask.bool())
        pooled = pooled.squeeze(1)
        
        # 3-layer adapter with gating
        x = self.adapter_1(pooled)
        residual = x
        x = self.adapter_2(x)
        gate = torch.sigmoid(self.gate(x))
        x = gate * x + (1 - gate) * residual
        x = self.adapter_3(x)
        
        logits = self.classifier(x)
        
        output = type('Output', (), {'logits': logits})()
        if labels is not None:
            output.loss = nn.CrossEntropyLoss()(logits, labels)
        return output

In [None]:
class BalancedFinalModel(nn.Module):
    """Final: Balanced - Production-ready model with best practices."""
    def __init__(self, transformer_model, num_classes=5, hidden_dim=256, dropout=0.3):
        super().__init__()
        self.transformer = transformer_model
        self.num_classes = num_classes
        trans_hidden = transformer_model.config.hidden_size
        
        self.adapter = nn.Sequential(
            nn.Linear(trans_hidden, hidden_dim),
            nn.LayerNorm(hidden_dim),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, hidden_dim // 2),
            nn.LayerNorm(hidden_dim // 2),
            nn.GELU(),
            nn.Dropout(dropout)
        )
        self.classifier = nn.Linear(hidden_dim // 2, num_classes)
    
    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.transformer(
            input_ids=input_ids,
            attention_mask=attention_mask,
            output_hidden_states=False,
            return_dict=True
        )
        hidden = outputs.last_hidden_state
        
        # Mean pooling
        mask = attention_mask.unsqueeze(-1)
        summed = (hidden * mask).sum(1)
        counts = mask.sum(1).clamp(min=1)
        pooled = summed / counts
        
        adapted = self.adapter(pooled)
        logits = self.classifier(adapted)
        
        output = type('Output', (), {'logits': logits})()
        if labels is not None:
            output.loss = nn.CrossEntropyLoss()(logits, labels)
        return output

## Training Functions

In [None]:
def train_epoch(model, dataloader, optimizer, scheduler, device, criterion=None, grad_acc_steps=1):
    """Train for one epoch."""
    model.train()
    total_loss = 0
    predictions = []
    true_labels = []
    
    disable_tqdm = not sys.stdout.isatty()
    progress_bar = tqdm(dataloader, desc="Training", disable=disable_tqdm)
    scaler = amp.GradScaler('cuda', enabled=device.type == 'cuda')
    step_count = 0
    
    for batch in progress_bar:
        input_ids = batch['input_ids'].to(device, non_blocking=True)
        attention_mask = batch['attention_mask'].to(device, non_blocking=True)
        labels = batch['label'].to(device, non_blocking=True)
        
        with amp.autocast('cuda', enabled=scaler.is_enabled()):
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            
            if criterion is None:
                outputs_with_labels = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
                loss = outputs_with_labels.loss / grad_acc_steps
            else:
                loss = criterion(logits, labels) / grad_acc_steps
        
        scaler.scale(loss).backward()
        step_count += 1
        
        if step_count % grad_acc_steps == 0:
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad(set_to_none=True)
            scheduler.step()
        
        total_loss += loss.item() * grad_acc_steps
        preds = torch.argmax(logits.detach(), dim=1)
        predictions.extend(preds.cpu().numpy())
        true_labels.extend(labels.cpu().numpy())
        progress_bar.set_postfix({'loss': (total_loss / step_count)})
    
    avg_loss = total_loss / len(dataloader)
    accuracy = accuracy_score(true_labels, predictions)
    
    return avg_loss, accuracy

In [None]:
def evaluate(model, dataloader, device, criterion=None):
    """Evaluate model on validation/test set."""
    model.eval()
    total_loss = 0
    predictions = []
    true_labels = []
    
    with torch.no_grad():
        disable_tqdm = not sys.stdout.isatty()
        progress_bar = tqdm(dataloader, desc="Evaluating", disable=disable_tqdm)
        mixed = device.type == 'cuda'
        
        for batch in progress_bar:
            input_ids = batch['input_ids'].to(device, non_blocking=True)
            attention_mask = batch['attention_mask'].to(device, non_blocking=True)
            labels = batch['label'].to(device, non_blocking=True)
            
            with amp.autocast('cuda', enabled=mixed):
                outputs = model(input_ids=input_ids, attention_mask=attention_mask)
                logits = outputs.logits
                
                if criterion is None:
                    outputs_with_labels = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
                    loss = outputs_with_labels.loss
                else:
                    loss = criterion(logits, labels)
            
            total_loss += loss.item()
            preds = torch.argmax(logits.detach(), dim=1)
            predictions.extend(preds.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())
    
    avg_loss = total_loss / len(dataloader)
    accuracy = accuracy_score(true_labels, predictions)
    
    return avg_loss, accuracy, predictions, true_labels

In [None]:
def plot_training_history(history, save_path):
    """Plot training and validation metrics."""
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))
    
    epochs = range(1, len(history['train_loss']) + 1)
    
    ax1.plot(epochs, history['train_loss'], 'b-', label='Train Loss')
    if 'val_loss' in history:
        ax1.plot(epochs, history['val_loss'], 'r-', label='Val Loss')
    ax1.set_title('Training and Validation Loss')
    ax1.set_xlabel('Epoch')
    ax1.set_ylabel('Loss')
    ax1.legend()
    ax1.grid(True)
    
    ax2.plot(epochs, history['train_acc'], 'b-', label='Train Accuracy')
    if 'val_acc' in history:
        ax2.plot(epochs, history['val_acc'], 'r-', label='Val Accuracy')
    ax2.set_title('Training and Validation Accuracy')
    ax2.set_xlabel('Epoch')
    ax2.set_ylabel('Accuracy')
    ax2.legend()
    ax2.grid(True)
    
    fig.tight_layout()
    fig.savefig(save_path)
    plt.show()
    plt.close(fig)

In [None]:
def plot_model_comparison(all_results, save_path):
    """Plot comparison of all 4 models' performance."""
    model_names = list(all_results.keys())
    
    fig, axes = plt.subplots(2, 2, figsize=(14, 10))
    
    # Val Accuracy Comparison
    val_accs = [all_results[name]['val_acc'] for name in model_names]
    train_accs = [all_results[name]['train_acc'] for name in model_names]
    
    x = np.arange(len(model_names))
    width = 0.35
    axes[0, 0].bar(x - width/2, train_accs, width, label='Train Acc', color='#3498db')
    axes[0, 0].bar(x + width/2, val_accs, width, label='Val Acc', color='#e74c3c')
    axes[0, 0].set_ylabel('Accuracy')
    axes[0, 0].set_title('Train vs Val Accuracy Comparison')
    axes[0, 0].set_xticks(x)
    axes[0, 0].set_xticklabels(model_names, rotation=15, ha='right')
    axes[0, 0].legend()
    axes[0, 0].grid(True, alpha=0.3)
    
    # Overfitting Gap
    gaps = [all_results[name]['train_acc'] - all_results[name]['val_acc'] for name in model_names]
    colors = ['#2ecc71' if g < 0.05 else '#f39c12' if g < 0.10 else '#e74c3c' for g in gaps]
    axes[0, 1].bar(model_names, gaps, color=colors)
    axes[0, 1].axhline(y=0.05, color='orange', linestyle='--', label='5% threshold')
    axes[0, 1].axhline(y=0.10, color='red', linestyle='--', label='10% threshold')
    axes[0, 1].set_ylabel('Train - Val Accuracy Gap')
    axes[0, 1].set_title('Overfitting Analysis')
    axes[0, 1].set_xticks(range(len(model_names)))
    axes[0, 1].set_xticklabels(model_names, rotation=15, ha='right')
    axes[0, 1].legend()
    axes[0, 1].grid(True, alpha=0.3)
    
    # Convergence Speed
    epochs_to_best = [all_results[name]['epochs_trained'] for name in model_names]
    axes[1, 0].bar(model_names, epochs_to_best, color='#9b59b6')
    axes[1, 0].set_ylabel('Epochs')
    axes[1, 0].set_title('Training Epochs')
    axes[1, 0].set_xticks(range(len(model_names)))
    axes[1, 0].set_xticklabels(model_names, rotation=15, ha='right')
    axes[1, 0].grid(True, alpha=0.3)
    
    # F1 Scores
    macro_f1s = [all_results[name]['val_macro_f1'] for name in model_names]
    weighted_f1s = [all_results[name]['val_weighted_f1'] for name in model_names]
    
    axes[1, 1].bar(x - width/2, macro_f1s, width, label='Macro F1', color='#1abc9c')
    axes[1, 1].bar(x + width/2, weighted_f1s, width, label='Weighted F1', color='#34495e')
    axes[1, 1].set_ylabel('F1 Score')
    axes[1, 1].set_title('F1 Score Comparison')
    axes[1, 1].set_xticks(x)
    axes[1, 1].set_xticklabels(model_names, rotation=15, ha='right')
    axes[1, 1].legend()
    axes[1, 1].grid(True, alpha=0.3)
    
    fig.tight_layout()
    fig.savefig(save_path, dpi=150, bbox_inches='tight')
    plt.show()
    plt.close(fig)
    print(f"Model comparison plot saved to {save_path}")

## Main Training Pipeline

Betöltjük az adatokat és tanítjuk mind a 4 modellt:

In [None]:
# Configuration
base_output = os.getenv('OUTPUT_DIR', '../output')
processed_dir = os.path.join(base_output, 'processed')
models_dir = os.path.join(base_output, 'models')
reports_dir = os.path.join(base_output, 'reports')

Path(models_dir).mkdir(parents=True, exist_ok=True)
Path(reports_dir).mkdir(parents=True, exist_ok=True)

# Hyperparameters
model_name = os.getenv('TRANSFORMER_MODEL', 'SZTAKI-HLT/hubert-base-cc')
batch_size = 8
epochs = 15
learning_rate = 1.5e-5
weight_decay = 0.01
max_length = 320
label_smoothing = 0.02
early_stopping_patience = 3
grad_acc_steps = 2

print(f"Base Model: {model_name}")
print(f"Batch Size: {batch_size} | Epochs: {epochs} | LR: {learning_rate}")

In [None]:
# Load data
train_path = os.path.join(processed_dir, "train.csv")
val_path = os.path.join(processed_dir, "val.csv")

train_df = pd.read_csv(train_path)
val_df = pd.read_csv(val_path) if os.path.exists(val_path) else None

print(f"Train samples: {len(train_df)}")
if val_df is not None:
    print(f"Val samples: {len(val_df)}")

display(train_df.head())

In [None]:
# Prepare labels
y_train_str = train_df['label'].astype(str).tolist()
y_train, label2id, id2label = build_ordinal_mapping(y_train_str)

# Save label mapping
label_map_path = os.path.join(models_dir, 'label_mapping.json')
with open(label_map_path, 'w', encoding='utf-8') as f:
    json.dump({'label2id': label2id, 'id2label': id2label}, f, ensure_ascii=False, indent=2)

num_labels = len(label2id)
print(f"Number of labels: {num_labels}")
print(f"Label mapping: {label2id}")

In [None]:
# Device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")
if device.type == 'cuda':
    gpu_name = torch.cuda.get_device_name(0)
    total_mem = torch.cuda.get_device_properties(0).total_memory / (1024**3)
    print(f"GPU: {gpu_name} | Total VRAM: {total_mem:.2f} GB")

In [None]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Prepare datasets
X_train = train_df['text'].astype(str).tolist()
train_dataset = LegalTextDataset(X_train, y_train, tokenizer, max_length)
train_loader = DataLoader(
    train_dataset,
    batch_size=batch_size,
    shuffle=True,
    pin_memory=device.type=='cuda',
    num_workers=0
)

val_loader = None
if val_df is not None:
    y_val_str = val_df['label'].astype(str).tolist()
    y_val_numeric = [normalize_label(label) for label in y_val_str]
    y_val = [label2id[n] for n in y_val_numeric]
    X_val = val_df['text'].astype(str).tolist()
    val_dataset = LegalTextDataset(X_val, y_val, tokenizer, max_length)
    val_loader = DataLoader(
        val_dataset,
        batch_size=batch_size,
        shuffle=False,
        pin_memory=device.type=='cuda',
        num_workers=0
    )

In [None]:
# Compute class weights
class_weights_raw = compute_class_weight('balanced', classes=np.unique(y_train), y=np.array(y_train))
class_weights = np.sqrt(class_weights_raw)
class_weights_tensor = torch.tensor(class_weights, dtype=torch.float32).to(device)
print(f"Class weights (sqrt-scaled): {class_weights}")

criterion = nn.CrossEntropyLoss(weight=class_weights_tensor, label_smoothing=label_smoothing)

## Train All 4 Models

Most tanítjuk mind a 4 modell architektúrát és összehasonlítjuk őket:

In [None]:
# Model configs
model_configs = [
    {
        'name': 'Step1_Baseline',
        'class': Step1_BaselineModel,
        'description': 'Minimal baseline: Transformer + single linear classifier'
    },
    {
        'name': 'Step2_Extended',
        'class': Step2_ExtendedModel,
        'description': '2-layer adapter + BatchNorm + Dropout(0.3)'
    },
    {
        'name': 'Step3_Advanced',
        'class': Step3_AdvancedModel,
        'description': 'Attention pooling + 3-layer adapter + gating mechanism'
    },
    {
        'name': 'Final_Balanced',
        'class': BalancedFinalModel,
        'description': 'PRODUCTION RECOMMENDED: Mean pooling + balanced architecture'
    }
]

all_results = {}

In [None]:
# Train each model (simplified training loop for notebook)
for config in model_configs:
    print(f"\n{'='*80}")
    print(f"Training {config['name']}")
    print(f"Architecture: {config['description']}")
    print(f"{'='*80}\n")
    
    # Load fresh base transformer
    base_transformer = AutoModel.from_pretrained(model_name)
    model = config['class'](base_transformer, num_classes=num_labels)
    model.to(device)
    
    # Optimizer and scheduler
    optimizer = AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
    effective_steps_per_epoch = math.ceil(len(train_loader) / max(1, grad_acc_steps))
    total_steps = effective_steps_per_epoch * epochs
    warmup_steps = int(0.15 * total_steps)
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=total_steps)
    
    # Training loop
    history = {'train_loss': [], 'train_acc': [], 'val_loss': [], 'val_acc': [], 'val_macro_f1': [], 'val_weighted_f1': []}
    best_metric_val = -float('inf')
    no_improve_epochs = 0
    
    for epoch in range(epochs):
        print(f"\nEpoch {epoch + 1}/{epochs}")
        
        train_loss, train_acc = train_epoch(model, train_loader, optimizer, scheduler, device, criterion, grad_acc_steps)
        history['train_loss'].append(train_loss)
        history['train_acc'].append(train_acc)
        
        print(f"Train Loss: {train_loss:.4f}, Train Accuracy: {train_acc:.4f}")
        
        if val_loader is not None:
            val_loss, val_acc, val_preds, val_trues = evaluate(model, val_loader, device, criterion)
            history['val_loss'].append(val_loss)
            history['val_acc'].append(val_acc)
            val_macro_f1 = f1_score(val_trues, val_preds, average='macro')
            val_weighted_f1 = f1_score(val_trues, val_preds, average='weighted')
            history['val_macro_f1'].append(val_macro_f1)
            history['val_weighted_f1'].append(val_weighted_f1)
            print(f"Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}, Val Weighted-F1: {val_weighted_f1:.4f}")
            
            # Early stopping
            current = val_weighted_f1
            if current > best_metric_val + 1e-4:
                best_metric_val = current
                no_improve_epochs = 0
                print(f"✓ New best (val_weighted_f1 = {current:.4f})")
            else:
                no_improve_epochs += 1
                if no_improve_epochs >= early_stopping_patience:
                    print(f"Early stopping triggered")
                    break
    
    # Plot training history
    history_plot_path = os.path.join(reports_dir, f'04-{config["name"].lower().replace(" ", "_")}_history.png')
    plot_training_history(history, history_plot_path)
    
    # Store results
    all_results[config['name']] = {
        'name': config['name'],
        'train_acc': history['train_acc'][-1],
        'val_acc': history['val_acc'][-1] if history['val_acc'] else 0.0,
        'val_macro_f1': history['val_macro_f1'][-1] if history['val_macro_f1'] else 0.0,
        'val_weighted_f1': best_metric_val,
        'epochs_trained': len(history['train_acc'])
    }
    
    # Clean up GPU memory
    del base_transformer, model
    torch.cuda.empty_cache() if torch.cuda.is_available() else None

## Results Comparison

In [None]:
# Summary
print(f"\n{'='*80}")
print(f"FINAL RESULTS SUMMARY")
print(f"{'='*80}\n")

for name, result in all_results.items():
    print(f"{name}:")
    print(f"  Train Acc: {result['train_acc']:.4f} | Val Acc: {result['val_acc']:.4f}")
    print(f"  Val Weighted F1: {result['val_weighted_f1']:.4f} | Val Macro F1: {result['val_macro_f1']:.4f}")
    print(f"  Epochs Trained: {result['epochs_trained']}")
    print(f"  Gap (Train-Val): {result['train_acc'] - result['val_acc']:.4f}")
    print()

# Save summary JSON
summary_path = os.path.join(reports_dir, '04-expansion_summary.json')
with open(summary_path, 'w', encoding='utf-8') as f:
    json.dump(all_results, f, ensure_ascii=False, indent=2)
print(f"Summary saved to {summary_path}")

In [None]:
# Generate comparison plot
comparison_plot_path = os.path.join(reports_dir, '04-model_expansion_comparison.png')
plot_model_comparison(all_results, comparison_plot_path)

In [None]:
# Identify best model
best_name, best_result = max(all_results.items(), key=lambda kv: kv[1]['val_weighted_f1'])
print(f"\n{'='*80}")
print(f"BEST MODEL: {best_name}")
print(f"Val Weighted F1: {best_result['val_weighted_f1']:.4f}")
print(f"{'='*80}")