# BERT Model Optimization with MLflow and Optuna

This notebook implements hyperparameter optimization for the BERT text classification model using:
- **Optuna**: For hyperparameter optimization
- **MLflow**: For experiment tracking and model registry
- **Autologging**: Automatic logging of training metrics and artifacts

In [1]:
# Import required libraries
from transformers import BertModel, BertTokenizer, get_linear_schedule_with_warmup
import torch
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
from torch import nn, optim
from torch.utils.data import DataLoader, Dataset
import pandas as pd
import time
from tqdm.notebook import tqdm
import warnings
warnings.filterwarnings('ignore')

# MLflow and Optuna imports
import mlflow
import mlflow.pytorch
import optuna
from optuna.integration.mlflow import MLflowCallback
import os

In [2]:
# Configuration and global parameters
RANDOM_SEED = 42
DATASET_PATH = "historias_clinicas_procesadas.xlsx"
NCLASS = 6
PRE_TRAINED_MODEL_NAME = "bert-base-multilingual-cased"
N_TRIALS = 20  # Number of Optuna trials
SAMPLE_SIZE = 500  # Sample size for faster experimentation

# Set random seeds
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
optuna.logging.set_verbosity(optuna.logging.WARNING)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# MLflow configuration
mlflow.set_experiment("BERT_Hyperparameter_Optimization")
mlflow.pytorch.autolog(log_models=True, log_datasets=True)

Using device: cuda:0


In [3]:
# Load and preprocess dataset
def load_and_preprocess_data():
    """Load and preprocess the dataset"""
    df = pd.read_excel(DATASET_PATH)
    
    # Sample data for faster experimentation
    df = df.sample(n=SAMPLE_SIZE, random_state=RANDOM_SEED).reset_index(drop=True)
    
    # Clean text data
    df["concatenada"] = df["concatenada"].apply(lambda x: str(x).replace("[","").replace("]",""))
    df["concatenada"] = df["concatenada"].apply(lambda x: str(x).replace("'",""))
    df["concatenada"] = df["concatenada"].apply(lambda x: str(x).replace(","," "))
    
    return df

# Load data
df = load_and_preprocess_data()
print(f"Dataset loaded with {len(df)} samples")
print(f"Classes distribution:")
print(df['grupo_codificado'].value_counts().sort_index())

Dataset loaded with 500 samples
Classes distribution:
grupo_codificado
0    106
1     27
2    151
3    109
4     42
5     65
Name: count, dtype: int64


In [4]:
# Custom Dataset Class
class CustomDataset(Dataset):
    def __init__(self, text, labels, tokenizer, max_len):
        self.text = text
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, item):
        text = str(self.text[item])
        label = self.labels[item]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
            return_tensors="pt",
        )
        return {
            "text": text,
            "input_ids": encoding["input_ids"].flatten(),
            "attention_mask": encoding["attention_mask"].flatten(),
            "labels": torch.tensor(label, dtype=torch.long)
        }

In [7]:
# BERT Text Classifier with configurable parameters
class BERTTextClassifier(nn.Module):
    def __init__(self, n_classes, dropout_rate=0.3, hidden_layers=None):
        super(BERTTextClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
        self.dropout = nn.Dropout(dropout_rate)
        
        # Optional additional hidden layers
        if hidden_layers:
            layers = []
            prev_size = self.bert.config.hidden_size
            for hidden_size in hidden_layers:
                layers.extend([
                    nn.Linear(prev_size, hidden_size),
                    nn.ReLU(),
                    nn.Dropout(dropout_rate)
                ])
                prev_size = hidden_size
            layers.append(nn.Linear(prev_size, n_classes))
            self.classifier = nn.Sequential(*layers)
        else:
            self.classifier = nn.Linear(self.bert.config.hidden_size, n_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        pooled_output = outputs.pooler_output
        output = self.dropout(pooled_output)
        return self.classifier(output)

In [8]:
# Data loader function
def create_data_loader(df, tokenizer, max_len, batch_size):
    """Create DataLoader for the dataset"""
    dataset = CustomDataset(
        text=df["concatenada"].to_numpy(),
        labels=df["grupo_codificado"].to_numpy(),
        tokenizer=tokenizer,
        max_len=max_len
    )
    return DataLoader(
        dataset, 
        batch_size=batch_size, 
        num_workers=0,  # Set to 0 for Windows compatibility
        pin_memory=torch.cuda.is_available()
    )

In [9]:
# Training and evaluation functions
def train_epoch(model, data_loader, loss_fn, optimizer, device, scheduler):
    """Train model for one epoch"""
    model.train()
    losses = []
    correct_predictions = 0
    total_samples = 0
    
    for batch in data_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)
        
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        _, preds = torch.max(outputs, dim=1)
        loss = loss_fn(outputs, labels)
        
        correct_predictions += torch.sum(preds == labels)
        total_samples += labels.size(0)
        losses.append(loss.item())
        
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        if scheduler:
            scheduler.step()
        optimizer.zero_grad()
    
    return correct_predictions.double() / total_samples, np.mean(losses)

def eval_model(model, data_loader, loss_fn, device):
    """Evaluate model"""
    model.eval()
    losses = []
    correct_predictions = 0
    total_samples = 0
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)
            
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs, dim=1)
            loss = loss_fn(outputs, labels)
            
            correct_predictions += torch.sum(preds == labels)
            total_samples += labels.size(0)
            losses.append(loss.item())
            
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    
    accuracy = correct_predictions.double() / total_samples
    avg_loss = np.mean(losses)
    
    # Calculate additional metrics
    precision, recall, f1, _ = precision_recall_fscore_support(
        all_labels, all_preds, average='weighted', zero_division=0
    )
    
    return accuracy, avg_loss, precision, recall, f1

In [None]:
# Objective function for Optuna optimization
def objective(trial):
    """Optuna objective function for hyperparameter optimization"""
    
    # Start MLflow run for this trial
    with mlflow.start_run(nested=True):
        # Suggest hyperparameters
        params = {
            'max_len': trial.suggest_categorical('max_len', [128, 200, 256, 512]),
            'batch_size': trial.suggest_categorical('batch_size', [8, 16, 32]),
            'learning_rate': trial.suggest_float('learning_rate', 1e-5, 5e-5, log=True),
            'epochs': trial.suggest_int('epochs', 4, 8),
            'dropout_rate': trial.suggest_float('dropout_rate', 0.3, 0.5),
            'weight_decay': trial.suggest_float('weight_decay', 0.1, 0.2),
            'warmup_steps_ratio': trial.suggest_float('warmup_steps_ratio', 0.1, 0.2),
            'use_scheduler': trial.suggest_categorical('use_scheduler', [True, False]),
            'hidden_layers': trial.suggest_categorical('hidden_layers', [None, [512], [512, 256]])
        }
        
        # Log hyperparameters
        mlflow.log_params(params)
        
        try:
            # Initialize tokenizer
            tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)
            
            # Split data
            df_train, df_val = train_test_split(
                df, test_size=0.2, random_state=RANDOM_SEED, stratify=df['grupo_codificado']
            )
            
            # Create data loaders
            train_data_loader = create_data_loader(
                df_train, tokenizer, params['max_len'], params['batch_size']
            )
            val_data_loader = create_data_loader(
                df_val, tokenizer, params['max_len'], params['batch_size']
            )
            
            # Initialize model
            model = BERTTextClassifier(
                n_classes=NCLASS, 
                dropout_rate=params['dropout_rate'],
                hidden_layers=params['hidden_layers']
            )
            model = model.to(device)
            
            # Initialize optimizer and scheduler
            optimizer = optim.AdamW(
                model.parameters(), 
                lr=params['learning_rate'],
                weight_decay=params['weight_decay']
            )
            
            scheduler = None
            if params['use_scheduler']:
                total_steps = len(train_data_loader) * params['epochs']
                warmup_steps = int(total_steps * params['warmup_steps_ratio'])
                scheduler = get_linear_schedule_with_warmup(
                    optimizer,
                    num_warmup_steps=warmup_steps,
                    num_training_steps=total_steps
                )
            
            loss_fn = nn.CrossEntropyLoss().to(device)
            
            # Training loop
            best_val_accuracy = 0
            
            for epoch in range(params['epochs']):
                # Training
                train_acc, train_loss = train_epoch(
                    model, train_data_loader, loss_fn, optimizer, device, scheduler
                )
                
                # Validation
                val_acc, val_loss, val_precision, val_recall, val_f1 = eval_model(
                    model, val_data_loader, loss_fn, device
                )
                
                # Log metrics for this epoch
                mlflow.log_metrics({
                    'train_accuracy': train_acc.item(),
                    'train_loss': train_loss,
                    'val_accuracy': val_acc.item(),
                    'val_loss': val_loss,
                    'val_precision': val_precision,
                    'val_recall': val_recall,
                    'val_f1': val_f1
                }, step=epoch)
                
                # Update best validation accuracy
                if val_acc > best_val_accuracy:
                    best_val_accuracy = val_acc
                
                # Pruning: report intermediate value and check if trial should be pruned
                trial.report(val_acc.item(), epoch)
                if trial.should_prune():
                    raise optuna.exceptions.TrialPruned()
            
            # Log final metrics
            mlflow.log_metric('best_val_accuracy', best_val_accuracy.item())
            
            # Save model if it's among the best
            if best_val_accuracy > 0.7:  # Save only good models
                mlflow.pytorch.log_model(
                    model, 
                    "model",
                    registered_model_name=f"BERT_Classifier_Trial_{trial.number}"
                )
            
            return best_val_accuracy.item()
            
        except Exception as e:
            mlflow.log_param('error', str(e))
            print(f"Trial {trial.number} failed with error: {e}")
            return 0.0

In [11]:
# Create Optuna study with MLflow integration
def run_optimization():
    """Run hyperparameter optimization with Optuna and MLflow"""
    
    # Create MLflow callback for Optuna
    mlflc = MLflowCallback(
        tracking_uri=mlflow.get_tracking_uri(),
        metric_name="best_val_accuracy",
        create_experiment=False
    )
    
    # Create Optuna study
    study = optuna.create_study(
        direction="maximize",
        study_name="BERT_Hyperparameter_Optimization",
        pruner=optuna.pruners.MedianPruner(
            n_startup_trials=5,
            n_warmup_steps=2,
            interval_steps=1
        )
    )
    
    # Start MLflow parent run
    with mlflow.start_run(run_name="BERT_Hyperparameter_Optimization"):
        mlflow.log_param("n_trials", N_TRIALS)
        mlflow.log_param("sample_size", SAMPLE_SIZE)
        mlflow.log_param("n_classes", NCLASS)
        mlflow.log_param("pre_trained_model", PRE_TRAINED_MODEL_NAME)
        
        # Run optimization
        study.optimize(
            objective, 
            n_trials=N_TRIALS,
            callbacks=[mlflc],
            show_progress_bar=True
        )
        
        # Log best parameters and results
        best_params = study.best_params
        best_value = study.best_value
        
        mlflow.log_params({f"best_{k}": v for k, v in best_params.items()})
        mlflow.log_metric("best_accuracy", best_value)
        
        print(f"\nOptimization completed!")
        print(f"Best accuracy: {best_value:.4f}")
        print(f"Best parameters: {best_params}")
        
        return study, best_params, best_value

In [12]:
# Run the optimization
print("Starting hyperparameter optimization...")
print(f"Number of trials: {N_TRIALS}")
print(f"Using device: {device}")
print(f"Dataset size: {len(df)} samples")
print("-" * 50)

study, best_params, best_value = run_optimization()

Starting hyperparameter optimization...
Number of trials: 20
Using device: cuda:0
Dataset size: 500 samples
--------------------------------------------------


  0%|          | 0/20 [00:00<?, ?it/s]

Exception: Run with UUID 362ccf8b8b444339a335d9ac6bdfa85d is already active. To start a new run, first end the current run with mlflow.end_run(). To start a nested run, call start_run with nested=True

In [None]:
# Analyze optimization results
import matplotlib.pyplot as plt
import seaborn as sns

# Plot optimization history
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))

# Plot optimization history
optuna.visualization.matplotlib.plot_optimization_history(study, ax=ax1)
ax1.set_title('Optimization History')

# Plot parameter importances
optuna.visualization.matplotlib.plot_param_importances(study, ax=ax2)
ax2.set_title('Parameter Importances')

plt.tight_layout()
plt.show()

# Display trial results
trials_df = study.trials_dataframe()
print("\nTop 10 trials:")
print(trials_df.nlargest(10, 'value')[['number', 'value', 'state']].to_string(index=False))

In [None]:
# Train final model with best parameters
def train_final_model(best_params):
    """Train final model with best hyperparameters"""
    
    with mlflow.start_run(run_name="Final_Best_Model"):
        # Log best parameters
        mlflow.log_params(best_params)
        
        # Initialize tokenizer
        tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)
        
        # Split data (using larger validation set for final evaluation)
        df_train, df_test = train_test_split(
            df, test_size=0.3, random_state=RANDOM_SEED, stratify=df['grupo_codificado']
        )
        
        # Create data loaders
        train_data_loader = create_data_loader(
            df_train, tokenizer, best_params['max_len'], best_params['batch_size']
        )
        test_data_loader = create_data_loader(
            df_test, tokenizer, best_params['max_len'], best_params['batch_size']
        )
        
        # Initialize model
        model = BERTTextClassifier(
            n_classes=NCLASS, 
            dropout_rate=best_params['dropout_rate'],
            hidden_layers=best_params['hidden_layers']
        )
        model = model.to(device)
        
        # Initialize optimizer and scheduler
        optimizer = optim.AdamW(
            model.parameters(), 
            lr=best_params['learning_rate'],
            weight_decay=best_params['weight_decay']
        )
        
        scheduler = None
        if best_params['use_scheduler']:
            total_steps = len(train_data_loader) * best_params['epochs']
            warmup_steps = int(total_steps * best_params['warmup_steps_ratio'])
            scheduler = get_linear_schedule_with_warmup(
                optimizer,
                num_warmup_steps=warmup_steps,
                num_training_steps=total_steps
            )
        
        loss_fn = nn.CrossEntropyLoss().to(device)
        
        # Training loop with progress tracking
        print("Training final model with best parameters...")
        
        for epoch in range(best_params['epochs']):
            print(f"Epoch {epoch + 1}/{best_params['epochs']}")
            
            # Training
            train_acc, train_loss = train_epoch(
                model, train_data_loader, loss_fn, optimizer, device, scheduler
            )
            
            # Testing
            test_acc, test_loss, test_precision, test_recall, test_f1 = eval_model(
                model, test_data_loader, loss_fn, device
            )
            
            # Log metrics
            mlflow.log_metrics({
                'train_accuracy': train_acc.item(),
                'train_loss': train_loss,
                'test_accuracy': test_acc.item(),
                'test_loss': test_loss,
                'test_precision': test_precision,
                'test_recall': test_recall,
                'test_f1': test_f1
            }, step=epoch)
            
            print(f"  Train - Acc: {train_acc:.4f}, Loss: {train_loss:.4f}")
            print(f"  Test  - Acc: {test_acc:.4f}, Loss: {test_loss:.4f}, F1: {test_f1:.4f}")
        
        # Save final model
        mlflow.pytorch.log_model(
            model, 
            "final_model",
            registered_model_name="BERT_Text_Classifier_Final"
        )
        
        print(f"\nFinal model training completed!")
        print(f"Final test accuracy: {test_acc:.4f}")
        print(f"Final test F1-score: {test_f1:.4f}")
        
        return model, test_acc.item(), test_f1

# Train final model
final_model, final_accuracy, final_f1 = train_final_model(best_params)

In [None]:
# Summary and model registry information
print("=" * 60)
print("OPTIMIZATION SUMMARY")
print("=" * 60)
print(f"Total trials run: {len(study.trials)}")
print(f"Best validation accuracy: {best_value:.4f}")
print(f"Final test accuracy: {final_accuracy:.4f}")
print(f"Final test F1-score: {final_f1:.4f}")
print("\nBest hyperparameters:")
for param, value in best_params.items():
    print(f"  {param}: {value}")

print("\n" + "=" * 60)
print("MLflow Tracking:")
print(f"Experiment: {mlflow.get_experiment_by_name('BERT_Hyperparameter_Optimization').name}")
print(f"Tracking URI: {mlflow.get_tracking_uri()}")
print("\nView your experiments in MLflow UI by running:")
print("mlflow ui")
print("=" * 60)