# Transformer Training for Relay Optimization

This notebook trains a transformer network with Optuna to learn from relay optimization data and generalize optimal values.

## Objectives:
1. Load existing GA optimization data
2. Train a transformer to predict optimal TDS and pickup values
3. Use Optuna to optimize model hyperparameters
4. Generalize optimization values for new scenarios

**üöÄ SINGLE CELL EXECUTION - Run All Button Compatible**


In [1]:
# =============================================================================
# TRANSFORMER TRAINING FOR RELAY OPTIMIZATION - COMPLETE EXECUTION
# =============================================================================

import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import numpy as np
import pandas as pd
from torch.utils.data import DataLoader, TensorDataset
import optuna
import math
import pickle
import json
import os
from pathlib import Path
from collections import defaultdict
import warnings
import time
warnings.filterwarnings('ignore')

print("üöÄ TRANSFORMER TRAINING - COMPLETE EXECUTION")
print("=" * 60)

# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# =============================================================================
# CONFIGURATION AND PATHS
# =============================================================================

PROJECT_ROOT = Path("/Users/gustavo/Documents/Projects/TESIS_UNAL/AutoDOC-MG")
MODEL_DIR = PROJECT_ROOT / "models" / "transformer"
MODEL_DIR.mkdir(parents=True, exist_ok=True)

RAW_DATA_PATH = PROJECT_ROOT / "data" / "raw" / "automation_results.json"
OPTIMIZATION_RESULTS_PATH = PROJECT_ROOT / "data" / "processed" / "ga_optimization_all_scenarios_comprehensive_20251008_224215.json"

# Model files
MODEL_PATH = MODEL_DIR / "best_relay_optimization_transformer.pth"
SCALER_INPUT_PATH = MODEL_DIR / "scaler_input.pkl"
SCALER_TARGET_PATH = MODEL_DIR / "scaler_target.pkl"
BEST_PARAMS_PATH = MODEL_DIR / "best_params.json"
TRAINING_SUMMARY_PATH = MODEL_DIR / "training_summary.json"

print(f"üìÅ Model directory: {MODEL_DIR}")
print(f"üìÇ Data paths configured")

# =============================================================================
# TRANSFORMER MODEL ARCHITECTURE
# =============================================================================

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super(PositionalEncoding, self).__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return x

class RelayOptimizationTransformer(nn.Module):
    def __init__(self, input_dim, output_dim, d_model, nhead, num_encoder_layers, dim_feedforward, dropout=0.1):
        super(RelayOptimizationTransformer, self).__init__()
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.d_model = d_model

        self.input_proj = nn.Linear(input_dim, d_model)
        self.pos_encoder = PositionalEncoding(d_model)
        
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model,
            nhead=nhead,
            dim_feedforward=dim_feedforward,
            dropout=dropout,
            batch_first=False
        )
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_encoder_layers)
        self.output_proj = nn.Linear(d_model, output_dim)
        
        self._init_weights()

    def _init_weights(self):
        initrange = 0.1
        self.input_proj.weight.data.uniform_(-initrange, initrange)
        self.output_proj.weight.data.uniform_(-initrange, initrange)

    def forward(self, src):
        # Asegurar que src tenga 3 dimensiones (batch_size, sequence_length, features)
        if src.dim() == 2:
            # Si src tiene 2 dimensiones, agregar una dimensi√≥n de secuencia
            src = src.unsqueeze(1)  # (batch_size, 1, features)
        
        src = self.input_proj(src) * math.sqrt(self.d_model)
        # Transformer espera (sequence_length, batch_size, features)
        src = src.permute(1, 0, 2)
        src = self.pos_encoder(src)
        output = self.transformer_encoder(src)
        # Volver a (batch_size, sequence_length, features)
        output = output.permute(1, 0, 2)
        # Promedio sobre la secuencia para obtener (batch_size, features)
        output = output.mean(dim=1)
        output = self.output_proj(output)
        return output

print("‚úÖ Transformer model architecture defined")

# =============================================================================
# DATA LOADING AND PREPROCESSING
# =============================================================================

def load_and_process_data():
    """
    Carga y procesa los datos de rel√©s y optimizaci√≥n GA
    """
    print("üîÑ Loading and processing data...")
    
    # Cargar datos originales
    with open(RAW_DATA_PATH, 'r', encoding='utf-8') as f:
        raw_data = json.load(f)
    
    # Cargar resultados de optimizaci√≥n GA
    with open(OPTIMIZATION_RESULTS_PATH, 'r', encoding='utf-8') as f:
        ga_results = json.load(f)
    
    print(f"üìä Raw data loaded: {len(raw_data)} relay pairs")
    print(f"üìä GA results loaded: {len(ga_results['optimization_results'])} scenarios")
    
    # Agrupar datos originales por escenario
    raw_by_scenario = defaultdict(list)
    for entry in raw_data:
        raw_by_scenario[entry['scenario_id']].append(entry)
    
    # Obtener resultados GA por escenario
    ga_by_scenario = ga_results['optimization_results']
    
    # Crear dataset de entrenamiento
    training_data = []
    
    for scenario_id in ga_by_scenario.keys():
        if scenario_id not in raw_by_scenario:
            continue
            
        scenario_raw = raw_by_scenario[scenario_id]
        scenario_ga = ga_by_scenario[scenario_id]
        
        optimized_relays = scenario_ga['relay_values']
        
        for relay_pair in scenario_raw:
            main_relay_id = relay_pair['main_relay']['relay']
            backup_relay_id = relay_pair['backup_relay']['relay']
            
            # Verificar si ambos rel√©s fueron optimizados
            if main_relay_id in optimized_relays and backup_relay_id in optimized_relays:
                
                # Caracter√≠sticas de entrada
                input_features = [
                    float(relay_pair['fault']),
                    relay_pair['main_relay']['Ishc'],
                    relay_pair['main_relay']['Time_out'],
                    relay_pair['backup_relay']['Ishc'],
                    relay_pair['backup_relay']['Time_out'],
                    len(scenario_raw)
                ]
                
                # Caracter√≠sticas objetivo (valores optimizados por GA)
                target_features = [
                    optimized_relays[main_relay_id]['TDS'],
                    optimized_relays[main_relay_id]['pickup'],
                    optimized_relays[backup_relay_id]['TDS'],
                    optimized_relays[backup_relay_id]['pickup']
                ]
                
                training_data.append({
                    'input': input_features,
                    'target': target_features,
                    'scenario_id': scenario_id,
                    'main_relay': main_relay_id,
                    'backup_relay': backup_relay_id
                })
    
    print(f"üìä Training dataset created: {len(training_data)} samples")
    print(f"üìä Scenarios included: {len(set(d['scenario_id'] for d in training_data))}")
    
    if len(training_data) == 0:
        raise ValueError("No training data created. Check data files.")
    
    return training_data

# Cargar datos
training_data = load_and_process_data()

# =============================================================================
# DATA PREPARATION FOR TRAINING
# =============================================================================

def prepare_training_data(training_data):
    """
    Prepara los datos para el entrenamiento
    """
    print("üîÑ Preparing training data...")
    
    # Extraer caracter√≠sticas de entrada y objetivo
    X = np.array([item['input'] for item in training_data])
    y = np.array([item['target'] for item in training_data])
    
    # Dividir en entrenamiento y validaci√≥n
    X_train, X_val, y_train, y_val = train_test_split(
        X, y, test_size=0.2, random_state=42, shuffle=True
    )
    
    # Normalizar caracter√≠sticas de entrada
    scaler_input = StandardScaler()
    X_train_scaled = scaler_input.fit_transform(X_train)
    X_val_scaled = scaler_input.transform(X_val)
    
    # Normalizar caracter√≠sticas objetivo
    scaler_target = StandardScaler()
    y_train_scaled = scaler_target.fit_transform(y_train)
    y_val_scaled = scaler_target.transform(y_val)
    
    print(f"üìä Training samples: {len(X_train_scaled)}")
    print(f"üìä Validation samples: {len(X_val_scaled)}")
    print(f"üìä Input features: {X_train_scaled.shape[1]}")
    print(f"üìä Output features: {y_train_scaled.shape[1]}")
    
    return (X_train_scaled, X_val_scaled, y_train_scaled, y_val_scaled, 
            scaler_input, scaler_target)

# Preparar datos
X_train, X_val, y_train, y_val, scaler_input, scaler_target = prepare_training_data(training_data)

# =============================================================================
# TRAINING FUNCTIONS
# =============================================================================

def train_epoch(model, dataloader, optimizer, criterion, device):
    """
    Entrena el modelo por una √©poca
    """
    model.train()
    total_loss = 0
    
    for batch_X, batch_y in dataloader:
        batch_X = batch_X.to(device)
        batch_y = batch_y.to(device)
        
        optimizer.zero_grad()
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    return total_loss / len(dataloader)

def validate_epoch(model, dataloader, criterion, device):
    """
    Valida el modelo por una √©poca
    """
    model.eval()
    total_loss = 0
    
    with torch.no_grad():
        for batch_X, batch_y in dataloader:
            batch_X = batch_X.to(device)
            batch_y = batch_y.to(device)
            
            outputs = model(batch_X)
            loss = criterion(outputs, batch_y)
            total_loss += loss.item()
    
    return total_loss / len(dataloader)

def objective(trial):
    """
    Funci√≥n objetivo para Optuna
    """
    # Par√°metros del modelo
    d_model = trial.suggest_categorical('d_model', [32, 64, 128])
    nhead = trial.suggest_categorical('nhead', [4, 8, 16])
    num_encoder_layers = trial.suggest_int('num_encoder_layers', 2, 6)
    dim_feedforward = trial.suggest_categorical('dim_feedforward', [256, 512, 1024])
    dropout = trial.suggest_float('dropout', 0.1, 0.3)
    
    # Par√°metros de entrenamiento
    learning_rate = trial.suggest_float('learning_rate', 1e-5, 1e-3, log=True)
    batch_size = trial.suggest_categorical('batch_size', [16, 32, 64])
    weight_decay = trial.suggest_float('weight_decay', 1e-6, 1e-4, log=True)
    
    # Crear modelo
    model = RelayOptimizationTransformer(
        input_dim=6,
        output_dim=4,
        d_model=d_model,
        nhead=nhead,
        num_encoder_layers=num_encoder_layers,
        dim_feedforward=dim_feedforward,
        dropout=dropout
    ).to(device)
    
    # Optimizador y funci√≥n de p√©rdida
    optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
    criterion = nn.MSELoss()
    
    # Crear dataloaders
    train_dataset = TensorDataset(
        torch.tensor(X_train, dtype=torch.float32),
        torch.tensor(y_train, dtype=torch.float32)
    )
    val_dataset = TensorDataset(
        torch.tensor(X_val, dtype=torch.float32),
        torch.tensor(y_val, dtype=torch.float32)
    )
    
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
    
    # Entrenar modelo
    best_val_loss = float('inf')
    patience = 5
    patience_counter = 0
    
    for epoch in range(20):  # M√°ximo 20 √©pocas por trial
        train_loss = train_epoch(model, train_loader, optimizer, criterion, device)
        val_loss = validate_epoch(model, val_loader, criterion, device)
        
        # Early stopping
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            patience_counter = 0
        else:
            patience_counter += 1
            if patience_counter >= patience:
                break
        
        # Reportar m√©tricas intermedias
        trial.report(val_loss, epoch)
        
        # Verificar si el trial debe ser podado
        if trial.should_prune():
            raise optuna.exceptions.TrialPruned()
    
    return best_val_loss

print("‚úÖ Training functions defined")

# =============================================================================
# OPTUNA OPTIMIZATION
# =============================================================================

print("üîÑ Starting Optuna optimization...")

# Crear estudio Optuna
study = optuna.create_study(
    direction='minimize',
    pruner=optuna.pruners.MedianPruner(n_startup_trials=5, n_warmup_steps=10)
)

# Ejecutar optimizaci√≥n
study.optimize(objective, n_trials=20, timeout=1800)  # 20 trials, 30 min timeout

print(f"‚úÖ Optuna optimization completed!")
print(f"üìä Best trial: {study.best_trial.number}")
print(f"üìä Best validation loss: {study.best_value:.6f}")
print(f"üìä Best parameters: {study.best_params}")

# =============================================================================
# FINAL MODEL TRAINING
# =============================================================================

print("üîÑ Training final model with best parameters...")

# Obtener mejores par√°metros
best_params = study.best_params

# Crear modelo final
final_model = RelayOptimizationTransformer(
    input_dim=6,
    output_dim=4,
    d_model=best_params['d_model'],
    nhead=best_params['nhead'],
    num_encoder_layers=best_params['num_encoder_layers'],
    dim_feedforward=best_params['dim_feedforward'],
    dropout=best_params['dropout']
).to(device)

# Optimizador y funci√≥n de p√©rdida
optimizer = optim.Adam(
    final_model.parameters(), 
    lr=best_params['learning_rate'], 
    weight_decay=best_params['weight_decay']
)
criterion = nn.MSELoss()

# Crear dataloaders
train_dataset = TensorDataset(
    torch.tensor(X_train, dtype=torch.float32),
    torch.tensor(y_train, dtype=torch.float32)
)
val_dataset = TensorDataset(
    torch.tensor(X_val, dtype=torch.float32),
    torch.tensor(y_val, dtype=torch.float32)
)

train_loader = DataLoader(train_dataset, batch_size=best_params['batch_size'], shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=best_params['batch_size'], shuffle=False)

# Entrenar modelo final
train_losses = []
val_losses = []
best_val_loss = float('inf')
patience = 10
patience_counter = 0

print("üîÑ Training final model...")

for epoch in range(50):  # M√°ximo 50 √©pocas
    train_loss = train_epoch(final_model, train_loader, optimizer, criterion, device)
    val_loss = validate_epoch(final_model, val_loader, criterion, device)
    
    train_losses.append(train_loss)
    val_losses.append(val_loss)
    
    if epoch % 5 == 0:
        print(f"   Epoch {epoch:3d}: Train Loss = {train_loss:.6f}, Val Loss = {val_loss:.6f}")
    
    # Early stopping
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        patience_counter = 0
        # Guardar mejor modelo
        torch.save(final_model.state_dict(), MODEL_PATH)
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print(f"   Early stopping at epoch {epoch}")
            break

print(f"‚úÖ Final model training completed!")
print(f"üìä Best validation loss: {best_val_loss:.6f}")
print(f"üìä Total epochs: {len(train_losses)}")

# =============================================================================
# SAVE MODEL AND ARTIFACTS
# =============================================================================

print("üîÑ Saving model and artifacts...")

# Guardar scalers
with open(SCALER_INPUT_PATH, 'wb') as f:
    pickle.dump(scaler_input, f)

with open(SCALER_TARGET_PATH, 'wb') as f:
    pickle.dump(scaler_target, f)

# Guardar mejores par√°metros
with open(BEST_PARAMS_PATH, 'w') as f:
    json.dump(best_params, f, indent=4)

# Guardar resumen de entrenamiento
training_summary = {
    'best_val_loss': best_val_loss,
    'best_params': best_params,
    'train_losses': train_losses,
    'val_losses': val_losses,
    'num_final_epochs': len(train_losses),
    'training_date': time.strftime('%Y-%m-%dT%H:%M:%S.%f'),
    'mode': 'SINGLE_CELL_EXECUTION'
}

with open(TRAINING_SUMMARY_PATH, 'w') as f:
    json.dump(training_summary, f, indent=4)

print(f"‚úÖ Model and artifacts saved!")
print(f"üìÅ Model file: {MODEL_PATH}")
print(f"üìÅ Scaler input: {SCALER_INPUT_PATH}")
print(f"üìÅ Scaler target: {SCALER_TARGET_PATH}")
print(f"üìÅ Best params: {BEST_PARAMS_PATH}")
print(f"üìÅ Training summary: {TRAINING_SUMMARY_PATH}")

# =============================================================================
# MODEL EVALUATION
# =============================================================================

print("üîÑ Evaluating final model...")

# Cargar mejor modelo
final_model.load_state_dict(torch.load(MODEL_PATH, map_location=device))
final_model.eval()

# Evaluar en datos de validaci√≥n
with torch.no_grad():
    val_predictions = final_model(torch.tensor(X_val, dtype=torch.float32).to(device))
    val_predictions = val_predictions.cpu().numpy()
    
    # Desnormalizar predicciones
    val_predictions_denorm = scaler_target.inverse_transform(val_predictions)
    val_targets_denorm = scaler_target.inverse_transform(y_val)
    
    # Calcular m√©tricas
    from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
    
    mse = mean_squared_error(val_targets_denorm, val_predictions_denorm)
    mae = mean_absolute_error(val_targets_denorm, val_predictions_denorm)
    r2 = r2_score(val_targets_denorm, val_predictions_denorm)
    
    print(f"üìä Final Model Performance:")
    print(f"   ‚Ä¢ MSE: {mse:.6f}")
    print(f"   ‚Ä¢ MAE: {mae:.6f}")
    print(f"   ‚Ä¢ R¬≤: {r2:.6f}")
    print(f"   ‚Ä¢ RMSE: {np.sqrt(mse):.6f}")

# =============================================================================
# TRANSFORMER PREDICTOR CLASS
# =============================================================================

class RelayOptimizationPredictor:
    """
    Clase para hacer predicciones con el modelo entrenado
    """
    
    def __init__(self, model_path, scaler_input_path, scaler_target_path, best_params_path):
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        
        # Cargar par√°metros
        with open(best_params_path, 'r') as f:
            self.best_params = json.load(f)
        
        # Cargar scalers
        with open(scaler_input_path, 'rb') as f:
            self.scaler_input = pickle.load(f)
        with open(scaler_target_path, 'rb') as f:
            self.scaler_target = pickle.load(f)
        
        # Crear modelo
        self.model = RelayOptimizationTransformer(
            input_dim=6,
            output_dim=4,
            d_model=self.best_params['d_model'],
            nhead=self.best_params['nhead'],
            num_encoder_layers=self.best_params['num_encoder_layers'],
            dim_feedforward=self.best_params['dim_feedforward'],
            dropout=self.best_params['dropout']
        ).to(self.device)
        
        # Cargar pesos
        self.model.load_state_dict(torch.load(model_path, map_location=self.device))
        self.model.eval()
    
    def predict_optimization(self, relay_data):
        """
        Predice valores √≥ptimos para un par de rel√©s
        
        Args:
            relay_data: Lista de diccionarios con datos de rel√©s
        
        Returns:
            Lista de predicciones con valores √≥ptimos
        """
        predictions = []
        
        with torch.no_grad():
            for relay_pair in relay_data:
                # Preparar caracter√≠sticas de entrada
                input_features = [
                    float(relay_pair['fault']),
                    relay_pair['main_relay']['Ishc'],
                    relay_pair['main_relay']['Time_out'],
                    relay_pair['backup_relay']['Ishc'],
                    relay_pair['backup_relay']['Time_out'],
                    len(relay_data)
                ]
                
                # Normalizar entrada
                input_normalized = self.scaler_input.transform([input_features])
                
                # Convertir a tensor
                input_tensor = torch.tensor(input_normalized, dtype=torch.float32).to(self.device)
                
                # Hacer predicci√≥n
                prediction = self.model(input_tensor)
                prediction_np = prediction.cpu().numpy().reshape(-1, 4)[0]
                
                # Desnormalizar predicci√≥n
                prediction_denorm = self.scaler_target.inverse_transform([prediction_np])[0]
                
                # Crear resultado
                result = {
                    'main_relay': {
                        'relay': relay_pair['main_relay']['relay'],
                        'TDS': max(0.05, min(0.8, prediction_denorm[0])),
                        'pickup': max(0.05, min(2.0, prediction_denorm[1]))
                    },
                    'backup_relay': {
                        'relay': relay_pair['backup_relay']['relay'],
                        'TDS': max(0.05, min(0.8, prediction_denorm[2])),
                        'pickup': max(0.05, min(2.0, prediction_denorm[3]))
                    }
                }
                
                predictions.append(result)
        
        return predictions

# Guardar clase predictor
predictor_code = f"""
import torch
import torch.nn as nn
import numpy as np
import pickle
import json
import math

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super(PositionalEncoding, self).__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return x

class RelayOptimizationTransformer(nn.Module):
    def __init__(self, input_dim, output_dim, d_model, nhead, num_encoder_layers, dim_feedforward, dropout=0.1):
        super(RelayOptimizationTransformer, self).__init__()
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.d_model = d_model

        self.input_proj = nn.Linear(input_dim, d_model)
        self.pos_encoder = PositionalEncoding(d_model)
        
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model,
            nhead=nhead,
            dim_feedforward=dim_feedforward,
            dropout=dropout,
            batch_first=False
        )
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_encoder_layers)
        self.output_proj = nn.Linear(d_model, output_dim)
        
        self._init_weights()

    def _init_weights(self):
        initrange = 0.1
        self.input_proj.weight.data.uniform_(-initrange, initrange)
        self.output_proj.weight.data.uniform_(-initrange, initrange)

    def forward(self, src):
        # Asegurar que src tenga 3 dimensiones (batch_size, sequence_length, features)
        if src.dim() == 2:
            # Si src tiene 2 dimensiones, agregar una dimensi√≥n de secuencia
            src = src.unsqueeze(1)  # (batch_size, 1, features)
        
        src = self.input_proj(src) * math.sqrt(self.d_model)
        # Transformer espera (sequence_length, batch_size, features)
        src = src.permute(1, 0, 2)
        src = self.pos_encoder(src)
        output = self.transformer_encoder(src)
        # Volver a (batch_size, sequence_length, features)
        output = output.permute(1, 0, 2)
        # Promedio sobre la secuencia para obtener (batch_size, features)
        output = output.mean(dim=1)
        output = self.output_proj(output)
        return output

class RelayOptimizationPredictor:
    def __init__(self, model_path, scaler_input_path, scaler_target_path, best_params_path):
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        
        # Cargar par√°metros
        with open(best_params_path, 'r') as f:
            self.best_params = json.load(f)
        
        # Cargar scalers
        with open(scaler_input_path, 'rb') as f:
            self.scaler_input = pickle.load(f)
        with open(scaler_target_path, 'rb') as f:
            self.scaler_target = pickle.load(f)
        
        # Crear modelo
        self.model = RelayOptimizationTransformer(
            input_dim=6,
            output_dim=4,
            d_model=self.best_params['d_model'],
            nhead=self.best_params['nhead'],
            num_encoder_layers=self.best_params['num_encoder_layers'],
            dim_feedforward=self.best_params['dim_feedforward'],
            dropout=self.best_params['dropout']
        ).to(self.device)
        
        # Cargar pesos
        self.model.load_state_dict(torch.load(model_path, map_location=self.device))
        self.model.eval()
    
    def predict_optimization(self, relay_data):
        predictions = []
        
        with torch.no_grad():
            for relay_pair in relay_data:
                # Preparar caracter√≠sticas de entrada
                input_features = [
                    float(relay_pair['fault']),
                    relay_pair['main_relay']['Ishc'],
                    relay_pair['main_relay']['Time_out'],
                    relay_pair['backup_relay']['Ishc'],
                    relay_pair['backup_relay']['Time_out'],
                    len(relay_data)
                ]
                
                # Normalizar entrada
                input_normalized = self.scaler_input.transform([input_features])
                
                # Convertir a tensor
                input_tensor = torch.tensor(input_normalized, dtype=torch.float32).to(self.device)
                
                # Hacer predicci√≥n
                prediction = self.model(input_tensor)
                prediction_np = prediction.cpu().numpy().reshape(-1, 4)[0]
                
                # Desnormalizar predicci√≥n
                prediction_denorm = self.scaler_target.inverse_transform([prediction_np])[0]
                
                # Crear resultado
                result = {{
                    'main_relay': {{
                        'relay': relay_pair['main_relay']['relay'],
                        'TDS': max(0.05, min(0.8, prediction_denorm[0])),
                        'pickup': max(0.05, min(2.0, prediction_denorm[1]))
                    }},
                    'backup_relay': {{
                        'relay': relay_pair['backup_relay']['relay'],
                        'TDS': max(0.05, min(0.8, prediction_denorm[2])),
                        'pickup': max(0.05, min(2.0, prediction_denorm[3]))
                    }}
                }}
                
                predictions.append(result)
        
        return predictions
"""

# Guardar archivo predictor
predictor_path = MODEL_DIR / "transformer_predictor.py"
with open(predictor_path, 'w') as f:
    f.write(predictor_code)

print(f"‚úÖ Predictor class saved: {predictor_path}")

# =============================================================================
# EXECUTION COMPLETED
# =============================================================================

print("\n" + "=" * 60)
print("üéâ TRANSFORMER TRAINING COMPLETED SUCCESSFULLY!")
print("=" * 60)

print(f"\nüìä TRAINING SUMMARY:")
print(f"   ‚Ä¢ Total training samples: {len(X_train)}")
print(f"   ‚Ä¢ Total validation samples: {len(X_val)}")
print(f"   ‚Ä¢ Best validation loss: {best_val_loss:.6f}")
print(f"   ‚Ä¢ Final R¬≤ score: {r2:.6f}")
print(f"   ‚Ä¢ Training epochs: {len(train_losses)}")

print(f"\nüìÅ FILES CREATED:")
print(f"   ‚Ä¢ Model: {MODEL_PATH}")
print(f"   ‚Ä¢ Input scaler: {SCALER_INPUT_PATH}")
print(f"   ‚Ä¢ Target scaler: {SCALER_TARGET_PATH}")
print(f"   ‚Ä¢ Best parameters: {BEST_PARAMS_PATH}")
print(f"   ‚Ä¢ Training summary: {TRAINING_SUMMARY_PATH}")
print(f"   ‚Ä¢ Predictor class: {predictor_path}")

print(f"\nüöÄ NEXT STEPS:")
print(f"   1. Run the validation notebook to test the model")
print(f"   2. Use the predictor class for new relay optimizations")
print(f"   3. Deploy the model for production use")

print(f"\n‚úÖ All operations completed successfully!")


[I 2025-10-09 17:13:05,617] A new study created in memory with name: no-name-7e66e918-c70e-47e9-acef-c40302dc5635


üöÄ TRANSFORMER TRAINING - COMPLETE EXECUTION
Using device: cpu
üìÅ Model directory: /Users/gustavo/Documents/Projects/TESIS_UNAL/AutoDOC-MG/models/transformer
üìÇ Data paths configured
‚úÖ Transformer model architecture defined
üîÑ Loading and processing data...
üìä Raw data loaded: 6800 relay pairs
üìä GA results loaded: 68 scenarios
üìä Training dataset created: 6732 samples
üìä Scenarios included: 68
üîÑ Preparing training data...
üìä Training samples: 5385
üìä Validation samples: 1347
üìä Input features: 6
üìä Output features: 4
‚úÖ Training functions defined
üîÑ Starting Optuna optimization...


[I 2025-10-09 17:13:14,221] Trial 0 finished with value: 0.6408095047917477 and parameters: {'d_model': 64, 'nhead': 4, 'num_encoder_layers': 3, 'dim_feedforward': 512, 'dropout': 0.11441735456773089, 'learning_rate': 0.0004799477427860132, 'batch_size': 32, 'weight_decay': 8.684192176131874e-05}. Best is trial 0 with value: 0.6408095047917477.
[I 2025-10-09 17:13:35,624] Trial 1 finished with value: 0.766578584208208 and parameters: {'d_model': 128, 'nhead': 8, 'num_encoder_layers': 5, 'dim_feedforward': 512, 'dropout': 0.1835533279885399, 'learning_rate': 0.0008709538316185319, 'batch_size': 16, 'weight_decay': 2.8242272583931502e-05}. Best is trial 0 with value: 0.6408095047917477.
[I 2025-10-09 17:13:53,619] Trial 2 finished with value: 0.6342564634112424 and parameters: {'d_model': 32, 'nhead': 8, 'num_encoder_layers': 4, 'dim_feedforward': 512, 'dropout': 0.16293126217841666, 'learning_rate': 0.0003184307682466833, 'batch_size': 32, 'weight_decay': 2.1404183996371762e-05}. Best i

‚úÖ Optuna optimization completed!
üìä Best trial: 14
üìä Best validation loss: 0.632615
üìä Best parameters: {'d_model': 32, 'nhead': 16, 'num_encoder_layers': 2, 'dim_feedforward': 256, 'dropout': 0.20647313147231663, 'learning_rate': 0.000581065610494624, 'batch_size': 16, 'weight_decay': 3.185175750821139e-06}
üîÑ Training final model with best parameters...
üîÑ Training final model...
   Epoch   0: Train Loss = 0.774303, Val Loss = 0.677476
   Epoch   5: Train Loss = 0.646171, Val Loss = 0.641147
   Epoch  10: Train Loss = 0.635420, Val Loss = 0.648110
   Epoch  15: Train Loss = 0.632565, Val Loss = 0.639873
   Epoch  20: Train Loss = 0.629637, Val Loss = 0.637422
   Epoch  25: Train Loss = 0.628015, Val Loss = 0.647966
   Epoch  30: Train Loss = 0.626402, Val Loss = 0.645309
   Epoch  35: Train Loss = 0.624633, Val Loss = 0.634499
   Early stopping at epoch 37
‚úÖ Final model training completed!
üìä Best validation loss: 0.633212
üìä Total epochs: 38
üîÑ Saving model and 