# Train Model with Balanced Dataset

This notebook trains models using the balanced dataset created in `create_dataset.ipynb`. The balanced dataset ensures equal representation of all target categories (0, 0.25, 0.5, 0.75, 1) with 20% probability each.

**Training Approach:**
- Use existing model architectures from the codebase
- Train with the balanced dataset to prevent distribution bias
- Monitor performance across all target ranges
- Save trained models for evaluation

In [5]:
# Setup and imports
import sys
import os

# Add the project root to the path so we can import from src
project_root = os.path.abspath(os.path.join(os.getcwd(), '../..'))
sys.path.append(project_root)
print(f"Project root: {project_root}")

# Standard libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import json

# TensorFlow
import tensorflow as tf
from tensorflow.keras.optimizers import Adam, SGD, RMSprop
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from tensorflow.keras.losses import MeanAbsoluteError

# Project imports
from src.data_handler import DataHandler
from src.model_evaluator import ModelEvaluator

# Import model architectures
from src.models.BestModel import model as best_model
from src.models.CorrelationModel import model as correlation_model
from src.models.Deep import model as deep_model

# Set random seeds for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

# Configure matplotlib
plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = (12, 8)

print("✅ All imports successful!")
print(f"TensorFlow version: {tf.__version__}")

Project root: /Users/marcofurrer/Documents/github/dspro2
✅ All imports successful!
TensorFlow version: 2.15.0


In [7]:
# ===== STEP 1: CREATE BALANCED DATASET =====

def load_and_analyze_dataset(data_path="../../data/train.parquet"):
    """Load and analyze the original dataset distribution"""
    print("Loading original dataset...")
    df = pd.read_parquet(data_path)
    
    print(f"Dataset shape: {df.shape}")
    print(f"Columns: {list(df.columns)[:10]}...")  # Show first 10 columns
    
    # Analyze target distribution
    target_counts = df['target'].value_counts().sort_index()
    target_percentages = df['target'].value_counts(normalize=True).sort_index() * 100
    
    print("\nCurrent target distribution:")
    for target, count in target_counts.items():
        percentage = target_percentages[target]
        print(f"  {target}: {count:,} samples ({percentage:.1f}%)")
    
    return df

def create_balanced_dataset(df, target_column='target', save_path="../../data/train_balanced.parquet"):
    """Create a balanced dataset with equal representation of all target categories"""
    print("\nCreating balanced dataset...")
    
    # Get current distribution
    target_counts = df[target_column].value_counts()
    min_count = target_counts.min()
    
    print(f"Minimum category count: {min_count:,}")
    print(f"Will downsample all categories to {min_count:,} samples each")
    
    # Create balanced dataset by downsampling
    balanced_dfs = []
    for target_value in df[target_column].unique():
        target_subset = df[df[target_column] == target_value]
        if len(target_subset) > min_count:
            # Downsample
            target_subset = target_subset.sample(n=min_count, random_state=42)
        balanced_dfs.append(target_subset)
    
    # Combine and shuffle
    balanced_df = pd.concat(balanced_dfs, ignore_index=True)
    balanced_df = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)
    
    # Verify balance
    balanced_counts = balanced_df[target_column].value_counts().sort_index()
    balanced_percentages = balanced_df[target_column].value_counts(normalize=True).sort_index() * 100
    
    print(f"\nBalanced dataset shape: {balanced_df.shape}")
    print("\nNew target distribution:")
    for target, count in balanced_counts.items():
        percentage = balanced_percentages[target]
        print(f"  {target}: {count:,} samples ({percentage:.1f}%)")
    
    # Save balanced dataset
    balanced_df.to_parquet(save_path)
    print(f"\nBalanced dataset saved to: {save_path}")
    
    return balanced_df

def visualize_distribution_comparison(original_df, balanced_df, target_column='target'):
    """Visualize the comparison between original and balanced distributions"""
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    
    # Original distribution - counts
    original_counts = original_df[target_column].value_counts().sort_index()
    axes[0, 0].bar(original_counts.index, original_counts.values, alpha=0.7, color='skyblue')
    axes[0, 0].set_title('Original Dataset - Sample Counts')
    axes[0, 0].set_xlabel('Target Value')
    axes[0, 0].set_ylabel('Count')
    
    # Original distribution - percentages
    original_pct = original_df[target_column].value_counts(normalize=True).sort_index() * 100
    axes[0, 1].bar(original_pct.index, original_pct.values, alpha=0.7, color='lightcoral')
    axes[0, 1].set_title('Original Dataset - Percentage Distribution')
    axes[0, 1].set_xlabel('Target Value')
    axes[0, 1].set_ylabel('Percentage (%)')
    
    # Balanced distribution - counts
    balanced_counts = balanced_df[target_column].value_counts().sort_index()
    axes[1, 0].bar(balanced_counts.index, balanced_counts.values, alpha=0.7, color='lightgreen')
    axes[1, 0].set_title('Balanced Dataset - Sample Counts')
    axes[1, 0].set_xlabel('Target Value')
    axes[1, 0].set_ylabel('Count')
    
    # Balanced distribution - percentages
    balanced_pct = balanced_df[target_column].value_counts(normalize=True).sort_index() * 100
    axes[1, 1].bar(balanced_pct.index, balanced_pct.values, alpha=0.7, color='gold')
    axes[1, 1].set_title('Balanced Dataset - Percentage Distribution')
    axes[1, 1].set_xlabel('Target Value')
    axes[1, 1].set_ylabel('Percentage (%)')
    
    plt.tight_layout()
    plt.show()

# Load and analyze original dataset
original_df = load_and_analyze_dataset()

# Create balanced dataset
balanced_df = create_balanced_dataset(original_df)

# Visualize comparison
visualize_distribution_comparison(original_df, balanced_df)

Loading original dataset...


KeyboardInterrupt: 

In [None]:
# Execute dataset creation
original_df = load_and_analyze_dataset()
balanced_df = create_balanced_dataset(original_df)
visualize_distribution_comparison(original_df, balanced_df)

Loading original dataset...
Dataset shape: (2746270, 2415)
Columns: ['era', 'data_type', 'feature_shaded_hallucinatory_dactylology', 'feature_itinerant_hexahedral_photoengraver', 'feature_prudent_pileate_oven', 'feature_subalpine_apothegmatical_ajax', 'feature_pistachio_atypical_malison', 'feature_symmetrical_spongy_tricentenary', 'feature_ungrounded_transpontine_winder', 'feature_aseptic_eely_hemiplegia']...

Current target distribution:
  0.0: 135,668 samples (4.9%)
  0.25: 550,995 samples (20.1%)
  0.5: 1,373,108 samples (50.0%)
  0.75: 550,373 samples (20.0%)
  1.0: 136,126 samples (5.0%)
Dataset shape: (2746270, 2415)
Columns: ['era', 'data_type', 'feature_shaded_hallucinatory_dactylology', 'feature_itinerant_hexahedral_photoengraver', 'feature_prudent_pileate_oven', 'feature_subalpine_apothegmatical_ajax', 'feature_pistachio_atypical_malison', 'feature_symmetrical_spongy_tricentenary', 'feature_ungrounded_transpontine_winder', 'feature_aseptic_eely_hemiplegia']...

Current target

## Model Training Setup

Now we'll set up training functions and train models with the balanced dataset.

In [None]:
# ===== STEP 2: TRAINING SETUP =====

def create_callbacks(model_name, patience=15):
    """Create training callbacks"""
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    
    callbacks = [
        EarlyStopping(
            monitor='val_loss',
            patience=patience,
            restore_best_weights=True,
            verbose=1
        ),
        ReduceLROnPlateau(
            monitor='val_loss',
            factor=0.5,
            patience=5,
            min_lr=0.00001,
            verbose=1
        ),
        ModelCheckpoint(
            filepath=f'../../exports/{model_name}_balanced_{timestamp}.keras',
            monitor='val_loss',
            save_best_only=True,
            verbose=1
        )
    ]
    return callbacks

def prepare_data(df, feature_columns=None, validation_split=0.2):
    """Prepare training and validation data"""
    if feature_columns is None:
        # Exclude non-feature columns
        exclude_cols = ['target', 'era', 'data_type'] if 'era' in df.columns else ['target']
        feature_columns = [col for col in df.columns if col not in exclude_cols]
    
    X = df[feature_columns].values.astype(np.float32)
    y = df['target'].values.astype(np.float32)
    
    # Split data
    n_train = int(len(X) * (1 - validation_split))
    
    X_train, X_val = X[:n_train], X[n_train:]
    y_train, y_val = y[:n_train], y[n_train:]
    
    print(f"Training samples: {len(X_train):,}")
    print(f"Validation samples: {len(X_val):,}")
    print(f"Number of features: {X.shape[1]}")
    
    return X_train, X_val, y_train, y_val, feature_columns

def get_optimizer(optimizer_name, learning_rate=0.001):
    """Get optimizer by name"""
    optimizers = {
        'Adam': Adam(learning_rate=learning_rate),
        'SGD': SGD(learning_rate=learning_rate),
        'RMSprop': RMSprop(learning_rate=learning_rate)
    }
    return optimizers.get(optimizer_name, Adam(learning_rate=learning_rate))

def train_model(model_fn, model_name, optimizer_name, X_train, X_val, y_train, y_val, 
               epochs=100, batch_size=512, learning_rate=0.001):
    """Train a single model"""
    print(f"\n{'='*50}")
    print(f"Training {model_name} with {optimizer_name}")
    print(f"{'='*50}")
    
    # Create model
    input_shape = (X_train.shape[1],)
    model = model_fn(input_shape)
    
    # Compile model
    optimizer = get_optimizer(optimizer_name, learning_rate)
    model.compile(
        optimizer=optimizer,
        loss='mae',
        metrics=['mse']
    )
    
    print(f"Model architecture:")
    model.summary()
    
    # Create callbacks
    callbacks = create_callbacks(f"{model_name}_{optimizer_name}")
    
    # Train model
    start_time = datetime.now()
    history = model.fit(
        X_train, y_train,
        validation_data=(X_val, y_val),
        epochs=epochs,
        batch_size=batch_size,
        callbacks=callbacks,
        verbose=1
    )
    end_time = datetime.now()
    
    training_time = (end_time - start_time).total_seconds()
    print(f"Training completed in {training_time:.1f} seconds")
    
    return model, history, training_time

In [None]:
# ===== STEP 3: EVALUATION FUNCTIONS =====

def evaluate_model(model, X_val, y_val, model_name):
    """Evaluate model performance"""
    y_pred = model.predict(X_val)
    
    # Calculate metrics
    mae = np.mean(np.abs(y_val - y_pred.flatten()))
    mse = np.mean((y_val - y_pred.flatten()) ** 2)
    rmse = np.sqrt(mse)
    
    # Calculate correlation
    correlation = np.corrcoef(y_val, y_pred.flatten())[0, 1]
    
    # Calculate per-target performance
    target_performance = {}
    for target_val in np.unique(y_val):
        mask = y_val == target_val
        if np.sum(mask) > 0:
            target_mae = np.mean(np.abs(y_val[mask] - y_pred.flatten()[mask]))
            target_performance[target_val] = target_mae
    
    results = {
        'model': model_name,
        'mae': mae,
        'mse': mse,
        'rmse': rmse,
        'correlation': correlation,
        'target_performance': target_performance
    }
    
    print(f"\nEvaluation Results for {model_name}:")
    print(f"  MAE: {mae:.4f}")
    print(f"  MSE: {mse:.4f}")
    print(f"  RMSE: {rmse:.4f}")
    print(f"  Correlation: {correlation:.4f}")
    print(f"  Per-target MAE:")
    for target, target_mae in target_performance.items():
        print(f"    {target}: {target_mae:.4f}")
    
    return results

def plot_training_history(history, model_name):
    """Plot training history"""
    fig, axes = plt.subplots(1, 2, figsize=(15, 5))
    
    # Loss plot
    axes[0].plot(history.history['loss'], label='Training Loss', alpha=0.8)
    axes[0].plot(history.history['val_loss'], label='Validation Loss', alpha=0.8)
    axes[0].set_title(f'{model_name} - Training Loss')
    axes[0].set_xlabel('Epoch')
    axes[0].set_ylabel('MAE Loss')
    axes[0].legend()
    axes[0].grid(True, alpha=0.3)
    
    # MSE plot
    if 'mse' in history.history:
        axes[1].plot(history.history['mse'], label='Training MSE', alpha=0.8)
        axes[1].plot(history.history['val_mse'], label='Validation MSE', alpha=0.8)
        axes[1].set_title(f'{model_name} - Training MSE')
        axes[1].set_xlabel('Epoch')
        axes[1].set_ylabel('MSE')
        axes[1].legend()
        axes[1].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()

def plot_predictions_vs_actual(y_true, y_pred, model_name):
    """Plot predictions vs actual values"""
    fig, axes = plt.subplots(1, 2, figsize=(15, 6))
    
    # Scatter plot
    axes[0].scatter(y_true, y_pred, alpha=0.5)
    axes[0].plot([y_true.min(), y_true.max()], [y_true.min(), y_true.max()], 'r--', lw=2)
    axes[0].set_xlabel('Actual Values')
    axes[0].set_ylabel('Predicted Values')
    axes[0].set_title(f'{model_name} - Predictions vs Actual')
    axes[0].grid(True, alpha=0.3)
    
    # Residuals plot
    residuals = y_true - y_pred.flatten()
    axes[1].scatter(y_pred, residuals, alpha=0.5)
    axes[1].axhline(y=0, color='r', linestyle='--')
    axes[1].set_xlabel('Predicted Values')
    axes[1].set_ylabel('Residuals')
    axes[1].set_title(f'{model_name} - Residuals Plot')
    axes[1].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()

## Data Preparation

Prepare the balanced dataset for training.

In [None]:
# Prepare data for training
print("Preparing training data from balanced dataset...")
X_train, X_val, y_train, y_val, feature_columns = prepare_data(balanced_df)

print(f"\nData preparation complete!")
print(f"Feature columns: {len(feature_columns)}")
print(f"Feature names (first 10): {feature_columns[:10]}")

## Model Training

Train different model architectures with three optimizers (Adam, SGD, RMSprop) using the balanced dataset.

In [None]:
# ===== STEP 4: TRAIN MODELS =====

# Define models to train
models_to_train = {
    'BestModel': best_model,
    'CorrelationModel': correlation_model,
    'DeepModel': deep_model
}

# Define optimizers to test
optimizers_to_test = ['Adam', 'SGD', 'RMSprop']

# Training configuration
training_config = {
    'epochs': 100,
    'batch_size': 512,
    'learning_rate': 0.001
}

print(f"Training {len(models_to_train)} models with {len(optimizers_to_test)} optimizers each")
print(f"Total combinations: {len(models_to_train) * len(optimizers_to_test)}")
print(f"Training configuration: {training_config}")

In [None]:
# Execute training
training_results = []
trained_models = {}

for model_name, model_fn in models_to_train.items():
    for optimizer_name in optimizers_to_test:
        try:
            # Train model
            model, history, training_time = train_model(
                model_fn=model_fn,
                model_name=model_name,
                optimizer_name=optimizer_name,
                X_train=X_train,
                X_val=X_val,
                y_train=y_train,
                y_val=y_val,
                **training_config
            )
            
            # Evaluate model
            eval_results = evaluate_model(
                model=model,
                X_val=X_val,
                y_val=y_val,
                model_name=f"{model_name}_{optimizer_name}"
            )
            
            # Add training time and configuration to results
            eval_results['training_time'] = training_time
            eval_results['optimizer'] = optimizer_name
            eval_results['epochs_trained'] = len(history.history['loss'])
            eval_results['final_train_loss'] = history.history['loss'][-1]
            eval_results['final_val_loss'] = history.history['val_loss'][-1]
            
            training_results.append(eval_results)
            trained_models[f"{model_name}_{optimizer_name}"] = {
                'model': model,
                'history': history
            }
            
            # Plot training history
            plot_training_history(history, f"{model_name}_{optimizer_name}")
            
            # Plot predictions
            y_pred = model.predict(X_val)
            plot_predictions_vs_actual(y_val, y_pred, f"{model_name}_{optimizer_name}")
            
            print(f"✅ {model_name} with {optimizer_name} completed successfully\n")
            
        except Exception as e:
            print(f"❌ Error training {model_name} with {optimizer_name}: {str(e)}")
            continue

print(f"\n🎉 Training completed! {len(training_results)} models trained successfully.")

## Results Analysis

Analyze and compare the performance of all trained models.

In [None]:
# ===== STEP 5: ANALYZE RESULTS =====

# Convert results to DataFrame for analysis
if training_results:
    results_df = pd.DataFrame(training_results)
    
    # Extract model name and optimizer for better analysis
    results_df[['model_name', 'optimizer']] = results_df['model'].str.split('_', n=1, expand=True)
    
    print("Training Results Summary:")
    print("=" * 50)
    display(results_df[['model_name', 'optimizer', 'mae', 'correlation', 'training_time']].round(4))
    
    # Best performers by metric
    print("\nBest Performers:")
    print("=" * 30)
    
    # Best by MAE (lower is better)
    best_mae = results_df.loc[results_df['mae'].idxmin()]
    print(f"Best MAE: {best_mae['model']} (MAE: {best_mae['mae']:.4f})")
    
    # Best by correlation (higher is better)
    best_corr = results_df.loc[results_df['correlation'].idxmax()]
    print(f"Best Correlation: {best_corr['model']} (Correlation: {best_corr['correlation']:.4f})")
    
    # Fastest training
    fastest = results_df.loc[results_df['training_time'].idxmin()]
    print(f"Fastest Training: {fastest['model']} (Time: {fastest['training_time']:.1f}s)")
else:
    print("No training results available for analysis.")

In [None]:
# Create comparison visualizations
if training_results:
    fig, axes = plt.subplots(2, 2, figsize=(16, 12))
    
    # MAE comparison
    mae_pivot = results_df.pivot(index='model_name', columns='optimizer', values='mae')
    sns.heatmap(mae_pivot, annot=True, fmt='.4f', cmap='YlOrRd_r', ax=axes[0, 0])
    axes[0, 0].set_title('MAE by Model and Optimizer (Lower is Better)')
    
    # Correlation comparison
    corr_pivot = results_df.pivot(index='model_name', columns='optimizer', values='correlation')
    sns.heatmap(corr_pivot, annot=True, fmt='.4f', cmap='YlGn', ax=axes[0, 1])
    axes[0, 1].set_title('Correlation by Model and Optimizer (Higher is Better)')
    
    # Training time comparison
    time_pivot = results_df.pivot(index='model_name', columns='optimizer', values='training_time')
    sns.heatmap(time_pivot, annot=True, fmt='.1f', cmap='YlOrRd_r', ax=axes[1, 0])
    axes[1, 0].set_title('Training Time (seconds) by Model and Optimizer')
    
    # Overall performance (MAE vs Correlation)
    for optimizer in optimizers_to_test:
        opt_data = results_df[results_df['optimizer'] == optimizer]
        axes[1, 1].scatter(opt_data['mae'], opt_data['correlation'], 
                          label=optimizer, s=100, alpha=0.7)
        
        # Add model labels
        for _, row in opt_data.iterrows():
            axes[1, 1].annotate(row['model_name'], 
                               (row['mae'], row['correlation']),
                               xytext=(5, 5), textcoords='offset points',
                               fontsize=8, alpha=0.8)
    
    axes[1, 1].set_xlabel('MAE (Lower is Better)')
    axes[1, 1].set_ylabel('Correlation (Higher is Better)')
    axes[1, 1].set_title('MAE vs Correlation by Optimizer')
    axes[1, 1].legend()
    axes[1, 1].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()

In [None]:
# Per-target performance analysis
if training_results:
    print("\nPer-Target Performance Analysis:")
    print("=" * 40)
    
    target_values = [0.0, 0.25, 0.5, 0.75, 1.0]
    target_performance_data = []
    
    for result in training_results:
        for target_val, target_mae in result['target_performance'].items():
            target_performance_data.append({
                'model': result['model'],
                'model_name': result['model'].split('_')[0],
                'optimizer': result['optimizer'],
                'target_value': target_val,
                'target_mae': target_mae
            })
    
    target_df = pd.DataFrame(target_performance_data)
    
    # Create per-target heatmap
    plt.figure(figsize=(12, 8))
    target_pivot = target_df.pivot_table(
        index='model', 
        columns='target_value', 
        values='target_mae',
        aggfunc='mean'
    )
    
    sns.heatmap(target_pivot, annot=True, fmt='.4f', cmap='YlOrRd_r')
    plt.title('Per-Target MAE Performance (Lower is Better)')
    plt.xlabel('Target Value')
    plt.ylabel('Model')
    plt.tight_layout()
    plt.show()
    
    # Check for balanced performance across targets
    print("\nTarget Performance Balance Check:")
    for model in target_df['model'].unique():
        model_targets = target_df[target_df['model'] == model]
        target_std = model_targets['target_mae'].std()
        target_mean = model_targets['target_mae'].mean()
        print(f"{model}: Mean MAE = {target_mean:.4f}, Std = {target_std:.4f}")
        if target_std < 0.02:  # Low standard deviation indicates balanced performance
            print(f"  ✅ Well-balanced across all targets")
        else:
            print(f"  ⚠️ Some variation in target performance")

## Summary and Recommendations

Based on the training results with the balanced dataset:

In [None]:
# Final summary and recommendations
if training_results:
    print("\n" + "="*60)
    print("TRAINING SUMMARY WITH BALANCED DATASET")
    print("="*60)
    
    # Overall statistics
    print(f"\n📊 Training Statistics:")
    print(f"   Models trained: {len(set(r['model_name'] for r in training_results))}")
    print(f"   Optimizers tested: {len(set(r['optimizer'] for r in training_results))}")
    print(f"   Total combinations: {len(training_results)}")
    print(f"   Average training time: {np.mean([r['training_time'] for r in training_results]):.1f}s")
    
    # Performance ranges
    mae_values = [r['mae'] for r in training_results]
    corr_values = [r['correlation'] for r in training_results]
    
    print(f"\n📈 Performance Ranges:")
    print(f"   MAE: {min(mae_values):.4f} - {max(mae_values):.4f}")
    print(f"   Correlation: {min(corr_values):.4f} - {max(corr_values):.4f}")
    
    # Top 3 performers by MAE
    top_mae = sorted(training_results, key=lambda x: x['mae'])[:3]
    print(f"\n🏆 Top 3 Models by MAE:")
    for i, model in enumerate(top_mae, 1):
        print(f"   {i}. {model['model']} - MAE: {model['mae']:.4f}, Correlation: {model['correlation']:.4f}")
    
    # Benefits of balanced training
    print(f"\n✅ Benefits of Balanced Dataset Training:")
    print(f"   • Equal representation of all target values (0, 0.25, 0.5, 0.75, 1.0)")
    print(f"   • Reduced bias towards middle values (0.5)")
    print(f"   • Better generalization across all prediction ranges")
    print(f"   • More reliable performance metrics")
    
    # Save results
    results_df.to_csv('../../exports/balanced_training_results.csv', index=False)
    print(f"\n💾 Results saved to: ../../exports/balanced_training_results.csv")
    
    print(f"\n🎯 Recommendation: Use the best performing model-optimizer combination")
    print(f"   for production, considering both MAE and correlation scores.")
else:
    print("No results to summarize.")

In [None]:
# ===== STEP 2: MODEL PREPARATION AND CONFIGURATION =====

def get_feature_columns(df):
    """Get feature columns excluding target and metadata columns"""
    exclude_cols = ['target', 'era', 'data_type', 'id'] if 'era' in df.columns else ['target', 'id']
    feature_cols = [col for col in df.columns if col not in exclude_cols]
    print(f"Feature columns: {len(feature_cols)} features")
    return feature_cols

def prepare_data(df, feature_cols, test_size=0.2, val_size=0.1):
    """Prepare training, validation, and test data"""
    X = df[feature_cols].values.astype(np.float32)
    y = df['target'].values.astype(np.float32)
    
    # First split: train+val vs test
    X_train_val, X_test, y_train_val, y_test = train_test_split(
        X, y, test_size=test_size, random_state=42, stratify=y
    )
    
    # Second split: train vs val
    val_size_adjusted = val_size / (1 - test_size)  # Adjust val_size for remaining data
    X_train, X_val, y_train, y_val = train_test_split(
        X_train_val, y_train_val, test_size=val_size_adjusted, random_state=42, stratify=y_train_val
    )
    
    print(f"Data splits:")
    print(f"  Training: {X_train.shape[0]:,} samples")
    print(f"  Validation: {X_val.shape[0]:,} samples")
    print(f"  Test: {X_test.shape[0]:,} samples")
    
    return (X_train, y_train), (X_val, y_val), (X_test, y_test)

# Model configurations
MODEL_CONFIGS = {
    'BestModel': {
        'model_fn': best_model,
        'description': 'Advanced model with distribution-aware layers and attention mechanisms'
    },
    'CorrelationModel': {
        'model_fn': correlation_model,
        'description': 'Model focused on capturing feature correlations and interactions'
    },
    'DeepModel': {
        'model_fn': deep_model,
        'description': 'Deep model with feature interactions and multi-head attention'
    }
}

# Optimizer configurations
OPTIMIZER_CONFIGS = {
    'Adam': {
        'optimizer_fn': lambda lr: Adam(learning_rate=lr, beta_1=0.9, beta_2=0.999),
        'lr': 0.001
    },
    'SGD': {
        'optimizer_fn': lambda lr: SGD(learning_rate=lr, momentum=0.9),
        'lr': 0.01
    },
    'RMSprop': {
        'optimizer_fn': lambda lr: RMSprop(learning_rate=lr),
        'lr': 0.001
    }
}

# Loss function configurations
LOSS_CONFIGS = {
    'MAE': MeanAbsoluteError(),
    'CorrelationLoss': correlation_loss,
    'CorrelationAware': CorrelationAwareLoss(distribution_penalty=0.2)
}

print("Model configurations prepared:")
for name, config in MODEL_CONFIGS.items():
    print(f"  - {name}: {config['description']}")

print("\nOptimizer configurations prepared:")
for name, config in OPTIMIZER_CONFIGS.items():
    print(f"  - {name}: Learning rate {config['lr']}")

print("\nLoss function configurations prepared:")
for name in LOSS_CONFIGS.keys():
    print(f"  - {name}")

In [None]:
# ===== STEP 3: TRAINING FUNCTIONS =====

def create_callbacks(model_name, optimizer_name, loss_name, patience=15):
    """Create training callbacks"""
    callbacks = [
        EarlyStopping(
            monitor='val_loss',
            patience=patience,
            restore_best_weights=True,
            verbose=1
        ),
        ReduceLROnPlateau(
            monitor='val_loss',
            factor=0.5,
            patience=8,
            min_lr=1e-7,
            verbose=1
        )
    ]
    
    # Create model checkpoint
    checkpoint_path = f"../../exports/{model_name}_{optimizer_name}_{loss_name}_best.keras"
    os.makedirs(os.path.dirname(checkpoint_path), exist_ok=True)
    
    callbacks.append(
        ModelCheckpoint(
            checkpoint_path,
            monitor='val_loss',
            save_best_only=True,
            verbose=1
        )
    )
    
    return callbacks, checkpoint_path

def calculate_correlation(y_true, y_pred):
    """Calculate Pearson correlation coefficient"""
    return np.corrcoef(y_true, y_pred.flatten())[0, 1] if len(y_true) > 1 else 0.0

def evaluate_model(model, X_test, y_test, model_name):
    """Comprehensive model evaluation"""
    print(f"\nEvaluating {model_name}...")
    
    # Make predictions
    y_pred = model.predict(X_test, verbose=0)
    
    # Calculate metrics
    mae = np.mean(np.abs(y_test - y_pred.flatten()))
    mse = np.mean((y_test - y_pred.flatten()) ** 2)
    correlation = calculate_correlation(y_test, y_pred)
    
    # Calculate per-target performance
    target_performance = {}
    for target_val in np.unique(y_test):
        mask = y_test == target_val
        if mask.sum() > 0:
            target_mae = np.mean(np.abs(y_test[mask] - y_pred.flatten()[mask]))
            target_corr = calculate_correlation(y_test[mask], y_pred[mask]) if mask.sum() > 1 else 0.0
            target_performance[target_val] = {
                'count': mask.sum(),
                'mae': target_mae,
                'correlation': target_corr
            }
    
    results = {
        'model_name': model_name,
        'mae': mae,
        'mse': mse,
        'correlation': correlation,
        'target_performance': target_performance,
        'predictions': y_pred.flatten(),
        'targets': y_test
    }
    
    print(f"  MAE: {mae:.4f}")
    print(f"  MSE: {mse:.4f}")
    print(f"  Correlation: {correlation:.4f}")
    
    return results

def train_single_model(model_config, optimizer_config, loss_config, train_data, val_data, 
                      model_name, optimizer_name, loss_name, epochs=100, batch_size=512):
    """Train a single model configuration"""
    
    print(f"\n{'='*60}")
    print(f"Training: {model_name} + {optimizer_name} + {loss_name}")
    print(f"{'='*60}")
    
    X_train, y_train = train_data
    X_val, y_val = val_data
    
    # Create model
    input_shape = (X_train.shape[1],)
    model = model_config['model_fn'](input_shape)
    
    # Create optimizer
    optimizer = optimizer_config['optimizer_fn'](optimizer_config['lr'])
    
    # Compile model
    model.compile(
        optimizer=optimizer,
        loss=loss_config,
        metrics=['mae']
    )
    
    print(f"Model: {model.name}")
    print(f"Input shape: {input_shape}")
    print(f"Total parameters: {model.count_params():,}")
    
    # Create callbacks
    callbacks, checkpoint_path = create_callbacks(model_name, optimizer_name, loss_name)
    
    # Train model
    start_time = datetime.now()
    history = model.fit(
        X_train, y_train,
        validation_data=(X_val, y_val),
        epochs=epochs,
        batch_size=batch_size,
        callbacks=callbacks,
        verbose=1
    )
    training_time = datetime.now() - start_time
    
    print(f"Training completed in: {training_time}")
    print(f"Model saved to: {checkpoint_path}")
    
    # Load best model
    model.load_weights(checkpoint_path)
    
    return model, history, checkpoint_path

def plot_training_history(histories, title_prefix="Training History"):
    """Plot training histories for comparison"""
    fig, axes = plt.subplots(1, 2, figsize=(15, 5))
    
    # Plot loss
    for name, history in histories.items():
        axes[0].plot(history.history['loss'], label=f'{name} - Train', alpha=0.7)
        axes[0].plot(history.history['val_loss'], label=f'{name} - Val', alpha=0.7, linestyle='--')
    
    axes[0].set_title('Model Loss')
    axes[0].set_xlabel('Epoch')
    axes[0].set_ylabel('Loss')
    axes[0].legend()
    axes[0].grid(True)
    
    # Plot MAE
    for name, history in histories.items():
        if 'mae' in history.history:
            axes[1].plot(history.history['mae'], label=f'{name} - Train', alpha=0.7)
        if 'val_mae' in history.history:
            axes[1].plot(history.history['val_mae'], label=f'{name} - Val', alpha=0.7, linestyle='--')
    
    axes[1].set_title('Model MAE')
    axes[1].set_xlabel('Epoch')
    axes[1].set_ylabel('MAE')
    axes[1].legend()
    axes[1].grid(True)
    
    plt.suptitle(title_prefix)
    plt.tight_layout()
    plt.show()

print("Training functions defined and ready!")

In [None]:
# ===== STEP 4: PREPARE DATA FOR TRAINING =====

# Get feature columns
feature_cols = get_feature_columns(balanced_df)
print(f"Using {len(feature_cols)} features for training")

# Prepare data splits
(X_train, y_train), (X_val, y_val), (X_test, y_test) = prepare_data(balanced_df, feature_cols)

# Verify data shapes and distributions
print(f"\nData shapes:")
print(f"  X_train: {X_train.shape}, y_train: {y_train.shape}")
print(f"  X_val: {X_val.shape}, y_val: {y_val.shape}")
print(f"  X_test: {X_test.shape}, y_test: {y_test.shape}")

# Check target distribution in splits
print(f"\nTarget distribution in training set:")
train_target_dist = pd.Series(y_train).value_counts(normalize=True).sort_index() * 100
for target, pct in train_target_dist.items():
    print(f"  {target}: {pct:.1f}%")

print(f"\nTarget distribution in validation set:")
val_target_dist = pd.Series(y_val).value_counts(normalize=True).sort_index() * 100
for target, pct in val_target_dist.items():
    print(f"  {target}: {pct:.1f}%")

print(f"\nTarget distribution in test set:")
test_target_dist = pd.Series(y_test).value_counts(normalize=True).sort_index() * 100
for target, pct in test_target_dist.items():
    print(f"  {target}: {pct:.1f}%")

print("\nData preparation complete!")

In [None]:
# ===== STEP 5: COMPREHENSIVE MODEL TRAINING =====

# Training configuration
TRAINING_CONFIG = {
    'epochs': 100,
    'batch_size': 512,
    'models_to_train': ['BestModel', 'CorrelationModel'],  # Start with 2 models for faster execution
    'optimizers_to_use': ['Adam', 'SGD'],  # Use 2 optimizers
    'losses_to_use': ['MAE', 'CorrelationAware']  # Use 2 loss functions
}

print(f"Training Configuration:")
print(f"  Epochs: {TRAINING_CONFIG['epochs']}")
print(f"  Batch size: {TRAINING_CONFIG['batch_size']}")
print(f"  Models: {TRAINING_CONFIG['models_to_train']}")
print(f"  Optimizers: {TRAINING_CONFIG['optimizers_to_use']}")
print(f"  Loss functions: {TRAINING_CONFIG['losses_to_use']}")

# Storage for results
training_results = []
trained_models = {}
training_histories = {}

print(f"\nStarting comprehensive training...")
print(f"Total combinations: {len(TRAINING_CONFIG['models_to_train']) * len(TRAINING_CONFIG['optimizers_to_use']) * len(TRAINING_CONFIG['losses_to_use'])}")

# Main training loop
for model_name in TRAINING_CONFIG['models_to_train']:
    for optimizer_name in TRAINING_CONFIG['optimizers_to_use']:
        for loss_name in TRAINING_CONFIG['losses_to_use']:
            
            # Create unique identifier for this configuration
            config_id = f"{model_name}_{optimizer_name}_{loss_name}"
            
            try:
                # Get configurations
                model_config = MODEL_CONFIGS[model_name]
                optimizer_config = OPTIMIZER_CONFIGS[optimizer_name]
                loss_config = LOSS_CONFIGS[loss_name]
                
                # Train model
                model, history, checkpoint_path = train_single_model(
                    model_config=model_config,
                    optimizer_config=optimizer_config,
                    loss_config=loss_config,
                    train_data=(X_train, y_train),
                    val_data=(X_val, y_val),
                    model_name=model_name,
                    optimizer_name=optimizer_name,
                    loss_name=loss_name,
                    epochs=TRAINING_CONFIG['epochs'],
                    batch_size=TRAINING_CONFIG['batch_size']
                )
                
                # Evaluate model
                evaluation_results = evaluate_model(model, X_test, y_test, config_id)
                
                # Store results
                result_record = {
                    'config_id': config_id,
                    'model_name': model_name,
                    'optimizer_name': optimizer_name,
                    'loss_name': loss_name,
                    'checkpoint_path': checkpoint_path,
                    'final_train_loss': history.history['loss'][-1],
                    'final_val_loss': history.history['val_loss'][-1],
                    'best_val_loss': min(history.history['val_loss']),
                    'epochs_trained': len(history.history['loss']),
                    'test_mae': evaluation_results['mae'],
                    'test_mse': evaluation_results['mse'],
                    'test_correlation': evaluation_results['correlation'],
                    'timestamp': datetime.now().isoformat()
                }
                
                training_results.append(result_record)
                trained_models[config_id] = model
                training_histories[config_id] = history
                
                print(f"✅ {config_id} completed successfully")
                
            except Exception as e:
                print(f"❌ {config_id} failed: {str(e)}")
                # Log the error but continue with next configuration
                error_record = {
                    'config_id': config_id,
                    'model_name': model_name,
                    'optimizer_name': optimizer_name,
                    'loss_name': loss_name,
                    'error': str(e),
                    'timestamp': datetime.now().isoformat()
                }
                training_results.append(error_record)
                continue

print(f"\n🎉 Training completed!")
print(f"Successfully trained: {len([r for r in training_results if 'error' not in r])} models")
print(f"Failed: {len([r for r in training_results if 'error' in r])} models")

In [None]:
# ===== STEP 6: ANALYZE TRAINING RESULTS =====

# Convert results to DataFrame for analysis
results_df = pd.DataFrame([r for r in training_results if 'error' not in r])
errors_df = pd.DataFrame([r for r in training_results if 'error' in r])

if len(results_df) > 0:
    print("\n📊 TRAINING RESULTS SUMMARY")
    print("=" * 50)
    
    # Display results table
    display_cols = ['config_id', 'test_mae', 'test_correlation', 'best_val_loss', 'epochs_trained']
    results_display = results_df[display_cols].copy()
    results_display = results_display.sort_values('test_correlation', ascending=False)
    
    print("\nTop performers by correlation:")
    print(results_display.head(10).to_string(index=False))
    
    # Find best models
    best_by_correlation = results_df.loc[results_df['test_correlation'].idxmax()]
    best_by_mae = results_df.loc[results_df['test_mae'].idxmin()]
    
    print(f"\n🏆 BEST MODELS:")
    print(f"Best Correlation: {best_by_correlation['config_id']} (r={best_by_correlation['test_correlation']:.4f})")
    print(f"Best MAE: {best_by_mae['config_id']} (MAE={best_by_mae['test_mae']:.4f})")
    
    # Model architecture comparison
    print(f"\n📈 MODEL ARCHITECTURE COMPARISON:")
    model_comparison = results_df.groupby('model_name').agg({
        'test_correlation': ['mean', 'std', 'max'],
        'test_mae': ['mean', 'std', 'min']
    }).round(4)
    print(model_comparison)
    
    # Optimizer comparison
    print(f"\n⚙️ OPTIMIZER COMPARISON:")
    optimizer_comparison = results_df.groupby('optimizer_name').agg({
        'test_correlation': ['mean', 'std', 'max'],
        'test_mae': ['mean', 'std', 'min']
    }).round(4)
    print(optimizer_comparison)
    
    # Loss function comparison
    print(f"\n📉 LOSS FUNCTION COMPARISON:")
    loss_comparison = results_df.groupby('loss_name').agg({
        'test_correlation': ['mean', 'std', 'max'],
        'test_mae': ['mean', 'std', 'min']
    }).round(4)
    print(loss_comparison)
    
else:
    print("❌ No successful training results to analyze")

if len(errors_df) > 0:
    print(f"\n⚠️ TRAINING ERRORS:")
    for _, error in errors_df.iterrows():
        print(f"  {error['config_id']}: {error['error']}")

In [None]:
# ===== STEP 7: COMPREHENSIVE VISUALIZATIONS =====

def plot_model_comparison(results_df):
    """Create comprehensive comparison plots"""
    if len(results_df) == 0:
        print("No results to plot")
        return
    
    fig, axes = plt.subplots(2, 3, figsize=(20, 12))
    fig.suptitle('Comprehensive Model Training Results', fontsize=16)
    
    # 1. Correlation vs MAE scatter plot
    scatter = axes[0, 0].scatter(results_df['test_mae'], results_df['test_correlation'], 
                                c=range(len(results_df)), cmap='viridis', alpha=0.7, s=100)
    axes[0, 0].set_xlabel('Test MAE')
    axes[0, 0].set_ylabel('Test Correlation')
    axes[0, 0].set_title('Correlation vs MAE Performance')
    axes[0, 0].grid(True, alpha=0.3)
    
    # Add annotations for best models
    for idx, row in results_df.iterrows():
        axes[0, 0].annotate(row['config_id'], (row['test_mae'], row['test_correlation']),
                           xytext=(5, 5), textcoords='offset points', fontsize=8, alpha=0.7)
    
    # 2. Model architecture comparison
    model_means = results_df.groupby('model_name')['test_correlation'].mean()
    model_stds = results_df.groupby('model_name')['test_correlation'].std()
    axes[0, 1].bar(model_means.index, model_means.values, yerr=model_stds.values, 
                   capsize=5, alpha=0.7, color=['skyblue', 'lightcoral', 'lightgreen'])
    axes[0, 1].set_title('Average Correlation by Model Architecture')
    axes[0, 1].set_ylabel('Test Correlation')
    axes[0, 1].tick_params(axis='x', rotation=45)
    
    # 3. Optimizer comparison
    opt_means = results_df.groupby('optimizer_name')['test_correlation'].mean()
    opt_stds = results_df.groupby('optimizer_name')['test_correlation'].std()
    axes[0, 2].bar(opt_means.index, opt_means.values, yerr=opt_stds.values, 
                   capsize=5, alpha=0.7, color=['orange', 'purple', 'brown'])
    axes[0, 2].set_title('Average Correlation by Optimizer')
    axes[0, 2].set_ylabel('Test Correlation')
    
    # 4. Loss function comparison
    loss_means = results_df.groupby('loss_name')['test_correlation'].mean()
    loss_stds = results_df.groupby('loss_name')['test_correlation'].std()
    axes[1, 0].bar(loss_means.index, loss_means.values, yerr=loss_stds.values, 
                   capsize=5, alpha=0.7, color=['gold', 'pink'])
    axes[1, 0].set_title('Average Correlation by Loss Function')
    axes[1, 0].set_ylabel('Test Correlation')
    axes[1, 0].tick_params(axis='x', rotation=45)
    
    # 5. Training efficiency (epochs vs performance)
    scatter2 = axes[1, 1].scatter(results_df['epochs_trained'], results_df['test_correlation'],
                                 c=results_df['test_mae'], cmap='viridis_r', alpha=0.7, s=100)
    axes[1, 1].set_xlabel('Epochs Trained')
    axes[1, 1].set_ylabel('Test Correlation')
    axes[1, 1].set_title('Training Efficiency (Color = MAE)')
    plt.colorbar(scatter2, ax=axes[1, 1], label='Test MAE')
    
    # 6. Validation vs Test performance
    axes[1, 2].scatter(results_df['best_val_loss'], results_df['test_mae'], alpha=0.7, s=100)
    axes[1, 2].set_xlabel('Best Validation Loss')
    axes[1, 2].set_ylabel('Test MAE')
    axes[1, 2].set_title('Validation vs Test Performance')
    axes[1, 2].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()

def plot_training_histories_summary(training_histories):
    """Plot training histories for all models"""
    if len(training_histories) == 0:
        print("No training histories to plot")
        return
    
    fig, axes = plt.subplots(1, 2, figsize=(15, 6))
    
    # Plot all training and validation losses
    for config_id, history in training_histories.items():
        epochs = range(1, len(history.history['loss']) + 1)
        axes[0].plot(epochs, history.history['loss'], label=f'{config_id} (train)', alpha=0.7)
        axes[0].plot(epochs, history.history['val_loss'], label=f'{config_id} (val)', 
                    alpha=0.7, linestyle='--')
    
    axes[0].set_title('All Training Histories - Loss')
    axes[0].set_xlabel('Epoch')
    axes[0].set_ylabel('Loss')
    axes[0].legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    axes[0].grid(True, alpha=0.3)
    
    # Plot final convergence comparison
    final_losses = []
    config_names = []
    for config_id, history in training_histories.items():
        final_losses.append(history.history['val_loss'][-1])
        config_names.append(config_id)
    
    axes[1].bar(range(len(final_losses)), final_losses, alpha=0.7)
    axes[1].set_title('Final Validation Loss Comparison')
    axes[1].set_xlabel('Model Configuration')
    axes[1].set_ylabel('Final Validation Loss')
    axes[1].set_xticks(range(len(config_names)))
    axes[1].set_xticklabels(config_names, rotation=45, ha='right')
    
    plt.tight_layout()
    plt.show()

def analyze_predictions(trained_models, X_test, y_test, results_df):
    """Analyze predictions from the best models"""
    if len(trained_models) == 0 or len(results_df) == 0:
        print("No models to analyze")
        return
    
    # Get best model by correlation
    best_config_id = results_df.loc[results_df['test_correlation'].idxmax(), 'config_id']
    best_model = trained_models[best_config_id]
    
    print(f"\n🔍 ANALYZING BEST MODEL: {best_config_id}")
    
    # Make predictions
    y_pred = best_model.predict(X_test, verbose=0).flatten()
    
    # Create detailed analysis plots
    fig, axes = plt.subplots(2, 2, figsize=(15, 12))
    fig.suptitle(f'Detailed Analysis: {best_config_id}', fontsize=16)
    
    # 1. Prediction vs True scatter plot
    axes[0, 0].scatter(y_test, y_pred, alpha=0.6, s=50)
    axes[0, 0].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
    axes[0, 0].set_xlabel('True Values')
    axes[0, 0].set_ylabel('Predicted Values')
    axes[0, 0].set_title('Predictions vs True Values')
    axes[0, 0].grid(True, alpha=0.3)
    
    # Add correlation info
    correlation = np.corrcoef(y_test, y_pred)[0, 1]
    axes[0, 0].text(0.05, 0.95, f'r = {correlation:.4f}', transform=axes[0, 0].transAxes,
                    bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.8))
    
    # 2. Residuals plot
    residuals = y_test - y_pred
    axes[0, 1].scatter(y_pred, residuals, alpha=0.6, s=50)
    axes[0, 1].axhline(y=0, color='r', linestyle='--')
    axes[0, 1].set_xlabel('Predicted Values')
    axes[0, 1].set_ylabel('Residuals')
    axes[0, 1].set_title('Residuals Plot')
    axes[0, 1].grid(True, alpha=0.3)
    
    # 3. Distribution comparison
    axes[1, 0].hist(y_test, bins=20, alpha=0.7, label='True', density=True)
    axes[1, 0].hist(y_pred, bins=20, alpha=0.7, label='Predicted', density=True)
    axes[1, 0].set_xlabel('Value')
    axes[1, 0].set_ylabel('Density')
    axes[1, 0].set_title('Distribution Comparison')
    axes[1, 0].legend()
    axes[1, 0].grid(True, alpha=0.3)
    
    # 4. Per-target performance
    target_performance = []
    target_values = np.unique(y_test)
    
    for target_val in target_values:
        mask = y_test == target_val
        if mask.sum() > 0:
            target_mae = np.mean(np.abs(y_test[mask] - y_pred[mask]))
            target_performance.append(target_mae)
    
    axes[1, 1].bar(target_values, target_performance, alpha=0.7)
    axes[1, 1].set_xlabel('Target Value')
    axes[1, 1].set_ylabel('MAE')
    axes[1, 1].set_title('Per-Target Performance')
    axes[1, 1].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    # Print detailed performance by target
    print(f"\n📋 DETAILED PERFORMANCE BY TARGET:")
    for target_val in target_values:
        mask = y_test == target_val
        if mask.sum() > 0:
            target_mae = np.mean(np.abs(y_test[mask] - y_pred[mask]))
            target_corr = np.corrcoef(y_test[mask], y_pred[mask])[0, 1] if mask.sum() > 1 else 0.0
            print(f"  Target {target_val}: {mask.sum():,} samples, MAE={target_mae:.4f}, r={target_corr:.4f}")

# Generate all visualizations if we have results
if len(results_df) > 0:
    print("\n📊 Generating comprehensive visualizations...")
    
    # Main comparison plots
    plot_model_comparison(results_df)
    
    # Training histories
    plot_training_histories_summary(training_histories)
    
    # Detailed prediction analysis
    analyze_predictions(trained_models, X_test, y_test, results_df)
    
    print("\n✅ All visualizations completed!")
else:
    print("\n❌ No results available for visualization")

In [None]:
# ===== STEP 8: SAVE RESULTS AND EXPORT MODELS =====

def save_training_results(results_df, training_histories, save_dir="../../exports"):
    """Save training results and metadata"""
    os.makedirs(save_dir, exist_ok=True)
    
    # Save results CSV
    results_path = os.path.join(save_dir, "balanced_training_results.csv")
    results_df.to_csv(results_path, index=False)
    print(f"Training results saved to: {results_path}")
    
    # Save training histories
    histories_path = os.path.join(save_dir, "training_histories.json")
    histories_data = {}
    for config_id, history in training_histories.items():
        histories_data[config_id] = {
            'loss': history.history['loss'],
            'val_loss': history.history['val_loss'],
            'mae': history.history.get('mae', []),
            'val_mae': history.history.get('val_mae', [])
        }
    
    with open(histories_path, 'w') as f:
        json.dump(histories_data, f, indent=2)
    print(f"Training histories saved to: {histories_path}")
    
    # Save training configuration
    config_path = os.path.join(save_dir, "training_config.json")
    config_data = {
        'training_config': TRAINING_CONFIG,
        'model_configs': {name: config['description'] for name, config in MODEL_CONFIGS.items()},
        'data_info': {
            'balanced_dataset_shape': balanced_df.shape,
            'train_samples': len(X_train),
            'val_samples': len(X_val),
            'test_samples': len(X_test),
            'num_features': len(feature_cols)
        },
        'timestamp': datetime.now().isoformat()
    }
    
    with open(config_path, 'w') as f:
        json.dump(config_data, f, indent=2)
    print(f"Training configuration saved to: {config_path}")

def create_model_summary_report(results_df, save_dir="../../exports"):
    """Create a comprehensive summary report"""
    if len(results_df) == 0:
        print("No results to summarize")
        return
    
    report_path = os.path.join(save_dir, "model_training_report.md")
    
    with open(report_path, 'w') as f:
        f.write("# Balanced Dataset Model Training Report\n\n")
        f.write(f"Generated on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
        
        f.write("## Training Overview\n\n")
        f.write(f"- **Total models trained**: {len(results_df)}\n")
        f.write(f"- **Dataset**: Balanced train.parquet with equal target distribution\n")
        f.write(f"- **Training samples**: {len(X_train):,}\n")
        f.write(f"- **Validation samples**: {len(X_val):,}\n")
        f.write(f"- **Test samples**: {len(X_test):,}\n")
        f.write(f"- **Features**: {len(feature_cols)}\n\n")
        
        f.write("## Best Models\n\n")
        best_corr = results_df.loc[results_df['test_correlation'].idxmax()]
        best_mae = results_df.loc[results_df['test_mae'].idxmin()]
        
        f.write(f"### Best by Correlation\n")
        f.write(f"- **Model**: {best_corr['config_id']}\n")
        f.write(f"- **Correlation**: {best_corr['test_correlation']:.4f}\n")
        f.write(f"- **MAE**: {best_corr['test_mae']:.4f}\n")
        f.write(f"- **Epochs**: {best_corr['epochs_trained']}\n\n")
        
        f.write(f"### Best by MAE\n")
        f.write(f"- **Model**: {best_mae['config_id']}\n")
        f.write(f"- **MAE**: {best_mae['test_mae']:.4f}\n")
        f.write(f"- **Correlation**: {best_mae['test_correlation']:.4f}\n")
        f.write(f"- **Epochs**: {best_mae['epochs_trained']}\n\n")
        
        f.write("## All Results\n\n")
        f.write("| Config ID | Model | Optimizer | Loss | Test Correlation | Test MAE | Epochs |\n")
        f.write("|-----------|-------|-----------|------|------------------|----------|--------|\n")
        
        for _, row in results_df.sort_values('test_correlation', ascending=False).iterrows():
            f.write(f"| {row['config_id']} | {row['model_name']} | {row['optimizer_name']} | "
                   f"{row['loss_name']} | {row['test_correlation']:.4f} | {row['test_mae']:.4f} | "
                   f"{row['epochs_trained']} |\n")
        
        f.write("\n## Model Architecture Comparison\n\n")
        model_comparison = results_df.groupby('model_name').agg({
            'test_correlation': ['mean', 'std', 'max'],
            'test_mae': ['mean', 'std', 'min']
        }).round(4)
        
        f.write("| Model | Avg Correlation | Std Correlation | Max Correlation | Avg MAE | Std MAE | Min MAE |\n")
        f.write("|-------|-----------------|-----------------|-----------------|---------|---------|---------|\n")
        
        for model_name in model_comparison.index:
            row = model_comparison.loc[model_name]
            f.write(f"| {model_name} | {row[('test_correlation', 'mean')]} | "
                   f"{row[('test_correlation', 'std')]} | {row[('test_correlation', 'max')]} | "
                   f"{row[('test_mae', 'mean')]} | {row[('test_mae', 'std')]} | "
                   f"{row[('test_mae', 'min')]} |\n")
    
    print(f"Summary report saved to: {report_path}")

# Save all results if we have them
if len(results_df) > 0:
    print("\n💾 Saving training results and creating reports...")
    
    # Save results and metadata
    save_training_results(results_df, training_histories)
    
    # Create summary report
    create_model_summary_report(results_df)
    
    print("\n✅ All results saved successfully!")
    
    # Print final summary
    print(f"\n🎯 FINAL TRAINING SUMMARY:")
    print(f"{'='*50}")
    print(f"Total successful models: {len(results_df)}")
    print(f"Best correlation: {results_df['test_correlation'].max():.4f}")
    print(f"Best MAE: {results_df['test_mae'].min():.4f}")
    print(f"Average correlation: {results_df['test_correlation'].mean():.4f}")
    print(f"Average MAE: {results_df['test_mae'].mean():.4f}")
    print(f"\nAll models and results saved to: ../../exports/")
    print(f"Balanced dataset saved to: ../../data/train_balanced.parquet")
    
else:
    print("\n❌ No successful training results to save")

print("\n🏁 Comprehensive training notebook execution completed!")

In [None]:
# Load the balanced dataset
print("=== LOADING BALANCED DATASET ===")

# Load balanced training data
balanced_data_path = "../../data/train_balanced.parquet"
if os.path.exists(balanced_data_path):
    train_df = pd.read_parquet(balanced_data_path)
    print(f"✅ Balanced dataset loaded: {train_df.shape}")
    
    # Verify the distribution is balanced
    target_dist = train_df['target'].value_counts().sort_index()
    target_pct = (target_dist / len(train_df) * 100).round(1)
    
    print("\nTarget distribution verification:")
    for target, count in target_dist.items():
        pct = target_pct[target]
        print(f"  {target}: {count:,} samples ({pct}%)")
    
    # Quick visualization
    plt.figure(figsize=(10, 6))
    target_pct.plot(kind='bar', color='skyblue', alpha=0.7)
    plt.title('Balanced Dataset - Target Distribution')
    plt.xlabel('Target Value')
    plt.ylabel('Percentage (%)')
    plt.xticks(rotation=0)
    for i, v in enumerate(target_pct.values):
        plt.text(i, v + 0.5, f'{v}%', ha='center', va='bottom')
    plt.tight_layout()
    plt.show()
    
else:
    print(f"❌ Balanced dataset not found at {balanced_data_path}")
    print("Please run the create_dataset.ipynb notebook first.")

In [None]:
# Setup data handler and prepare features
print("\n=== PREPARING DATA FOR TRAINING ===")

if 'train_df' in locals():
    # Create a custom data handler for our balanced dataset
    class BalancedDataHandler:
        def __init__(self, df):
            self.df = df
            
        def get_feature_columns(self, feature_set='all'):
            """Get feature columns based on the feature set"""
            # Exclude non-feature columns
            exclude_cols = ['target', 'era', 'data_type'] if 'era' in self.df.columns else ['target']
            feature_cols = [col for col in self.df.columns if col not in exclude_cols]
            
            if feature_set == 'small':
                # Use first 50 features
                return feature_cols[:50]
            elif feature_set == 'medium':
                # Use first 100 features
                return feature_cols[:100]
            else:  # 'all'
                return feature_cols
        
        def prepare_data(self, feature_set='all', validation_split=0.2):
            """Prepare training and validation data"""
            feature_cols = self.get_feature_columns(feature_set)
            
            # Get features and target
            X = self.df[feature_cols].values.astype(np.float32)
            y = self.df['target'].values.astype(np.float32)
            
            print(f"Features shape: {X.shape}")
            print(f"Target shape: {y.shape}")
            print(f"Feature set: {feature_set} ({len(feature_cols)} features)")
            
            # Split data for validation
            if validation_split > 0:
                split_idx = int(len(X) * (1 - validation_split))
                
                # Shuffle indices to ensure random split while maintaining balance
                indices = np.random.permutation(len(X))
                train_indices = indices[:split_idx]
                val_indices = indices[split_idx:]
                
                X_train, X_val = X[train_indices], X[val_indices]
                y_train, y_val = y[train_indices], y[val_indices]
                
                print(f"Training set: {X_train.shape[0]} samples")
                print(f"Validation set: {X_val.shape[0]} samples")
                
                # Verify both sets maintain target balance
                print("\nTraining set target distribution:")
                train_target_dist = pd.Series(y_train).value_counts().sort_index()
                for target, count in train_target_dist.items():
                    pct = count / len(y_train) * 100
                    print(f"  {target}: {count} ({pct:.1f}%)")
                
                return X_train, X_val, y_train, y_val, feature_cols
            else:
                return X, None, y, None, feature_cols
    
    # Initialize data handler
    data_handler = BalancedDataHandler(train_df)
    
    # Prepare data with different feature sets
    feature_set = 'all'  # Change to 'small' or 'medium' if needed
    X_train, X_val, y_train, y_val, feature_cols = data_handler.prepare_data(
        feature_set=feature_set, 
        validation_split=0.2
    )
    
    print(f"\n✅ Data preparation complete!")
else:
    print("❌ Training data not loaded. Please run the previous cell.")

In [None]:
# Training configuration
print("\n=== TRAINING CONFIGURATION ===")

# Training parameters
TRAINING_CONFIG = {
    'epochs': 100,
    'batch_size': 512,
    'learning_rate': 0.001,
    'validation_split': 0.2,
    'early_stopping_patience': 15,
    'reduce_lr_patience': 5,
    'reduce_lr_factor': 0.5,
    'min_lr': 1e-6
}

# Models to train
MODELS_TO_TRAIN = {
    'BestModel': best_model,
    'CorrelationModel': correlation_model,
    'DeepModel': deep_model
}

# Optimizers to test
OPTIMIZERS = {
    'Adam': Adam(learning_rate=TRAINING_CONFIG['learning_rate']),
    'SGD': SGD(learning_rate=TRAINING_CONFIG['learning_rate'], momentum=0.9),
    'RMSprop': RMSprop(learning_rate=TRAINING_CONFIG['learning_rate'])
}

print("Training configuration:")
for key, value in TRAINING_CONFIG.items():
    print(f"  {key}: {value}")

print(f"\nModels to train: {list(MODELS_TO_TRAIN.keys())}")
print(f"Optimizers to test: {list(OPTIMIZERS.keys())}")
print(f"Total combinations: {len(MODELS_TO_TRAIN)} × {len(OPTIMIZERS)} = {len(MODELS_TO_TRAIN) * len(OPTIMIZERS)}")

In [None]:
# Training functions
def create_callbacks(model_name, optimizer_name):
    """Create training callbacks"""
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    
    # Create directories
    log_dir = f"../../logs/{model_name}_{optimizer_name}_{timestamp}"
    os.makedirs(log_dir, exist_ok=True)
    os.makedirs("../../exports", exist_ok=True)
    
    callbacks = [
        EarlyStopping(
            monitor='val_loss',
            patience=TRAINING_CONFIG['early_stopping_patience'],
            restore_best_weights=True,
            verbose=1
        ),
        ReduceLROnPlateau(
            monitor='val_loss',
            factor=TRAINING_CONFIG['reduce_lr_factor'],
            patience=TRAINING_CONFIG['reduce_lr_patience'],
            min_lr=TRAINING_CONFIG['min_lr'],
            verbose=1
        ),
        ModelCheckpoint(
            filepath=f"../../exports/{model_name}_{optimizer_name}_balanced.keras",
            monitor='val_loss',
            save_best_only=True,
            verbose=1
        )
    ]
    
    return callbacks, log_dir

def train_single_model(model_fn, model_name, optimizer_name, optimizer):
    """Train a single model with given optimizer"""
    print(f"\n🚀 Training {model_name} with {optimizer_name}...")
    
    try:
        # Create model
        input_shape = (len(feature_cols),)
        model = model_fn(input_shape)
        
        # Compile model
        model.compile(
            optimizer=optimizer,
            loss='mae',
            metrics=['mae', 'mse']
        )
        
        print(f"Model architecture: {model.count_params():,} parameters")
        
        # Create callbacks
        callbacks, log_dir = create_callbacks(model_name, optimizer_name)
        
        # Train model
        history = model.fit(
            X_train, y_train,
            validation_data=(X_val, y_val),
            epochs=TRAINING_CONFIG['epochs'],
            batch_size=TRAINING_CONFIG['batch_size'],
            callbacks=callbacks,
            verbose=1
        )
        
        # Save training history
        history_df = pd.DataFrame(history.history)
        history_df.to_csv(f"{log_dir}/training_history.csv", index=False)
        
        # Evaluate model
        train_loss = model.evaluate(X_train, y_train, verbose=0)
        val_loss = model.evaluate(X_val, y_val, verbose=0)
        
        # Calculate predictions for analysis
        train_pred = model.predict(X_train, verbose=0)
        val_pred = model.predict(X_val, verbose=0)
        
        # Calculate correlations
        train_corr = np.corrcoef(y_train, train_pred.flatten())[0, 1]
        val_corr = np.corrcoef(y_val, val_pred.flatten())[0, 1]
        
        results = {
            'model': model_name,
            'optimizer': optimizer_name,
            'train_loss': train_loss[0],
            'val_loss': val_loss[0],
            'train_mae': train_loss[1],
            'val_mae': val_loss[1],
            'train_correlation': train_corr,
            'val_correlation': val_corr,
            'epochs_trained': len(history.history['loss']),
            'log_dir': log_dir
        }
        
        print(f"✅ {model_name} + {optimizer_name} completed!")
        print(f"   Final val_loss: {val_loss[0]:.4f}")
        print(f"   Val correlation: {val_corr:.4f}")
        print(f"   Epochs trained: {len(history.history['loss'])}")
        
        return results, history, model
        
    except Exception as e:
        print(f"❌ Error training {model_name} with {optimizer_name}: {str(e)}")
        return None, None, None

print("✅ Training functions defined!")

In [None]:
# Execute training
print("\n=== STARTING TRAINING ===")
print(f"Training {len(MODELS_TO_TRAIN) * len(OPTIMIZERS)} model-optimizer combinations...")

all_results = []
all_histories = {}
all_models = {}

for model_name, model_fn in MODELS_TO_TRAIN.items():
    for optimizer_name, optimizer in OPTIMIZERS.items():
        combination_name = f"{model_name}_{optimizer_name}"
        
        # Train the model
        results, history, model = train_single_model(
            model_fn, model_name, optimizer_name, optimizer
        )
        
        if results is not None:
            all_results.append(results)
            all_histories[combination_name] = history
            all_models[combination_name] = model
        
        print(f"\nProgress: {len(all_results)}/{len(MODELS_TO_TRAIN) * len(OPTIMIZERS)} completed")
        print("-" * 50)

print(f"\n🎉 Training completed! {len(all_results)} models trained successfully.")

In [None]:
# Analyze results
print("\n=== TRAINING RESULTS ANALYSIS ===")

if all_results:
    # Create results DataFrame
    results_df = pd.DataFrame(all_results)
    print("\nTraining Results Summary:")
    print(results_df[['model', 'optimizer', 'val_loss', 'val_correlation', 'epochs_trained']].to_string(index=False))
    
    # Find best results
    best_by_loss = results_df.loc[results_df['val_loss'].idxmin()]
    best_by_corr = results_df.loc[results_df['val_correlation'].idxmax()]
    
    print(f"\n🏆 BEST RESULTS:")
    print(f"Best by validation loss: {best_by_loss['model']} + {best_by_loss['optimizer']} (loss: {best_by_loss['val_loss']:.4f})")
    print(f"Best by correlation: {best_by_corr['model']} + {best_by_corr['optimizer']} (corr: {best_by_corr['val_correlation']:.4f})")
    
    # Create comparison plots
    fig, axes = plt.subplots(2, 2, figsize=(15, 12))
    
    # 1. Validation Loss by Model-Optimizer combination
    ax1 = axes[0, 0]
    results_pivot = results_df.pivot(index='model', columns='optimizer', values='val_loss')
    sns.heatmap(results_pivot, annot=True, fmt='.4f', cmap='YlOrRd_r', ax=ax1)
    ax1.set_title('Validation Loss by Model-Optimizer')
    
    # 2. Validation Correlation by Model-Optimizer combination
    ax2 = axes[0, 1]
    corr_pivot = results_df.pivot(index='model', columns='optimizer', values='val_correlation')
    sns.heatmap(corr_pivot, annot=True, fmt='.4f', cmap='YlGn', ax=ax2)
    ax2.set_title('Validation Correlation by Model-Optimizer')
    
    # 3. Training epochs by combination
    ax3 = axes[1, 0]
    epoch_pivot = results_df.pivot(index='model', columns='optimizer', values='epochs_trained')
    sns.heatmap(epoch_pivot, annot=True, fmt='d', cmap='YlOrRd', ax=ax3)
    ax3.set_title('Epochs Trained by Model-Optimizer')
    
    # 4. Loss vs Correlation scatter plot
    ax4 = axes[1, 1]
    for model in results_df['model'].unique():
        model_data = results_df[results_df['model'] == model]
        ax4.scatter(model_data['val_loss'], model_data['val_correlation'], 
                   label=model, s=100, alpha=0.7)
        
        # Add optimizer labels
        for _, row in model_data.iterrows():
            ax4.annotate(row['optimizer'], 
                        (row['val_loss'], row['val_correlation']),
                        xytext=(5, 5), textcoords='offset points', 
                        fontsize=8, alpha=0.7)
    
    ax4.set_xlabel('Validation Loss')
    ax4.set_ylabel('Validation Correlation')
    ax4.set_title('Loss vs Correlation Trade-off')
    ax4.legend()
    ax4.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    # Save results
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    results_df.to_csv(f"../../exports/balanced_training_results_{timestamp}.csv", index=False)
    print(f"\n📊 Results saved to: exports/balanced_training_results_{timestamp}.csv")
    
else:
    print("❌ No successful training results to analyze.")

In [None]:
# Plot training curves for the best performing models
print("\n=== TRAINING CURVES ANALYSIS ===")

if all_results and all_histories:
    # Get the best model by validation loss
    best_model_info = results_df.loc[results_df['val_loss'].idxmin()]
    best_combination = f"{best_model_info['model']}_{best_model_info['optimizer']}"
    
    print(f"Plotting training curves for best model: {best_combination}")
    
    # Plot training curves for all models
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    
    # 1. Training & Validation Loss for best model
    ax1 = axes[0, 0]
    if best_combination in all_histories:
        history = all_histories[best_combination]
        ax1.plot(history.history['loss'], label='Training Loss', alpha=0.8)
        ax1.plot(history.history['val_loss'], label='Validation Loss', alpha=0.8)
        ax1.set_title(f'Learning Curves - {best_combination}')
        ax1.set_xlabel('Epoch')
        ax1.set_ylabel('Loss')
        ax1.legend()
        ax1.grid(True, alpha=0.3)
    
    # 2. Compare final validation loss across all combinations
    ax2 = axes[0, 1]
    model_names = [f"{r['model']}_{r['optimizer']}" for r in all_results]
    val_losses = [r['val_loss'] for r in all_results]
    bars = ax2.bar(range(len(model_names)), val_losses, alpha=0.7)
    ax2.set_title('Final Validation Loss Comparison')
    ax2.set_xlabel('Model-Optimizer Combination')
    ax2.set_ylabel('Validation Loss')
    ax2.set_xticks(range(len(model_names)))
    ax2.set_xticklabels(model_names, rotation=45, ha='right')
    
    # Highlight the best model
    best_idx = val_losses.index(min(val_losses))
    bars[best_idx].set_color('gold')
    
    # 3. Compare correlations
    ax3 = axes[1, 0]
    val_corrs = [r['val_correlation'] for r in all_results]
    bars2 = ax3.bar(range(len(model_names)), val_corrs, alpha=0.7, color='lightgreen')
    ax3.set_title('Validation Correlation Comparison')
    ax3.set_xlabel('Model-Optimizer Combination')
    ax3.set_ylabel('Correlation')
    ax3.set_xticks(range(len(model_names)))
    ax3.set_xticklabels(model_names, rotation=45, ha='right')
    
    # Highlight the best correlation
    best_corr_idx = val_corrs.index(max(val_corrs))
    bars2[best_corr_idx].set_color('darkgreen')
    
    # 4. Training progress comparison (epochs needed)
    ax4 = axes[1, 1]
    epochs_trained = [r['epochs_trained'] for r in all_results]
    bars3 = ax4.bar(range(len(model_names)), epochs_trained, alpha=0.7, color='lightcoral')
    ax4.set_title('Training Epochs Needed')
    ax4.set_xlabel('Model-Optimizer Combination')
    ax4.set_ylabel('Epochs')
    ax4.set_xticks(range(len(model_names)))
    ax4.set_xticklabels(model_names, rotation=45, ha='right')
    
    plt.tight_layout()
    plt.show()
    
    # Summary statistics
    print(f"\n📈 TRAINING SUMMARY:")
    print(f"Average validation loss: {np.mean(val_losses):.4f} ± {np.std(val_losses):.4f}")
    print(f"Average correlation: {np.mean(val_corrs):.4f} ± {np.std(val_corrs):.4f}")
    print(f"Average epochs needed: {np.mean(epochs_trained):.1f} ± {np.std(epochs_trained):.1f}")
    
else:
    print("❌ No training histories available for plotting.")

In [None]:
# Evaluate model performance across target ranges
print("\n=== TARGET-SPECIFIC PERFORMANCE ANALYSIS ===")

if all_results and all_models:
    # Get the best model
    best_model_info = results_df.loc[results_df['val_loss'].idxmin()]
    best_combination = f"{best_model_info['model']}_{best_model_info['optimizer']}"
    best_model = all_models[best_combination]
    
    print(f"Analyzing target-specific performance for: {best_combination}")
    
    # Get predictions
    val_predictions = best_model.predict(X_val, verbose=0).flatten()
    
    # Analyze performance by target value
    target_analysis = []
    unique_targets = sorted(np.unique(y_val))
    
    for target_val in unique_targets:
        # Get indices for this target value
        target_mask = y_val == target_val
        target_true = y_val[target_mask]
        target_pred = val_predictions[target_mask]
        
        if len(target_true) > 0:
            mae = np.mean(np.abs(target_true - target_pred))
            mse = np.mean((target_true - target_pred) ** 2)
            correlation = np.corrcoef(target_true, target_pred)[0, 1] if len(target_true) > 1 else 0
            
            target_analysis.append({
                'target_value': target_val,
                'n_samples': len(target_true),
                'mae': mae,
                'mse': mse,
                'correlation': correlation,
                'mean_prediction': np.mean(target_pred),
                'std_prediction': np.std(target_pred)
            })
    
    # Create DataFrame for analysis
    target_df = pd.DataFrame(target_analysis)
    print("\nPerformance by Target Value:")
    print(target_df.round(4).to_string(index=False))
    
    # Visualize target-specific performance
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    
    # 1. MAE by target value
    ax1 = axes[0, 0]
    ax1.bar(target_df['target_value'], target_df['mae'], alpha=0.7, color='lightcoral')
    ax1.set_title('MAE by Target Value')
    ax1.set_xlabel('Target Value')
    ax1.set_ylabel('Mean Absolute Error')
    ax1.grid(True, alpha=0.3)
    
    # 2. Correlation by target value
    ax2 = axes[0, 1]
    ax2.bar(target_df['target_value'], target_df['correlation'], alpha=0.7, color='lightgreen')
    ax2.set_title('Correlation by Target Value')
    ax2.set_xlabel('Target Value')
    ax2.set_ylabel('Correlation')
    ax2.grid(True, alpha=0.3)
    
    # 3. Prediction vs True scatter for each target
    ax3 = axes[1, 0]
    colors = plt.cm.viridis(np.linspace(0, 1, len(unique_targets)))
    for i, target_val in enumerate(unique_targets):
        target_mask = y_val == target_val
        ax3.scatter(y_val[target_mask], val_predictions[target_mask], 
                   alpha=0.6, label=f'Target {target_val}', color=colors[i])
    
    # Add perfect prediction line
    min_val, max_val = y_val.min(), y_val.max()
    ax3.plot([min_val, max_val], [min_val, max_val], 'r--', alpha=0.8, label='Perfect Prediction')
    ax3.set_xlabel('True Values')
    ax3.set_ylabel('Predicted Values')
    ax3.set_title('Predictions vs True Values by Target')
    ax3.legend()
    ax3.grid(True, alpha=0.3)
    
    # 4. Prediction distribution by target
    ax4 = axes[1, 1]
    for i, target_val in enumerate(unique_targets):
        target_mask = y_val == target_val
        target_preds = val_predictions[target_mask]
        ax4.hist(target_preds, alpha=0.6, label=f'Target {target_val}', 
                color=colors[i], bins=20)
        # Add vertical line for true target value
        ax4.axvline(target_val, color=colors[i], linestyle='--', alpha=0.8)
    
    ax4.set_xlabel('Predicted Values')
    ax4.set_ylabel('Frequency')
    ax4.set_title('Distribution of Predictions by Target')
    ax4.legend()
    ax4.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    # Check if model is learning vs memorizing distribution
    overall_correlation = np.corrcoef(y_val, val_predictions)[0, 1]
    prediction_std = np.std(val_predictions)
    target_std = np.std(y_val)
    
    print(f"\n🎯 LEARNING ASSESSMENT:")
    print(f"Overall correlation: {overall_correlation:.4f}")
    print(f"Prediction std: {prediction_std:.4f} (target std: {target_std:.4f})")
    print(f"Std ratio: {prediction_std/target_std:.4f}")
    
    if overall_correlation > 0.3 and 0.5 < prediction_std/target_std < 1.5:
        print("✅ Model appears to be learning meaningful patterns!")
    elif prediction_std < 0.01:
        print("⚠️ Model may be predicting constant values")
    elif abs(prediction_std - target_std) < 0.001 and overall_correlation < 0.1:
        print("⚠️ Model may be memorizing distribution without learning")
    else:
        print("🤔 Model learning is unclear - review individual target performance")
        
else:
    print("❌ No trained models available for evaluation.")

## Training Summary

### Key Findings:

1. **Best Model Performance**: The analysis above shows which model-optimizer combination performed best on the balanced dataset

2. **Target-Specific Performance**: Unlike training on imbalanced data, the balanced dataset ensures the model learns patterns across all target ranges equally

3. **Learning vs Memorization**: The target-specific analysis helps identify whether the model is actually learning relationships or just memorizing distributions

### Next Steps:

1. **Use the best performing model** for further experimentation
2. **Compare with models trained on imbalanced data** to see the improvement
3. **Test on validation/test sets** to confirm generalization
4. **Fine-tune hyperparameters** for the best model-optimizer combination

### Benefits of Balanced Training:

- ✅ **Equal representation** of all target categories
- ✅ **Reduced bias** towards middle values (0.5)
- ✅ **Better learning** across all prediction ranges
- ✅ **More robust evaluation** of model performance

The trained models are saved in the `exports/` directory and can be loaded for further use or evaluation.