# Training Pipeline - novacron

**MLE-Star Stage 2: Learning Pipeline Setup**

This notebook implements the learning pipeline phase of the MLE-Star methodology:
- Data preprocessing and feature engineering
- Training and validation pipeline setup
- Model training with monitoring
- Training optimization and checkpointing

**Framework:** {{framework}}

**Date:** {{date}}


In [None]:
# Import required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import yaml
import pickle
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Set style for plots
plt.style.use('seaborn-v0_8')
sns.set_palette('husl')

print("Libraries imported successfully!")
print(f"Training started at: {datetime.now()}")

## 1. Load Configuration and Setup


In [None]:
# Load project configuration
config_path = Path('../configs/config.yaml')
with open(config_path, 'r') as f:
    config = yaml.safe_load(f)

# Load model design summary
design_summary_path = Path('../outputs/reports/model_design_summary.yaml')
if design_summary_path.exists():
    with open(design_summary_path, 'r') as f:
        design_summary = yaml.safe_load(f)
    print("Model design summary loaded successfully!")
else:
    print("Model design summary not found. Please run model design notebook first.")
    design_summary = {}

print(f"Experiment: {config['experiment']['name']}")
print(f"Framework: {config['model']['framework']}")

# Set random seeds for reproducibility
np.random.seed(config['data']['random_seed'])

## 2. Data Loading and Preprocessing Pipeline


In [None]:
# Framework-specific imports
framework = config['model']['framework']

if framework == 'pytorch':
    import torch
    import torch.nn as nn
    import torch.optim as optim
    from torch.utils.data import DataLoader, TensorDataset, Dataset
    from torch.optim.lr_scheduler import StepLR
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"PyTorch device: {device}")
    
elif framework == 'tensorflow':
    import tensorflow as tf
    from tensorflow import keras
    from tensorflow.keras import layers, callbacks
    from tensorflow.keras.optimizers import Adam
    print(f"TensorFlow GPU available: {tf.test.is_gpu_available()}")
    
elif framework == 'scikit-learn':
    from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
    from sklearn.preprocessing import StandardScaler, LabelEncoder
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.metrics import classification_report, accuracy_score
    from sklearn.pipeline import Pipeline
    print("Scikit-learn components imported")

# Common ML utilities
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split

In [None]:
# Data loading function
def load_and_preprocess_data():
    """Load and preprocess data according to configuration"""
    
    # For demonstration, create synthetic data similar to model design
    n_samples = 5000
    n_features = 10
    
    # Generate synthetic data
    X = np.random.randn(n_samples, n_features)
    # Create more complex relationship
    y = ((X[:, 0] * 0.5 + X[:, 1] * 0.3 + X[:, 2] * 0.2 + 
          np.random.randn(n_samples) * 0.1) > 0).astype(int)
    
    # Create DataFrame
    feature_names = [f'feature_{i}' for i in range(n_features)]
    df = pd.DataFrame(X, columns=feature_names)
    df['target'] = y
    
    print(f"Dataset loaded: {df.shape}")
    print(f"Target distribution: {df['target'].value_counts().to_dict()}")
    
    return df, feature_names

# Load data
df, feature_names = load_and_preprocess_data()

# Prepare features and target
X = df[feature_names].values
y = df['target'].values

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")

In [None]:
# Data splitting with stratification
train_size = config['data']['train_split']
val_size = config['data']['validation_split']
test_size = config['data']['test_split']

# First split: separate test set
X_temp, X_test, y_temp, y_test = train_test_split(
    X, y, test_size=test_size, random_state=config['data']['random_seed'],
    stratify=y if config['data']['stratify'] else None
)

# Second split: separate train and validation
val_ratio = val_size / (train_size + val_size)
X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, test_size=val_ratio, random_state=config['data']['random_seed'],
    stratify=y_temp if config['data']['stratify'] else None
)

print(f"Training set: {X_train.shape} ({len(X_train)/len(X)*100:.1f}%)")
print(f"Validation set: {X_val.shape} ({len(X_val)/len(X)*100:.1f}%)")
print(f"Test set: {X_test.shape} ({len(X_test)/len(X)*100:.1f}%)")

# Check class distribution
print("\nClass distribution:")
print(f"Train: {np.bincount(y_train)}")
print(f"Validation: {np.bincount(y_val)}")
print(f"Test: {np.bincount(y_test)}")

In [None]:
# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

print("Features scaled successfully!")
print(f"Train features - Mean: {X_train_scaled.mean():.4f}, Std: {X_train_scaled.std():.4f}")
print(f"Val features - Mean: {X_val_scaled.mean():.4f}, Std: {X_val_scaled.std():.4f}")

# Save scaler for later use
scaler_path = Path('../outputs/models/feature_scaler.pkl')
scaler_path.parent.mkdir(parents=True, exist_ok=True)
with open(scaler_path, 'wb') as f:
    pickle.dump(scaler, f)
print(f"Scaler saved to: {scaler_path}")

## 3. Model Architecture Definition


In [None]:
# Define model architecture based on framework
def create_model():
    """Create model based on framework configuration"""
    
    input_size = len(feature_names)
    hidden_layers = config['model']['hidden_layers']
    output_size = len(np.unique(y))
    dropout_rate = config['model']['dropout_rate']
    
    if framework == 'pytorch':
        class MLPClassifier(nn.Module):
            def __init__(self, input_size, hidden_layers, output_size, dropout_rate):
                super(MLPClassifier, self).__init__()
                
                layers = []
                prev_size = input_size
                
                # Hidden layers
                for hidden_size in hidden_layers:
                    layers.extend([
                        nn.Linear(prev_size, hidden_size),
                        nn.ReLU(),
                        nn.Dropout(dropout_rate)
                    ])
                    prev_size = hidden_size
                
                # Output layer
                layers.append(nn.Linear(prev_size, output_size))
                
                self.network = nn.Sequential(*layers)
            
            def forward(self, x):
                return self.network(x)
        
        model = MLPClassifier(input_size, hidden_layers, output_size, dropout_rate)
        model = model.to(device)
        
        print(f"PyTorch model created:")
        print(f"Input size: {input_size}")
        print(f"Hidden layers: {hidden_layers}")
        print(f"Output size: {output_size}")
        print(f"Total parameters: {sum(p.numel() for p in model.parameters())}")
        
    elif framework == 'tensorflow':
        model = keras.Sequential([
            layers.Dense(hidden_layers[0], activation='relu', input_shape=(input_size,))
        ])
        
        # Add hidden layers
        for hidden_size in hidden_layers[1:]:
            model.add(layers.Dense(hidden_size, activation='relu'))
            model.add(layers.Dropout(dropout_rate))
        
        # Output layer
        activation = 'sigmoid' if output_size == 2 else 'softmax'
        output_units = 1 if output_size == 2 else output_size
        model.add(layers.Dense(output_units, activation=activation))
        
        # Compile model
        loss = 'binary_crossentropy' if output_size == 2 else 'sparse_categorical_crossentropy'
        model.compile(
            optimizer=Adam(learning_rate=config['training']['learning_rate']),
            loss=loss,
            metrics=['accuracy']
        )
        
        print("TensorFlow model created:")
        model.summary()
        
    elif framework == 'scikit-learn':
        # Create pipeline with preprocessing and model
        model = Pipeline([
            ('scaler', StandardScaler()),
            ('classifier', RandomForestClassifier(
                n_estimators=100,
                random_state=config['data']['random_seed'],
                n_jobs=-1
            ))
        ])
        
        print("Scikit-learn pipeline created:")
        print(f"Components: {[name for name, _ in model.steps]}")
    
    return model

# Create model
model = create_model()

## 4. Training Pipeline Implementation


In [None]:
# Training configuration
training_config = config['training']
batch_size = training_config['batch_size']
epochs = training_config['epochs']
learning_rate = training_config['learning_rate']

print(f"Training Configuration:")
print(f"Batch size: {batch_size}")
print(f"Epochs: {epochs}")
print(f"Learning rate: {learning_rate}")

# Initialize training history
training_history = {
    'train_loss': [],
    'train_acc': [],
    'val_loss': [],
    'val_acc': [],
    'epoch': [],
    'lr': []
}

In [None]:
# Framework-specific training implementation
def train_model():
    """Train model based on framework"""
    
    if framework == 'pytorch':
        return train_pytorch_model()
    elif framework == 'tensorflow':
        return train_tensorflow_model()
    elif framework == 'scikit-learn':
        return train_sklearn_model()

def train_pytorch_model():
    """PyTorch training implementation"""
    
    # Prepare data loaders
    train_dataset = TensorDataset(
        torch.FloatTensor(X_train_scaled),
        torch.LongTensor(y_train)
    )
    val_dataset = TensorDataset(
        torch.FloatTensor(X_val_scaled),
        torch.LongTensor(y_val)
    )
    
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
    
    # Setup optimizer and loss
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    criterion = nn.CrossEntropyLoss()
    scheduler = StepLR(optimizer, step_size=30, gamma=0.1)
    
    # Training loop
    model.train()
    best_val_acc = 0
    
    for epoch in range(epochs):
        # Training phase
        train_loss = 0
        train_correct = 0
        train_total = 0
        
        for batch_X, batch_y in train_loader:
            batch_X, batch_y = batch_X.to(device), batch_y.to(device)
            
            optimizer.zero_grad()
            outputs = model(batch_X)
            loss = criterion(outputs, batch_y)
            loss.backward()
            optimizer.step()
            
            train_loss += loss.item()
            _, predicted = outputs.max(1)
            train_total += batch_y.size(0)
            train_correct += predicted.eq(batch_y).sum().item()
        
        # Validation phase
        model.eval()
        val_loss = 0
        val_correct = 0
        val_total = 0
        
        with torch.no_grad():
            for batch_X, batch_y in val_loader:
                batch_X, batch_y = batch_X.to(device), batch_y.to(device)
                outputs = model(batch_X)
                loss = criterion(outputs, batch_y)
                
                val_loss += loss.item()
                _, predicted = outputs.max(1)
                val_total += batch_y.size(0)
                val_correct += predicted.eq(batch_y).sum().item()
        
        # Calculate metrics
        train_acc = 100. * train_correct / train_total
        val_acc = 100. * val_correct / val_total
        train_loss_avg = train_loss / len(train_loader)
        val_loss_avg = val_loss / len(val_loader)
        
        # Update history
        training_history['epoch'].append(epoch + 1)
        training_history['train_loss'].append(train_loss_avg)
        training_history['train_acc'].append(train_acc)
        training_history['val_loss'].append(val_loss_avg)
        training_history['val_acc'].append(val_acc)
        training_history['lr'].append(scheduler.get_last_lr()[0])
        
        # Save best model
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            torch.save(model.state_dict(), '../outputs/models/best_model.pth')
        
        # Print progress
        if (epoch + 1) % 10 == 0:
            print(f'Epoch {epoch+1}/{epochs}: '
                  f'Train Loss: {train_loss_avg:.4f}, Train Acc: {train_acc:.2f}%, '
                  f'Val Loss: {val_loss_avg:.4f}, Val Acc: {val_acc:.2f}%')
        
        scheduler.step()
        model.train()
    
    return model, training_history

def train_tensorflow_model():
    """TensorFlow training implementation"""
    
    # Callbacks
    callback_list = [
        callbacks.ModelCheckpoint(
            '../outputs/models/best_model.h5',
            monitor='val_accuracy',
            save_best_only=True,
            verbose=1
        ),
        callbacks.ReduceLROnPlateau(
            monitor='val_loss',
            factor=0.5,
            patience=10,
            verbose=1
        ),
        callbacks.EarlyStopping(
            monitor='val_loss',
            patience=20,
            restore_best_weights=True
        )
    ]
    
    # Train model
    history = model.fit(
        X_train_scaled, y_train,
        batch_size=batch_size,
        epochs=epochs,
        validation_data=(X_val_scaled, y_val),
        callbacks=callback_list,
        verbose=1
    )
    
    # Update training history
    for epoch in range(len(history.history['loss'])):
        training_history['epoch'].append(epoch + 1)
        training_history['train_loss'].append(history.history['loss'][epoch])
        training_history['train_acc'].append(history.history['accuracy'][epoch] * 100)
        training_history['val_loss'].append(history.history['val_loss'][epoch])
        training_history['val_acc'].append(history.history['val_accuracy'][epoch] * 100)
        training_history['lr'].append(learning_rate)  # Simplified
    
    return model, training_history

def train_sklearn_model():
    """Scikit-learn training implementation"""
    
    # Train model
    print("Training scikit-learn model...")
    model.fit(X_train, y_train)  # Pipeline handles scaling
    
    # Evaluate on train and validation sets
    train_pred = model.predict(X_train)
    val_pred = model.predict(X_val)
    
    train_acc = accuracy_score(y_train, train_pred) * 100
    val_acc = accuracy_score(y_val, val_pred) * 100
    
    # Update training history (simplified for sklearn)
    training_history['epoch'] = [1]
    training_history['train_acc'] = [train_acc]
    training_history['val_acc'] = [val_acc]
    training_history['train_loss'] = [0]  # Not applicable
    training_history['val_loss'] = [0]    # Not applicable
    training_history['lr'] = [0]          # Not applicable
    
    print(f"Training completed!")
    print(f"Train Accuracy: {train_acc:.2f}%")
    print(f"Validation Accuracy: {val_acc:.2f}%")
    
    # Save model
    with open('../outputs/models/best_model.pkl', 'wb') as f:
        pickle.dump(model, f)
    
    return model, training_history

# Start training
print(f"Starting {framework} model training...")
trained_model, history = train_model()
print("Training completed!")

## 5. Training Results Analysis


In [None]:
# Plot training history
if len(history['epoch']) > 1:  # Only plot if we have multiple epochs
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    
    # Training and validation loss
    axes[0, 0].plot(history['epoch'], history['train_loss'], label='Train Loss', marker='o')
    axes[0, 0].plot(history['epoch'], history['val_loss'], label='Val Loss', marker='s')
    axes[0, 0].set_title('Training and Validation Loss')
    axes[0, 0].set_xlabel('Epoch')
    axes[0, 0].set_ylabel('Loss')
    axes[0, 0].legend()
    axes[0, 0].grid(True)
    
    # Training and validation accuracy
    axes[0, 1].plot(history['epoch'], history['train_acc'], label='Train Acc', marker='o')
    axes[0, 1].plot(history['epoch'], history['val_acc'], label='Val Acc', marker='s')
    axes[0, 1].set_title('Training and Validation Accuracy')
    axes[0, 1].set_xlabel('Epoch')
    axes[0, 1].set_ylabel('Accuracy (%)')
    axes[0, 1].legend()
    axes[0, 1].grid(True)
    
    # Learning rate schedule
    axes[1, 0].plot(history['epoch'], history['lr'], marker='o')
    axes[1, 0].set_title('Learning Rate Schedule')
    axes[1, 0].set_xlabel('Epoch')
    axes[1, 0].set_ylabel('Learning Rate')
    axes[1, 0].set_yscale('log')
    axes[1, 0].grid(True)
    
    # Overfitting analysis
    overfitting = [train - val for train, val in zip(history['train_acc'], history['val_acc'])]
    axes[1, 1].plot(history['epoch'], overfitting, marker='o', color='red')
    axes[1, 1].set_title('Overfitting Analysis (Train - Val Accuracy)')
    axes[1, 1].set_xlabel('Epoch')
    axes[1, 1].set_ylabel('Accuracy Difference (%)')
    axes[1, 1].axhline(y=0, color='black', linestyle='--')
    axes[1, 1].grid(True)
    
    plt.tight_layout()
    plt.savefig('../outputs/figures/training_history.png', dpi=300, bbox_inches='tight')
    plt.show()
else:
    print("Training history visualization skipped (single epoch or sklearn model)")

In [None]:
# Training summary statistics
print("Training Summary:")
print(f"Framework: {framework}")
print(f"Total epochs: {len(history['epoch'])}")

if len(history['train_acc']) > 0:
    final_train_acc = history['train_acc'][-1]
    final_val_acc = history['val_acc'][-1]
    best_val_acc = max(history['val_acc'])
    
    print(f"Final train accuracy: {final_train_acc:.2f}%")
    print(f"Final validation accuracy: {final_val_acc:.2f}%")
    print(f"Best validation accuracy: {best_val_acc:.2f}%")
    print(f"Overfitting (train - val): {final_train_acc - final_val_acc:.2f}%")
    
    # Find best epoch
    best_epoch = history['val_acc'].index(best_val_acc) + 1
    print(f"Best epoch: {best_epoch}")
    
    if len(history['train_loss']) > 1 and any(loss > 0 for loss in history['train_loss']):
        final_train_loss = history['train_loss'][-1]
        final_val_loss = history['val_loss'][-1]
        print(f"Final train loss: {final_train_loss:.4f}")
        print(f"Final validation loss: {final_val_loss:.4f}")

## 6. Model Evaluation on Test Set


In [None]:
# Load best model and evaluate on test set
def evaluate_test_set():
    """Evaluate trained model on test set"""
    
    if framework == 'pytorch':
        # Load best model
        model.load_state_dict(torch.load('../outputs/models/best_model.pth'))
        model.eval()
        
        with torch.no_grad():
            test_tensor = torch.FloatTensor(X_test_scaled).to(device)
            outputs = model(test_tensor)
            _, predicted = outputs.max(1)
            test_predictions = predicted.cpu().numpy()
    
    elif framework == 'tensorflow':
        # Load best model
        best_model = keras.models.load_model('../outputs/models/best_model.h5')
        test_predictions = (best_model.predict(X_test_scaled) > 0.5).astype(int).flatten()
    
    elif framework == 'scikit-learn':
        # Use trained model
        test_predictions = trained_model.predict(X_test)
    
    return test_predictions

# Evaluate on test set
print("Evaluating model on test set...")
test_pred = evaluate_test_set()

# Calculate test metrics
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

test_acc = accuracy_score(y_test, test_pred)
print(f"\nTest Set Results:")
print(f"Test Accuracy: {test_acc:.4f} ({test_acc*100:.2f}%)")

print("\nDetailed Classification Report:")
print(classification_report(y_test, test_pred))

# Confusion Matrix
cm = confusion_matrix(y_test, test_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix - Test Set')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.savefig('../outputs/figures/test_confusion_matrix.png', dpi=300, bbox_inches='tight')
plt.show()

## 7. Save Training Pipeline Results


In [None]:
# Create comprehensive training report
training_report = {
    'experiment_name': config['experiment']['name'],
    'framework': framework,
    'training_config': training_config,
    'model_architecture': {
        'input_size': len(feature_names),
        'hidden_layers': config['model']['hidden_layers'],
        'output_size': len(np.unique(y)),
        'dropout_rate': config['model']['dropout_rate']
    },
    'data_info': {
        'total_samples': len(X),
        'train_samples': len(X_train),
        'val_samples': len(X_val),
        'test_samples': len(X_test),
        'n_features': len(feature_names)
    },
    'training_results': {
        'epochs_trained': len(history['epoch']),
        'final_train_accuracy': history['train_acc'][-1] if history['train_acc'] else 0,
        'final_val_accuracy': history['val_acc'][-1] if history['val_acc'] else 0,
        'best_val_accuracy': max(history['val_acc']) if history['val_acc'] else 0,
        'test_accuracy': test_acc * 100,
        'overfitting_score': (history['train_acc'][-1] - history['val_acc'][-1]) if history['train_acc'] and history['val_acc'] else 0
    },
    'artifacts_created': [
        'outputs/models/best_model.*',
        'outputs/models/feature_scaler.pkl',
        'outputs/figures/training_history.png',
        'outputs/figures/test_confusion_matrix.png',
        'outputs/reports/training_pipeline_report.yaml'
    ],
    'next_steps': [
        'Model evaluation with detailed metrics',
        'Hyperparameter tuning',
        'Model analysis and interpretation',
        'Systematic testing implementation'
    ],
    'timestamp': datetime.now().isoformat()
}

# Save training report
report_path = Path('../outputs/reports/training_pipeline_report.yaml')
report_path.parent.mkdir(parents=True, exist_ok=True)
with open(report_path, 'w') as f:
    yaml.dump(training_report, f, default_flow_style=False)

print(f"Training pipeline report saved to: {report_path}")

# Save training history
history_path = Path('../outputs/models/training_history.pkl')
with open(history_path, 'wb') as f:
    pickle.dump(history, f)

print(f"Training history saved to: {history_path}")

# Print summary
print("\n" + "="*50)
print("TRAINING PIPELINE COMPLETED SUCCESSFULLY")
print("="*50)
print(f"Framework: {framework}")
print(f"Best Validation Accuracy: {training_report['training_results']['best_val_accuracy']:.2f}%")
print(f"Test Accuracy: {training_report['training_results']['test_accuracy']:.2f}%")
print(f"Model saved and ready for evaluation stage")
print("\nNext: Run notebook 03_model_evaluation.ipynb")