# Training of Final DNN
This jupyter notebook file is where the final version of the DNN is trained and saved. It is written to be fully reproducible.

In [None]:
import pandas as pd
import numpy as np
import json
import random
from itertools import product
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.regularizers import l2
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.optimizers import Adam
import tensorflow as tf
import matplotlib.pyplot as plt

# Set seeds for reproducibility
RANDOM_SEED = 42
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
tf.random.set_seed(RANDOM_SEED)

# Load and prepare data
df = pd.read_csv('../datasets/MEGAFRAME_CLEANEDV2.csv')
X = df.drop(columns=['UNEMP', 'Reference area', 'REF_AREA', 'TIME_PERIOD'])
y = df['UNEMP']

categorical_features = ['Region']
numerical_features = X.columns.difference(categorical_features)

preprocessor = ColumnTransformer([
    ('num', StandardScaler(), numerical_features),
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features),
])

# Create proper train/validation/test split
X_temp, X_test, y_temp, y_test = train_test_split(
    X_processed, y, test_size=0.15, random_state=RANDOM_SEED, stratify=None
)
X_train_full, X_val, y_train_full, y_val = train_test_split(
    X_temp, y_temp, test_size=0.15, random_state=RANDOM_SEED
)

print(f"Data split:")
print(f"  Training: {len(X_train_full)} samples ({len(X_train_full)/len(X_processed)*100:.1f}%)")
print(f"  Validation: {len(X_val)} samples ({len(X_val)/len(X_processed)*100:.1f}%)")  
print(f"  Test (unseen): {len(X_test)} samples ({len(X_test)/len(X_processed)*100:.1f}%)")
print(f"  Total: {len(X_processed)} samples\n")

def create_model(architecture, dropout_rate, l2_reg, learning_rate, input_shape):
    """Create model with specified hyperparameters"""
    model = Sequential()
    
    # First layer
    model.add(Dense(architecture[0], activation='relu', 
                   kernel_regularizer=l2(l2_reg), input_shape=(input_shape,)))
    model.add(BatchNormalization())
    model.add(Dropout(dropout_rate))
    
    # Hidden layers
    for units in architecture[1:]:
        model.add(Dense(units, activation='relu', kernel_regularizer=l2(l2_reg)))
        model.add(BatchNormalization())
        if units > 16:  # Only add dropout to larger layers
            model.add(Dropout(dropout_rate * 0.5))  # Reduce dropout in deeper layers
    
    # Output layer
    model.add(Dense(1))
    
    # Compile
    optimizer = Adam(learning_rate=learning_rate)
    model.compile(optimizer=optimizer, loss='mse', metrics=['mae'])
    
    return model

def evaluate_hyperparameters(params, X_train, y_train, X_val, y_val):
    """Evaluate hyperparameters using train/validation split"""
    
    # Create model
    model = create_model(
        architecture=params['architecture'],
        dropout_rate=params['dropout_rate'],
        l2_reg=params['l2_reg'],
        learning_rate=params['learning_rate'],
        input_shape=X_train.shape[1]
    )
    
    # Callbacks
    early_stop = EarlyStopping(monitor='val_loss', patience=15, restore_best_weights=True, verbose=0)
    lr_scheduler = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=8, min_lr=1e-7, verbose=0)
    
    # Train
    history = model.fit(
        X_train, y_train,
        validation_data=(X_val, y_val),
        epochs=200,
        batch_size=params['batch_size'],
        callbacks=[early_stop, lr_scheduler],
        verbose=0
    )
    
    # Evaluate with multiple metrics
    val_predictions = model.predict(X_val, verbose=0)
    train_predictions = model.predict(X_train, verbose=0)
    
    # Calculate comprehensive metrics
    from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
    
    val_mse = mean_squared_error(y_val, val_predictions)
    val_mae = mean_absolute_error(y_val, val_predictions)
    val_r2 = r2_score(y_val, val_predictions)
    
    train_mse = mean_squared_error(y_train, train_predictions)
    overfitting_gap = val_mse - train_mse
    
    return {
        'val_mse': val_mse,
        'val_mae': val_mae,
        'val_r2': val_r2,
        'overfitting_gap': overfitting_gap,
        'params': params
    }

def detect_overfitting(history):
    """Analyze training history for overfitting signs"""
    train_loss = history.history['loss']
    val_loss = history.history['val_loss']
    
    # Find minimum validation loss epoch
    min_val_epoch = np.argmin(val_loss)
    min_val_loss = val_loss[min_val_epoch]
    train_loss_at_min_val = train_loss[min_val_epoch]
    
    # Calculate overfitting metrics
    overfitting_gap = min_val_loss - train_loss_at_min_val
    
    # Check if validation loss starts increasing while training loss decreases
    recent_epochs = 20
    if len(val_loss) > recent_epochs:
        recent_val_trend = np.polyfit(range(recent_epochs), val_loss[-recent_epochs:], 1)[0]
        recent_train_trend = np.polyfit(range(recent_epochs), train_loss[-recent_epochs:], 1)[0]
    else:
        recent_val_trend = 0
        recent_train_trend = 0
    
    print(f"\n{'='*50}")
    print("OVERFITTING ANALYSIS")
    print(f"{'='*50}")
    print(f"Final Training Loss:   {train_loss[-1]:.4f}")
    print(f"Final Validation Loss: {val_loss[-1]:.4f}")
    print(f"Overfitting Gap:       {overfitting_gap:.4f}")
    
    if overfitting_gap > 0.5:
        print("⚠️  HIGH OVERFITTING DETECTED")
        print("   - Validation loss significantly higher than training loss")
        print("   - Consider: More regularization, less model complexity, more data")
    elif overfitting_gap > 0.2:
        print("⚠️  MODERATE OVERFITTING")
        print("   - Some overfitting present but manageable")
        print("   - Consider: Slight increase in regularization")
    else:
        print("✅ GOOD GENERALIZATION")
        print("   - Training and validation losses are similar")
    
    if recent_val_trend > 0 and recent_train_trend < 0:
        print("⚠️  DIVERGING TRENDS DETECTED")
        print("   - Validation loss increasing while training loss decreasing")
    
    return overfitting_gap

# Define hyperparameter search space
hyperparameter_space = {
    'architecture': [
        [128, 64, 32],        # Original smaller
        [256, 128, 64],       # Medium
        [512, 256, 128, 64],  # Larger
        [256, 128, 64, 32, 16] # Deeper
    ],
    'dropout_rate': [0.05, 0.1, 0.2],
    'l2_reg': [0.0001, 0.001, 0.01],
    'learning_rate': [0.0001, 0.001, 0.01],
    'batch_size': [8, 16, 32]
}

print("Starting Hyperparameter Optimization...")
print(f"Total combinations to test: {len(list(product(*hyperparameter_space.values())))}")
print("This will take a while - testing top combinations only...\n")

# Centered search around winning hyperparameters: [256,128,64], batch=32, dropout=0.1, l2=0.001, lr=0.001
results = []

# Architecture variations centered around winning [256, 128, 64]
winning_architectures = [
    [256, 128, 64],          # Original winner
    [224, 112, 56],          # Slightly smaller
    [288, 144, 72],          # Slightly larger  
    [256, 128, 64, 32],      # Add one more layer
    [320, 160, 80],          # 25% larger
    [384, 192, 96],          # 50% larger
    [256, 128],              # Shallower version
    [256, 192, 128, 64],     # Different taper
]

# Batch sizes centered around winning 32
batch_sizes = [24, 28, 32, 36, 40]

# Dropout rates centered around winning 0.1
dropout_rates = [0.08, 0.1, 0.12, 0.15]

# L2 regularization centered around winning 0.001
l2_values = [0.0005, 0.001, 0.002, 0.003]

# Learning rates centered around winning 0.001
learning_rates = [0.0008, 0.001, 0.0012, 0.0015]

test_combinations = []

# Test each architecture with optimal baseline params
for arch in winning_architectures:
    test_combinations.append({
        'architecture': arch,
        'dropout_rate': 0.1,
        'l2_reg': 0.001,
        'learning_rate': 0.001,
        'batch_size': 32
    })

# Test batch size variations with winning architecture
for bs in batch_sizes:
    if bs != 32:  # Don't duplicate the baseline
        test_combinations.append({
            'architecture': [256, 128, 64],
            'dropout_rate': 0.1,
            'l2_reg': 0.001,
            'learning_rate': 0.001,
            'batch_size': bs
        })

# Test dropout variations with winning architecture  
for dropout in dropout_rates:
    if dropout != 0.1:  # Don't duplicate the baseline
        test_combinations.append({
            'architecture': [256, 128, 64],
            'dropout_rate': dropout,
            'l2_reg': 0.001,
            'learning_rate': 0.001,
            'batch_size': 32
        })

# Test L2 regularization variations
for l2 in l2_values:
    if l2 != 0.001:  # Don't duplicate the baseline
        test_combinations.append({
            'architecture': [256, 128, 64],
            'dropout_rate': 0.1,
            'l2_reg': l2,
            'learning_rate': 0.001,
            'batch_size': 32
        })

# Test learning rate fine-tuning
for lr in learning_rates:
    if lr != 0.001:  # Don't duplicate the baseline
        test_combinations.append({
            'architecture': [256, 128, 64],
            'dropout_rate': 0.1,
            'l2_reg': 0.001,
            'learning_rate': lr,
            'batch_size': 32
        })

# Add a few promising combinations based on earlier results
test_combinations.extend([
    # Larger model with more regularization
    {'architecture': [384, 192, 96], 'dropout_rate': 0.12, 'l2_reg': 0.002, 'learning_rate': 0.001, 'batch_size': 32},
    # Deeper model with careful regularization
    {'architecture': [256, 192, 128, 64], 'dropout_rate': 0.1, 'l2_reg': 0.0015, 'learning_rate': 0.001, 'batch_size': 32},
    # Best combo with slightly different batch size
    {'architecture': [256, 128, 64], 'dropout_rate': 0.1, 'l2_reg': 0.001, 'learning_rate': 0.001, 'batch_size': 28},
])

for i, params in enumerate(test_combinations):
    print(f"Testing combination {i+1}/{len(test_combinations)}: {params}")
    result = evaluate_hyperparameters(params, X_train_full, y_train_full, X_val, y_val)
    results.append(result)
    print(f"  Val MSE: {result['val_mse']:.4f} | Val MAE: {result['val_mae']:.2f} | R²: {result['val_r2']:.3f} | Overfitting: {result['overfitting_gap']:.4f}")
    print()

# Find best parameters (prioritize MSE but consider overfitting)
best_result = min(results, key=lambda x: x['val_mse'])
best_balanced = min([r for r in results if r['overfitting_gap'] < 4.0], 
                   key=lambda x: x['val_mse'], default=best_result)

print(f"\n{'='*60}")
print("HYPERPARAMETER OPTIMIZATION RESULTS")
print(f"{'='*60}")

print("\nAll Results (sorted by validation score):")
for i, result in enumerate(sorted(results, key=lambda x: x['val_score'])):
    print(f"{i+1}. Val Score: {result['val_score']:.4f} | "
          f"Overfitting: {result['overfitting_gap']:.4f} | {result['params']}")

print(f"\n🏆 BEST PARAMETERS:")
print(f"   Architecture: {best_result['params']['architecture']}")
print(f"   Dropout Rate: {best_result['params']['dropout_rate']}")
print(f"   L2 Regularization: {best_result['params']['l2_reg']}")
print(f"   Learning Rate: {best_result['params']['learning_rate']}")
print(f"   Batch Size: {best_result['params']['batch_size']}")
print(f"   Validation Score: {best_result['val_score']:.4f}")
print(f"   Overfitting Gap: {best_result['overfitting_gap']:.4f}")

# Train final model with best parameters and analyze overfitting
print(f"\n{'='*60}")
print("TRAINING FINAL MODEL WITH BEST PARAMETERS")
print(f"{'='*60}")

final_model = create_model(
    architecture=best_result['params']['architecture'],
    dropout_rate=best_result['params']['dropout_rate'],
    l2_reg=best_result['params']['l2_reg'],
    learning_rate=best_result['params']['learning_rate'],
    input_shape=X_train_full.shape[1]
)

early_stop = EarlyStopping(monitor='val_loss', patience=20, restore_best_weights=True, verbose=1)
lr_scheduler = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=10, min_lr=1e-7, verbose=1)

history = final_model.fit(
    X_train_full, y_train_full,
    validation_data=(X_val, y_val),
    epochs=500,
    batch_size=best_result['params']['batch_size'],
    callbacks=[early_stop, lr_scheduler],
    verbose=1
)

# Analyze overfitting
overfitting_gap = detect_overfitting(history)

# FINAL TEST EVALUATION (completely unseen data)
print(f"\n{'='*60}")
print("FINAL TEST EVALUATION (COMPLETELY UNSEEN DATA)")
print(f"{'='*60}")

test_loss = final_model.evaluate(X_test, y_test, verbose=0)
test_predictions = final_model.predict(X_test, verbose=0)

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
test_mse = mean_squared_error(y_test, test_predictions)
test_mae = mean_absolute_error(y_test, test_predictions)
test_r2 = r2_score(y_test, test_predictions)

print(f"Test Results (Never seen this data before!):")
print(f"  Test Loss (MSE): {test_loss[0]:.4f}")
print(f"  Test MAE: {test_mae:.2f}")
print(f"  Test R² Score: {test_r2:.3f}")

# Compare with validation performance
val_loss = min(history.history['val_loss'])
print(f"\nValidation vs Test Comparison:")
print(f"  Validation Loss: {val_loss:.4f}")
print(f"  Test Loss:       {test_loss[0]:.4f}")
print(f"  Difference:      {abs(test_loss[0] - val_loss):.4f}")

if abs(test_loss[0] - val_loss) < 0.1:
    print("✅ EXCELLENT: Test and validation performance are very similar!")
    print("   Your model generalizes well to completely unseen data.")
elif abs(test_loss[0] - val_loss) < 0.3:
    print("✅ GOOD: Test and validation performance are reasonably similar.")
    print("   Your model generalizes adequately.")
else:
    print("⚠️  WARNING: Large difference between test and validation performance.")
    print("   Your hyperparameter tuning may have overfit to the validation set.")

# Save the optimized model
final_model.save('Unemployment_AI_Optimized.keras')

print(f"\n✅ Optimized model saved as 'Unemployment_AI_Optimized.keras'")
print(f"   Final test performance: MAE = {test_mae:.2f}, R² = {test_r2:.3f}")

: 