# Phase 4: Modeling

## Fannie Mae 2008Q1 Stress Testing - Credit Default Risk Modeling

---

### CRISP-DM Phase 4: Train Classification Models

**Goal**: Train multiple models to achieve AUC-ROC > 0.70

In [None]:
# Import Libraries
import numpy as np
import pickle
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, 
    f1_score, roc_auc_score
)
import warnings
warnings.filterwarnings('ignore')

RANDOM_STATE = 42
print("Libraries imported successfully!")

## 4.1 Load Prepared Data

In [None]:
# Load data from Phase 3
with open('phase3_prepared_data.pkl', 'rb') as f:
    data = pickle.load(f)

X_train = data['X_train']
X_test = data['X_test']
y_train = data['y_train']
y_test = data['y_test']
features = data['features']

print(f"Training set: {len(X_train):,} samples")
print(f"Test set: {len(X_test):,} samples")
print(f"Features: {len(features)}")
print(f"Default rate: {y_train.mean()*100:.2f}%")

## 4.2 Define Models

We'll train 4 models with optimized parameters for credit risk prediction:

In [None]:
# Define models with optimized parameters
models = {
    'Logistic Regression': LogisticRegression(
        random_state=RANDOM_STATE, 
        max_iter=1000,
        class_weight='balanced',
        C=0.1,  # Regularization
        solver='lbfgs'
    ),
    
    'Random Forest': RandomForestClassifier(
        n_estimators=200,
        max_depth=15,
        min_samples_split=10,
        min_samples_leaf=5,
        random_state=RANDOM_STATE,
        class_weight='balanced',
        n_jobs=-1
    ),
    
    'Gradient Boosting': GradientBoostingClassifier(
        n_estimators=200,
        max_depth=6,
        learning_rate=0.1,
        min_samples_split=10,
        min_samples_leaf=5,
        subsample=0.8,
        random_state=RANDOM_STATE
    ),
    
    'Neural Network (MLP)': MLPClassifier(
        hidden_layer_sizes=(100, 50),
        activation='relu',
        solver='adam',
        alpha=0.001,
        batch_size=256,
        learning_rate='adaptive',
        max_iter=200,
        random_state=RANDOM_STATE
    )
}

print(f"Defined {len(models)} models for training:")
for name in models:
    print(f"  - {name}")

## 4.3 Train and Evaluate Models

In [None]:
# Train and evaluate each model
results = {}

print("="*80)
print("TRAINING MODELS")
print("="*80)

for model_name, model in models.items():
    print(f"\n>>> {model_name}")
    print("    Training...")
    
    # Train
    model.fit(X_train, y_train)
    
    # Predict
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    
    # Metrics
    metrics = {
        'model': model,
        'accuracy': accuracy_score(y_test, y_pred),
        'precision': precision_score(y_test, y_pred, zero_division=0),
        'recall': recall_score(y_test, y_pred, zero_division=0),
        'f1_score': f1_score(y_test, y_pred, zero_division=0),
        'auc_roc': roc_auc_score(y_test, y_pred_proba),
        'y_pred': y_pred,
        'y_pred_proba': y_pred_proba
    }
    
    results[model_name] = metrics
    
    # Display results
    print(f"    Accuracy:  {metrics['accuracy']:.4f}")
    print(f"    Precision: {metrics['precision']:.4f}")
    print(f"    Recall:    {metrics['recall']:.4f}")
    print(f"    F1-Score:  {metrics['f1_score']:.4f}")
    print(f"    AUC-ROC:   {metrics['auc_roc']:.4f}")
    
    # Check if target achieved
    if metrics['auc_roc'] >= 0.70:
        print(f"    ‚úì TARGET ACHIEVED (AUC ‚â• 0.70)")
    else:
        print(f"    ‚úó Below target (need {0.70 - metrics['auc_roc']:.4f} more)")

print("\n" + "="*80)
print("TRAINING COMPLETE")
print("="*80)

## 4.4 Results Summary

In [None]:
import pandas as pd

# Create comparison table
comparison_df = pd.DataFrame({
    model_name: {
        'Accuracy': res['accuracy'],
        'Precision': res['precision'],
        'Recall': res['recall'],
        'F1-Score': res['f1_score'],
        'AUC-ROC': res['auc_roc']
    }
    for model_name, res in results.items()
}).T

print("\nModel Comparison:")
comparison_df.round(4)

In [None]:
# Find best model
best_model_name = comparison_df['AUC-ROC'].idxmax()
best_auc = comparison_df['AUC-ROC'].max()

print(f"\nBest Model: {best_model_name}")
print(f"Best AUC-ROC: {best_auc:.4f}")

if best_auc >= 0.70:
    print("\nüéâ SUCCESS! Target AUC-ROC ‚â• 0.70 achieved!")
else:
    print(f"\n‚ö†Ô∏è Target not reached. Need additional features or tuning.")
    print(f"   Gap to target: {0.70 - best_auc:.4f}")

In [None]:
# Save results for Phase 5
phase4_results = {
    'results': results,
    'comparison_df': comparison_df,
    'best_model_name': best_model_name,
    'y_test': y_test,
    'features': features
}

with open('phase4_results.pkl', 'wb') as f:
    pickle.dump(phase4_results, f)

print("\n‚úì Results saved to phase4_results.pkl")

---
## ‚úÖ Phase 4 Complete

**Models Trained**:
1. Logistic Regression (balanced, regularized)
2. Random Forest (200 trees, max_depth=15)
3. Gradient Boosting (200 trees, learning_rate=0.1)
4. Neural Network MLP (2 hidden layers)

**Next**: Phase 5 - Evaluation