# AI Decision Tree for Loan Default Prediction

**AN6001 - AI & Big Data in Business**  
**Author**: Mingkai Wang (#G2401001J)  
**Date**: December 2, 2024

---

## Project Overview

This project implements an intelligent loan default prediction system using decision tree algorithms. The system analyzes employment status, bank balance, and annual salary to assess default risk for financial institutions.

### Key Features:
- Binary classification for loan default prediction
- Feature importance analysis for risk factors
- Comprehensive model evaluation metrics
- Real-world financial dataset with 10,000+ records

---

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import (
    accuracy_score, classification_report, confusion_matrix, 
    roc_curve, auc, precision_recall_curve
)
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(42)

print("Libraries imported successfully")
print("AI Decision Tree Loan Prediction System initialized")

## Data Loading and Exploration

Loading the loan default dataset with employment, financial, and default status information.

In [None]:
# Load the loan default dataset
df = pd.read_csv('loan_default_data.csv')

print(f"Dataset shape: {df.shape}")
print(f"\nDataset info:")
print(df.info())

print(f"\nFirst 5 rows:")
df.head()

In [None]:
# Data exploration and summary statistics
print("Summary Statistics:")
print(df.describe())

print(f"\nDefault distribution:")
print(df['Defaulted?'].value_counts())
print(f"\nDefault rate: {df['Defaulted?'].mean():.4f}")

print(f"\nEmployment status distribution:")
print(df['Employed'].value_counts())

# Check for missing values
print(f"\nMissing values:")
print(df.isnull().sum())

## Exploratory Data Analysis (EDA)

Analyzing the relationships between features and loan defaults.

In [None]:
# Comprehensive EDA visualizations
fig, axes = plt.subplots(2, 3, figsize=(18, 12))

# Default rate by employment status
default_by_employment = df.groupby('Employed')['Defaulted?'].mean()
axes[0, 0].bar(['Unemployed', 'Employed'], default_by_employment.values, 
               color=['red', 'green'], alpha=0.7)
axes[0, 0].set_title('Default Rate by Employment Status')
axes[0, 0].set_ylabel('Default Rate')

# Bank balance distribution by default status
df.boxplot(column='Bank Balance', by='Defaulted?', ax=axes[0, 1])
axes[0, 1].set_title('Bank Balance Distribution by Default Status')
axes[0, 1].set_xlabel('Defaulted? (0=No, 1=Yes)')

# Annual salary distribution by default status
df.boxplot(column='Annual Salary', by='Defaulted?', ax=axes[0, 2])
axes[0, 2].set_title('Annual Salary Distribution by Default Status')
axes[0, 2].set_xlabel('Defaulted? (0=No, 1=Yes)')

# Correlation heatmap
correlation_matrix = df.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, ax=axes[1, 0])
axes[1, 0].set_title('Feature Correlation Matrix')

# Default distribution
default_counts = df['Defaulted?'].value_counts()
axes[1, 1].pie(default_counts.values, labels=['No Default', 'Default'], 
               autopct='%1.1f%%', colors=['lightgreen', 'lightcoral'])
axes[1, 1].set_title('Overall Default Distribution')

# Scatter plot: Bank Balance vs Annual Salary colored by default
colors = ['green' if x == 0 else 'red' for x in df['Defaulted?']]
axes[1, 2].scatter(df['Bank Balance'], df['Annual Salary'], c=colors, alpha=0.6)
axes[1, 2].set_xlabel('Bank Balance')
axes[1, 2].set_ylabel('Annual Salary')
axes[1, 2].set_title('Bank Balance vs Annual Salary (Green=No Default, Red=Default)')

plt.tight_layout()
plt.show()

print("Exploratory Data Analysis completed")

## Feature Engineering and Data Preprocessing

Preparing the data for machine learning model training.

In [None]:
# Feature engineering
def create_features(data):
    """Create additional features for better prediction"""
    df_features = data.copy()
    
    # Create debt-to-income ratio (assuming bank balance represents available funds)
    df_features['Debt_to_Income'] = (df_features['Annual Salary'] - df_features['Bank Balance']) / df_features['Annual Salary']
    df_features['Debt_to_Income'] = df_features['Debt_to_Income'].clip(0, 1)  # Cap at 100%
    
    # Create financial stability indicator
    df_features['Financial_Stability'] = df_features['Bank Balance'] / (df_features['Annual Salary'] + 1)
    
    # Create income categories
    income_quantiles = df_features['Annual Salary'].quantile([0.33, 0.67])
    df_features['Income_Category'] = pd.cut(df_features['Annual Salary'], 
                                           bins=[0, income_quantiles.iloc[0], income_quantiles.iloc[1], float('inf')],
                                           labels=['Low', 'Medium', 'High'])
    
    # Encode categorical variables
    le = LabelEncoder()
    df_features['Income_Category_Encoded'] = le.fit_transform(df_features['Income_Category'])
    
    return df_features

# Apply feature engineering
df_engineered = create_features(df)

print("Feature engineering completed")
print(f"New features created: Debt_to_Income, Financial_Stability, Income_Category")
print(f"Dataset shape after feature engineering: {df_engineered.shape}")

# Display new features
print("\nNew feature statistics:")
print(df_engineered[['Debt_to_Income', 'Financial_Stability']].describe())

In [None]:
# Prepare features and target variable
feature_columns = ['Employed', 'Bank Balance', 'Annual Salary', 
                  'Debt_to_Income', 'Financial_Stability', 'Income_Category_Encoded']

X = df_engineered[feature_columns]
y = df_engineered['Defaulted?']

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"\nFeature columns: {feature_columns}")

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"\nTraining set shape: {X_train.shape}")
print(f"Test set shape: {X_test.shape}")
print(f"Training set default rate: {y_train.mean():.4f}")
print(f"Test set default rate: {y_test.mean():.4f}")

## Decision Tree Model Training and Optimization

Training and optimizing the decision tree classifier for loan default prediction.

In [None]:
# Train multiple decision tree models with different parameters
models = {
    'Default': DecisionTreeClassifier(random_state=42),
    'Max_Depth_5': DecisionTreeClassifier(max_depth=5, random_state=42),
    'Max_Depth_10': DecisionTreeClassifier(max_depth=10, random_state=42),
    'Min_Samples_Split_10': DecisionTreeClassifier(min_samples_split=10, random_state=42),
    'Balanced': DecisionTreeClassifier(class_weight='balanced', random_state=42)
}

# Train and evaluate models
results = {}

for name, model in models.items():
    print(f"\nTraining {name} Decision Tree...")
    
    # Train the model
    model.fit(X_train, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    
    # Cross-validation
    cv_scores = cross_val_score(model, X_train, y_train, cv=5)
    
    results[name] = {
        'model': model,
        'accuracy': accuracy,
        'cv_mean': cv_scores.mean(),
        'cv_std': cv_scores.std(),
        'predictions': y_pred,
        'probabilities': y_pred_proba
    }
    
    print(f"   Accuracy: {accuracy:.4f}")
    print(f"   CV Score: {cv_scores.mean():.4f} (Â±{cv_scores.std():.4f})")

# Select best model
best_model_name = max(results.keys(), key=lambda x: results[x]['accuracy'])
best_model = results[best_model_name]['model']

print(f"\nBest performing model: {best_model_name}")
print(f"Best accuracy: {results[best_model_name]['accuracy']:.4f}")

## Model Evaluation and Performance Analysis

Comprehensive evaluation of the decision tree model performance.

In [None]:
# Detailed evaluation of the best model
best_predictions = results[best_model_name]['predictions']
best_probabilities = results[best_model_name]['probabilities']

# Classification report
print("Classification Report:")
print(classification_report(y_test, best_predictions, 
                          target_names=['No Default', 'Default']))

# Confusion Matrix
cm = confusion_matrix(y_test, best_predictions)
print(f"\nConfusion Matrix:")
print(cm)

# Feature importance
feature_importance = best_model.feature_importances_
importance_df = pd.DataFrame({
    'feature': feature_columns,
    'importance': feature_importance
}).sort_values('importance', ascending=False)

print(f"\nFeature Importance:")
for _, row in importance_df.iterrows():
    print(f"   {row['feature']}: {row['importance']:.4f}")

In [None]:
# Comprehensive visualization of model performance
fig, axes = plt.subplots(2, 3, figsize=(18, 12))

# Model comparison
model_names = list(results.keys())
accuracies = [results[name]['accuracy'] for name in model_names]
cv_means = [results[name]['cv_mean'] for name in model_names]

axes[0, 0].bar(model_names, accuracies, alpha=0.7, color='skyblue')
axes[0, 0].set_title('Model Accuracy Comparison')
axes[0, 0].set_ylabel('Accuracy')
axes[0, 0].tick_params(axis='x', rotation=45)

# Cross-validation scores
axes[0, 1].bar(model_names, cv_means, alpha=0.7, color='lightgreen')
axes[0, 1].set_title('Cross-Validation Score Comparison')
axes[0, 1].set_ylabel('CV Score')
axes[0, 1].tick_params(axis='x', rotation=45)

# Confusion Matrix Heatmap
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[0, 2])
axes[0, 2].set_title(f'Confusion Matrix - {best_model_name}')
axes[0, 2].set_xlabel('Predicted')
axes[0, 2].set_ylabel('Actual')

# Feature Importance
axes[1, 0].barh(importance_df['feature'], importance_df['importance'])
axes[1, 0].set_title('Feature Importance')
axes[1, 0].set_xlabel('Importance')

# ROC Curve
fpr, tpr, _ = roc_curve(y_test, best_probabilities)
roc_auc = auc(fpr, tpr)
axes[1, 1].plot(fpr, tpr, color='darkorange', lw=2, 
                label=f'ROC curve (AUC = {roc_auc:.3f})')
axes[1, 1].plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
axes[1, 1].set_xlim([0.0, 1.0])
axes[1, 1].set_ylim([0.0, 1.05])
axes[1, 1].set_xlabel('False Positive Rate')
axes[1, 1].set_ylabel('True Positive Rate')
axes[1, 1].set_title('ROC Curve')
axes[1, 1].legend(loc="lower right")

# Precision-Recall Curve
precision, recall, _ = precision_recall_curve(y_test, best_probabilities)
axes[1, 2].plot(recall, precision, color='blue', lw=2)
axes[1, 2].set_xlabel('Recall')
axes[1, 2].set_ylabel('Precision')
axes[1, 2].set_title('Precision-Recall Curve')
axes[1, 2].set_xlim([0.0, 1.0])
axes[1, 2].set_ylim([0.0, 1.05])

plt.tight_layout()
plt.show()

print(f"\nModel Performance Summary:")
print(f"   Best Model: {best_model_name}")
print(f"   Accuracy: {results[best_model_name]['accuracy']:.4f}")
print(f"   AUC Score: {roc_auc:.4f}")
print(f"   Cross-validation: {results[best_model_name]['cv_mean']:.4f}")

## Decision Tree Visualization

Visualizing the decision tree structure for interpretability.

In [None]:
# Create a simplified decision tree for visualization
viz_tree = DecisionTreeClassifier(max_depth=4, min_samples_split=50, random_state=42)
viz_tree.fit(X_train, y_train)

# Plot the decision tree
plt.figure(figsize=(20, 12))
plot_tree(viz_tree, 
          feature_names=feature_columns,
          class_names=['No Default', 'Default'],
          filled=True,
          rounded=True,
          fontsize=10)
plt.title('Decision Tree Visualization (Simplified)', fontsize=16)
plt.show()

# Tree statistics
print(f"Decision Tree Statistics:")
print(f"   Tree depth: {viz_tree.get_depth()}")
print(f"   Number of leaves: {viz_tree.get_n_leaves()}")
print(f"   Number of nodes: {viz_tree.tree_.node_count}")

# Evaluate simplified tree
viz_predictions = viz_tree.predict(X_test)
viz_accuracy = accuracy_score(y_test, viz_predictions)
print(f"   Simplified tree accuracy: {viz_accuracy:.4f}")

## Model Interpretation and Business Insights

Extracting actionable insights from the decision tree model.

In [None]:
# Business insights and model interpretation
print("=" * 60)
print("LOAN DEFAULT PREDICTION - BUSINESS INSIGHTS")
print("=" * 60)

# Risk factor analysis
print("\n1. KEY RISK FACTORS (by importance):")
for i, (_, row) in enumerate(importance_df.iterrows(), 1):
    print(f"   {i}. {row['feature']}: {row['importance']:.3f}")

# Employment impact
employment_default_rates = df_engineered.groupby('Employed')['Defaulted?'].agg(['mean', 'count'])
print(f"\n2. EMPLOYMENT STATUS IMPACT:")
print(f"   Unemployed default rate: {employment_default_rates.loc[0, 'mean']:.3f} ({employment_default_rates.loc[0, 'count']} cases)")
print(f"   Employed default rate: {employment_default_rates.loc[1, 'mean']:.3f} ({employment_default_rates.loc[1, 'count']} cases)")

# Financial metrics analysis
defaulters = df_engineered[df_engineered['Defaulted?'] == 1]
non_defaulters = df_engineered[df_engineered['Defaulted?'] == 0]

print(f"\n3. FINANCIAL PROFILE COMPARISON:")
print(f"   Average Bank Balance - Defaulters: ${defaulters['Bank Balance'].mean():,.2f}")
print(f"   Average Bank Balance - Non-defaulters: ${non_defaulters['Bank Balance'].mean():,.2f}")
print(f"   Average Annual Salary - Defaulters: ${defaulters['Annual Salary'].mean():,.2f}")
print(f"   Average Annual Salary - Non-defaulters: ${non_defaulters['Annual Salary'].mean():,.2f}")

# Model performance summary
print(f"\n4. MODEL PERFORMANCE:")
print(f"   Overall Accuracy: {results[best_model_name]['accuracy']:.3f}")
print(f"   Cross-validation Score: {results[best_model_name]['cv_mean']:.3f}")
print(f"   AUC Score: {roc_auc:.3f}")

# Prediction examples
print(f"\n5. SAMPLE PREDICTIONS:")
sample_indices = np.random.choice(X_test.index, 5, replace=False)
for idx in sample_indices:
    actual = y_test.loc[idx]
    predicted = best_model.predict(X_test.loc[[idx]])[0]
    probability = best_model.predict_proba(X_test.loc[[idx]])[0][1]
    
    print(f"   Case {idx}: Actual={actual}, Predicted={predicted}, Default Prob={probability:.3f}")
    print(f"      Employment={X_test.loc[idx, 'Employed']}, Balance=${X_test.loc[idx, 'Bank Balance']:,.0f}, Salary=${X_test.loc[idx, 'Annual Salary']:,.0f}")

print("\n" + "=" * 60)
print("ANALYSIS COMPLETE")
print("=" * 60)

## Model Deployment Recommendations

Practical recommendations for implementing the loan default prediction system.

In [None]:
# Deployment recommendations
print("DEPLOYMENT RECOMMENDATIONS")
print("=" * 40)

recommendations = [
    "1. RISK THRESHOLD SETTING:",
    "   - Set default probability threshold at 0.3 for high sensitivity",
    "   - Monitor false positive rates to balance risk and business impact",
    "",
    "2. FEATURE MONITORING:",
    f"   - Primary focus: {importance_df.iloc[0]['feature']} (highest importance)",
    "   - Regular updates needed for employment status verification",
    "   - Automated financial data validation",
    "",
    "3. MODEL MAINTENANCE:",
    "   - Retrain quarterly with new default data",
    "   - Monitor for concept drift in economic conditions",
    "   - A/B test model updates before full deployment",
    "",
    "4. BUSINESS INTEGRATION:",
    "   - Integrate with existing loan approval workflow",
    "   - Provide explanation capabilities for loan officers",
    "   - Implement feedback loop for prediction accuracy tracking"
]

for rec in recommendations:
    print(rec)

# Save model performance metrics
performance_summary = {
    'model_type': 'Decision Tree Classifier',
    'best_model': best_model_name,
    'accuracy': results[best_model_name]['accuracy'],
    'cv_score': results[best_model_name]['cv_mean'],
    'auc_score': roc_auc,
    'feature_importance': importance_df.to_dict('records'),
    'training_samples': len(X_train),
    'test_samples': len(X_test)
}

print(f"\nModel training completed successfully!")
print(f"Performance summary saved for deployment reference.")