In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.ensemble import RandomForestRegressor

df =pd.read_csv("../Data/clean_data/cleaned_loan_updated.csv")
df.info()

In [None]:
df.person_income_log.value_counts()

In [None]:
## One-Hot Encoding (Best for Nominal)
# One-hot encode nominal features
# Perform one-hot encoding
df_encoded = pd.get_dummies(
    df,
    columns=['person_home_ownership', 'loan_purpose', 'cb_person_default_on_file'],
    drop_first=True,  # Reduces multicollinearity
    dtype='int'      # Directly creates integer columns instead of boolean
)

# Verify the new columns
print("\nNew columns after encoding:")
print(df_encoded.columns.tolist())

# Check the first few rows to confirm encoding worked
print("\nSample of encoded data:")
print(df_encoded.head())

In [None]:
# 2. Target Variable (loan_status)
# Convert boolean to int (False=0, True=1)
df_encoded['loan_status'] = df_encoded['loan_status'].astype(int)

In [None]:
print(df_encoded.dtypes)
print(df_encoded['loan_status'].value_counts())  # Should show 0 (False) and 1 (True)

In [None]:
df_encoded.info()

In [None]:
## To impute the missing values of loan_interest_rate we will use 
# Predictive Modeling and Train a regression model (e.g., Random Forest) on complete cases to predict missing rates

from sklearn.model_selection import train_test_split

# 3. PERFORM THE IMPUTATION
from sklearn.ensemble import RandomForestRegressor

# Split into complete/missing cases
complete = df_encoded[df_encoded['loan_interest_rate'].notna()]
missing = df_encoded[df_encoded['loan_interest_rate'].isna()]

# Train model
# Featurees to include in the model
features = [
    'person_age', 'person_income', 'person_employment_length', 'loan_amount',
    'loan_status', 'loan_to_income_ratio', 'cb_credit_history_length',
    'person_income_log', 'loan_amount_log', 'loan_grade_numeric',
    'person_home_ownership_other', 'person_home_ownership_own', 'person_home_ownership_rent',
    'loan_purpose_education', 'loan_purpose_home-improvement', 'loan_purpose_medical',
    'loan_purpose_personal', 'loan_purpose_venture', 'cb_person_default_on_file_Y'
]
rf = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
rf.fit(complete[features], complete['loan_interest_rate'])

# Impute missing values
df_encoded.loc[missing.index, 'loan_interest_rate'] = rf.predict(missing[features])

# 4. VERIFY RESULTS
print("\nMissing values after imputation:", df_encoded['loan_interest_rate'].isna().sum())
print("New value ranges:", df_encoded['loan_interest_rate'].describe())

In [None]:
df_encoded.info()

In [None]:
# Select only numerical features (including encoded binary features)
numerical_features = [
    'person_age',
    'person_income',
    'person_employment_length',
    'loan_amount',
    'loan_interest_rate',
    'loan_to_income_ratio',
    'cb_credit_history_length',
    'person_income_log',
    'loan_amount_log',
    'loan_grade_numeric',
    'person_home_ownership_other',
    'person_home_ownership_own',
    'person_home_ownership_rent',
    'loan_purpose_education',
    'loan_purpose_home-improvement',
    'loan_purpose_medical',
    'loan_purpose_personal',
    'loan_purpose_venture',
    'cb_person_default_on_file_Y',
    'loan_status'  # Target
]

# Create correlation matrix only for these features
corr_matrix = df_encoded[numerical_features].corr()

# Plot ONLY correlations with loan_status (vertical layout)
plt.figure(figsize=(8, 10))
sns.heatmap(corr_matrix[['loan_status']].sort_values('loan_status', ascending=False), 
            annot=True, fmt=".2f", cmap='coolwarm', center=0, vmin=-1, vmax=1)
plt.title("Correlation with Loan Status (Default)", pad=20)
plt.tight_layout()
plt.show()

In [None]:
# Select numerical features (same as before)
numerical_features = [
    'person_age', 'person_income', 'person_employment_length',
    'loan_amount', 'loan_interest_rate', 'loan_to_income_ratio',
    'cb_credit_history_length', 'person_income_log', 'loan_amount_log',
    'loan_grade_numeric',
    # One-hot encoded features:
    'person_home_ownership_rent', 'person_home_ownership_own',
    'cb_person_default_on_file_Y', 
    'loan_status'  # Target
]

# Create correlation matrix
corr_matrix = df_encoded[numerical_features].corr()

# Set up the matplotlib figure
plt.figure(figsize=(12, 10))

# Create a mask to hide upper triangle (optional)
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))

# Draw the heatmap
sns.heatmap(corr_matrix, 
            mask=mask if 'mask' in locals() else None,
            annot=True, fmt=".2f", 
            cmap='coolwarm', center=0,
            vmin=-1, vmax=1,
            linewidths=0.5)
plt.title("Feature Correlation Matrix", pad=20)
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

In [None]:
df_encoded.info()

In [None]:
# Data preperation to train the model to predict the target
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Define features (X) and target (y) - exclude non-feature columns
X = df_encoded.drop(columns=[
    'loan_status',         # Target variable
    'loan_grade',          # Already encoded as loan_grade_numeric
    'credit_history_bins'  # Categorical (optional: could encode if needed)
])
y = df_encoded['loan_status']

# Verify feature columns
print("Features being used:\n", X.columns.tolist())

In [None]:
# Train and Test split:
# Stratified split (maintains 78%/22% ratio in both sets)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=42,
    stratify=y  # Critical for imbalanced data
)

# Check class distribution
print("\nClass counts in y_train:", y_train.value_counts())
print("Class counts in y_test:", y_test.value_counts())

In [None]:
# 3. Feature Scaling (Only Numerical Features)
# Identify numerical columns (excluding already binary-encoded ones)
num_cols = [
    'person_age', 
    'person_income', 
    'person_employment_length',
    'loan_amount', 
    'loan_interest_rate', 
    'loan_to_income_ratio',
    'cb_credit_history_length', 
    'person_income_log', 
    'loan_amount_log',
    'loan_grade_numeric'  # Ordinal encoded
]

# Scale numerical features (preserves binary columns)
scaler = StandardScaler()
X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
X_test[num_cols] = scaler.transform(X_test[num_cols])

# Verify scaling
print("\nScaled features (sample):\n", X_train[num_cols].head())

In [None]:
## Models training and evaluation
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score

# Initialize models with class weighting
models = {
    "KNN": KNeighborsClassifier(n_neighbors=5),
    "Logistic Regression": LogisticRegression(
        max_iter=1000, 
        class_weight='balanced',  # Adjusts for 78%/22% imbalance
        random_state=42
    ),
    "Random Forest": RandomForestClassifier(
        n_estimators=100,
        class_weight='balanced',  # Adjusts for imbalance
        random_state=42
    )
}

# Train and evaluate
results = {}
for name, model in models.items():
    # Training
    model.fit(X_train, y_train)

    # --- Evaluation on Test Set ---
    y_test_pred = model.predict(X_test)
    y_test_proba = model.predict_proba(X_test)[:, 1]

    # --- Evaluation on Train Set ---
    y_train_pred = model.predict(X_train)
    y_train_proba = model.predict_proba(X_train)[:, 1]

    # Store both train and test results
    results[name] = {
        "test_report": classification_report(y_test, y_test_pred),
        "test_roc_auc": roc_auc_score(y_test, y_test_proba),
        "train_report": classification_report(y_train, y_train_pred),
        "train_roc_auc": roc_auc_score(y_train, y_train_proba),
    }

# Print results
print("\nModel Comparison Results:")
for name, metrics in results.items():
    print(f"\n----- {name} -----")
    print("TRAIN SET:")
    print(metrics["train_report"])
    print(f"ROC-AUC: {metrics['train_roc_auc']:.4f}")
    print("TEST SET:")
    print(metrics["test_report"])
    print(f"ROC-AUC: {metrics['test_roc_auc']:.4f}")

In [None]:
df_encoded.info()

In [None]:
df_encoded.person_age.describe()

In [None]:
## Why These Bins?
## It is clearly that the majorty of the data has age lower than 30 years that why we decided to mkae the following bins:

# Define bin edges (inclusive on left, exclusive on right by default)
bins = [20, 25, 30, 35, 40, 45, 50]
labels = ['21-25', '26-30', '31-35', '36-40', '41-45', '46-50']

# Create a new binned column
df_encoded['person_age_bins'] = pd.cut(df_encoded['person_age'], bins=bins, labels=labels, include_lowest=True)

# Check for missing values (should be 0)
print("Missing values in binned column:", df_encoded['person_age_bins'].isna().sum())

In [None]:
df_encoded.person_income.describe()

In [None]:
## Why These Bins?
## Very Low (<$38.5K): Bottom 25% - Likely higher risk

## Low ($38.5K-$55K): Below median income

## Medium ($55K-$79.2K): Middle-income borrowers

## High ($79.2K-$100K): Top 25% excluding highest earners

## Very High (>$100K): Top ~15% (79218-225000 covers 75th-100th percentile)

income_bins = [14400, 38542, 55000, 79218, 100000, 225000]  # Based on percentiles + round numbers
income_labels = ['Very Low', 'Low', 'Medium', 'High', 'Very High']

df_encoded['income_group'] = pd.cut(
    df_encoded['person_income'],
    bins=income_bins,
    labels=income_labels,
    include_lowest=True
    )

In [None]:
print("Income Group Distribution:")
print(df_encoded['income_group'].value_counts().sort_index())

print("\nDefault Rates by Income Group:")
print(df_encoded.groupby('income_group')['loan_status'].mean().sort_values())

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(10,6))
# Default rate by income group
df_encoded.groupby('income_group')['loan_status'].mean().sort_values().plot(
    kind='bar', color='skyblue', edgecolor='black')
plt.title('Default Rate by Income Group')
plt.ylabel('Default Rate')
plt.axhline(y=df_encoded['loan_status'].mean(), color='red', linestyle='--', 
            label=f'Overall Default Rate ({df_encoded["loan_status"].mean():.1%})')
plt.legend()
plt.show()

In [None]:
df_encoded.drop(columns='age_group', inplace=True, errors='ignore')


In [None]:
df_encoded.info()

In [None]:
df_encoded.person_age.describe()

In [None]:
# One-hot encode the categorical columns
df_encoded = pd.get_dummies(df_encoded, columns=['income_group', 'person_age_bins'], drop_first=True)


In [None]:
# Define features and target
X = df_encoded.drop(columns=['loan_status', 'loan_grade', 'credit_history_bins', 'income_group', 'person_age_bins'])
y = df_encoded['loan_status']


In [None]:
#from sklearn.model_selection import train_test_split

#X_train, X_test, y_train, y_test = train_test_split(
 #   X, y, test_size=0.2, stratify=y, random_state=42
#)


In [None]:

models = {
    "KNN": KNeighborsClassifier(n_neighbors=5),
    "Logistic Regression": LogisticRegression(
        max_iter=1000,
        class_weight='balanced',
        random_state=42
    ),
    "Random Forest": RandomForestClassifier(
        n_estimators=100,
        class_weight='balanced',
        random_state=42
    )
}

results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    
    y_test_pred = model.predict(X_test)
    y_test_proba = model.predict_proba(X_test)[:, 1]

    y_train_pred = model.predict(X_train)
    y_train_proba = model.predict_proba(X_train)[:, 1]

    results[name] = {
        "test_report": classification_report(y_test, y_test_pred),
        "test_roc_auc": roc_auc_score(y_test, y_test_proba),
        "train_report": classification_report(y_train, y_train_pred),
        "train_roc_auc": roc_auc_score(y_train, y_train_proba),
    }

# Print results
print("\nModel Comparison Results:")
for name, metrics in results.items():
    print(f"\n----- {name} -----")
    print("TRAIN SET:")
    print(metrics["train_report"])
    print(f"ROC-AUC: {metrics['train_roc_auc']:.4f}")
    print("TEST SET:")
    print(metrics["test_report"])
    print(f"ROC-AUC: {metrics['test_roc_auc']:.4f}")

In [None]:
from sklearn.utils import resample

def manual_oversample(X_train, y_train, target_ratio=0.5):
    """
    Manually oversamples minority class to reach target ratio
    target_ratio = minority_count / majority_count
    """
    # Separate classes
    X_majority = X_train[y_train == 0]
    X_minority = X_train[y_train == 1]
    y_majority = y_train[y_train == 0]
    y_minority = y_train[y_train == 1]
    
    # Calculate needed samples
    n_majority = len(X_majority)
    n_minority_target = int(n_majority * target_ratio)
    n_to_sample = n_minority_target - len(X_minority)
    
    # Oversample minority
    X_minority_upsampled = resample(
        X_minority,
        replace=True,  # Sample with replacement
        n_samples=n_to_sample,
        random_state=42
    )
    y_minority_upsampled = pd.Series(1).repeat(n_to_sample)
    
    # Combine
    X_resampled = pd.concat([X_train, X_minority_upsampled])
    y_resampled = pd.concat([y_train, y_minority_upsampled])
    
    return X_resampled, y_resampled

# Usage
X_train_res, y_train_res = manual_oversample(X_train, y_train, target_ratio=0.5)
print("New class balance:", y_train_res.value_counts())

In [None]:
from sklearn.metrics import classification_report, roc_auc_score


# Initialize results dictionary at the START
results = {}

# Define models
models = {
    "KNN": KNeighborsClassifier(n_neighbors=5),
    "Logistic Regression": LogisticRegression(
        max_iter=1000,
        class_weight='balanced',
        random_state=42
    ),
    "Random Forest": RandomForestClassifier(
        n_estimators=100,
        class_weight='balanced',
        random_state=42
    )
}

for name, model in models.items():
    # Train on manually resampled data
    model.fit(X_train_res, y_train_res)
    
    # Evaluate on original test set
    y_test_pred = model.predict(X_test)
    y_test_proba = model.predict_proba(X_test)[:, 1]
    
    # Evaluate on resampled training data
    y_train_res_pred = model.predict(X_train_res)
    y_train_res_proba = model.predict_proba(X_train_res)[:, 1]
    
    results[name] = {
        "test_report": classification_report(y_test, y_test_pred),
        "test_roc_auc": roc_auc_score(y_test, y_test_proba),
        "train_resampled_report": classification_report(y_train_res, y_train_res_pred),
        "train_resampled_roc_auc": roc_auc_score(y_train_res, y_train_res_proba)
    }

# Print results
print("\nModel Comparison (After Manual Oversampling):")
for name, metrics in results.items():
    print(f"\n----- {name} -----")
    print("TRAIN SET (Resampled):")
    print(metrics["train_resampled_report"])
    print(f"ROC-AUC: {metrics['train_resampled_roc_auc']:.4f}")
    print("\nTEST SET (Original):")
    print(metrics["test_report"])
    print(f"ROC-AUC: {metrics['test_roc_auc']:.4f}")
    print("-" * 50)  # Properly separated now


In [None]:
## 1. KNN Improvements
# A. Feature Scaling (critical for KNN)
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_res)
X_test_scaled = scaler.transform(X_test)

# B. Optimize k-value
from sklearn.model_selection import GridSearchCV
param_grid = {'n_neighbors': [3,5,7,9,15], 
              'weights': ['uniform', 'distance']}
knn = KNeighborsClassifier()
grid = GridSearchCV(knn, param_grid, cv=5, scoring='roc_auc')
grid.fit(X_train_scaled, y_train_res)
print(f"Best params: {grid.best_params_}")

In [None]:
## 2. Logistic Regression Improvements
## Weakness: Low precision for defaults (0.54) - too many false positives
# A. Threshold adjustment (trade recall for precision)
# 1. First ensure you have this in your model dictionary
models = {
    "Logistic Regression": LogisticRegression(
        max_iter=1000,
        class_weight='balanced',
        random_state=42
    ),
    # ... other models
}

# 2. After training (model.fit), use this for threshold adjustment:
logreg = models["Logistic Regression"].fit(X_train_scaled, y_train_res)  # Get the trained model

# Get predicted probabilities for test set
y_proba = logreg.predict_proba(X_test)[:, 1]

# Find optimal threshold for precision >= 0.7
from sklearn.metrics import precision_recall_curve
precision, recall, thresholds = precision_recall_curve(y_test, y_proba)

# Find first threshold where precision >= 0.7
optimal_idx = next(i for i, p in enumerate(precision) if p >= 0.7)
optimal_threshold = thresholds[optimal_idx]

# Apply adjusted threshold
y_pred_adj = (y_proba >= optimal_threshold).astype(int)

# Evaluate
print("\nAdjusted Logistic Regression:")
print(classification_report(y_test, y_pred_adj))
print(f"New threshold: {optimal_threshold:.4f}")

In [None]:
## Random Forest Improvements
## Weakness: Overfitting (train AUC=1.0 vs test=0.938) and recall for defaults (0.73)
# A. Reduce overfitting
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(
    n_estimators=200,  # Increase from 100
    max_depth=10,       # Limit tree depth
    min_samples_leaf=5, # Require more samples per leaf
    class_weight='balanced_subsample',  # Better for imbalanced data
    random_state=42
)

# B. Feature importance analysis
importances = rf.feature_importances_
std = np.std([tree.feature_importances_ for tree in rf.estimators_], axis=0)
pd.DataFrame({'feature':X.columns, 'importance':importances, 'std':std})

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score

# Initialize results dictionary
results = {}

# Define tuned Random Forest model
rf = RandomForestClassifier(
    n_estimators=200,
    max_depth=10,
    min_samples_leaf=5,
    class_weight='balanced_subsample',
    random_state=42
)

# Train on manually resampled data
rf.fit(X_train_res, y_train_res)

# Evaluate on original test set
y_test_pred = rf.predict(X_test)
y_test_proba = rf.predict_proba(X_test)[:, 1]

# Evaluate on resampled training data
y_train_res_pred = rf.predict(X_train_res)
y_train_res_proba = rf.predict_proba(X_train_res)[:, 1]

# Store results
results["Random Forest (Tuned)"] = {
    "test_report": classification_report(y_test, y_test_pred),
    "test_roc_auc": roc_auc_score(y_test, y_test_proba),
    "train_resampled_report": classification_report(y_train_res, y_train_res_pred),
    "train_resampled_roc_auc": roc_auc_score(y_train_res, y_train_res_proba)
}

# Print results
print("\nModel Evaluation: Random Forest (Tuned)")
metrics = results["Random Forest (Tuned)"]
print("TRAIN SET (Resampled):")
print(metrics["train_resampled_report"])
print(f"ROC-AUC: {metrics['train_resampled_roc_auc']:.4f}")
print("\nTEST SET (Original):")
print(metrics["test_report"])
print(f"ROC-AUC: {metrics['test_roc_auc']:.4f}")
print("-" * 50)



In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

# Define parameter grid
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_leaf': [1, 3, 5],
    'class_weight': ['balanced', 'balanced_subsample']
}

# Set up the model
rf = RandomForestClassifier(random_state=42)

# Grid search
grid_search = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    cv=5,  # 5-fold cross-validation
    scoring='f1',  # or 'roc_auc', 'precision', etc.
    n_jobs=-1
)

# Fit
grid_search.fit(X_train_res, y_train_res)

# Best model
best_rf = grid_search.best_estimator_
print("Best hyperparameters:", grid_search.best_params_)


In [None]:
# Initialize results dictionary
results = {}

# Define tuned Random Forest model
rf = RandomForestClassifier(
    n_estimators=200,
    max_depth=None,
    min_samples_leaf=1,
    class_weight='balanced_subsample',
    random_state=42
)

# Train on manually resampled data
rf.fit(X_train_res, y_train_res)

# Evaluate on original test set
y_test_pred = rf.predict(X_test)
y_test_proba = rf.predict_proba(X_test)[:, 1]

# Evaluate on resampled training data
y_train_res_pred = rf.predict(X_train_res)
y_train_res_proba = rf.predict_proba(X_train_res)[:, 1]

# Store results
results["Random Forest (Tuned)"] = {
    "test_report": classification_report(y_test, y_test_pred),
    "test_roc_auc": roc_auc_score(y_test, y_test_proba),
    "train_resampled_report": classification_report(y_train_res, y_train_res_pred),
    "train_resampled_roc_auc": roc_auc_score(y_train_res, y_train_res_proba)
}

# Print results
print("\nModel Evaluation: Random Forest (Tuned)")
metrics = results["Random Forest (Tuned)"]
print("TRAIN SET (Resampled):")
print(metrics["train_resampled_report"])
print(f"ROC-AUC: {metrics['train_resampled_roc_auc']:.4f}")
print("\nTEST SET (Original):")
print(metrics["test_report"])
print(f"ROC-AUC: {metrics['test_roc_auc']:.4f}")
print("-" * 50)


In [None]:
# Initialize results dictionary
results = {}

# Define tuned Random Forest model
rf = RandomForestClassifier(
    n_estimators=200,
    max_depth=None,
    min_samples_leaf=1,
    class_weight={0: 1, 1: 2},
    random_state=42
)

# Train on manually resampled data
rf.fit(X_train_res, y_train_res)

# Evaluate on original test set
y_test_pred = rf.predict(X_test)
y_test_proba = rf.predict_proba(X_test)[:, 1]

# Evaluate on resampled training data
y_train_res_pred = rf.predict(X_train_res)
y_train_res_proba = rf.predict_proba(X_train_res)[:, 1]

# Store results
results["Random Forest (Tuned)"] = {
    "test_report": classification_report(y_test, y_test_pred),
    "test_roc_auc": roc_auc_score(y_test, y_test_proba),
    "train_resampled_report": classification_report(y_train_res, y_train_res_pred),
    "train_resampled_roc_auc": roc_auc_score(y_train_res, y_train_res_proba)
}

# Print results
print("\nModel Evaluation: Random Forest (Tuned)")
metrics = results["Random Forest (Tuned)"]
print("TRAIN SET (Resampled):")
print(metrics["train_resampled_report"])
print(f"ROC-AUC: {metrics['train_resampled_roc_auc']:.4f}")
print("\nTEST SET (Original):")
print(metrics["test_report"])
print(f"ROC-AUC: {metrics['test_roc_auc']:.4f}")
print("-" * 50)
