## 1. Import Libraries and Load Preprocessed Data

In [2]:
# Core libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Machine Learning libraries
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.metrics import (classification_report, confusion_matrix, accuracy_score, 
                           balanced_accuracy_score, f1_score, precision_score, recall_score)
from sklearn.preprocessing import StandardScaler
import joblib

#Advance ML libraries
try:
    import xgboost as xgb
    XGBOOST_AVAILABLE = True
except ImportError:
    XGBOOST_AVAILABLE = False
    print("XGBoost not available. Install with: pip install xgboost")

try:
    import lightgbm as lgb
    LIGHTGBM_AVAILABLE = True
except ImportError:
    LIGHTGBM_AVAILABLE = False
    print("LightGBM not available. Install with: pip install lightgbm")

# Explainability
try:
    import shap
    SHAP_AVAILABLE = True
except ImportError:
    SHAP_AVAILABLE = False
    print("SHAP not available. Install with: pip install shap")

# Set style for better visualizations
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("Libraries imported successfully!")
print(f"XGBoost available: {XGBOOST_AVAILABLE}")
print(f"LightGBM available: {LIGHTGBM_AVAILABLE}")
print(f"SHAP available: {SHAP_AVAILABLE}")


Libraries imported successfully!
XGBoost available: True
LightGBM available: True
SHAP available: True


In [3]:
# Load preprocessed datasets
print("LOADING PREPROCESSED DATA")

try:
    # Load preprocessed datasets
    X_train = pd.read_csv('X_train_scaled.csv')
    X_val = pd.read_csv('X_val_scaled.csv')
    X_test = pd.read_csv('X_test_scaled.csv')
    
    y_train = pd.read_csv('y_train.csv').squeeze()
    y_val = pd.read_csv('y_val.csv').squeeze()
    y_test = pd.read_csv('y_test.csv').squeeze()
    ## The .squeeze() method removes dimensions of size 1, converting a DataFrame to a Series
    ## The .squeeze() nmethod is a clean, robust way to ensure your targer variables are in the correct 1D format that sklearn expects to prevent potential errors during model training and evaluation

    #Load preprocessing objects
    scaler = joblib.load("scaler.pkl")
    selected_features = joblib.load('selected_features.pkl')

    print('')
    print(f"Training set: {X_train.shape}")
    print(f"Validation set: {X_val.shape}")
    print(f"Test set: {X_test.shape}")

    #Display calss distribution


    print(y_train.value_counts().sort_index())
    print("\nValidation set:")
    print(y_val.value_counts().sort_index())
    print("\nTest set:")
    print(y_test.value_counts().sort_index())

    #Display selected features
    print(f"\nSelected features ({len(selected_features)}):")
    for i, feature in enumerate(selected_features, 1):
        print(f"{i:2d}, {feature}")
    
except FileNotFoundError as e:
    print(f"Error loading preprocessed dataL {e}")

LOADING PREPROCESSED DATA

Training set: (346, 7)
Validation set: (116, 7)
Test set: (116, 7)
Loan_Status
0    105
1    241
Name: count, dtype: int64

Validation set:
Loan_Status
0    35
1    81
Name: count, dtype: int64

Test set:
Loan_Status
0    35
1    81
Name: count, dtype: int64

Selected features (7):
 1, Credit_History
 2, ApplicantIncome_log
 3, CoapplicantIncome_log
 4, LoanAmount_log
 5, Total_Income
 6, Loan_Income_Ratio
 7, Monthly_Payment


## 2. Baseline Model - Logistic Regression

In [4]:
#Baseline Logistic Regression model


#Create and train baseline model
baseline_model = LogisticRegression(
    random_state= 234,
    max_iter = 1000,
    class_weight= 'balanced' #Handles class imbalance as recommended by EDA
)

#Train the modes
baseline_model.fit(X_train, y_train)

#Make predictions
y_train_pred = baseline_model.predict(X_train)
y_val_pred = baseline_model.predict(X_val)
y_test_pred = baseline_model.predict(X_test)

#Calculate metrics
train_accuracy = accuracy_score(y_train, y_train_pred)
val_accuracy = accuracy_score(y_val, y_val_pred)
test_accuracy = accuracy_score(y_test,  y_test_pred)

train_balanced_acc = balanced_accuracy_score(y_train, y_train_pred)
val_balanced_acc = balanced_accuracy_score(y_val, y_val_pred)
test_balanced_acc = balanced_accuracy_score(y_test, y_test_pred)

train_f1 = f1_score(y_train, y_train_pred, average = 'macro')
val_f1 = f1_score(y_val, y_val_pred, average='macro')
test_f1 = f1_score(y_test, y_test_pred, average='macro')

print(f"\nBaseline Model Performance:")
print(f"Training - Accuracy: {train_accuracy:.3f}, Balanced Acc: {train_balanced_acc:.3f}, Macro F1: {train_f1:.3f}")
print(f"Validation - Accuracy: {val_accuracy:.3f}, Balanced Acc: {val_balanced_acc:.3f}, Macro F1: {val_f1:.3f}")
print(f"Test - Accuracy: {test_accuracy:.3f}, Balanced Acc: {test_balanced_acc:.3f}, Macro F1: {test_f1:.3f}")

# Display feature coefficients (interpretability)
print(f"\nFeature Coefficients (Top 10):")
feature_importance = pd.DataFrame({
    'feature': X_train.columns,
    'coefficient': baseline_model.coef_[0]
}).sort_values('coefficient', key=abs, ascending=False)

for i, (_, row) in enumerate(feature_importance.head(10).iterrows(), 1):
    print(f"{i:2d}. {row["feature"]}: {row["coefficient"]:3f}")

# Store baseline results
baseline_results = {
    'model': 'Logistic Regression',
    'train_accuracy': train_accuracy,
    'val_accuracy': val_accuracy,
    'test_accuracy': test_accuracy,
    'train_balanced_acc': train_balanced_acc,
    'val_balanced_acc': val_balanced_acc,
    'test_balanced_acc': test_balanced_acc,
    'train_f1': train_f1,
    'val_f1': val_f1,
    'test_f1': test_f1
}

print(".... Baseline model completed!")




Baseline Model Performance:
Training - Accuracy: 0.578, Balanced Acc: 0.549, Macro F1: 0.540
Validation - Accuracy: 0.629, Balanced Acc: 0.621, Macro F1: 0.601
Test - Accuracy: 0.517, Balanced Acc: 0.476, Macro F1: 0.472

Feature Coefficients (Top 10):
 1. Loan_Income_Ratio: -0.480667
 2. LoanAmount_log: 0.394713
 3. Total_Income: -0.370650
 4. CoapplicantIncome_log: 0.088604
 5. Monthly_Payment: -0.017892
 6. ApplicantIncome_log: -0.000623
 7. Credit_History: 0.000000
.... Baseline model completed!
