In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score, roc_curve
from imblearn.over_sampling import SMOTE

In [None]:
# 2. Preprocessing
# Drop ID column (non-predictive)
if "LoanID" in data.columns:
    data = data.drop("LoanID", axis=1)
data = data.drop_duplicates().dropna()

In [None]:
# Define target and features
target_column = "Default"
X = data.drop(target_column, axis=1)
y = data[target_column]


In [None]:
# Encode categorical features
label_encoders = {}
for col in data.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le

In [None]:
# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
.9)]

In [None]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)



In [None]:
# 3. Baseline Random Forest
# -----------------------------
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
y_pred_proba = rf.predict_proba(X_test)[:, 1]

print("\n Baseline Random Forest")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_pred_proba))
print(classification_report(y_test, y_pred))

In [None]:
# 4. Random Forest with Class Weights
# -----------------------------
rf_balanced = RandomForestClassifier(random_state=42, class_weight="balanced")
rf_balanced.fit(X_train, y_train)
y_pred_balanced = rf_balanced.predict(X_test)
y_pred_proba_balanced = rf_balanced.predict_proba(X_test)[:, 1]

print("\n🔹 Random Forest (Class Weight Balanced)")
print("Accuracy:", accuracy_score(y_test, y_pred_balanced))
print("ROC-AUC:", roc_auc_score(y_test, y_pred_proba_balanced))
print(classification_report(y_test, y_pred_balanced))

In [None]:
# 5. Random Forest with SMOTE
# -----------------------------
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

rf_smote = RandomForestClassifier(random_state=42)
rf_smote.fit(X_train_smote, y_train_smote)
y_pred_smote = rf_smote.predict(X_test)
y_pred_proba_smote = rf_smote.predict_proba(X_test)[:, 1]

print("\n🔹 Random Forest (SMOTE Oversampling)")
print("Accuracy:", accuracy_score(y_test, y_pred_smote))
print("ROC-AUC:", roc_auc_score(y_test, y_pred_proba_smo

In [None]:
# 6. Confusion Matrices
# -----------------------------
fig, axes = plt.subplots(1, 3, figsize=(18,5))
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt='d', cmap='Blues', ax=axes[0])
axes[0].set_title("Baseline RF")

sns.heatmap(confusion_matrix(y_test, y_pred_balanced), annot=True, fmt='d', cmap='Greens', ax=axes[1])
axes[1].set_title("RF (Class Weight)")

sns.heatmap(confusion_matrix(y_test, y_pred_smote), annot=True, fmt='d', cmap='Oranges', ax=axes[2])
axes[2].set_title("RF (SMOTE)")

plt.show()

In [None]:
# 7. ROC Curves
# -----------------------------
fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
fpr_b, tpr_b, _ = roc_curve(y_test, y_pred_proba_balanced)
fpr_s, tpr_s, _ = roc_curve(y_test, y_pred_proba_smote)

plt.figure(figsize=(8,6))
plt.plot(fpr, tpr, label="Baseline RF")
plt.plot(fpr_b, tpr_b, label="RF (Class Weight)")
plt.plot(fpr_s, tpr_s, label="RF (SMOTE)")
plt.plot([0,1],[0,1],'k--')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve Comparison")
plt.legend()
plt.show()

In [None]:
# 8. Feature Importance (using best SMOTE model)
# -----------------------------
importances = rf_smote.feature_importances_
indices = np.argsort(importances)[::-1]
plt.figure(figsize=(10,6))
plt.title("Feature Importances - RF with SMOTE")
sns.barplot(x=importances[indices], y=X.columns[indices])
plt.show()


In [None]:
# 9. Hyperparameter Tuning (on SMOTE data)
# -----------------------------
#param_grid = {
    #'n_estimators': [100, 200],
    #'max_depth': [None, 10, 20],
    #'min_samples_split': [2, 5],
    #'min_samples_leaf': [1, 2]
#}
#grid = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=3, scoring='roc_auc', n_jobs=-1)
#grid.fit(X_train_smote, y_train_smote)

#print("\nBest Parameters (RF with SMOTE):", grid.best_params_)
#print("Best CV ROC-AUC:", grid.best_score_)


In [None]:
# Save DataFrame to CSV
data.to_csv("KGpreprocessed_data.csv", index=False)

In [None]:
import joblib, os

# Create folder again
os.makedirs("model_artifacts", exist_ok=True)

# Save model and preprocessing objects correctly
joblib.dump(rf_smote, "model_artifacts/loan_rf_model.pkl")
joblib.dump(scaler, "model_artifacts/scaler.pkl")
joblib.dump(label_encoders, "model_artifacts/label_encoders.pkl")
joblib.dump(list(X.columns), "model_artifacts/features.pkl")

print("✅ Model and artifacts saved successfully!")