In [1]:
# Required libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    roc_auc_score, roc_curve, classification_report,
    confusion_matrix, precision_recall_curve
)
import joblib

In [2]:
# ---------- 1) Load data ----------
FILE = "Task 3 and 4_Loan_Data.csv"   # adjust path if needed
df = pd.read_csv(FILE)

In [3]:
# Quick sanity check
print("Rows, cols:", df.shape)
print(df.head())

Rows, cols: (10000, 8)
   customer_id  credit_lines_outstanding  loan_amt_outstanding  \
0      8153374                         0           5221.545193   
1      7442532                         5           1958.928726   
2      2256073                         0           3363.009259   
3      4885975                         0           4766.648001   
4      4700614                         1           1345.827718   

   total_debt_outstanding       income  years_employed  fico_score  default  
0             3915.471226  78039.38546               5         605        0  
1             8228.752520  26648.43525               2         572        1  
2             2027.830850  65866.71246               4         602        0  
3             2501.730397  74356.88347               5         612        0  
4             1768.826187  23448.32631               6         631        0  


In [5]:
# Define target and ID columns
TARGET = "default"
ID_COL = "customer_id"

# Drop identifier from features
X = df.drop(columns=[TARGET, ID_COL])
y = df[TARGET]

In [6]:
# ---------- 3) Train/test split ----------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [7]:
# ---------- 4) Preprocessing: scaling numeric features ----------
numeric_cols = X.columns.tolist()   # all are numeric here
scaler = StandardScaler()

In [8]:
# ---------- 5) Model pipelines ----------
log_pipe = Pipeline([
    ("scaler", scaler),
    ("clf", LogisticRegression(class_weight="balanced", solver="liblinear", random_state=42))
])

rf_pipe = Pipeline([
    ("scaler", scaler),  # RF doesn't strictly need scaling but leaving it is fine in a pipeline
    ("clf", RandomForestClassifier(n_estimators=200, class_weight="balanced", random_state=42))
])

In [9]:
# ---------- 6) Quick baseline training ----------
print("\nTraining Logistic Regression...")
log_pipe.fit(X_train, y_train)

print("Training Random Forest...")
rf_pipe.fit(X_train, y_train)


Training Logistic Regression...
Training Random Forest...


0,1,2
,steps,"[('scaler', ...), ('clf', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,n_estimators,200
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [10]:
# ---------- 7) Evaluate helper ----------
def evaluate_model(pipe, X_t, y_t, name="model"):
    probs = pipe.predict_proba(X_t)[:, 1]
    preds = pipe.predict(X_t)
    auc = roc_auc_score(y_t, probs)
    print(f"===== {name} =====")
    print("ROC AUC:", round(auc, 4))
    print("Classification report:\n", classification_report(y_t, preds, digits=4))
    cm = confusion_matrix(y_t, preds)
    print("Confusion matrix:\n", cm)
    return {"auc": auc, "probs": probs, "preds": preds}

res_log = evaluate_model(log_pipe, X_test, y_test, "LogisticRegression")
res_rf  = evaluate_model(rf_pipe,  X_test, y_test, "RandomForest")

===== LogisticRegression =====
ROC AUC: 1.0
Classification report:
               precision    recall  f1-score   support

           0     1.0000    0.9896    0.9948      1630
           1     0.9561    1.0000    0.9775       370

    accuracy                         0.9915      2000
   macro avg     0.9780    0.9948    0.9862      2000
weighted avg     0.9919    0.9915    0.9916      2000

Confusion matrix:
 [[1613   17]
 [   0  370]]
===== RandomForest =====
ROC AUC: 0.9999
Classification report:
               precision    recall  f1-score   support

           0     0.9969    0.9975    0.9972      1630
           1     0.9892    0.9865    0.9878       370

    accuracy                         0.9955      2000
   macro avg     0.9930    0.9920    0.9925      2000
weighted avg     0.9955    0.9955    0.9955      2000

Confusion matrix:
 [[1626    4]
 [   5  365]]


In [11]:
# ---------- 8) Choose best model by AUC ----------
best_model = log_pipe if res_log["auc"] >= res_rf["auc"] else rf_pipe
print("\nSelected best model:", "LogisticRegression" if best_model is log_pipe else "RandomForest")


Selected best model: LogisticRegression


In [12]:
# ---------- 9) Save the model ----------
joblib.dump(best_model, "best_default_model.joblib")
print("Saved best model to best_default_model.joblib")

Saved best model to best_default_model.joblib


In [13]:
# ---------- 10) Example: predict probabilities for new customers ----------
# Suppose new_customers is a DataFrame with same feature columns as X
new_customers = pd.DataFrame({
    "credit_lines_outstanding": [3, 8],
    "loan_amt_outstanding": [20000.0, 50000.0],
    "total_debt_outstanding": [15000.0, 65000.0],
    "income": [45000.0, 120000.0],
    "years_employed": [2, 8],
    "fico_score": [580, 720]
})

probs = best_model.predict_proba(new_customers)[:, 1]
print("\nNew customer default probabilities:\n", probs)


New customer default probabilities:
 [0.99951052 1.        ]
