#Project Introduction

This project builds a Credit Risk Scoring system using Logistic Regression to predict whether a borrower is likely to default on a loan.

Instead of just showing historical data, this model learns patterns from past borrowers and assigns each customer a probability of default. This allows banks to move from reactive reporting to proactive risk management.

The final output is a risk score for every customer, which is then visualized in Power BI for business users.

In [2]:
from google.colab import files
uploaded = files.upload()

Saving Credit_Risk_Dataset_Onyx_Data_September_25.xlsx to Credit_Risk_Dataset_Onyx_Data_September_25.xlsx


In [3]:
import pandas as pd

FILE_NAME = list(uploaded.keys())[0]
df = pd.read_excel(FILE_NAME)

In [4]:
print(df.shape)
df.head()

(32581, 29)


Unnamed: 0,client_ID,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,...,city_latitude,city_longitude,employment_type,loan_term_months,loan_to_income_ratio,other_debt,debt_to_income_ratio,open_accounts,credit_utilization_ratio,past_delinquencies
0,CUST_00001,22,59000,RENT,123.0,PERSONAL,D,35000,16.02,1,...,43.6532,-79.3832,Self-employed,36,0.59322,8402.45385,0.735635,14,0.495557,0
1,CUST_00002,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0,...,43.6532,-79.3832,Full-time,36,0.104167,1607.802794,0.271646,10,0.585436,3
2,CUST_00003,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,1,...,51.6214,-3.9436,Full-time,36,0.572917,2760.505633,0.860469,14,0.750732,0
3,CUST_00004,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,1,...,49.2827,-123.1207,Part-time,12,0.534351,7155.28615,0.643592,15,0.379333,0
4,CUST_00005,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,1,...,42.8864,-78.8784,Part-time,36,0.643382,15626.153439,0.930628,4,0.228103,0


In [5]:
TARGET = "loan_status"

FEATURES = [
    "person_age","person_income",
    "person_home_ownership",
    "person_emp_length",
    "employment_type",
    "loan_intent",
    "loan_grade",
    "loan_amnt",
    "loan_int_rate",
    "loan_term_months",
    "loan_to_income_ratio",
    "debt_to_income_ratio",
    "credit_utilization_ratio",
    "open_accounts",
    "past_delinquencies",
    "cb_person_cred_hist_length",
]

ID_COL = "client_ID"

data = df[[ID_COL] + FEATURES + [TARGET]].copy()

print("Missing target:", data[TARGET].isna().sum())
print("Default rate:", data[TARGET].mean())
data.head()

Missing target: 0
Default rate: 0.21816396059052823


Unnamed: 0,client_ID,person_age,person_income,person_home_ownership,person_emp_length,employment_type,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_term_months,loan_to_income_ratio,debt_to_income_ratio,credit_utilization_ratio,open_accounts,past_delinquencies,cb_person_cred_hist_length,loan_status
0,CUST_00001,22,59000,RENT,123.0,Self-employed,PERSONAL,D,35000,16.02,36,0.59322,0.735635,0.495557,14,0,3,1
1,CUST_00002,21,9600,OWN,5.0,Full-time,EDUCATION,B,1000,11.14,36,0.104167,0.271646,0.585436,10,3,2,0
2,CUST_00003,25,9600,MORTGAGE,1.0,Full-time,MEDICAL,C,5500,12.87,36,0.572917,0.860469,0.750732,14,0,3,1
3,CUST_00004,23,65500,RENT,4.0,Part-time,MEDICAL,C,35000,15.23,12,0.534351,0.643592,0.379333,15,0,2,1
4,CUST_00005,24,54400,RENT,8.0,Part-time,MEDICAL,C,35000,14.27,36,0.643382,0.930628,0.228103,4,0,4,1


In [6]:
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression

X = data[FEATURES]
y = data[TARGET]


# Identify numeric vs categorical
numeric_features = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
categorical_features = [c for c in FEATURES if c not in numeric_features]


print("Numeric:", numeric_features)
print("Categorical:", categorical_features)

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size = 0.2,
    random_state = 42,
    stratify = y
)

numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

model = LogisticRegression(
    max_iter=2000,
    class_weight="balanced",  # helpful when defaults are minority
    solver="lbfgs"
)

clf = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", model)
])

clf

Numeric: ['person_age', 'person_income', 'person_emp_length', 'loan_amnt', 'loan_int_rate', 'loan_term_months', 'loan_to_income_ratio', 'debt_to_income_ratio', 'credit_utilization_ratio', 'open_accounts', 'past_delinquencies', 'cb_person_cred_hist_length']
Categorical: ['person_home_ownership', 'employment_type', 'loan_intent', 'loan_grade']


In [7]:
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix

clf.fit(X_train, y_train)

# Probability of default
proba_test = clf.predict_proba(X_test)[:, 1]
pred_test = (proba_test >= 0.5).astype(int)

auc = roc_auc_score(y_test, proba_test)
print("ROC-AUC:", round(auc, 4))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, pred_test))
print("\nClassification Report:\n", classification_report(y_test, pred_test))


ROC-AUC: 0.8735

Confusion Matrix:
 [[4203  892]
 [ 309 1113]]

Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.82      0.87      5095
           1       0.56      0.78      0.65      1422

    accuracy                           0.82      6517
   macro avg       0.74      0.80      0.76      6517
weighted avg       0.85      0.82      0.83      6517



In [8]:
import numpy as np

proba_all = clf.predict_proba(X)[:, 1]

scored = pd.DataFrame({
    ID_COL: data[ID_COL],
    "prob_default": proba_all
})

# Optional risk buckets for dashboard
scored["risk_bucket"] = pd.cut(
    scored["prob_default"],
    bins=[-0.001, 0.2, 0.4, 0.6, 0.8, 1.0],
    labels=["Very Low", "Low", "Medium", "High", "Very High"]
)

scored.head()


Unnamed: 0,client_ID,prob_default,risk_bucket
0,CUST_00001,0.99229,Very High
1,CUST_00002,0.049293,Very Low
2,CUST_00003,0.994465,Very High
3,CUST_00004,0.895957,Very High
4,CUST_00005,0.972766,Very High


In [9]:
OUT_FILE = "credit_risk_logistic_scored.csv"
scored.to_csv(OUT_FILE, index=False)
print("Saved:", OUT_FILE)


Saved: credit_risk_logistic_scored.csv


In [10]:
from google.colab import files
files.download(OUT_FILE)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [11]:
import pandas as pd
import numpy as np

# Get feature names after one-hot encoding
ohe = clf.named_steps["preprocessor"].named_transformers_["cat"].named_steps["onehot"]
cat_feature_names = ohe.get_feature_names_out(categorical_features)

all_feature_names = np.concatenate([numeric_features, cat_feature_names])

coefs = clf.named_steps["model"].coef_[0]
coef_df = pd.DataFrame({
    "feature": all_feature_names,
    "coef": coefs,
    "abs_coef": np.abs(coefs)
}).sort_values("abs_coef", ascending=False)

coef_df.head(20)


Unnamed: 0,feature,coef,abs_coef
32,loan_grade_G,2.760337,2.760337
26,loan_grade_A,-1.982441,1.982441
27,loan_grade_B,-1.701126,1.701126
28,loan_grade_C,-1.488015,1.488015
6,loan_to_income_ratio,1.420318,1.420318
14,person_home_ownership_OWN,-1.374247,1.374247
31,loan_grade_F,1.25595,1.25595
15,person_home_ownership_RENT,0.92432,0.92432
30,loan_grade_E,0.839297,0.839297
3,loan_amnt,-0.701731,0.701731


In [12]:
COEF_FILE = "logistic_coefficients.csv"
coef_df.to_csv(COEF_FILE, index=False)
from google.colab import files
files.download(COEF_FILE)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>