In [None]:
# import neccessary libraries

from sklearn.datasets import fetch_openml
import pandas as pd

# Load dataset

data = fetch_openml("credit-g", as_frame=True, parser="auto")
df = pd.concat([data.data, data.target.rename("class")], axis=1)

X = df.drop(columns=["class"])
y = df["class"]

print(df.shape)                 # expect (1000, 21)
print(X.dtypes.value_counts())  # see mix of categories/numerics
print(y.value_counts())         # counts
print((y.value_counts(normalize=True)*100).round(2))  # percentages

- version 1, status: active
  url: https://www.openml.org/search?type=data&id=31
- version 2, status: active
  url: https://www.openml.org/search?type=data&id=44096



(1000, 21)
int64       7
category    1
category    1
category    1
category    1
category    1
category    1
category    1
category    1
category    1
category    1
category    1
category    1
category    1
Name: count, dtype: int64
class
good    700
bad     300
Name: count, dtype: int64
class
good    70.0
bad     30.0
Name: proportion, dtype: float64


In [None]:
# stratified train/validation split (80/20)

from sklearn.model_selection import train_test_split


X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

for name, arr in [("X_train", X_train), ("X_valid", X_valid),
                  ("y_train", y_train), ("y_valid", y_valid)]:
    print(name, arr.shape)

X_train (800, 20)
X_valid (200, 20)
y_train (800,)
y_valid (200,)


In [None]:
# Identifying which features are categorical vs numeric for X_train

cat_cols = X_train.select_dtypes(include=['category','object']).columns.tolist()
num_cols = X_train.select_dtypes(include=['number']).columns.tolist()

print(len(cat_cols), "categorical"); print(len(num_cols), "numeric")
print(cat_cols)

13 categorical
7 numeric
['checking_status', 'credit_history', 'purpose', 'savings_status', 'employment', 'personal_status', 'other_parties', 'property_magnitude', 'other_payment_plans', 'housing', 'job', 'own_telephone', 'foreign_worker']


In [14]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
import numpy as np

cat_cols = ['checking_status','credit_history','purpose','savings_status','employment',
            'personal_status','other_parties','property_magnitude','other_payment_plans',
            'housing','job','own_telephone','foreign_worker']
num_cols = [c for c in X_train.columns if c not in cat_cols]

preprocess = ColumnTransformer([
    ("num", StandardScaler(), num_cols),
    ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols)
])

clf = Pipeline([
    ("prep", preprocess),
    ("model", LogisticRegression(max_iter=1000, solver="liblinear"))
])

clf.fit(X_train, y_train)

proba_valid = clf.predict_proba(X_valid)
idx_bad = list(clf.named_steps["model"].classes_).index("bad")
p_bad = proba_valid[:, idx_bad]

auc = roc_auc_score((y_valid=="bad").astype(int), p_bad)
print("Validation ROC-AUC:", round(auc, 3))

Validation ROC-AUC: 0.76


In [24]:
from sklearn.model_selection import StratifiedKFold, cross_val_score

y_bad = (y == "bad").astype(int)  # 1 = bad (positive class), 0 = good
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(clf, X, y_bad, cv=cv, scoring="roc_auc")
print(scores.mean().round(3), scores.std().round(3))


0.786 0.019


In [25]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score

rf = RandomForestClassifier(
    n_estimators=400, random_state=42, class_weight="balanced")

rf_clf = Pipeline([("prep", preprocess), ("model", rf)])

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(rf_clf, X, y_bad, cv=cv, scoring="roc_auc")
print(scores.mean().round(3), scores.std().round(3))

0.791 0.022


In [None]:
import numpy as np
from sklearn.metrics import confusion_matrix, precision_score, recall_score

y_true = (y_valid=="bad").astype(int)
best = (None, 1e18)  # (t, cost)
for t in np.linspace(0,1,201):
    y_pred = (p_bad >= t).astype(int)
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    cost = fp*1 + fn*5
    if cost < best[1]:
        best = (t, cost)
t_star = best[0]
y_hat = (p_bad >= t_star).astype(int)
print(t_star, confusion_matrix(y_true, y_hat))
print("Precision:", precision_score(y_true, y_hat))
print("Recall:", recall_score(y_true, y_hat))


0.20500000000000002 [[77 63]
 [ 6 54]]
Precision: 0.46153846153846156
Recall: 0.9


In [41]:
from sklearn.metrics import confusion_matrix
y_true = (y_valid=="bad").astype(int)
y_hat_05 = (p_bad >= 0.5).astype(int)
tn, fp, fn, tp = confusion_matrix(y_true, y_hat_05).ravel()
print("cost@0.5 =", fp + 5*fn, "cmat:", [[tn,fp],[fn,tp]])
cost_default = 190
cost_opt = 93
reduction_pct = (cost_default - cost_opt) / cost_default * 100
print(round(reduction_pct, 1))  # â†’ 51.1


cost@0.5 = 190 cmat: [[np.int64(115), np.int64(25)], [np.int64(33), np.int64(27)]]
51.1


In [42]:
import numpy as np
import pandas as pd

pre = clf.named_steps["prep"]
ohe = pre.named_transformers_["cat"]
# Feature names (version-safe)
try:
    cat_feats = ohe.get_feature_names_out(cat_cols)
except:
    cat_feats = ohe.get_feature_names(cat_cols)
feat_names = list(num_cols) + list(cat_feats)

coef = clf.named_steps["model"].coef_.ravel()
or_df = pd.DataFrame({"feature": feat_names, "odds_ratio": np.exp(coef)})

top_risk = or_df.sort_values("odds_ratio", ascending=False).head(5)
top_protect = or_df.sort_values("odds_ratio", ascending=True).head(5)

print("Top risk-increasing (OR>1):")
print(top_risk.to_string(index=False))
print("\nTop protective (OR<1):")
print(top_protect.to_string(index=False))


Top risk-increasing (OR>1):
                                      feature  odds_ratio
                             purpose_used car    3.871150
                  checking_status_no checking    2.712682
                      other_parties_guarantor    2.670500
credit_history_critical/other existing credit    2.324341
                            foreign_worker_no    2.053430

Top protective (OR<1):
                   feature  odds_ratio
         purpose_education    0.493475
           purpose_new car    0.506342
        checking_status_<0    0.508079
       savings_status_<100    0.587095
other_parties_co applicant    0.599541
