### Imports

In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, RocCurveDisplay, auc, roc_auc_score, f1_score
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler

### Data

In [2]:
# Load data
df = pd.read_csv("german_credit_data.csv", sep=';', skiprows=1) 
df = df.iloc[0:1000]

In [3]:
# Setting types
df["duration_t"] = df["duration"].astype('int64')
df["credit_amount_t"] = df["credit_amount"].astype('float64')/1000
df["installment_commitment_t"] = df["installment_commitment"].astype('int64')
df["age_t"] = df["age"].astype('int64')
df["existing_credits_t"] = df["existing_credits"].astype('int64')
df["residence_since_t"] = df["residence_since"].astype('int64')
df["num_dependents_t"] = df["num_dependents"].astype('int64')

In [4]:
# Transformations
df["class_t"] = 1
df.loc[df["class"] == "good", ["class_t"]] = 0
df["checking_status_t"] = 0
df.loc[df["checking_status"] == "<0", ["checking_status_t"]] = 1
df.loc[df["checking_status"] == "0<=X<200", ["checking_status_t"]] = 2
df.loc[df["checking_status"] == ">=200", ["checking_status_t"]] = 3
df["savings_status_t"] = 1
df.loc[df["savings_status"] == "no known savings", ["savings_status_t"]] = 0
df.loc[df["savings_status"] == "100<=X<500", ["savings_status_t"]] = 2
df.loc[df["savings_status"] == "500<=X<1000", ["savings_status_t"]] = 3
df.loc[df["savings_status"] == ">=1000", ["savings_status_t"]] = 4
df["employment_t"] = 1
df.loc[df["employment"] == "unemployed", ["employment_t"]] = 0
df.loc[df["employment"] == "1<=X<4", ["employment_t"]] = 2
df.loc[df["employment"] == "4<=X<7", ["employment_t"]] = 3
df.loc[df["employment"] == ">=7", ["employment_t"]] = 4
df["personal_status_t"] = 0
df.loc[df["personal_status"] == "male single", ["personal_status_t"]] = 1
df.loc[df["personal_status"] == "male mar/wid", ["personal_status_t"]] = 2
df.loc[df["personal_status"] == "male div/sep", ["personal_status_t"]] = 3
df["property_magnitude_t"] = 0
df.loc[df["property_magnitude"] == "car", ["property_magnitude_t"]] = 1
df.loc[df["property_magnitude"] == "life insurance", ["property_magnitude_t"]] = 2
df.loc[df["property_magnitude"] == "real estate", ["property_magnitude_t"]] = 3
df["housing_t"] = 0
df.loc[df["housing"] == "for free", ["housing_t"]] = 1
df.loc[df["housing"] == "own", ["housing_t"]] = 2
df["job_t"] = 0
df.loc[df["job"] == "unskilled resident", ["job_t"]] = 1
df.loc[df["job"] == "skilled", ["job_t"]] = 2
df.loc[df["job"] == "high qualif/self emp/mgmt", ["job_t"]] = 3
df["other_parties_t"] = 0
df.loc[df["other_parties"] == "co applicant", ["other_parties_t"]] = 1
df.loc[df["other_parties"] == "guarantor", ["other_parties_t"]] = 2
df["other_payment_plans_t"] = 0
df.loc[df["other_payment_plans"] == "stores", ["other_payment_plans_t"]] = 1
df.loc[df["other_payment_plans"] == "bank", ["other_payment_plans_t"]] = 2
df["credit_history_t"] = 0
df.loc[df["credit_history"] == "all paid", ["credit_history_t"]] = 1
df.loc[df["credit_history"] == "existing paid", ["credit_history_t"]] = 2
df.loc[df["credit_history"] == "delayed previously", ["credit_history_t"]] = 3
df.loc[df["credit_history"] == "critical/other existing credit", ["credit_history_t"]] = 4
df["own_telephone_t"] = 1
df.loc[df["own_telephone"] == "no", ["own_telephone_t"]] = 0
df["foreign_worker_t"] = 1
df.loc[df["foreign_worker"] == "no", ["foreign_worker_t"]] = 0
# Dummies
df = pd.get_dummies(df,prefix=["purpose"], columns = ["purpose"], drop_first=True)

### Data split

In [5]:
start = ['duration_t', 'credit_amount_t',
       'installment_commitment_t', 'age_t', 'existing_credits_t',
       'residence_since_t', 'num_dependents_t', 'checking_status_t',
       'savings_status_t', 'employment_t', 'personal_status_t',
       'property_magnitude_t', 'housing_t', 'job_t', 'other_parties_t',
       'other_payment_plans_t', 'credit_history_t', 'own_telephone_t',
       'foreign_worker_t', 'purpose_domestic appliance', 'purpose_education',
       'purpose_furniture/equipment', 'purpose_new car', 'purpose_other',
       'purpose_radio/tv', 'purpose_repairs', 'purpose_retraining',
       'purpose_used car']

In [6]:
X, X_test, y, y_test = train_test_split(df[start], df["class_t"], test_size=0.2, random_state=22, stratify=df["class_t"])

# Logistic regression

In [7]:
lr = LogisticRegression(random_state=22, max_iter=800)
lr.fit(X, y)

In [8]:
cmlr = confusion_matrix(y_test, lr.predict(X_test))
print(cmlr)

[[120  20]
 [ 34  26]]


In [9]:
print(classification_report(y_test, lr.predict(X_test)))

              precision    recall  f1-score   support

           0       0.78      0.86      0.82       140
           1       0.57      0.43      0.49        60

    accuracy                           0.73       200
   macro avg       0.67      0.65      0.65       200
weighted avg       0.72      0.73      0.72       200



In [10]:
param_grid = {"solver": ['sag', 'saga', 'lbfgs', 'newton-cg', 'newton-cholesky'], #faktycznie sprawdziłem różne solvery i kary, tu zostawiam tylko wybrane
              "penalty": ['l2'], 
              "random_state": [22],
              "max_iter": [10000]
             }
print(param_grid)

{'solver': ['sag', 'saga', 'lbfgs', 'newton-cg', 'newton-cholesky'], 'penalty': ['l2'], 'random_state': [22], 'max_iter': [10000]}


In [11]:
classifier = LogisticRegression()
grid_search = GridSearchCV(classifier, param_grid=param_grid, cv=5, verbose=0)
grid_search.fit(X, y)

In [12]:
best_param_no = grid_search.best_index_ 
print('Best params set:', grid_search.best_params_, '(no. {0})'.format(best_param_no))
print('Best params score:', grid_search.best_score_ )

Best params set: {'max_iter': 10000, 'penalty': 'l2', 'random_state': 22, 'solver': 'lbfgs'} (no. 2)
Best params score: 0.7387499999999999


In [13]:
best_lr = LogisticRegression(C= 4, max_iter= 10000, penalty= 'l2', random_state= 22, solver= 'lbfgs')
best_lr.fit(X, y)

# Drzewo decyzyjne

In [14]:
tree = DecisionTreeClassifier(random_state=22, criterion='gini')
tree.fit(X, y)

In [15]:
cmt = confusion_matrix(y_test, tree.predict(X_test))
print(cmt)

[[109  31]
 [ 22  38]]


In [16]:
print(classification_report(y_test, tree.predict(X_test)))

              precision    recall  f1-score   support

           0       0.83      0.78      0.80       140
           1       0.55      0.63      0.59        60

    accuracy                           0.73       200
   macro avg       0.69      0.71      0.70       200
weighted avg       0.75      0.73      0.74       200



In [17]:
param_grid = {"max_depth": [20, 15, 10, 5, 3, None],
              "max_features": list(range(1,6)),
              "min_samples_leaf": [1,3,5],
              "min_samples_split": list(range(2,20,2)),
              "random_state": [22],
              "max_leaf_nodes":[None, 2, 3, 4, 5],
              "random_state" : [22]
             }

print(param_grid)

{'max_depth': [20, 15, 10, 5, 3, None], 'max_features': [1, 2, 3, 4, 5], 'min_samples_leaf': [1, 3, 5], 'min_samples_split': [2, 4, 6, 8, 10, 12, 14, 16, 18], 'random_state': [22], 'max_leaf_nodes': [None, 2, 3, 4, 5]}


In [18]:
classifier = DecisionTreeClassifier()
random_search = RandomizedSearchCV(classifier,
                                   param_distributions=param_grid,
                                   cv=4,
                                   n_iter=30, #!
                                   verbose=1, 
                                   scoring='accuracy',
                                   random_state=22
                                   )
random_search.fit(X, y)

Fitting 4 folds for each of 30 candidates, totalling 120 fits


In [19]:
random_search.best_params_

{'random_state': 22,
 'min_samples_split': 10,
 'min_samples_leaf': 5,
 'max_leaf_nodes': None,
 'max_features': 5,
 'max_depth': 5}

In [20]:
random_search.best_score_

0.7012499999999999

In [21]:
param_grid = {"max_depth": list(range(2,10)),
              "max_features": list(range(1,10)),
              "min_samples_leaf": list(range(1,6)),
              "min_samples_split": list(range(4,11)),
              "random_state": [22],
              "max_leaf_nodes": list(range(2,8)),
              "random_state" : [22]
             }

print(param_grid)

{'max_depth': [2, 3, 4, 5, 6, 7, 8, 9], 'max_features': [1, 2, 3, 4, 5, 6, 7, 8, 9], 'min_samples_leaf': [1, 2, 3, 4, 5], 'min_samples_split': [4, 5, 6, 7, 8, 9, 10], 'random_state': [22], 'max_leaf_nodes': [2, 3, 4, 5, 6, 7]}


In [22]:
classifier = DecisionTreeClassifier()

grid_search = GridSearchCV(classifier, param_grid=param_grid, cv=5, verbose=0)
grid_search.fit(X, y)

In [23]:
grid_search.best_params_

{'max_depth': 4,
 'max_features': 7,
 'max_leaf_nodes': 7,
 'min_samples_leaf': 3,
 'min_samples_split': 4,
 'random_state': 22}

In [24]:
grid_search.best_score_

0.70875

In [25]:
best_tr = DecisionTreeClassifier(**grid_search.best_params_)
best_tr.fit(X,y)

## ***XGBoost***

In [26]:
train_valid_x, test_x, train_valid_y, test_y = train_test_split(X, y, test_size=0.2)
data_dm = xgb.DMatrix(data=train_valid_x.values, label=train_valid_y)

In [27]:
params = {
    "objective": "binary:logistic", 
    "booster": "gbtree",
    "learning_rate": 0.2  # DEFAULT: 0.3
}

In [28]:
for max_depth in range(2,15):
    my_params = {"max_depth": max_depth}
    my_params.update(params)
    cv_results = xgb.cv(
        dtrain=data_dm, 
        params=my_params, 
        nfold=4, 
        num_boost_round=100, metrics=["error"], as_pandas=True)
    best_rounds = cv_results["test-error-mean"].argmin()
    score = cv_results["test-error-mean"].min()
    print("For max_depth {} the best result was {} after {} rounds".format(
        max_depth, score, best_rounds
    ))

For max_depth 2 the best result was 0.27031249999999996 after 50 rounds
For max_depth 3 the best result was 0.26718749999999997 after 24 rounds
For max_depth 4 the best result was 0.2671875 after 37 rounds
For max_depth 5 the best result was 0.2640625 after 64 rounds
For max_depth 6 the best result was 0.25625 after 46 rounds
For max_depth 7 the best result was 0.259375 after 79 rounds
For max_depth 8 the best result was 0.265625 after 14 rounds
For max_depth 9 the best result was 0.26406250000000003 after 48 rounds
For max_depth 10 the best result was 0.2625 after 77 rounds
For max_depth 11 the best result was 0.2734375 after 18 rounds
For max_depth 12 the best result was 0.26718749999999997 after 26 rounds
For max_depth 13 the best result was 0.275 after 96 rounds
For max_depth 14 the best result was 0.2734375 after 29 rounds


In [29]:
my_params = {"max_depth": 3, "n_estimators": 39}
my_params.update(params)

best_xgb = xgb.XGBClassifier(**my_params)
best_xgb.fit(train_valid_x, train_valid_y)


## ***Random forest***

In [30]:
rf = RandomForestClassifier()
rf.fit(X,y)
cmrf = confusion_matrix(y_test, rf.predict(X_test))
print(cmrf)

[[126  14]
 [ 33  27]]


In [31]:
param_grid = {"max_depth": [15, 20, 25, 30, 55, 60, 65, None],
             "n_estimators": [39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53],
             "min_samples_leaf": [2, 3, 4, 5],
             "max_features": ['auto', 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24],
             "criterion" :["gini", "entropy", "log_loss"],
             }

In [32]:
rfc = RandomForestClassifier()
random_search = RandomizedSearchCV(rfc, param_distributions=param_grid, cv=4, n_iter=200, verbose=1, n_jobs=-1)
random_search.fit(X, y)

Fitting 4 folds for each of 200 candidates, totalling 800 fits


In [33]:
score = random_search.cv_results_['mean_test_score']
score

array([0.73   , 0.75625, 0.73625, 0.72875, 0.74625, 0.7575 , 0.74375,
       0.745  , 0.74375, 0.75625, 0.73875, 0.7375 , 0.74125, 0.74   ,
       0.74125, 0.735  , 0.74   , 0.74375, 0.73875, 0.74   , 0.74625,
       0.7325 , 0.75625, 0.745  , 0.74125, 0.7375 , 0.76125, 0.75   ,
       0.735  , 0.7425 , 0.74375, 0.75   , 0.74375, 0.735  , 0.73375,
       0.7375 , 0.74375, 0.75125, 0.74875, 0.75   , 0.7525 , 0.7275 ,
       0.75125, 0.73   , 0.74375, 0.7575 , 0.7525 , 0.75625, 0.74875,
       0.745  , 0.74875, 0.74375, 0.74625, 0.745  , 0.75   , 0.74625,
       0.74125, 0.7475 , 0.75125, 0.75375, 0.735  , 0.7475 , 0.75   ,
       0.75   , 0.7425 , 0.74125, 0.75125, 0.735  , 0.75   , 0.745  ,
       0.745  , 0.74375, 0.73625, 0.75   , 0.7325 , 0.76375, 0.7425 ,
       0.74875, 0.735  , 0.735  , 0.74   , 0.73375, 0.74125, 0.74125,
       0.75375, 0.75375, 0.74   , 0.73875, 0.73625, 0.74375, 0.74625,
       0.7375 , 0.755  , 0.74   , 0.75125, 0.75   , 0.745  , 0.75375,
       0.74   , 0.74

In [34]:
ranks = random_search.cv_results_['rank_test_score']
score = random_search.cv_results_['mean_test_score']
n = 8
k = []

for i in range(n):
    for j in np.where(ranks == i + 1):
        k.extend(j)

for idx in k:
    print(random_search.cv_results_['params'][idx])
    print(score[idx])

{'n_estimators': 42, 'min_samples_leaf': 5, 'max_features': 15, 'max_depth': 15, 'criterion': 'entropy'}
0.76375
{'n_estimators': 41, 'min_samples_leaf': 2, 'max_features': 12, 'max_depth': 15, 'criterion': 'gini'}
0.76125
{'n_estimators': 50, 'min_samples_leaf': 2, 'max_features': 'auto', 'max_depth': 60, 'criterion': 'entropy'}
0.76125
{'n_estimators': 40, 'min_samples_leaf': 3, 'max_features': 17, 'max_depth': 55, 'criterion': 'entropy'}
0.76
{'n_estimators': 51, 'min_samples_leaf': 2, 'max_features': 'auto', 'max_depth': 30, 'criterion': 'gini'}
0.76
{'n_estimators': 44, 'min_samples_leaf': 4, 'max_features': 19, 'max_depth': 20, 'criterion': 'entropy'}
0.75875
{'n_estimators': 51, 'min_samples_leaf': 5, 'max_features': 13, 'max_depth': 65, 'criterion': 'gini'}
0.75875
{'n_estimators': 50, 'min_samples_leaf': 4, 'max_features': 17, 'max_depth': 20, 'criterion': 'log_loss'}
0.7575000000000001
{'n_estimators': 44, 'min_samples_leaf': 4, 'max_features': 15, 'max_depth': 65, 'criterion

In [35]:
param_grid2 = {"max_depth": [15, 25, 30],
             "n_estimators": [41, 42, 43, 44, 45, 51],
             "min_samples_leaf": [2, 3, 4, 5],
             "max_features": ['auto', 13, 14, 15,16],
             "criterion" :["log_loss", "gini", "entropy"], 
             }

In [36]:
rfcg = RandomForestClassifier()
grid_search = GridSearchCV(rfcg, param_grid=param_grid2, cv=4, verbose=1, n_jobs=-1)
grid_search.fit(X, y)

Fitting 4 folds for each of 1080 candidates, totalling 4320 fits


  warn(


In [37]:
grid_search.best_params_

{'criterion': 'log_loss',
 'max_depth': 15,
 'max_features': 'auto',
 'min_samples_leaf': 2,
 'n_estimators': 45}

Result:
{'criterion': 'log_loss',
 'max_depth': None,
 'max_features': 13,
 'min_samples_leaf': 2,
 'n_estimators': 39}

In [38]:
grid_search.best_score_
#0.768749

0.7675

In [39]:
best_rf = RandomForestClassifier(**grid_search.best_params_)
best_rf.fit(X, y)

  warn(


# ***SGD***

In [40]:
stds = StandardScaler()
XS = X.copy()
X_testS = X_test.copy()
stds.fit_transform(XS)
stds.transform(X_testS)

array([[ 0.31422566, -0.27830485,  0.90205822, ..., -0.14285714,
        -0.09395338, -0.34705645],
       [ 1.86118277,  1.15568489, -0.88862807, ..., -0.14285714,
        -0.09395338, -0.34705645],
       [ 1.3455304 ,  2.70951387, -0.88862807, ..., -0.14285714,
        -0.09395338,  2.88137564],
       ...,
       [-0.71707907, -0.22153889, -0.88862807, ..., -0.14285714,
        -0.09395338, -0.34705645],
       [ 1.3455304 , -0.15883662,  0.90205822, ..., -0.14285714,
        -0.09395338, -0.34705645],
       [-0.71707907, -0.70497709,  0.00671507, ..., -0.14285714,
        -0.09395338, -0.34705645]])

In [41]:
sgd = SGDClassifier(max_iter=1000, tol=1e-3, random_state=22)

In [42]:
sgd.fit(X,y)

In [43]:
cmsgd = confusion_matrix(y_test, sgd.predict(X_test))
print(cmsgd)

[[51 89]
 [ 7 53]]


In [110]:
param_grid_SGD = {"loss": ["hinge", "log_loss", "log", "modified_huber", "squared_hinge", "perceptron", "squared_error", "huber", "epsilon_insensitive", "squared_epsilon_insensitive"],
                  "penalty": ["l2", "l1", "elasticnet"],
                  "max_iter" : [10000], 
                  "tol" : [1e-4], 
                  "random_state" : [22],
             }

In [111]:
sgdg = SGDClassifier()
grid_search_SGD = GridSearchCV(sgdg, param_grid_SGD, cv=2, verbose=1, n_jobs=-1)
grid_search_SGD.fit(X,y)

Fitting 2 folds for each of 30 candidates, totalling 60 fits


In [106]:
grid_search_SGD.best_params_

{'loss': 'log_loss', 'max_iter': 10000, 'random_state': 22, 'tol': 0.0001}

In [112]:
grid_search_SGD.best_score_

0.71625

In [113]:
best_sgd = SGDClassifier(**grid_search_SGD.best_params_)
best_sgd.fit(X,y)

# ***Podsumowanie i wnioski***

In [117]:
models = {'Logistic regression': best_lr,
            'Decision tree 1': tree, 
            'Decision tree 2': best_tr, 
            'XGBoost': best_xgb, 
            'Random forest 1': rf,
            'Random forest 2': best_rf, 
            'SGD 1': sgd,
            'SGD 2': best_sgd}

In [118]:
for i in models:
    print(i)
    print(confusion_matrix(y_test, models[i].predict(X_test)))

Logistic regression
[[120  20]
 [ 34  26]]
Decision tree 1
[[109  31]
 [ 22  38]]
Decision tree 2
[[114  26]
 [ 34  26]]
XGBoost
[[118  22]
 [ 28  32]]
Random forest 1
[[126  14]
 [ 33  27]]
Random forest 2
[[123  17]
 [ 27  33]]
SGD 1
[[51 89]
 [ 7 53]]
SGD 2
[[133   7]
 [ 49  11]]


In [116]:
for i in models:
    print(i)
    print(classification_report(y_test, models[i].predict(X_test)))

Linear regression
              precision    recall  f1-score   support

           0       0.78      0.86      0.82       140
           1       0.57      0.43      0.49        60

    accuracy                           0.73       200
   macro avg       0.67      0.65      0.65       200
weighted avg       0.72      0.73      0.72       200

Decision tree 1
              precision    recall  f1-score   support

           0       0.83      0.78      0.80       140
           1       0.55      0.63      0.59        60

    accuracy                           0.73       200
   macro avg       0.69      0.71      0.70       200
weighted avg       0.75      0.73      0.74       200

Decision tree 2
              precision    recall  f1-score   support

           0       0.77      0.81      0.79       140
           1       0.50      0.43      0.46        60

    accuracy                           0.70       200
   macro avg       0.64      0.62      0.63       200
weighted avg       0.69 

# Wnioski

Z punktu widzenia banku dużo większe skutki przynosi brak spłaty kredytu niż utrata pojedynczego klienta, bo w pierwszym przypadku traci kapitał a w drugim odsetki od tego kapitału. 

Z tego powodu pierwszym kryterium byłoby Precision, bo pokazuje ilościowo jakość portfela po zastosowaniu modelu. Ilu mamy dobrych klientów wśród nowo przyjętych klientów, bo bank przyjmuje tylko dobrych według modelu. Jeśli byłoby to jedyne kryterium to najlepiej wypada SGD w defaultowych ustawieniach z wynikiem 0.88. Natomiast jest to wynik osiągnięty przy skrajnie niskim recallu. Zbyt wielu klientów jest odrzucanych.

Drugim kryterium byłaby liczba dobrych klientów. Każdemu bankowi zależy żeby mieć jak najwięcej dobrych klientów. Uwzględniając ten warunek najlepszym modelem byłby random forest z numerem 2 czyli po zoptymalizowaniu hiperparametrów z precyzją na poziomie 0.82 we wskazywaniu dobrych klientów i z najwyższą w zestawieniu precyzją we wskazywaniu złych klientów na poziomie 0.66. 

Zastosowanie modelu znacząco poprawi jakość portfela banku.