In [45]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB,  BernoulliNB
from sklearn.svm import SVC
%config InlineBackend.figure_format = 'svg'
import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn import tree

In [46]:
credit = pd.read_csv("../data/preprocessed_credit_risk.csv")
credit.head(10)

Unnamed: 0.1,Unnamed: 0,clientid,income,age,loan,default
0,0,1,66155,59,8106,0
1,1,2,34415,48,6564,0
2,2,3,57317,63,8020,0
3,3,4,42709,45,6103,0
4,4,5,66952,18,8770,1
5,5,6,24904,57,15,0
6,6,7,48430,26,5722,0
7,7,8,24500,32,2971,1
8,8,9,40654,55,4755,0
9,9,10,25075,39,1409,0


In [28]:
y = credit["default"]
y

0       0
1       0
2       0
3       0
4       1
       ..
1989    0
1990    0
1991    1
1992    0
1993    0
Name: default, Length: 1994, dtype: int64

In [29]:
credit1 = credit[["clientid", "default", "income", "age", "loan"]]
credit1


Unnamed: 0,clientid,default,income,age,loan
0,1,0,66155,59,8106
1,2,0,34415,48,6564
2,3,0,57317,63,8020
3,4,0,42709,45,6103
4,5,1,66952,18,8770
...,...,...,...,...,...
1989,1996,0,59221,48,1926
1990,1997,0,69516,23,3503
1991,1998,1,44311,28,5522
1992,1999,0,43756,63,1622


In [30]:
X = credit1[credit1.columns[2:credit1.shape[1]]]
X

Unnamed: 0,income,age,loan
0,66155,59,8106
1,34415,48,6564
2,57317,63,8020
3,42709,45,6103
4,66952,18,8770
...,...,...,...
1989,59221,48,1926
1990,69516,23,3503
1991,44311,28,5522
1992,43756,63,1622


In [31]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y)

In [32]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [33]:
parameters = {'n_neighbors': np.arange(1,15,1)}

In [34]:
 knn = KNeighborsClassifier()
clf = GridSearchCV(knn, parameters)
clf.fit(X_train, y_train)
clf.best_params_

{'n_neighbors': 6}

In [35]:
 y_pred = clf.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(clf.score(X_test, y_test))

[[425   3]
 [ 10  61]]
              precision    recall  f1-score   support

           0       0.98      0.99      0.98       428
           1       0.95      0.86      0.90        71

    accuracy                           0.97       499
   macro avg       0.97      0.93      0.94       499
weighted avg       0.97      0.97      0.97       499

0.9739478957915831


In [36]:
parameters = {'max_depth': np.arange(1,15,1),
              'max_features': np.arange(5,X.shape[1]-1,1)}

In [54]:
clf = tree.DecisionTreeClassifier()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(clf.score(X_test, y_test))

[[421   7]
 [  3  68]]
              precision    recall  f1-score   support

           0       0.99      0.98      0.99       428
           1       0.91      0.96      0.93        71

    accuracy                           0.98       499
   macro avg       0.95      0.97      0.96       499
weighted avg       0.98      0.98      0.98       499

0.9799599198396793


In [55]:
nb_gaussian = GaussianNB()
nb_bernoulli = BernoulliNB()
nb_gaussian.fit(X_train, y_train)
y_pred = nb_gaussian.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(nb_gaussian.score(X_test, y_test))

[[422   6]
 [ 25  46]]
              precision    recall  f1-score   support

           0       0.94      0.99      0.96       428
           1       0.88      0.65      0.75        71

    accuracy                           0.94       499
   macro avg       0.91      0.82      0.86       499
weighted avg       0.94      0.94      0.93       499

0.9378757515030061


In [56]:
nb_bernoulli.fit(X_train, y_train)
y_pred = nb_bernoulli.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(nb_bernoulli.score(X_test, y_test))

[[428   0]
 [ 71   0]]
              precision    recall  f1-score   support

           0       0.86      1.00      0.92       428
           1       0.00      0.00      0.00        71

    accuracy                           0.86       499
   macro avg       0.43      0.50      0.46       499
weighted avg       0.74      0.86      0.79       499

0.8577154308617234


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [57]:
parameters = {'C': np.arange(0.1,1.1,0.1),
              'degree': np.arange(2,5,1)}

In [58]:
svc = SVC()
clf = GridSearchCV(svc, parameters)
clf.fit(X_train, y_train)
clf.best_params_

{'C': 1.0, 'degree': 2}

In [59]:
y_pred = clf.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(clf.score(X_test, y_test))

[[421   7]
 [  6  65]]
              precision    recall  f1-score   support

           0       0.99      0.98      0.98       428
           1       0.90      0.92      0.91        71

    accuracy                           0.97       499
   macro avg       0.94      0.95      0.95       499
weighted avg       0.97      0.97      0.97       499

0.9739478957915831


In [60]:
parameters = {'C': np.arange(0.1,1.1,0.1)}

In [61]:
lr = LogisticRegression()
clf = GridSearchCV(lr, parameters)
clf.fit(X_train, y_train)
clf.best_params_

{'C': 0.6}

In [62]:
 y_pred = clf.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(clf.score(X_test, y_test))

[[416  12]
 [ 16  55]]
              precision    recall  f1-score   support

           0       0.96      0.97      0.97       428
           1       0.82      0.77      0.80        71

    accuracy                           0.94       499
   macro avg       0.89      0.87      0.88       499
weighted avg       0.94      0.94      0.94       499

0.9438877755511023


In [64]:
skf = StratifiedKFold(n_splits=5)
for train_index, test_index in skf.split(X, y):
  X_train, X_test = X.iloc[train_index], X.iloc[test_index]
  y_train, y_test = y.iloc[train_index], y.iloc[test_index]
  clf = tree.DecisionTreeClassifier(max_depth=dtc_max_depth, max_features=dtc_max_features)
  clf.fit(X_train, y_train)
  y_pred = clf.predict(X_test)
  print(confusion_matrix(y_test, y_pred))
  print(classification_report(y_test, y_pred))
  print(clf.score(X_test, y_test))

NameError: name 'dtc_max_depth' is not defined