In [1]:
import pandas as pd
import numpy as np

In [2]:
credit = pd.read_csv('credit_history.csv')
credit.head()

Unnamed: 0,default,amount,grade,years,ownership,income,age
0,0,1000,B,2.0,RENT,19200.0,24
1,1,6500,A,2.0,MORTGAGE,66000.0,28
2,0,2400,A,2.0,RENT,60000.0,36
3,0,10000,C,3.0,RENT,62000.0,24
4,1,4000,C,2.0,RENT,20000.0,28


In [3]:
credit.describe()

Unnamed: 0,default,amount,years,income,age
count,7727.0,7727.0,7448.0,7727.0,7727.0
mean,0.478452,9453.345412,6.086332,63371.97,27.542125
std,0.499568,6298.5958,6.700758,46871.95,6.132121
min,0.0,500.0,0.0,4000.0,20.0
25%,0.0,5000.0,2.0,37000.0,23.0
50%,0.0,8000.0,4.0,54000.0,26.0
75%,1.0,12000.0,8.0,76796.5,30.0
max,1.0,35000.0,62.0,1200000.0,94.0


In [4]:
credit.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7727 entries, 0 to 7726
Data columns (total 7 columns):
default      7727 non-null int64
amount       7727 non-null int64
grade        7727 non-null object
years        7448 non-null float64
ownership    7727 non-null object
income       7727 non-null float64
age          7727 non-null int64
dtypes: float64(2), int64(3), object(2)
memory usage: 422.6+ KB


In [6]:
credit.isnull().sum()/7727

default      0.000000
amount       0.000000
grade        0.000000
years        0.036107
ownership    0.000000
income       0.000000
age          0.000000
dtype: float64

In [7]:
credit['years'].fillna(credit['years'].median(), inplace = True)

In [8]:
credit.isnull().sum()/7727

default      0.0
amount       0.0
grade        0.0
years        0.0
ownership    0.0
income       0.0
age          0.0
dtype: float64

In [10]:
from sklearn.model_selection import train_test_split

In [20]:
X = credit.drop('default', axis = 1)

In [21]:
X = pd.get_dummies(X,drop_first=True )

In [12]:
y = credit['default']

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [9]:
from sklearn.linear_model import LogisticRegression

In [14]:
clf = LogisticRegression(random_state=0, solver='lbfgs')

In [29]:
clf.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=0, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [33]:
print(clf.score(X_train, y_train))
print(clf.score(X_test, y_test))

0.542344674556213
0.5653298835705045


In [36]:
#from sklearn.tree import DecisionTreeClassifier
import sklearn.tree as tree
DT = tree.DecisionTreeClassifier(max_depth=4,random_state=200, criterion = 'gini')

In [37]:
DT.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=4,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=200, splitter='best')

In [38]:
print(DT.score(X_train, y_train))
print(DT.score(X_test, y_test))

0.634430473372781
0.6123329021129797


In [40]:
for depth in range(3,15):
    DT = tree.DecisionTreeClassifier(max_depth=depth,random_state=200, criterion = 'entropy')
    DT.fit(X_train, y_train)
    print(depth)
    print(DT.score(X_train, y_train))
    print(DT.score(X_test, y_test))

3
0.6274038461538461
0.610608020698577
4
0.6342455621301775
0.6127641224665804
5
0.6433062130177515
0.6209573091849935
6
0.6601331360946746
0.6278568348426046
7
0.6740014792899408
0.628719275549806
8
0.6850961538461539
0.6304441569642087
9
0.7013683431952663
0.6330314790858128
10
0.7165310650887574
0.6265631737818025
11
0.7366863905325444
0.6347563605002156
12
0.7538831360946746
0.6239758516601983
13
0.7723742603550295
0.6067270375161707
14
0.7990014792899408
0.6093143596377749


In [45]:
from sklearn.model_selection import GridSearchCV

In [51]:
DT = tree.DecisionTreeClassifier(random_state=200)

In [52]:
mod = GridSearchCV(DT,cv=5
,param_grid={'max_depth':[5,10,15,20,25,30,35,40]
             , 'criterion': ['gini', 'entropy'], })

In [None]:
mod.fit(X_train,y_train)

In [54]:
mod.best_estimator_

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=5,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=200, splitter='best')

In [55]:
mod.best_score_

0.6264792899408284