In [103]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.tree import DecisionTreeClassifier


In [79]:
cs = pd.read_csv("CreditScoring.csv")

In [80]:
cs.head()

Unnamed: 0,Status,Seniority,Home,Time,Age,Marital,Records,Job,Expenses,Income,Assets,Debt,Amount,Price
0,1,9,1,60,30,2,1,3,73,129,0,0,800,846
1,1,17,1,60,58,3,1,1,48,131,0,0,1000,1658
2,2,10,2,36,46,2,2,3,90,200,3000,0,2000,2985
3,1,0,1,60,24,1,1,1,63,182,2500,0,900,1325
4,1,0,1,36,26,1,1,1,46,107,0,0,310,910


In [81]:
cs.columns.values

array(['Status', 'Seniority', 'Home', 'Time', 'Age', 'Marital', 'Records',
       'Job', 'Expenses', 'Income', 'Assets', 'Debt', 'Amount', 'Price'],
      dtype=object)

In [82]:
cs.columns = cs.columns.str.lower()

In [83]:
cs.columns.values

array(['status', 'seniority', 'home', 'time', 'age', 'marital', 'records',
       'job', 'expenses', 'income', 'assets', 'debt', 'amount', 'price'],
      dtype=object)

In [84]:
status_values = {1: 'ok', 2: 'default', 3: "unk"}
home_values = {1: 'rent', 2: 'owner', 3: 'private', 4: 'ignore', 5: 'parents', 6: 'other', 0: 'unk'}
marital_values = {1: 'single', 2: 'married', 3: 'widow', 4: 'separated', 5: 'divorced', 6: 'unk'}
records_values = {1: 'no', 2: 'yes', 3: 'unk'}
job_values = {1: 'fixed', 2: 'parttime', 3: 'freelance', 4: 'others', 0: 'unk'}

In [85]:
cs.status = cs.status.map(status_values)
cs.home = cs.home.map(home_values)
cs.marital = cs.marital.map(marital_values)
cs.records = cs.records.map(records_values)
cs.job = cs.job.map(job_values)

In [86]:
cs.head(10)

Unnamed: 0,status,seniority,home,time,age,marital,records,job,expenses,income,assets,debt,amount,price
0,ok,9,rent,60,30,married,no,freelance,73,129,0,0,800,846
1,ok,17,rent,60,58,widow,no,fixed,48,131,0,0,1000,1658
2,default,10,owner,36,46,married,yes,freelance,90,200,3000,0,2000,2985
3,ok,0,rent,60,24,single,no,fixed,63,182,2500,0,900,1325
4,ok,0,rent,36,26,single,no,fixed,46,107,0,0,310,910
5,ok,1,owner,60,36,married,no,fixed,75,214,3500,0,650,1645
6,ok,29,owner,60,44,married,no,fixed,75,125,10000,0,1600,1800
7,ok,9,parents,12,27,single,no,fixed,35,80,0,0,200,1093
8,ok,0,owner,60,32,married,no,freelance,90,107,15000,0,1200,1957
9,default,0,parents,48,41,married,no,parttime,90,80,0,0,1200,1468


In [87]:
cs.describe().round().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
seniority,4455.0,8.0,8.0,0.0,2.0,5.0,12.0,48.0
time,4455.0,46.0,15.0,6.0,36.0,48.0,60.0,72.0
age,4455.0,37.0,11.0,18.0,28.0,36.0,45.0,68.0
expenses,4455.0,56.0,20.0,35.0,35.0,51.0,72.0,180.0
income,4455.0,763317.0,8703625.0,0.0,80.0,120.0,166.0,99999999.0
assets,4455.0,1060341.0,10217569.0,0.0,0.0,3500.0,6000.0,99999999.0
debt,4455.0,404382.0,6344253.0,0.0,0.0,0.0,0.0,99999999.0
amount,4455.0,1039.0,475.0,100.0,700.0,1000.0,1300.0,5000.0
price,4455.0,1463.0,628.0,105.0,1118.0,1400.0,1692.0,11140.0


In [88]:
for col in ['income', 'assets','debt']:
    cs[col] = cs[col].replace(to_replace=99999999, value=np.nan)

In [89]:
cs.describe().round().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
seniority,4455.0,8.0,8.0,0.0,2.0,5.0,12.0,48.0
time,4455.0,46.0,15.0,6.0,36.0,48.0,60.0,72.0
age,4455.0,37.0,11.0,18.0,28.0,36.0,45.0,68.0
expenses,4455.0,56.0,20.0,35.0,35.0,51.0,72.0,180.0
income,4421.0,131.0,86.0,0.0,80.0,120.0,165.0,959.0
assets,4408.0,5403.0,11573.0,0.0,0.0,3000.0,6000.0,300000.0
debt,4437.0,343.0,1246.0,0.0,0.0,0.0,0.0,30000.0
amount,4455.0,1039.0,475.0,100.0,700.0,1000.0,1300.0,5000.0
price,4455.0,1463.0,628.0,105.0,1118.0,1400.0,1692.0,11140.0


In [90]:
cs.status.value_counts()

ok         3200
default    1254
Name: status, dtype: int64

In [91]:
cs_train_full, cs_test = train_test_split(cs, test_size=0.2, random_state=42)
cs_train, cs_val = train_test_split(cs_train_full, test_size= 0.25, random_state=42)

In [92]:
len(cs_train), len(cs_val), len(cs_test)

(2673, 891, 891)

In [93]:
y_train = (cs_train.status == 'default').values
y_test = (cs_test.status == 'default').values

del(cs_train['status'])
del(cs_test['status'])

print(cs_train.head(10))
print(y_train[0:10])

      seniority     home  time  age    marital records        job  expenses  \
353          14    owner    60   30    married      no      fixed        60   
990           2  parents    60   35    married      no      fixed        75   
3289          8     rent    36   61     single      no      fixed        42   
4002         14    owner    60   40    married      no      fixed        45   
1273          2    other    60   41  separated      no  freelance        35   
2019         15    owner    60   52    married     yes      fixed        60   
894          15     rent    60   37    married      no      fixed        58   
147          14    owner    60   64    married      no  freelance        60   
2839          2    owner    60   32    married     yes      fixed        60   
397           1  private    48   34    married     yes  freelance        45   

      income   assets    debt  amount  price  
353     70.0   4000.0  2800.0     600   1125  
990    104.0      0.0     0.0    120

In [94]:
cs_train = cs_train.fillna(0)
cs_test = cs_test.fillna(0)

In [97]:
cs_train_dict = cs_train.to_dict(orient='records')
cs_val_dict = cs_val.to_dict(orient='records')
cs_val_dict


[{'status': 'ok',
  'seniority': 7,
  'home': 'rent',
  'time': 36,
  'age': 49,
  'marital': 'married',
  'records': 'no',
  'job': 'fixed',
  'expenses': 57,
  'income': 115.0,
  'assets': 0.0,
  'debt': 0.0,
  'amount': 675,
  'price': 675},
 {'status': 'ok',
  'seniority': 19,
  'home': 'owner',
  'time': 48,
  'age': 52,
  'marital': 'married',
  'records': 'no',
  'job': 'fixed',
  'expenses': 60,
  'income': 68.0,
  'assets': 6000.0,
  'debt': 0.0,
  'amount': 850,
  'price': 1393},
 {'status': 'default',
  'seniority': 2,
  'home': 'rent',
  'time': 48,
  'age': 36,
  'marital': 'divorced',
  'records': 'no',
  'job': 'fixed',
  'expenses': 45,
  'income': 99.0,
  'assets': 0.0,
  'debt': 0.0,
  'amount': 750,
  'price': 1350},
 {'status': 'default',
  'seniority': 1,
  'home': 'owner',
  'time': 48,
  'age': 42,
  'marital': 'divorced',
  'records': 'no',
  'job': 'fixed',
  'expenses': 35,
  'income': 210.0,
  'assets': 6500.0,
  'debt': 1500.0,
  'amount': 1650,
  'price': 1

In [101]:
dv = DictVectorizer(sparse=False)

X_train = dv.fit_transform(cs_train_dict)
X_val = dv.fit_transform(cs_val_dict)

In [102]:
def access_risk(client):
    if client['records'] == 'yes':
        if client['job'] == 'parttime':
            return 'default'
        else:
            return 'ok'
    else:
        if client['assets'] > 6000:
            return 'ok'
        else:
            return 'default'

In [104]:
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)