In [17]:
import wget
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.tree import DecisionTreeClassifier, export_text
from sklearn.metrics import roc_auc_score
import warnings
warnings.filterwarnings('ignore')

In [2]:
# name = 'creditscore.csv'
# url = 'https://github.com/alexeygrigorev/mlbookcamp-code/raw/master/chapter-06-trees/CreditScoring.csv'
# wget.download(url, name)

In [18]:
df = pd.read_csv('creditscore.csv')
#df.head()

In [19]:
df.columns = df.columns.str.lower().str.replace(' ', '_')
cat_cols = df.dtypes[df.dtypes == 'object'].index.tolist()

for col in cat_cols:
    df[col] = df[col].str.lower().str.replace(' ', '_')

In [20]:
cat_col = ['home','marital','records', 'job']
num_col = ['seniority', 'time', 'age', 'expenses', 
            'income', 'assets', 'debt', 'amount', 'price']

status_mapping = {1:'ok', 2:'default', 0:'unk'}
df['status'] = df['status'].map(status_mapping)

home_mapping = {1:'rent', 2:'owner', 3:'priv', 4:'ignore', 5:'parents', 6:'others', 0:'unk'}
df['home'] = df['home'].map(home_mapping)

marital_mapping = {1:'single', 2:'married', 3:'widow', 4:'seperated', 5:'divorced', 0:'unk'}
df['marital'] = df['marital'].map(marital_mapping)

record_mapping = {1:'no_rec', 2:'yes_rec'}
df['records'] = df['records'].map(record_mapping)

job_mapping = {1:'fixed', 2:'parttime', 3:'freelance', 4:'others', 0:'unk'}

In [21]:
#df.describe().round()

In [22]:
for c in ['income', 'assets', 'debt']:
    df[c] = df[c].replace(to_replace =99999999.0, value = np.nan)

df.fillna(0, inplace = True)
#df.isnull().sum()

In [11]:
df = df[df['status'] != 0].reset_index(drop = True)
df.to_csv('transformed_data.csv')

In [23]:
full_train_df, full_test_df = train_test_split(df, test_size =0.2, random_state=11)
full_train_df, full_val_df = train_test_split(full_train_df, test_size = 0.25, random_state=11)

train_df = full_train_df.reset_index(drop = True)
test_df = full_test_df.reset_index(drop = True)
val_df = full_val_df.reset_index(drop = True)

y_train = (train_df['status'] == 'default').astype('int')
y_test = (test_df['status'] == 'default').astype('int')
y_val = (val_df['status'] == 'default').astype('int')

dv = DictVectorizer(sparse = False)
dv.fit(train_df[cat_col + num_col].to_dict(orient = 'records'))
feature_names = dv.get_feature_names()

X_train = dv.transform(train_df[cat_col + num_col].to_dict(orient = 'records'))
X_test = dv.transform(test_df[cat_col + num_col].to_dict(orient = 'records'))
X_val = dv.transform(val_df[cat_col + num_col].to_dict(orient = 'records'))


del train_df['status']
del test_df['status']
del val_df['status']

In [24]:
def risk(data):
    if data['records'] == 'yes':
        if data['job'] == 'parttime':
            return 'default'
        else:
            return 'ok'

    else:
        if data['assets'] > 6000:
            return 'ok'
        else:
            return 'default'

x = train_df.iloc[0].to_dict()
risk(x)

'ok'

In [25]:
model = DecisionTreeClassifier()
model.fit(X_train, y_train)

train_pred = model.predict_proba(X_train)[:,1]
val_pred = model.predict_proba(X_val)[:,1]

print(roc_auc_score(y_train, train_pred))
print(roc_auc_score(y_val, val_pred))

1.0
0.6416797488226059


In [26]:
model = DecisionTreeClassifier(max_depth=3)
model.fit(X_train, y_train)

train_pred = model.predict_proba(X_train)[:,1]
val_pred = model.predict_proba(X_val)[:,1]

print(roc_auc_score(y_train, train_pred))
print(roc_auc_score(y_val, val_pred))

0.7741250716750189
0.7376574494122301


In [27]:
model = DecisionTreeClassifier(max_depth=4)
model.fit(X_train, y_train)

train_pred = model.predict_proba(X_train)[:,1]
val_pred = model.predict_proba(X_val)[:,1]

print(roc_auc_score(y_train, train_pred))
print(roc_auc_score(y_val, val_pred))

0.8036576448892013
0.7706090310139804


In [28]:
#print(export_text(model))

In [29]:
depths = [1,2,3,4,5,6,7,9,45,200,None]

for d in depths:
    model = DecisionTreeClassifier(max_depth=d)
    model.fit(X_train, y_train)

    train_pred = model.predict_proba(X_train)[:,1]
    val_pred = model.predict_proba(X_val)[:,1]

    print('Validation data %4s == %.3f' % (d, roc_auc_score(y_val, val_pred)))
    print()

Validation data    1 == 0.613

Validation data    2 == 0.690

Validation data    3 == 0.738

Validation data    4 == 0.771

Validation data    5 == 0.769

Validation data    6 == 0.749

Validation data    7 == 0.738

Validation data    9 == 0.699

Validation data   45 == 0.650

Validation data  200 == 0.640

Validation data None == 0.632



In [33]:
leaves = [2,7,8,9,15,23,34,40]

for l in leaves:
            
    model = DecisionTreeClassifier(max_leaf_nodes= l)
    model.fit(X_train, y_train)

    val_pred = model.predict_proba(X_val)[:,1]
    auc = roc_auc_score(y_val, val_pred)
    print('Validation data %4s == %.3f' % (l, auc))
    print()

Validation data    2 == 0.613

Validation data    7 == 0.736

Validation data    8 == 0.742

Validation data    9 == 0.745

Validation data   15 == 0.762

Validation data   23 == 0.769

Validation data   34 == 0.770

Validation data   40 == 0.767



In [34]:
depths = [4,5,6]
leaves = [15,23,34]
scores = []

for d in depths:
    for l in leaves:
            
        model = DecisionTreeClassifier(max_depth=d, max_leaf_nodes= l)
        model.fit(X_train, y_train)

        val_pred = model.predict_proba(X_val)[:,1]
        auc = roc_auc_score(y_val, val_pred)

        scores.append([d,l,auc])

output = pd.DataFrame(scores, columns = ['Depth', 'Leaf', 'Auc Score'])
output = output.sort_values('Auc Score', ascending=False)
output = output.reset_index(drop=True)
output.head()

Unnamed: 0,Depth,Leaf,Auc Score
0,6,23,0.771703
1,4,15,0.770609
2,4,23,0.770609
3,4,34,0.770609
4,5,23,0.770179


In [35]:
output_pivot = output.pivot(index = 'Leaf', columns=['Depth'], values = ['Auc Score'])
output_pivot

Unnamed: 0_level_0,Auc Score,Auc Score,Auc Score
Depth,4,5,6
Leaf,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
15,0.770609,0.767859,0.761647
23,0.770609,0.770179,0.771703
34,0.770609,0.769395,0.769932


In [36]:
model = DecisionTreeClassifier(max_depth=6, max_leaf_nodes=23)
model.fit(X_train, y_train)

prediction = model.predict_proba(X_val)[:,1]
auc = roc_auc_score(y_val, prediction)
print(auc)

0.7717029876759911
