In [49]:
# Необходимый импорт
import pandas as pd
import numpy as np

import seaborn as sns
sns.set_style('whitegrid')
sns.set_palette("Set2")
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import roc_auc_score

from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_text

RUNDOM_SEED = 42

In [50]:
df = pd.read_csv('data\CreditScoring.csv')
df.columns = df.columns.str.lower()
df.head()

Unnamed: 0,status,seniority,home,time,age,marital,records,job,expenses,income,assets,debt,amount,price
0,1,9,1,60,30,2,1,3,73,129,0,0,800,846
1,1,17,1,60,58,3,1,1,48,131,0,0,1000,1658
2,2,10,2,36,46,2,2,3,90,200,3000,0,2000,2985
3,1,0,1,60,24,1,1,1,63,182,2500,0,900,1325
4,1,0,1,36,26,1,1,1,46,107,0,0,310,910


In [51]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4455 entries, 0 to 4454
Data columns (total 14 columns):
 #   Column     Non-Null Count  Dtype
---  ------     --------------  -----
 0   status     4455 non-null   int64
 1   seniority  4455 non-null   int64
 2   home       4455 non-null   int64
 3   time       4455 non-null   int64
 4   age        4455 non-null   int64
 5   marital    4455 non-null   int64
 6   records    4455 non-null   int64
 7   job        4455 non-null   int64
 8   expenses   4455 non-null   int64
 9   income     4455 non-null   int64
 10  assets     4455 non-null   int64
 11  debt       4455 non-null   int64
 12  amount     4455 non-null   int64
 13  price      4455 non-null   int64
dtypes: int64(14)
memory usage: 487.4 KB


In [52]:
# Восстановим данные при помощи map

status_values = {
    1: 'ok',
    2: 'default',
    3: 'unk'
}
df['status'] = df['status'].map(status_values)

home_values = {
    1: 'rent',
    2: 'owner',
    3: 'private',
    4: 'ignore',
    5: 'parents',
    6: 'other',
    0: 'unk'
}
df['home'] = df['home'].map(home_values)

marital_values = {
    1: 'single',
    2: 'marrier',
    3: 'widow',
    4: 'separated',
    5: 'divorsed',
    0: 'unk'
}
df['marital'] = df['marital'].map(marital_values)

records_values = {
    1: 'no',
    2: 'yes',
    3: 'unk'
}
df['records'] = df['records'].map(records_values)

job_values = {
    1: 'fixed',
    2: 'parttime',
    3: 'freelance',
    4: 'others',
    0: 'unk'
}
df['job'] = df['job'].map(job_values)

In [53]:
df.head()

Unnamed: 0,status,seniority,home,time,age,marital,records,job,expenses,income,assets,debt,amount,price
0,ok,9,rent,60,30,marrier,no,freelance,73,129,0,0,800,846
1,ok,17,rent,60,58,widow,no,fixed,48,131,0,0,1000,1658
2,default,10,owner,36,46,marrier,yes,freelance,90,200,3000,0,2000,2985
3,ok,0,rent,60,24,single,no,fixed,63,182,2500,0,900,1325
4,ok,0,rent,36,26,single,no,fixed,46,107,0,0,310,910


In [54]:
df.describe().round()

Unnamed: 0,seniority,time,age,expenses,income,assets,debt,amount,price
count,4455.0,4455.0,4455.0,4455.0,4455.0,4455.0,4455.0,4455.0,4455.0
mean,8.0,46.0,37.0,56.0,763317.0,1060341.0,404382.0,1039.0,1463.0
std,8.0,15.0,11.0,20.0,8703625.0,10217569.0,6344253.0,475.0,628.0
min,0.0,6.0,18.0,35.0,0.0,0.0,0.0,100.0,105.0
25%,2.0,36.0,28.0,35.0,80.0,0.0,0.0,700.0,1118.0
50%,5.0,48.0,36.0,51.0,120.0,3500.0,0.0,1000.0,1400.0
75%,12.0,60.0,45.0,72.0,166.0,6000.0,0.0,1300.0,1692.0
max,48.0,72.0,68.0,180.0,99999999.0,99999999.0,99999999.0,5000.0,11140.0


In [55]:
# заменим выбросы в столбцах income assets debt на Nan

for c in ['income', 'assets', 'debt']:
    df[c] = df[c].replace(to_replace=99999999.0, value=np.nan)

In [56]:
df.describe().round()

Unnamed: 0,seniority,time,age,expenses,income,assets,debt,amount,price
count,4455.0,4455.0,4455.0,4455.0,4421.0,4408.0,4437.0,4455.0,4455.0
mean,8.0,46.0,37.0,56.0,131.0,5403.0,343.0,1039.0,1463.0
std,8.0,15.0,11.0,20.0,86.0,11573.0,1246.0,475.0,628.0
min,0.0,6.0,18.0,35.0,0.0,0.0,0.0,100.0,105.0
25%,2.0,36.0,28.0,35.0,80.0,0.0,0.0,700.0,1118.0
50%,5.0,48.0,36.0,51.0,120.0,3000.0,0.0,1000.0,1400.0
75%,12.0,60.0,45.0,72.0,165.0,6000.0,0.0,1300.0,1692.0
max,48.0,72.0,68.0,180.0,959.0,300000.0,30000.0,5000.0,11140.0


In [57]:
# рассмотрим целевую переменную
df['status'].value_counts()

ok         3200
default    1254
Name: status, dtype: int64

## Подготовка данных

In [58]:
# Разделим данные на тренировочыне, валидационные и тестовые
df_train_full, df_test = train_test_split(df, test_size=0.2, random_state=RUNDOM_SEED)
df_train, df_val = train_test_split(df_train_full, test_size=0.25, random_state=RUNDOM_SEED)

# Выделим целевую переменную
y_train = (df_train['status'] == 'default').values
y_val = (df_val['status'] == 'default').values

# Удалим ненужные столбцы
del df_train['status']
del df_val['status']

# заполним недостающие значения нулями
df_train = df_train.fillna(0)
df_val = df_val.fillna(0)

In [59]:
# Закодируем переменные при помощи DictVectorizer
dict_train = df_train.to_dict(orient='records')
dict_val = df_val.to_dict(orient='records')

dv = DictVectorizer(sparse=False)
X_train = dv.fit_transform(dict_train)
X_val = dv.transform(dict_val)

## Деревья решений

In [60]:
# обучаем модель
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)

In [61]:
y_pred = dt.predict_proba(X_train)[:, 1]
roc_auc_score(y_train,y_pred)

0.9999996450431698

In [62]:
y_pred = dt.predict_proba(X_val)[:, 1]
roc_auc_score(y_val,y_pred)

0.658573605759104

Модель переобучена

In [63]:
export_text

<function sklearn.tree._export.export_text(decision_tree, *, feature_names=None, max_depth=10, spacing=3, decimals=2, show_weights=False)>

In [64]:
# испарвим глубину
dt = DecisionTreeClassifier(max_depth=3)
dt.fit(X_train, y_train)

y_pred = dt.predict_proba(X_train)[:, 1]
print(f'roc auc score in train: {roc_auc_score(y_train,y_pred)}\n')

tree_text = export_text(dt, feature_names=dv.feature_names_)
print(tree_text)

roc auc score in train: 0.7783638903836515

|--- records=yes <= 0.50
|   |--- job=parttime <= 0.50
|   |   |--- income <= 83.50
|   |   |   |--- class: False
|   |   |--- income >  83.50
|   |   |   |--- class: False
|   |--- job=parttime >  0.50
|   |   |--- assets <= 8500.00
|   |   |   |--- class: True
|   |   |--- assets >  8500.00
|   |   |   |--- class: False
|--- records=yes >  0.50
|   |--- seniority <= 6.50
|   |   |--- seniority <= 1.50
|   |   |   |--- class: True
|   |   |--- seniority >  1.50
|   |   |   |--- class: True
|   |--- seniority >  6.50
|   |   |--- income <= 106.00
|   |   |   |--- class: True
|   |   |--- income >  106.00
|   |   |   |--- class: False



In [65]:
y_pred = dt.predict_proba(X_val)[:, 1]
print(f'roc auc score in val: {roc_auc_score(y_val,y_pred)}')

roc auc score in val: 0.7377396012737434


## Подбор гиперпараметров дерева решений

In [69]:
for depth in [1, 2, 3, 4, 5, 6, 10, 15, 20, None]:
    dt = DecisionTreeClassifier(max_depth=depth)

    dt.fit(X_train, y_train)
    y_pred = dt.predict_proba(X_val)[:, 1]
    auc = roc_auc_score(y_val, y_pred)
    print(f'{depth} - {auc}')

1 - 0.5989505247376311
2 - 0.6701804056649362
3 - 0.7377396012737434
4 - 0.7359089050516064
5 - 0.754339772262629
6 - 0.7557677772683906
10 - 0.6578642496933351
15 - 0.6632355309948332
20 - 0.6589979803486687
None - 0.6668938258143655


In [None]:
for m in [5, 6]:
    print(f'Depth: {m}')

    for s in [1, 5, 10, 15, 20, 50, 100, 200]:
        dt = DecisionTreeClassifier(max_depth=m, min_samples_leaf=s)

        dt.fit(X_train, y_train)
        y_pred = dt.predict_proba(X_val)[:, 1]
        auc = roc_auc_score(y_val, y_pred)
        print(f'{s} - {auc}')
    print('\n')

Depth: 5
1 - 0.7506257201977524
5 - 0.755067714076846
10 - 0.760030108912486
15 - 0.7535963423247054
20 - 0.759336240970424
50 - 0.7620683459922932
100 - 0.7633786412578836
200 - 0.7625763564498742


Depth: 6
1 - 0.7535622684525507
5 - 0.7591534811106843
10 - 0.7620931269902239
15 - 0.7604204096298959
20 - 0.7624462562107376
50 - 0.7705434472846223
100 - 0.7678516113843905
200 - 0.7628861189240091




In [76]:
dt = DecisionTreeClassifier(max_depth=6, min_samples_leaf=50)
dt.fit(X_train, y_train)

y_pred = dt.predict_proba(X_val)[:, 1]
print(f'roc auc score in val: {roc_auc_score(y_val,y_pred)}')

roc auc score in val: 0.7705434472846223


## Случайный лес