In [275]:
import pandas as pd

In [276]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
import seaborn as sns
from scipy import stats
from scipy.stats import norm
from scipy.stats import boxcox

In [277]:
data = pd.read_csv('cs-training.csv', encoding='cp1251', sep=',')

In [278]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150000 entries, 0 to 149999
Data columns (total 12 columns):
Unnamed: 0                              150000 non-null int64
SeriousDlqin2yrs                        150000 non-null int64
RevolvingUtilizationOfUnsecuredLines    150000 non-null float64
age                                     150000 non-null int64
NumberOfTime30-59DaysPastDueNotWorse    150000 non-null int64
DebtRatio                               150000 non-null float64
MonthlyIncome                           120269 non-null float64
NumberOfOpenCreditLinesAndLoans         150000 non-null int64
NumberOfTimes90DaysLate                 150000 non-null int64
NumberRealEstateLoansOrLines            150000 non-null int64
NumberOfTime60-89DaysPastDueNotWorse    150000 non-null int64
NumberOfDependents                      146076 non-null float64
dtypes: float64(4), int64(8)
memory usage: 13.7 MB


In [279]:
data.head()

Unnamed: 0.1,Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
0,1,1,0.766,45,2,0.803,9120.0,13,0,6,0,2.0
1,2,0,0.957,40,0,0.122,2600.0,4,0,0,0,1.0
2,3,0,0.658,38,1,0.085,3042.0,2,1,0,0,0.0
3,4,0,0.234,30,0,0.036,3300.0,5,0,0,0,0.0
4,5,0,0.907,49,1,0.025,63588.0,7,0,1,0,0.0


In [280]:
data['MonthlyIncome'].isnull().sum()

29731

In [281]:
data['SeriousDlqin2yrs'] = data['SeriousDlqin2yrs'].astype('object')

In [282]:
pd.set_option('display.float_format', lambda x: '%.3f' % x)

In [283]:
data.describe()

Unnamed: 0.1,Unnamed: 0,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
count,150000.0,150000.0,150000.0,150000.0,150000.0,120269.0,150000.0,150000.0,150000.0,150000.0,146076.0
mean,75000.5,6.048,52.295,0.421,353.005,6670.221,8.453,0.266,1.018,0.24,0.757
std,43301.415,249.755,14.772,4.193,2037.819,14384.674,5.146,4.169,1.13,4.155,1.115
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,37500.75,0.03,41.0,0.0,0.175,3400.0,5.0,0.0,0.0,0.0,0.0
50%,75000.5,0.154,52.0,0.0,0.367,5400.0,8.0,0.0,1.0,0.0,0.0
75%,112500.25,0.559,63.0,0.0,0.868,8249.0,11.0,0.0,2.0,0.0,1.0
max,150000.0,50708.0,109.0,98.0,329664.0,3008750.0,58.0,98.0,54.0,98.0,20.0


In [284]:
data['DebtRatio'] = data['DebtRatio'].map(lambda x: float('0.0' + ''.join(str(x).split('.'))) if x > 1 and x < 100000 else x)

In [285]:
data['DebtRatio'] = data['DebtRatio'].map(lambda x: float('0.0' + ''.join(str(x).split('.'))) if x >= 100000 else x)

In [286]:
print(data['MonthlyIncome'].nsmallest(10))
print(data['MonthlyIncome'].nlargest(10))

14    0.000
50    0.000
73    0.000
90    0.000
298   0.000
326   0.000
508   0.000
537   0.000
605   0.000
629   0.000
Name: MonthlyIncome, dtype: float64
73763    3008750.000
137140   1794060.000
111365   1560100.000
50640    1072500.000
122543    835040.000
123291    730483.000
93564     702500.000
96549     699530.000
119136    649587.000
37078     629000.000
Name: MonthlyIncome, dtype: float64


In [287]:
data['MonthlyIncome'] = np.where((data['MonthlyIncome'] < 1000) | (data['MonthlyIncome'] > 1000000), np.NaN, data['MonthlyIncome']) 

In [288]:
data.drop('Unnamed: 0', axis=1, inplace=True)

In [289]:
data['age'] = np.where((data['age'] < 21) | (data['age'] > 80), np.NaN, data['age'])

In [290]:
a = 0.01
data['Ratio'] = (data['NumberOfTimes90DaysLate'] + a) / (data['NumberOfTimes90DaysLate'] + 
                                                         data['NumberOfTime30-59DaysPastDueNotWorse'] + 
                                                        data['NumberOfTime60-89DaysPastDueNotWorse']) + a
data['Ratio'].replace([np.inf, -np.inf], 0, inplace=True)

In [291]:
data['NumberOfTimes90DaysLate_is_0'] = np.where(data['NumberOfTimes90DaysLate'] == 0, 1, 0).astype('object')
data['RevolvingUtilizationOfUnsecuredLines_is_0'] = np.where(data['RevolvingUtilizationOfUnsecuredLines'] == 0, 1, 0).astype('object')

In [292]:
for i in ['NumberOfTime30-59DaysPastDueNotWorse', 
          'NumberOfTime60-89DaysPastDueNotWorse',
          'NumberOfTimes90DaysLate',
          'NumberOfDependents']:
    print(data[i].value_counts(dropna=False))

0     126018
1      16033
2       4598
3       1754
4        747
5        342
98       264
6        140
7         54
8         25
9         12
96         5
10         4
12         2
13         1
11         1
Name: NumberOfTime30-59DaysPastDueNotWorse, dtype: int64
0     142396
1       5731
2       1118
3        318
98       264
4        105
5         34
6         16
7          9
96         5
8          2
11         1
9          1
Name: NumberOfTime60-89DaysPastDueNotWorse, dtype: int64
0     141662
1       5243
2       1555
3        667
4        291
98       264
5        131
6         80
7         38
8         21
9         19
10         8
11         5
96         5
13         4
12         2
14         2
15         2
17         1
Name: NumberOfTimes90DaysLate, dtype: int64
0.000     86902
1.000     26316
2.000     19522
3.000      9483
nan        3924
4.000      2862
5.000       746
6.000       158
7.000        51
8.000        24
9.000         5
10.000        5
13.000        1
20.000    

In [293]:
data.at[data['NumberOfTime30-59DaysPastDueNotWorse'].isin([96,98]), 'NumberOfTime30-59DaysPastDueNotWorse'] = -1
data.loc[data['NumberOfTime30-59DaysPastDueNotWorse'] > 3, 'NumberOfTime30-59DaysPastDueNotWorse'] = 'more_3'
data.at[data['NumberOfTime60-89DaysPastDueNotWorse'].isin([96,98]), 'NumberOfTime60-89DaysPastDueNotWorse'] = -1
data.loc[data['NumberOfTime60-89DaysPastDueNotWorse'] > 3, 'NumberOfTime60-89DaysPastDueNotWorse'] = 'more_3'
data.at[data['NumberOfTimes90DaysLate'].isin([96,98]), 'NumberOfTimes90DaysLate'] = -1
data.loc[data['NumberOfTimes90DaysLate'] > 3, 'NumberOfTimes90DaysLate'] = 'more_3'

In [294]:
for i in ['NumberOfTime30-59DaysPastDueNotWorse', 
          'NumberOfTime60-89DaysPastDueNotWorse',
          'NumberOfTimes90DaysLate',
          'NumberOfDependents']:
    print(data[i].value_counts(dropna=False))

0         126018
1          16033
2           4598
3           1754
more_3      1328
-1           269
Name: NumberOfTime30-59DaysPastDueNotWorse, dtype: int64
0         142396
1           5731
2           1118
3            318
-1           269
more_3       168
Name: NumberOfTime60-89DaysPastDueNotWorse, dtype: int64
0         141662
1           5243
2           1555
3            667
more_3       604
-1           269
Name: NumberOfTimes90DaysLate, dtype: int64
0.000     86902
1.000     26316
2.000     19522
3.000      9483
nan        3924
4.000      2862
5.000       746
6.000       158
7.000        51
8.000        24
9.000         5
10.000        5
13.000        1
20.000        1
Name: NumberOfDependents, dtype: int64


In [295]:
train = data.sample(frac=0.7, random_state=200)
test = data.drop(train.index)

In [296]:
train['age'].fillna(train['age'].median(), inplace=True)
test['age'].fillna(test['age'].median(), inplace=True)

In [297]:
train['age'].isnull().sum()

0

In [298]:
train['MonthlyIncome'].fillna(train['MonthlyIncome'].median(), inplace=True)
test['MonthlyIncome'].fillna(test['MonthlyIncome'].median(), inplace=True)

In [299]:
train.at[train['NumberOfDependents'].isnull(), 'NumberOfDependents'] = 0
train.loc[train['NumberOfDependents'] > 4, 'NumberOfDependents'] = 'more_4'
test.at[test['NumberOfDependents'].isnull(), 'NumberOfDependents'] = 0
test.loc[test['NumberOfDependents'] > 4, 'NumberOfDependents'] = 'more_4'

In [300]:
train['NumberOfDependents'].value_counts(dropna=False)

0.0       63701
1.0       18344
2.0       13663
3.0        6582
4.0        2021
more_4      689
Name: NumberOfDependents, dtype: int64

In [301]:
train['SeriousDlqin2yrs'].value_counts(dropna=False)

0    97985
1     7015
Name: SeriousDlqin2yrs, dtype: int64

In [302]:
train['RevolvingUtilizationOfUnsecuredLines'] = np.where(train['RevolvingUtilizationOfUnsecuredLines'] > 2, np.NaN,
                                                        train['RevolvingUtilizationOfUnsecuredLines'])
test['RevolvingUtilizationOfUnsecuredLines'] = np.where(test['RevolvingUtilizationOfUnsecuredLines'] > 2, np.NaN,
                                                        test['RevolvingUtilizationOfUnsecuredLines'])
train['RevolvingUtilizationOfUnsecuredLines'].fillna(train['RevolvingUtilizationOfUnsecuredLines'].median(), inplace=True)
test['RevolvingUtilizationOfUnsecuredLines'].fillna(test['RevolvingUtilizationOfUnsecuredLines'].median(), inplace=True)

In [303]:
for i in ['DebtRatio', 
          'MonthlyIncome', 
          'NumberOfOpenCreditLinesAndLoans', 
          'NumberRealEstateLoansOrLines']:
    train[i].replace({0:0.01}, inplace=True)
    test[i].replace({0:0.01}, inplace=True)

In [304]:
for i in ['DebtRatio', 
          'MonthlyIncome', 
          'NumberOfOpenCreditLinesAndLoans', 
          'NumberRealEstateLoansOrLines']:
    train[i], fitted_lambda = boxcox(train[i])
    test[i] =  boxcox(test[i], fitted_lambda)

In [305]:
# пишем функцию, вычисляющую IV по всем 
# количественным предикторам
def numeric_IV(df): 
    # создаем список, в который будем записывать IV
    iv_list = []
    # создаем копию датафрейма
    df = df.copy()
    # записываем константу, которую будем добавлять,
    # чтобы избежать деления на 0
    a= 0.0001
    # задаем зависимую переменную
    target = df['SeriousDlqin2yrs'].astype('str')
    # отбираем столбцы, у которых больше 10 уникальных значений
    df = df.loc[:, df.apply(pd.Series.nunique) > 5]
    # из этих столбцов отбираем только количественные
    numerical_columns = df.select_dtypes(include=['number']).columns
    # запускаем цикл, который вычисляет IV по каждой 
    # выбранной переменной
    for var_name in numerical_columns:
        # разбиваем переменную на 10 квантилей
        df[var_name] = pd.qcut(df[var_name].values, 10, duplicates='drop').codes
        # строим таблицу сопряженности между категоризированной 
        # переменной и зависимой переменной
        biv = pd.crosstab(df[var_name], target)        
        # вычисляем IV на основе таблицы сопряженности
        IV = sum(((1.0 * biv['0'] / sum(biv['0']) + a) - (1.0 * biv['1'] / sum(biv['1']) + a)) *
                 np.log((1.0 * biv['0'] / sum(biv['0']) + a) / (1.0 * 
                                                                biv['1'] / sum(biv['1']) + a)))
        # добавляем вычисленное IV в список, где хранятся IV
        iv_list.append(IV)
    # создаем список с названиями столбцов
    col_list = list(numerical_columns)
    # создаем датафрейм с двумя столбцами, в одном - названия переменных,
    # в другом - IV этих переменных
    result = pd.DataFrame({'Название переменной' : col_list,'IV' : iv_list})
    # добавляем дополнительный столбец "Полезность", задаем строковые значения,
    # которые будут выводится в зависимости от величины IV
    result['Полезность'] = ['Подозрительно высокая' if x > 0.5 else 'Сильная' 
                            if x <= 0.5 and x > 0.3 else 'Средняя'
                            if x <= 0.3 and x > 0.1 else 'Слабая' 
                            if x <= 0.1 and x > 0.02 else 'Бесполезная' 
                            for x in result['IV']]  # по Наиму Сиддики
    # возвращаем датафрейм, отсортированный по убыванию IV
    return(result.sort_values(by='IV', ascending=False))

In [306]:
numeric_IV(train)

Unnamed: 0,Название переменной,IV,Полезность
6,Ratio,1.259,Подозрительно высокая
0,RevolvingUtilizationOfUnsecuredLines,1.099,Подозрительно высокая
1,age,0.236,Средняя
3,MonthlyIncome,0.084,Слабая
4,NumberOfOpenCreditLinesAndLoans,0.067,Слабая
2,DebtRatio,0.043,Слабая
5,NumberRealEstateLoansOrLines,0.012,Бесполезная


In [307]:
def user_bin(df, number):    
    # увеличиваем максимальную ширину столбца
    pd.set_option('max_colwidth', 800) 
    # задаем список, где будут хранится IV
    iv_list = []
    # задаем список, где будет хранится информация о количестве бинов
    bins_list = [] 
    # задаем список, где будет хранится информация о бинах
    groups_list = []  
    # записываем константу, которую будем добавлять,
    # чтобы избежать деления на 0
    a = 0.0001
    # задаем зависимую переменную
    target = df['SeriousDlqin2yrs'].astype('str')
    # отбираем столбцы, у которых больше 10 уникальных значений
    df = df.loc[:, df.apply(pd.Series.nunique) > 5]
    # из этих столбцов отбираем только количественные
    numerical_columns = df.select_dtypes(include=['number']).columns    
    # запускаем цикл, который вычисляет IV по каждой 
    # выбранной переменной
    for var_name in numerical_columns:  
        # фиксируем количество бинов
        num = number
        # создаем точки разбиения
        bins = np.linspace(df[var_name].min(), df[var_name].max(), num) 
        # округляем значения точек разбиения
        rounded_bins = np.round(bins, 2)
        # создаем бины
        groups = np.digitize(df[var_name], rounded_bins)
        # строим таблицу сопряженности между категоризированной 
        # переменной и зависимой переменной
        biv = pd.crosstab(groups, target)
        # вычисляем IV на основе таблицы сопряженности        
        IV = sum(((1.0 * biv['0'] / sum(biv['0']) + a) - (1.0 * biv['1'] / sum(biv['1']) + a)) * 
                 np.log((1.0 * biv['0'] / sum(biv['0']) + a) / (1.0 * 
                                                                biv['1'] / sum(biv['1']) + a)))
        # добавляем вычисленное IV в список, где хранятся IV
        iv_list.append(IV) 
        # добавляем информацию о количестве бинов в список, где хранится
        # информация о количестве бинов
        bins_list.append(num)
        # добавляем бины в список, где хранится 
        # информация о бинах
        groups_list.append(rounded_bins)
    # создаем список с названиями столбцов    
    col_list = list(numerical_columns) 
    # создаем датафрейм с четырьмя столбцами, в первом - названия переменных,
    # во втором - бины, в третьем - IV, в четвертом - количество бинов
    result = pd.DataFrame({'Переменная' : col_list, 
                           'Бины': groups_list, 
                           'IV' : iv_list,
                           'Количество_бинов' : bins_list})    
    # возвращаем датафрейм, отсортированный по убыванию IV
    return(result.sort_values(by='IV', ascending=False))

In [308]:
user_bin(train,8)

Unnamed: 0,Переменная,Бины,IV,Количество_бинов
0,RevolvingUtilizationOfUnsecuredLines,"[0.0, 0.29, 0.57, 0.86, 1.14, 1.43, 1.71, 2.0]",1.062,8
6,Ratio,"[0.0, 0.15, 0.29, 0.44, 0.58, 0.73, 0.87, 1.02]",0.845,8
1,age,"[21.0, 29.43, 37.86, 46.29, 54.71, 63.14, 71.57, 80.0]",0.23,8
4,NumberOfOpenCreditLinesAndLoans,"[-1.78, 0.41, 2.59, 4.78, 6.97, 9.15, 11.34, 13.52]",0.092,8
3,MonthlyIncome,"[5.86, 6.43, 7.0, 7.57, 8.14, 8.71, 9.28, 9.85]",0.074,8
5,NumberRealEstateLoansOrLines,"[-2.94, -1.63, -0.31, 1.01, 2.32, 3.64, 4.95, 6.27]",0.062,8
2,DebtRatio,"[-3.45, -2.96, -2.46, -1.97, -1.48, -0.99, -0.49, 0.0]",0.045,8


In [309]:
bins = [-np.inf,  6.43, 7.0, 7.57, 8.14, 8.71, 9.28, np.inf]
train['incomecat'] = pd.cut(train['MonthlyIncome'], bins).astype('object')
test['incomecat'] = pd.cut(test['MonthlyIncome'], bins).astype('object')

In [310]:
bins = [-np.inf, -2.96, -2.46, -1.97, -1.48, -0.99, -0.49, np.inf]
train['debtcat'] = pd.cut(train['DebtRatio'], bins).astype('object')
test['debtcat'] = pd.cut(test['DebtRatio'], bins).astype('object')

In [311]:
bins = [-np.inf,  -1.63, -0.31, 1.01, 2.32, 3.64, 4.95, np.inf]
train['linescat'] = pd.cut(train['NumberRealEstateLoansOrLines'], bins).astype('object')
test['linescat'] = pd.cut(test['NumberRealEstateLoansOrLines'], bins).astype('object')

In [312]:
num_cols = [c for c in train.columns if train[c].dtype.name != 'object']
train_copy = train.copy()
for i in num_cols:
    train[i] = (train[i] - train[i].mean()) / train[i].std()
    test[i] = (test[i] - train_copy[i].mean()) / train_copy[i].std()

In [313]:
print(train.info())
print(test.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 105000 entries, 58185 to 126885
Data columns (total 17 columns):
SeriousDlqin2yrs                             105000 non-null object
RevolvingUtilizationOfUnsecuredLines         105000 non-null float64
age                                          105000 non-null float64
NumberOfTime30-59DaysPastDueNotWorse         105000 non-null object
DebtRatio                                    105000 non-null float64
MonthlyIncome                                105000 non-null float64
NumberOfOpenCreditLinesAndLoans              105000 non-null float64
NumberOfTimes90DaysLate                      105000 non-null object
NumberRealEstateLoansOrLines                 105000 non-null float64
NumberOfTime60-89DaysPastDueNotWorse         105000 non-null object
NumberOfDependents                           105000 non-null object
Ratio                                        105000 non-null float64
NumberOfTimes90DaysLate_is_0                 105000 non-null o

In [314]:
print('Исходные переменные', list(train.columns), '\n')
train_dumm = pd.get_dummies(train)
print('Переменные после дамми', list(train_dumm.columns), '\n')
print('Исходные переменные', list(test.columns), '\n')
test_dumm = pd.get_dummies(test)
print('Переменные после дамми', list(test_dumm.columns))

Исходные переменные ['SeriousDlqin2yrs', 'RevolvingUtilizationOfUnsecuredLines', 'age', 'NumberOfTime30-59DaysPastDueNotWorse', 'DebtRatio', 'MonthlyIncome', 'NumberOfOpenCreditLinesAndLoans', 'NumberOfTimes90DaysLate', 'NumberRealEstateLoansOrLines', 'NumberOfTime60-89DaysPastDueNotWorse', 'NumberOfDependents', 'Ratio', 'NumberOfTimes90DaysLate_is_0', 'RevolvingUtilizationOfUnsecuredLines_is_0', 'incomecat', 'debtcat', 'linescat'] 

Переменные после дамми ['RevolvingUtilizationOfUnsecuredLines', 'age', 'DebtRatio', 'MonthlyIncome', 'NumberOfOpenCreditLinesAndLoans', 'NumberRealEstateLoansOrLines', 'Ratio', 'SeriousDlqin2yrs_0', 'SeriousDlqin2yrs_1', 'NumberOfTime30-59DaysPastDueNotWorse_-1', 'NumberOfTime30-59DaysPastDueNotWorse_0', 'NumberOfTime30-59DaysPastDueNotWorse_1', 'NumberOfTime30-59DaysPastDueNotWorse_2', 'NumberOfTime30-59DaysPastDueNotWorse_3', 'NumberOfTime30-59DaysPastDueNotWorse_more_3', 'NumberOfTimes90DaysLate_-1', 'NumberOfTimes90DaysLate_0', 'NumberOfTimes90DaysLate

In [315]:
y_tr = train_dumm.loc[:, 'SeriousDlqin2yrs_1']
y_tst = test_dumm.loc[:, 'SeriousDlqin2yrs_1']

In [316]:
train_dumm = train_dumm.drop(['SeriousDlqin2yrs_0', 'SeriousDlqin2yrs_1'], axis=1)
test_dumm = test_dumm.drop(['SeriousDlqin2yrs_0', 'SeriousDlqin2yrs_1'], axis=1)

In [317]:
X_tr = train_dumm.loc[:, 'RevolvingUtilizationOfUnsecuredLines':'linescat_(3.64, 4.95]']
X_tst = test_dumm.loc[:, 'RevolvingUtilizationOfUnsecuredLines':'linescat_(3.64, 4.95]']

In [318]:
logreg = LogisticRegression(solver='saga', max_iter=500, random_state=42).fit(X_tr, y_tr)
print('AUC на обучающей выборке: {:.5f}'.format(roc_auc_score(y_tr, logreg.predict_proba(X_tr)[:,1])))
print('AUC на тестовой выборке: {:.5f}'.format(roc_auc_score(y_tst, logreg.predict_proba(X_tst)[:,1])))

AUC на обучающей выборке: 0.86190
AUC на тестовой выборке: 0.86204


In [319]:
from sklearn.ensemble import RandomForestClassifier

In [320]:
#forest = RandomForestClassifier(n_estimators=800, max_depth=17, random_state=152, n_jobs=-1)

In [321]:
"""forest.fit(X_tr, y_tr)
print('AUC на обучающей выборке по обычному методу: {:.5f}'.
      format(roc_auc_score(y_tr, forest.predict_proba(X_tr)[:, 1])))
print('AUC на контрольной выборке: {:.5f}'.
      format(roc_auc_score(y_tst, forest.predict_proba(X_tst)[:,1])))"""

"forest.fit(X_tr, y_tr)\nprint('AUC на обучающей выборке по обычному методу: {:.5f}'.\n      format(roc_auc_score(y_tr, forest.predict_proba(X_tr)[:, 1])))\nprint('AUC на контрольной выборке: {:.5f}'.\n      format(roc_auc_score(y_tst, forest.predict_proba(X_tst)[:,1])))"

In [322]:
#Случайный лес сработал на обучающей лучше чем логистическая регрессия, но на тестовой чуть хуже

In [323]:
#import h2o # установить Java8

In [324]:
#h2o.init(nthreads=-1, max_mem_size=8)

In [325]:
from sklearn.neural_network import MLPClassifier

In [326]:
#clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(20, 8), random_state=42)

In [327]:
#clf.fit(X_tr, y_tr)  

In [328]:
#clf.predict(X_tst)

In [329]:
#print('AUC на выборке: {:.5f}'.format(roc_auc_score(y_tst, clf.predict_proba(X_tst)[:,1])))

In [330]:
#print('AUC на выборке: {:.5f}'.format(roc_auc_score(y_tr, clf.predict_proba(X_tr)[:,1])))

##### 86216 at 5,2 
##### 0.86362 at 10,4

In [331]:
#import h2o - лучше в colab

In [332]:
data_test = pd.read_csv('cs-test.csv', encoding='cp1251', sep=',')

In [335]:
#for i in numerical_columns:
#    data_test[i] = (train[i] - train_copy.mean()) / train_copy.std()

In [336]:
data_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101503 entries, 0 to 101502
Data columns (total 12 columns):
Unnamed: 0                              101503 non-null int64
SeriousDlqin2yrs                        0 non-null float64
RevolvingUtilizationOfUnsecuredLines    101503 non-null float64
age                                     101503 non-null int64
NumberOfTime30-59DaysPastDueNotWorse    101503 non-null int64
DebtRatio                               101503 non-null float64
MonthlyIncome                           81400 non-null float64
NumberOfOpenCreditLinesAndLoans         101503 non-null int64
NumberOfTimes90DaysLate                 101503 non-null int64
NumberRealEstateLoansOrLines            101503 non-null int64
NumberOfTime60-89DaysPastDueNotWorse    101503 non-null int64
NumberOfDependents                      98877 non-null float64
dtypes: float64(5), int64(7)
memory usage: 9.3 MB
