# Imports

In [94]:
import pandas as pd
import numpy as np
from numpy import mean
from numpy import std
import math
import time

import warnings
warnings.filterwarnings('ignore')

from sklearnex import patch_sklearn
patch_sklearn()

# from lazypredict.Supervised import LazyRegressor

# Visualization
import matplotlib.pyplot as plt
from matplotlib import rcParams
%matplotlib inline
import seaborn as sns
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.express as px

# Feature and Model Selection
from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectKBest
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.impute import KNNImputer
from sklearn.preprocessing import MinMaxScaler

# Models
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from catboost import CatBoostClassifier
from xgboost.sklearn import XGBClassifier

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


# EDA

In [95]:
# !wget https://raw.githubusercontent.com/BKHV/risk_models/master/data/PD-data-train.csv
# !wget https://raw.githubusercontent.com/BKHV/risk_models/master/data/PD-data-test.csv
# !wget https://raw.githubusercontent.com/BKHV/risk_models/master/data/PD-data-desc.csv

In [96]:
train_df = pd.read_csv('PD-data-train.csv', sep=';')
test_df = pd.read_csv('PD-data-test.csv', sep=';')
desc_df = pd.read_csv('PD-data-desc.csv', sep=';')

## Exploring Train Data

In [97]:
print('Number of rows in train_df: ', train_df.shape[0])
print('Number of columns in train_df: ', train_df.shape[1])
print('Number of values in train_df: ', train_df.count().sum())
print('Number of NaNs in train_df: ', sum(train_df.isna().sum()))
print('Number of NaNs in train_df in all columns:\n')
print(train_df.isna().sum())

Number of rows in train_df:  32395
Number of columns in train_df:  37
Number of values in train_df:  835135
Number of NaNs in train_df:  363480
Number of NaNs in train_df in all columns:

record_id                           0
ar_revenue                      15145
ar_total_expenses               15145
ar_sale_cost                    15145
ar_selling_expenses             15145
ar_management_expenses          15145
ar_sale_profit                  15145
ar_balance_of_rvns_and_expns    15145
ar_profit_before_tax            15145
ar_taxes                        15145
ar_other_profit_and_losses      15145
ar_net_profit                   15145
ab_immobilized_assets           15145
ab_mobile_current_assets        15145
ab_inventory                    15145
ab_accounts_receivable          15145
ab_other_current_assets         15145
ab_cash_and_securities          15145
ab_losses                       15145
ab_own_capital                  15145
ab_borrowed_capital             15145
ab_long_term_l

In [98]:
train_df.describe()

Unnamed: 0,record_id,ar_revenue,ar_total_expenses,ar_sale_cost,ar_selling_expenses,ar_management_expenses,ar_sale_profit,ar_balance_of_rvns_and_expns,ar_profit_before_tax,ar_taxes,...,ogrn_age,adr_actual_age,head_actual_age,cap_actual_age,ul_capital_sum,ul_founders_cnt,ul_branch_cnt,ul_strategic_flg,ul_systematizing_flg,default_12m
count,32395.0,17250.0,17250.0,17250.0,17250.0,17250.0,17250.0,17250.0,17250.0,17250.0,...,32395.0,32395.0,32395.0,32395.0,32395.0,32395.0,32395.0,32395.0,32395.0,32395.0
mean,136955.533601,81537760.0,76784740.0,73203280.0,2499198.0,1082263.0,4753016.0,-642477.3,4110538.0,418807.1,...,78.644822,15.010341,15.010341,15.010341,1366509.0,11.946535,0.02167,9.3e-05,0.0,0.064547
std,79370.573366,127440200.0,122722700.0,115665100.0,12317980.0,7848929.0,19410270.0,6014715.0,19380210.0,1565298.0,...,46.522534,9.546511,9.546511,9.546511,15024380.0,86.341847,0.342715,0.009623,0.0,0.245729
min,7.0,0.0,-701984000.0,-507278000.0,-35429000.0,-192167000.0,-92100000.0,-75818000.0,-59466000.0,-16810000.0,...,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
25%,67912.0,10199500.0,9130250.0,8844000.0,0.0,0.0,211250.0,-940000.0,142000.0,0.0,...,35.0,6.0,6.0,6.0,10000.0,1.0,0.0,0.0,0.0,0.0
50%,137527.0,32335000.0,29703500.0,28408000.0,0.0,0.0,1231500.0,-126000.0,850000.0,41000.0,...,77.0,14.0,14.0,14.0,10000.0,1.0,0.0,0.0,0.0,0.0
75%,205633.5,88876750.0,83095000.0,79914250.0,0.0,0.0,4574750.0,0.0,3543750.0,291000.0,...,133.0,24.0,24.0,24.0,30000.0,2.0,0.0,0.0,0.0,0.0
max,274623.0,794684000.0,794936000.0,762493000.0,274656000.0,302355000.0,1434511000.0,158425000.0,1481526000.0,54202000.0,...,149.0,33.0,33.0,33.0,1584979000.0,998.0,20.0,1.0,0.0,1.0


In [99]:
train_df.dtypes

record_id                         int64
ar_revenue                      float64
ar_total_expenses               float64
ar_sale_cost                    float64
ar_selling_expenses             float64
ar_management_expenses          float64
ar_sale_profit                  float64
ar_balance_of_rvns_and_expns    float64
ar_profit_before_tax            float64
ar_taxes                        float64
ar_other_profit_and_losses      float64
ar_net_profit                   float64
ab_immobilized_assets           float64
ab_mobile_current_assets        float64
ab_inventory                    float64
ab_accounts_receivable          float64
ab_other_current_assets         float64
ab_cash_and_securities          float64
ab_losses                       float64
ab_own_capital                  float64
ab_borrowed_capital             float64
ab_long_term_liabilities        float64
ab_short_term_borrowing         float64
ab_accounts_payable             float64
ab_other_borrowings             float64


## Exploring Test Data

In [100]:
test_df.head()

Unnamed: 0,record_id,ar_revenue,ar_total_expenses,ar_sale_cost,ar_selling_expenses,ar_management_expenses,ar_sale_profit,ar_balance_of_rvns_and_expns,ar_profit_before_tax,ar_taxes,...,ogrn_age,adr_actual_age,head_actual_age,cap_actual_age,ul_staff_range,ul_capital_sum,ul_founders_cnt,ul_branch_cnt,ul_strategic_flg,ul_systematizing_flg
0,196,39103000.0,38913000.0,38913000.0,0.0,0.0,190000.0,141000.0,331000.0,284000.0,...,135,3,3,3,[1-100],100000.0,1,0,0,0
1,1196,,,,,,,,,,...,138,5,5,5,[1-100],100000.0,1,0,0,0
2,2813,54174000.0,50929000.0,50929000.0,0.0,0.0,3245000.0,-1278000.0,1967000.0,500000.0,...,138,20,20,20,[1-100],250000.0,2,0,0,0
3,4385,1904000.0,1679000.0,1679000.0,0.0,0.0,225000.0,-88000.0,137000.0,0.0,...,139,9,9,9,[1-100],10000.0,2,0,0,0
4,6479,225584000.0,210685000.0,210685000.0,0.0,0.0,14899000.0,-12715000.0,2184000.0,445000.0,...,139,3,3,3,(100-500],1700598.0,84,0,0,0


In [101]:
print('Number of rows in test_df: ', test_df.shape[0])
print('Number of columns in test_df: ', test_df.shape[1])
print('Number of values in test_df: ', test_df.count().sum())
print('Number of NaNs in test_df: ', sum(test_df.isna().sum()))
print('Number of NaNs in test_df in all columns:\n')
print(test_df.isna().sum())

Number of rows in test_df:  200
Number of columns in test_df:  36
Number of values in test_df:  4368
Number of NaNs in test_df:  2832
Number of NaNs in test_df in all columns:

record_id                         0
ar_revenue                      118
ar_total_expenses               118
ar_sale_cost                    118
ar_selling_expenses             118
ar_management_expenses          118
ar_sale_profit                  118
ar_balance_of_rvns_and_expns    118
ar_profit_before_tax            118
ar_taxes                        118
ar_other_profit_and_losses      118
ar_net_profit                   118
ab_immobilized_assets           118
ab_mobile_current_assets        118
ab_inventory                    118
ab_accounts_receivable          118
ab_other_current_assets         118
ab_cash_and_securities          118
ab_losses                       118
ab_own_capital                  118
ab_borrowed_capital             118
ab_long_term_liabilities        118
ab_short_term_borrowing        

In [102]:
test_df.describe()

Unnamed: 0,record_id,ar_revenue,ar_total_expenses,ar_sale_cost,ar_selling_expenses,ar_management_expenses,ar_sale_profit,ar_balance_of_rvns_and_expns,ar_profit_before_tax,ar_taxes,...,bus_age,ogrn_age,adr_actual_age,head_actual_age,cap_actual_age,ul_capital_sum,ul_founders_cnt,ul_branch_cnt,ul_strategic_flg,ul_systematizing_flg
count,200.0,82.0,82.0,82.0,82.0,82.0,82.0,82.0,82.0,82.0,...,200.0,200.0,200.0,200.0,200.0,200.0,200.0,200.0,200.0,200.0
mean,135340.925,86259710.0,82002180.0,76948630.0,2780183.0,2273366.0,4257524.0,-688085.4,3569439.0,441987.8,...,84.595,65.68,14.32,14.32,14.32,726356.0,5.07,0.0,0.0,0.0
std,80177.086856,158903000.0,153409200.0,137265100.0,11806920.0,17528200.0,7752518.0,5253569.0,8232787.0,1294710.0,...,80.585115,46.096442,9.697308,9.697308,9.697308,4043191.0,32.669582,0.0,0.0,0.0
min,196.0,0.0,0.0,0.0,0.0,0.0,-4898000.0,-23281000.0,-6390000.0,-11000.0,...,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0
25%,68367.5,6779250.0,5397500.0,5397500.0,0.0,0.0,155750.0,-657000.0,126500.0,0.0,...,25.0,25.0,5.0,5.0,5.0,10000.0,1.0,0.0,0.0,0.0
50%,127585.5,18997000.0,16456000.0,15878500.0,0.0,0.0,1184000.0,-58000.0,545000.0,73500.0,...,57.0,54.5,12.5,12.5,12.5,10000.0,1.0,0.0,0.0,0.0
75%,202665.0,83882500.0,78313500.0,76511750.0,0.0,0.0,3200000.0,0.0,2630500.0,352500.0,...,106.0,101.25,24.0,24.0,24.0,20555.75,2.0,0.0,0.0,0.0
max,271584.0,770724000.0,745881000.0,669601000.0,93976000.0,156456000.0,38051000.0,28586000.0,54481000.0,7690000.0,...,460.0,145.0,33.0,33.0,33.0,40170000.0,420.0,0.0,0.0,0.0


In [103]:
test_df.dtypes

record_id                         int64
ar_revenue                      float64
ar_total_expenses               float64
ar_sale_cost                    float64
ar_selling_expenses             float64
ar_management_expenses          float64
ar_sale_profit                  float64
ar_balance_of_rvns_and_expns    float64
ar_profit_before_tax            float64
ar_taxes                        float64
ar_other_profit_and_losses      float64
ar_net_profit                   float64
ab_immobilized_assets           float64
ab_mobile_current_assets        float64
ab_inventory                    float64
ab_accounts_receivable          float64
ab_other_current_assets         float64
ab_cash_and_securities          float64
ab_losses                       float64
ab_own_capital                  float64
ab_borrowed_capital             float64
ab_long_term_liabilities        float64
ab_short_term_borrowing         float64
ab_accounts_payable             float64
ab_other_borrowings             float64


In [104]:
train_df['default_12m'].mean()

0.06454699799351751

In [105]:
train_features = ['default_12m', 'ar_revenue', 'ar_total_expenses', 'ar_sale_cost', 'ar_profit_before_tax', 'ar_net_profit', 'ab_inventory',
    'ab_accounts_receivable', 'ab_own_capital', 'ab_borrowed_capital', 'ab_accounts_payable', 'ogrn_age', 'adr_actual_age',
    'head_actual_age', 'cap_actual_age']

test_features = ['ar_revenue', 'ar_total_expenses', 'ar_sale_cost', 'ar_profit_before_tax', 'ar_net_profit', 'ab_inventory',
    'ab_accounts_receivable', 'ab_own_capital', 'ab_borrowed_capital', 'ab_accounts_payable', 'ogrn_age', 'adr_actual_age',
    'head_actual_age', 'cap_actual_age']

# test_record_id = test_df['record_id']
# train_df = train_df[train_features]
# test_df = test_df[test_features]

# train_df = train_df.drop('ul_staff_range', axis=1)
# test_df = test_df.drop('ul_staff_range', axis=1)

# Data Preprocessing and Feature Selection

In [106]:
df = pd.concat([train_df, test_df], sort=False)

In [107]:
TL = df.ab_long_term_liabilities + df.ab_other_borrowings + df.ab_short_term_borrowing
TA = df.ab_own_capital + df.ab_borrowed_capital
STD = df.ab_short_term_borrowing
STFD = df.ab_short_term_borrowing
CA = df.ab_mobile_current_assets
FCA = df.ab_mobile_current_assets - df.ab_inventory
GY = df.ar_sale_profit

df['r_1_a'] =  df.ar_revenue / (df.ab_accounts_receivable/12) 
df['r_2_a'] = df.ar_sale_cost / (df.ab_inventory/12)
df['r_3_a'] = df.ar_selling_expenses / (df.ar_total_expenses/12)
df['r_4_a'] = df.ar_revenue / TA -TL
df['r_5_a'] = df.ar_revenue / (df.ab_immobilized_assets/12)
df['r_6_a'] = df.ar_revenue / (df.ab_mobile_current_assets + df.ab_cash_and_securities)
df['r_7_a'] = df.ar_sale_profit / df.ar_revenue
df['r_8_a'] = df.ar_profit_before_tax / df.ar_revenue
df['r_9_a'] = df.ar_net_profit / df.ar_revenue

df['r_10_a'] = df.ab_short_term_borrowing / (df.ab_short_term_borrowing + df.ab_accounts_payable + df.ab_other_borrowings)
df['r_11_a'] = df.ab_accounts_payable / (df.ab_short_term_borrowing + df.ab_accounts_payable + df.ab_other_borrowings)
df['r_12_a'] = df.ab_inventory / df.ar_revenue
df['r_13_a'] = df.ab_long_term_liabilities / df.ar_revenue
df['r_15_a'] = df.ar_taxes / df.ar_revenue

df['r_16_a'] = df.ab_inventory / df.ab_borrowed_capital
df['r_17_a'] = df.ab_inventory / df.ab_mobile_current_assets
df['r_18_a'] = df.ab_inventory / df.ab_accounts_payable

df['r_19_a'] = df.ab_accounts_receivable / (df.ab_cash_and_securities + df.ab_accounts_receivable)
df['r_20_a'] = df.ab_cash_and_securities / df.ab_borrowed_capital
df['r_21_a'] = df.ab_cash_and_securities / df.ab_short_term_borrowing
df['r_22_a'] = df.ab_cash_and_securities / (df.ab_short_term_borrowing + df.ab_accounts_payable)
df['r_23_a'] = df.ab_cash_and_securities / (df.ab_short_term_borrowing + df.ab_accounts_payable + df. ab_other_borrowings)

df['r_24_a'] = df.ar_profit_before_tax / df.ar_net_profit

df['r_25_a'] = TL / TA
df['r_26_a'] = (df.ab_accounts_receivable + df.ab_cash_and_securities) / TA
df['r_27_a'] = CA / STD

In [108]:
df_short = df.drop(['ar_revenue', 'ar_total_expenses', 'ar_sale_cost', 'ar_selling_expenses', 'ar_management_expenses',
         'ar_sale_profit', 'ar_balance_of_rvns_and_expns', 'ar_profit_before_tax', 'ar_taxes', 
         'ar_other_profit_and_losses', 'ul_systematizing_flg'], axis = 1)

In [109]:
df_short['ul_staff_range'].unique()

array(['[1-100]', '(100-500]', '> 500'], dtype=object)

In [110]:
df_short.loc[df_short['ul_staff_range'] == '[1-100]', 'ul_staff_range'] = 1
df_short.loc[df_short['ul_staff_range'] == '(100-500]', 'ul_staff_range'] = 2
df_short.loc[df_short['ul_staff_range'] == '> 500', 'ul_staff_range'] = 3

In [111]:
test_df = df_short.iloc[-200:,:]
train_df = df_short.iloc[:-200, :]
test_record_id = test_df['record_id']

In [112]:
# MinMaxScaler for data normalization
# KNNImputer is a distance-based imputation method,
# so it needs normalization before imputing
train_df.dropna(axis=1, inplace=True)
test_df.dropna(axis=1, inplace=True)

# scaler = MinMaxScaler()
# default = train_df['default_12m']
# train_df = pd.DataFrame(scaler.fit_transform(train_df.drop('default_12m', axis=1)), columns = train_df.drop('default_12m', axis=1).columns)
# train_df['default_12m'] = default

# test_df = pd.DataFrame(scaler.fit_transform(test_df), columns = test_df.columns)

# Cross-Validation

In [113]:
# Get a list of models to evaluate
def get_models():
    models = dict()
    models['LogisticRegression'] = LogisticRegression()
    models['SVC'] = SVC()
    models['GaussianNB'] = GaussianNB()
    models['MultinomialNB'] = MultinomialNB()
    models['SGDClassifier'] = SGDClassifier()
    models['DecisionTreeClassifier'] = DecisionTreeClassifier()
    models['RandomForestClassifier'] = RandomForestClassifier()
    models['GradientBoostingClassifier'] = GradientBoostingClassifier()
    # models['LGBMClassifier'] = LGBMClassifier()
    models['CatBoostClassifier'] = CatBoostClassifier()
    models['XGBClassifier'] = XGBClassifier()
    # models['Stacking'] = get_stacking()
    return models

In [114]:
# Evaluate a given model using cross-validation
def roc_auc_evaluate_model(model, x, y):
    start_time = time.time()
    cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
    scores = cross_val_score(model, x, y, scoring='roc_auc', cv=cv, n_jobs=-1, error_score='raise')
    run_time = time.time() - start_time
    return scores, run_time

In [115]:
# Get a stacking ensemble of models
def get_stacking():
    # Defining the base models
    level0 = list()
    level0.append(('LogisticRegression', LogisticRegression()))
    level0.append(('GradientBoosting', GradientBoostingClassifier()))
    level0.append(('CatBoost', CatBoostClassifier()))
    level0.append(('XGBClassifier', XGBClassifier()))
    level0.append(('GaussianNB', GaussianNB()))
    level0.append(('SGDClassifier', SGDClassifier()))
    # Defining meta learner model
    level1 = LogisticRegression()
    # Defining the stacking ensemble
    model = StackingClassifier(estimators=level0, final_estimator=level1, cv=5)
    return model

In [116]:
# Get the models to evaluate
models = get_models()
roc_auc_results, names, runs = list(), list(), list()

# Evaluate the models and store results
for name, model in models.items():
    roc_auc_scores, run_time = roc_auc_evaluate_model(model, train_df.drop('default_12m', axis=1), train_df['default_12m'])
    roc_auc_results.append(roc_auc_scores)
    names.append(name)
    runs.append(run_time)
    print(name, "| ROC AUC:", mean(roc_auc_scores), "| Standard Deviation:", std(roc_auc_scores),"| Time Taken:", run_time)

LogisticRegression | ROC AUC: 0.6326650438267056 | Standard Deviation: 0.01808234797749254 | Time Taken: 9.96429991722107
SVC | ROC AUC: 0.5033462842391001 | Standard Deviation: 0.02733217431142682 | Time Taken: 30.161293029785156
GaussianNB | ROC AUC: 0.6060522857467511 | Standard Deviation: 0.03156594290812751 | Time Taken: 0.6699907779693604
MultinomialNB | ROC AUC: 0.5240800648428106 | Standard Deviation: 0.006885708258675605 | Time Taken: 0.48747801780700684
SGDClassifier | ROC AUC: 0.5252807136581185 | Standard Deviation: 0.017496306918552224 | Time Taken: 1.003403902053833
DecisionTreeClassifier | ROC AUC: 0.5276306253599868 | Standard Deviation: 0.012919250271418877 | Time Taken: 1.777923822402954
RandomForestClassifier | ROC AUC: 0.6120741654172482 | Standard Deviation: 0.02036794920930717 | Time Taken: 12.506797075271606
GradientBoostingClassifier | ROC AUC: 0.6728411175690563 | Standard Deviation: 0.017057336106610893 | Time Taken: 27.354887008666992
Learning rate set to 0.0

ValueError: DataFrame.dtypes for data must be int, float, bool or category.  When
categorical type is supplied, DMatrix parameter `enable_categorical` must
be set to `True`. Invalid columns:ul_staff_range

# Model Selection

In [None]:
roc_auc_mean_results, roc_auc_std_results = list(), list()
r2_mean_results, r2_std_results = list(), list()
for value in range(len(roc_auc_results)):
    roc_auc_mean_results.append(mean(roc_auc_results[value]))
    roc_auc_std_results.append(std(roc_auc_results[value]))

In [None]:
data = {'Name':names, 'ROC AUC':roc_auc_mean_results, 'Standart Deviation':roc_auc_std_results, 'Time Taken':runs}
df_models = pd.DataFrame(data)
df_models.set_index('Name', inplace=True)
df_models

Unnamed: 0_level_0,ROC AUC,Standart Deviation,Time Taken
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
LogisticRegression,0.655508,0.017119,10.073706
SVC,0.517618,0.045267,147.647699
GaussianNB,0.65705,0.015889,0.275462
MultinomialNB,0.596276,0.016266,0.273948
SGDClassifier,0.616795,0.03268,0.748358
DecisionTreeClassifier,0.526692,0.011479,1.424347
RandomForestClassifier,0.61284,0.02062,12.976867
GradientBoostingClassifier,0.669721,0.017826,30.268051
CatBoostClassifier,0.669929,0.016743,123.993093
XGBClassifier,0.661791,0.014156,31.663199


In [None]:
line = px.line(data_frame = df_models.sort_values('Time Taken', ascending=True),y =['Time Taken'], markers = True)
line.update_xaxes(title="Model", rangeslider_visible = False)
line.update_yaxes(title = "Time (s)")
line.update_traces(line_color="red")
line.update_layout(showlegend = True,
                   title = {
                       'text': 'Time Taken vs Model',
                       'y':0.94,
                       'x':0.5,
                       'xanchor': 'center',
                       'yanchor': 'top'})

line.show()

In [None]:
line = px.line(data_frame = df_models.sort_values('ROC AUC', ascending=False),y =['ROC AUC'], markers = True)
line.update_xaxes(title="Model", rangeslider_visible = False)
line.update_yaxes(title = "ROC AUC Score")
line.update_traces(line_color="blue")
line.update_layout(showlegend = True,
                   title = {
                       'text': 'ROC AUC vs Model',
                       'y':0.94,
                       'x':0.5,
                       'xanchor': 'center',
                       'yanchor': 'top'})

line.show()

In [None]:
line = px.line(data_frame = df_models.sort_values('Standart Deviation', ascending=True),y =['Standart Deviation'], markers = True)
line.update_xaxes(title="Model", rangeslider_visible = False)
line.update_yaxes(title = "Standart Deviation")
line.update_traces(line_color="purple")
line.update_layout(showlegend = True,
                   title = {
                       'text': 'Standart Deviation vs Model',
                       'y':0.94,
                       'x':0.5,
                       'xanchor': 'center',
                       'yanchor': 'top'})

line.show()

# Modeling

## Catboost

In [None]:
model = CatBoostClassifier()

parameters = {
    'depth': [4,5,6,7,8,9, 10],
    'learning_rate' : [0.01,0.02,0.03,0.04],
    'iterations'    : [10, 20,30,40,50,60,70,80,90, 100]
}

Grid = GridSearchCV(estimator=model, param_grid = parameters, cv = 2, n_jobs=-1)
Grid.fit(train_df.drop('default_12m', axis=1), train_df['default_12m'])

print(" Results from Grid Search " )
print("\n The best estimator across ALL searched params:\n",Grid.best_estimator_)
print("\n The best score across ALL searched params:\n",Grid.best_score_)
print("\n The best parameters across ALL searched params:\n",Grid.best_params_)

0:	learn: 0.6855884	total: 65.5ms	remaining: 590ms
1:	learn: 0.6782046	total: 69.7ms	remaining: 279ms
2:	learn: 0.6709632	total: 73.2ms	remaining: 171ms
0:	learn: 0.6856142	total: 71.2ms	remaining: 640ms
3:	learn: 0.6638880	total: 76.2ms	remaining: 114ms
1:	learn: 0.6782260	total: 75.1ms	remaining: 301ms
4:	learn: 0.6568936	total: 81.2ms	remaining: 81.2ms
2:	learn: 0.6710089	total: 79.4ms	remaining: 185ms
5:	learn: 0.6500802	total: 85.3ms	remaining: 56.9ms
3:	learn: 0.6639238	total: 84ms	remaining: 126ms
6:	learn: 0.6433678	total: 89.1ms	remaining: 38.2ms
4:	learn: 0.6569655	total: 87.4ms	remaining: 87.4ms
5:	learn: 0.6501099	total: 89.9ms	remaining: 60ms
7:	learn: 0.6368154	total: 92.8ms	remaining: 23.2ms
6:	learn: 0.6434308	total: 98.2ms	remaining: 42.1ms
8:	learn: 0.6303995	total: 97.1ms	remaining: 10.8ms
7:	learn: 0.6368799	total: 102ms	remaining: 25.6ms
9:	learn: 0.6240904	total: 101ms	remaining: 0us
8:	learn: 0.6304442	total: 107ms	remaining: 11.9ms
9:	learn: 0.6241010	total: 111

In [None]:
import numpy as np

model = CatBoostClassifier(depth=4, iterations=10, learning_rate=0.01)

model.fit(train_df.drop('default_12m', axis=1), train_df['default_12m'])

predictions = np.around(model.predict(test_df))
predictions

0:	learn: 0.6855727	total: 2.7ms	remaining: 24.3ms
1:	learn: 0.6781563	total: 5.1ms	remaining: 20.4ms
2:	learn: 0.6709020	total: 7.1ms	remaining: 16.6ms
3:	learn: 0.6637972	total: 9.29ms	remaining: 13.9ms
4:	learn: 0.6567939	total: 11.7ms	remaining: 11.7ms
5:	learn: 0.6499577	total: 13.8ms	remaining: 9.23ms
6:	learn: 0.6432360	total: 15.5ms	remaining: 6.66ms
7:	learn: 0.6366666	total: 18.4ms	remaining: 4.59ms
8:	learn: 0.6302246	total: 21.3ms	remaining: 2.37ms
9:	learn: 0.6239013	total: 22.9ms	remaining: 0us


array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [None]:
answ_df = pd.DataFrame()
answ_df['id'] = test_record_id
predict = pd.DataFrame([math.floor(float(x)) for x in (predictions*100)])
answ_df['predict'] = predict
answ_df.to_csv('PD-submit.csv',index=False, sep=';')