# Imports

In [214]:
import pandas as pd
import numpy as np
from numpy import mean
from numpy import std
import math
import time

import warnings
warnings.filterwarnings('ignore')

from sklearnex import patch_sklearn
patch_sklearn()

# from lazypredict.Supervised import LazyRegressor

# Visualization
import matplotlib.pyplot as plt
from matplotlib import rcParams
%matplotlib inline
import seaborn as sns
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.express as px

# Feature and Model Selection
from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectKBest
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.impute import KNNImputer
from sklearn.preprocessing import MinMaxScaler

# Models
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from lightgbm import LGBMClassifier
from xgboost.sklearn import XGBClassifier

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


# EDA

In [215]:
# !wget https://raw.githubusercontent.com/BKHV/risk_models/master/data/PD-data-train.csv
# !wget https://raw.githubusercontent.com/BKHV/risk_models/master/data/PD-data-test.csv
# !wget https://raw.githubusercontent.com/BKHV/risk_models/master/data/PD-data-desc.csv

In [216]:
train_df = pd.read_csv('PD-data-train.csv', sep=';')
test_df = pd.read_csv('PD-data-test.csv', sep=';')
desc_df = pd.read_csv('PD-data-desc.csv', sep=';')

## Exploring Train Data

In [219]:
print('Number of rows in train_df: ', train_df.shape[0])
print('Number of columns in train_df: ', train_df.shape[1])
print('Number of values in train_df: ', train_df.count().sum())
print('Number of NaNs in train_df: ', sum(train_df.isna().sum()))
print('Number of NaNs in train_df in all columns:\n')
print(train_df.isna().sum())

Number of rows in train_df:  32395
Number of columns in train_df:  37
Number of values in train_df:  835135
Number of NaNs in train_df:  363480
Number of NaNs in train_df in all columns:

record_id                           0
ar_revenue                      15145
ar_total_expenses               15145
ar_sale_cost                    15145
ar_selling_expenses             15145
ar_management_expenses          15145
ar_sale_profit                  15145
ar_balance_of_rvns_and_expns    15145
ar_profit_before_tax            15145
ar_taxes                        15145
ar_other_profit_and_losses      15145
ar_net_profit                   15145
ab_immobilized_assets           15145
ab_mobile_current_assets        15145
ab_inventory                    15145
ab_accounts_receivable          15145
ab_other_current_assets         15145
ab_cash_and_securities          15145
ab_losses                       15145
ab_own_capital                  15145
ab_borrowed_capital             15145
ab_long_term_l

In [220]:
train_df.describe()

Unnamed: 0,record_id,ar_revenue,ar_total_expenses,ar_sale_cost,ar_selling_expenses,ar_management_expenses,ar_sale_profit,ar_balance_of_rvns_and_expns,ar_profit_before_tax,ar_taxes,...,ogrn_age,adr_actual_age,head_actual_age,cap_actual_age,ul_capital_sum,ul_founders_cnt,ul_branch_cnt,ul_strategic_flg,ul_systematizing_flg,default_12m
count,32395.0,17250.0,17250.0,17250.0,17250.0,17250.0,17250.0,17250.0,17250.0,17250.0,...,32395.0,32395.0,32395.0,32395.0,32395.0,32395.0,32395.0,32395.0,32395.0,32395.0
mean,136955.533601,81537760.0,76784740.0,73203280.0,2499198.0,1082263.0,4753016.0,-642477.3,4110538.0,418807.1,...,78.644822,15.010341,15.010341,15.010341,1366509.0,11.946535,0.02167,9.3e-05,0.0,0.064547
std,79370.573366,127440200.0,122722700.0,115665100.0,12317980.0,7848929.0,19410270.0,6014715.0,19380210.0,1565298.0,...,46.522534,9.546511,9.546511,9.546511,15024380.0,86.341847,0.342715,0.009623,0.0,0.245729
min,7.0,0.0,-701984000.0,-507278000.0,-35429000.0,-192167000.0,-92100000.0,-75818000.0,-59466000.0,-16810000.0,...,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
25%,67912.0,10199500.0,9130250.0,8844000.0,0.0,0.0,211250.0,-940000.0,142000.0,0.0,...,35.0,6.0,6.0,6.0,10000.0,1.0,0.0,0.0,0.0,0.0
50%,137527.0,32335000.0,29703500.0,28408000.0,0.0,0.0,1231500.0,-126000.0,850000.0,41000.0,...,77.0,14.0,14.0,14.0,10000.0,1.0,0.0,0.0,0.0,0.0
75%,205633.5,88876750.0,83095000.0,79914250.0,0.0,0.0,4574750.0,0.0,3543750.0,291000.0,...,133.0,24.0,24.0,24.0,30000.0,2.0,0.0,0.0,0.0,0.0
max,274623.0,794684000.0,794936000.0,762493000.0,274656000.0,302355000.0,1434511000.0,158425000.0,1481526000.0,54202000.0,...,149.0,33.0,33.0,33.0,1584979000.0,998.0,20.0,1.0,0.0,1.0


In [221]:
train_df.dtypes

record_id                         int64
ar_revenue                      float64
ar_total_expenses               float64
ar_sale_cost                    float64
ar_selling_expenses             float64
ar_management_expenses          float64
ar_sale_profit                  float64
ar_balance_of_rvns_and_expns    float64
ar_profit_before_tax            float64
ar_taxes                        float64
ar_other_profit_and_losses      float64
ar_net_profit                   float64
ab_immobilized_assets           float64
ab_mobile_current_assets        float64
ab_inventory                    float64
ab_accounts_receivable          float64
ab_other_current_assets         float64
ab_cash_and_securities          float64
ab_losses                       float64
ab_own_capital                  float64
ab_borrowed_capital             float64
ab_long_term_liabilities        float64
ab_short_term_borrowing         float64
ab_accounts_payable             float64
ab_other_borrowings             float64


## Exploring Test Data

In [222]:
test_df.head()

Unnamed: 0,record_id,ar_revenue,ar_total_expenses,ar_sale_cost,ar_selling_expenses,ar_management_expenses,ar_sale_profit,ar_balance_of_rvns_and_expns,ar_profit_before_tax,ar_taxes,...,ogrn_age,adr_actual_age,head_actual_age,cap_actual_age,ul_staff_range,ul_capital_sum,ul_founders_cnt,ul_branch_cnt,ul_strategic_flg,ul_systematizing_flg
0,196,39103000.0,38913000.0,38913000.0,0.0,0.0,190000.0,141000.0,331000.0,284000.0,...,135,3,3,3,[1-100],100000.0,1,0,0,0
1,1196,,,,,,,,,,...,138,5,5,5,[1-100],100000.0,1,0,0,0
2,2813,54174000.0,50929000.0,50929000.0,0.0,0.0,3245000.0,-1278000.0,1967000.0,500000.0,...,138,20,20,20,[1-100],250000.0,2,0,0,0
3,4385,1904000.0,1679000.0,1679000.0,0.0,0.0,225000.0,-88000.0,137000.0,0.0,...,139,9,9,9,[1-100],10000.0,2,0,0,0
4,6479,225584000.0,210685000.0,210685000.0,0.0,0.0,14899000.0,-12715000.0,2184000.0,445000.0,...,139,3,3,3,(100-500],1700598.0,84,0,0,0


In [223]:
print('Number of rows in test_df: ', test_df.shape[0])
print('Number of columns in test_df: ', test_df.shape[1])
print('Number of values in test_df: ', test_df.count().sum())
print('Number of NaNs in test_df: ', sum(test_df.isna().sum()))
print('Number of NaNs in test_df in all columns:\n')
print(test_df.isna().sum())

Number of rows in test_df:  200
Number of columns in test_df:  36
Number of values in test_df:  4368
Number of NaNs in test_df:  2832
Number of NaNs in test_df in all columns:

record_id                         0
ar_revenue                      118
ar_total_expenses               118
ar_sale_cost                    118
ar_selling_expenses             118
ar_management_expenses          118
ar_sale_profit                  118
ar_balance_of_rvns_and_expns    118
ar_profit_before_tax            118
ar_taxes                        118
ar_other_profit_and_losses      118
ar_net_profit                   118
ab_immobilized_assets           118
ab_mobile_current_assets        118
ab_inventory                    118
ab_accounts_receivable          118
ab_other_current_assets         118
ab_cash_and_securities          118
ab_losses                       118
ab_own_capital                  118
ab_borrowed_capital             118
ab_long_term_liabilities        118
ab_short_term_borrowing        

In [224]:
test_df.describe()

Unnamed: 0,record_id,ar_revenue,ar_total_expenses,ar_sale_cost,ar_selling_expenses,ar_management_expenses,ar_sale_profit,ar_balance_of_rvns_and_expns,ar_profit_before_tax,ar_taxes,...,bus_age,ogrn_age,adr_actual_age,head_actual_age,cap_actual_age,ul_capital_sum,ul_founders_cnt,ul_branch_cnt,ul_strategic_flg,ul_systematizing_flg
count,200.0,82.0,82.0,82.0,82.0,82.0,82.0,82.0,82.0,82.0,...,200.0,200.0,200.0,200.0,200.0,200.0,200.0,200.0,200.0,200.0
mean,135340.925,86259710.0,82002180.0,76948630.0,2780183.0,2273366.0,4257524.0,-688085.4,3569439.0,441987.8,...,84.595,65.68,14.32,14.32,14.32,726356.0,5.07,0.0,0.0,0.0
std,80177.086856,158903000.0,153409200.0,137265100.0,11806920.0,17528200.0,7752518.0,5253569.0,8232787.0,1294710.0,...,80.585115,46.096442,9.697308,9.697308,9.697308,4043191.0,32.669582,0.0,0.0,0.0
min,196.0,0.0,0.0,0.0,0.0,0.0,-4898000.0,-23281000.0,-6390000.0,-11000.0,...,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0
25%,68367.5,6779250.0,5397500.0,5397500.0,0.0,0.0,155750.0,-657000.0,126500.0,0.0,...,25.0,25.0,5.0,5.0,5.0,10000.0,1.0,0.0,0.0,0.0
50%,127585.5,18997000.0,16456000.0,15878500.0,0.0,0.0,1184000.0,-58000.0,545000.0,73500.0,...,57.0,54.5,12.5,12.5,12.5,10000.0,1.0,0.0,0.0,0.0
75%,202665.0,83882500.0,78313500.0,76511750.0,0.0,0.0,3200000.0,0.0,2630500.0,352500.0,...,106.0,101.25,24.0,24.0,24.0,20555.75,2.0,0.0,0.0,0.0
max,271584.0,770724000.0,745881000.0,669601000.0,93976000.0,156456000.0,38051000.0,28586000.0,54481000.0,7690000.0,...,460.0,145.0,33.0,33.0,33.0,40170000.0,420.0,0.0,0.0,0.0


In [225]:
test_df.dtypes

record_id                         int64
ar_revenue                      float64
ar_total_expenses               float64
ar_sale_cost                    float64
ar_selling_expenses             float64
ar_management_expenses          float64
ar_sale_profit                  float64
ar_balance_of_rvns_and_expns    float64
ar_profit_before_tax            float64
ar_taxes                        float64
ar_other_profit_and_losses      float64
ar_net_profit                   float64
ab_immobilized_assets           float64
ab_mobile_current_assets        float64
ab_inventory                    float64
ab_accounts_receivable          float64
ab_other_current_assets         float64
ab_cash_and_securities          float64
ab_losses                       float64
ab_own_capital                  float64
ab_borrowed_capital             float64
ab_long_term_liabilities        float64
ab_short_term_borrowing         float64
ab_accounts_payable             float64
ab_other_borrowings             float64


In [227]:
train_features = ['default_12m', 'ar_revenue', 'ar_total_expenses', 'ar_sale_cost', 'ar_profit_before_tax', 'ar_net_profit', 'ab_inventory',
    'ab_accounts_receivable', 'ab_own_capital', 'ab_borrowed_capital', 'ab_accounts_payable', 'ogrn_age', 'adr_actual_age',
    'head_actual_age', 'cap_actual_age']

test_features = ['ar_revenue', 'ar_total_expenses', 'ar_sale_cost', 'ar_profit_before_tax', 'ar_net_profit', 'ab_inventory',
    'ab_accounts_receivable', 'ab_own_capital', 'ab_borrowed_capital', 'ab_accounts_payable', 'ogrn_age', 'adr_actual_age',
    'head_actual_age', 'cap_actual_age']

test_record_id = test_df['record_id']
train_df = train_df[train_features]
test_df = test_df[test_features]

# test_record_id = test_df['record_id']
# train_df = train_df.drop('ul_staff_range', axis=1)
# test_df = test_df.drop('ul_staff_range', axis=1)

# Data Preprocessing and Feature Selection

In [228]:
# MinMaxScaler for data normalization
# KNNImputer is a distance-based imputation method,
# so it needs normalization before imputing
scaler = MinMaxScaler()
default = train_df['default_12m']
train_df = pd.DataFrame(scaler.fit_transform(train_df.drop('default_12m', axis=1)), columns = train_df.drop('default_12m', axis=1).columns)
train_df['default_12m'] = default
# df_normalized = scaler.fit_transform(df.drop(['target', 'id'], axis=1))
test_df = pd.DataFrame(scaler.fit_transform(test_df), columns = test_df.columns)

In [229]:
train_df = train_df.dropna(axis=1)
test_df = test_df.dropna(axis=1)

# Cross-Validation

In [230]:
# Get a list of models to evaluate
def get_models():
    models = dict()
    models['LogisticRegression'] = LogisticRegression()
    models['SVC'] = SVC()
    models['GaussianNB'] = GaussianNB()
    models['MultinomialNB'] = MultinomialNB()
    models['SGDClassifier'] = SGDClassifier()
    models['DecisionTreeClassifier'] = DecisionTreeClassifier()
    models['RandomForestClassifier'] = RandomForestClassifier()
    models['GradientBoostingClassifier'] = GradientBoostingClassifier()
    models['LGBMClassifier'] = LGBMClassifier()
    models['XGBClassifier'] = XGBClassifier()
    models['Stacking'] = get_stacking()
    return models

In [231]:
# Evaluate a given model using cross-validation
def mape_evaluate_model(model, x, y):
    start_time = time.time()
    cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
    scores = cross_val_score(model, x, y, scoring='neg_mean_absolute_percentage_error', cv=cv, n_jobs=-1, error_score='raise')
    run_time = time.time() - start_time
    return scores, run_time

In [232]:
# Get a stacking ensemble of models
def get_stacking():
    # Defining the base models
    level0 = list()
    level0.append(('MultinomialNB', MultinomialNB()))
    level0.append(('GaussianNB', GaussianNB()))
    level0.append(('SGDClassifier', SGDClassifier()))
    # Defining meta learner model
    level1 = LogisticRegression()
    # Defining the stacking ensemble
    model = StackingClassifier(estimators=level0, final_estimator=level1, cv=5)
    return model

In [233]:
# Get the models to evaluate
models = get_models()
mape_results, names, runs = list(), list(), list()

# Evaluate the models and store results
for name, model in models.items():
    mape_scores, run_time = mape_evaluate_model(model, train_df.drop('default_12m', axis=1), train_df['default_12m'])
    mape_results.append(mape_scores)
    names.append(name)
    runs.append(run_time)
    print(name, "| MAPE:", mean(mape_scores), "| Standard Deviation:", std(mape_scores),"| Time Taken:", run_time)

LogisticRegression | MAPE: -0.06454705829289892 | Standard Deviation: 0.003842818698104729 | Time Taken: 0.8251864910125732
SVC | MAPE: -22430495519825.793 | Standard Deviation: 45628838265558.96 | Time Taken: 67.21057724952698
GaussianNB | MAPE: -0.06454705829289892 | Standard Deviation: 0.003842818698104729 | Time Taken: 0.12402796745300293
MultinomialNB | MAPE: -0.06454705829289892 | Standard Deviation: 0.003842818698104729 | Time Taken: 0.12502813339233398
SGDClassifier | MAPE: -0.06454705829289892 | Standard Deviation: 0.003842818698104729 | Time Taken: 0.14303159713745117
DecisionTreeClassifier | MAPE: -14319516464678.25 | Standard Deviation: 5041002228153.881 | Time Taken: 0.15703511238098145
RandomForestClassifier | MAPE: -17980364468868.996 | Standard Deviation: 5799014941073.912 | Time Taken: 1.3983149528503418
GradientBoostingClassifier | MAPE: -0.06454705829289892 | Standard Deviation: 0.003842818698104729 | Time Taken: 2.964679479598999
LGBMClassifier | MAPE: -0.0645470582

# Model Selection

In [234]:
mape_mean_results, mape_std_results = list(), list()
r2_mean_results, r2_std_results = list(), list()
for value in range(len(mape_results)):
    mape_mean_results.append(mean(mape_results[value]))
    mape_std_results.append(std(mape_results[value]))

In [235]:
data = {'Name':names, 'MAPE':mape_mean_results, 'Standart Deviation':mape_std_results, 'Time Taken':runs}
df_models = pd.DataFrame(data)
df_models.set_index('Name', inplace=True)
df_models

Unnamed: 0_level_0,MAPE,Standart Deviation,Time Taken
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
LogisticRegression,-0.06454706,0.003842819,0.825186
SVC,-22430500000000.0,45628840000000.0,67.210577
GaussianNB,-0.06454706,0.003842819,0.124028
MultinomialNB,-0.06454706,0.003842819,0.125028
SGDClassifier,-0.06454706,0.003842819,0.143032
DecisionTreeClassifier,-14319520000000.0,5041002000000.0,0.157035
RandomForestClassifier,-17980360000000.0,5799015000000.0,1.398315
GradientBoostingClassifier,-0.06454706,0.003842819,2.964679
LGBMClassifier,-0.06454706,0.003842819,0.549124
XGBClassifier,-185376200000.0,472618500000.0,2.483559


In [236]:
line = px.line(data_frame = df_models.sort_values('Time Taken', ascending=True),y =['Time Taken'], markers = True)
line.update_xaxes(title="Model", rangeslider_visible = False)
line.update_yaxes(title = "Time (s)")
line.update_traces(line_color="red")
line.update_layout(showlegend = True,
                   title = {
                       'text': 'Time Taken vs Model',
                       'y':0.94,
                       'x':0.5,
                       'xanchor': 'center',
                       'yanchor': 'top'})

line.show()

In [237]:
line = px.line(data_frame = df_models.sort_values('MAPE', ascending=False),y =['MAPE'], markers = True)
line.update_xaxes(title="Model", rangeslider_visible = False)
line.update_yaxes(title = "MAPE Score")
line.update_traces(line_color="blue")
line.update_layout(showlegend = True,
                   title = {
                       'text': 'MAPE vs Model',
                       'y':0.94,
                       'x':0.5,
                       'xanchor': 'center',
                       'yanchor': 'top'})

line.show()

In [238]:
line = px.line(data_frame = df_models.sort_values('Standart Deviation', ascending=True),y =['Standart Deviation'], markers = True)
line.update_xaxes(title="Model", rangeslider_visible = False)
line.update_yaxes(title = "Standart Deviation")
line.update_traces(line_color="purple")
line.update_layout(showlegend = True,
                   title = {
                       'text': 'Standart Deviation vs Model',
                       'y':0.94,
                       'x':0.5,
                       'xanchor': 'center',
                       'yanchor': 'top'})

line.show()

# Modeling

In [239]:
# Defining the base models
# level0 = list()
# level0.append(('MultinomialNB', MultinomialNB()))
# level0.append(('GaussianNB', GaussianNB()))
# level0.append(('SGDClassifier', SGDClassifier()))
# level1 = LogisticRegression()
# model = StackingClassifier(estimators=level0, final_estimator=level1, cv=5)


model = LogisticRegression()

solvers = ['newton-cg', 'lbfgs', 'liblinear']
penalty = ['l2']
c_values = [100, 10, 1.0, 0.1, 0.01]

grid = dict(solver=solvers,penalty=penalty,C=c_values)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
grid_result = grid_search.fit(train_df.drop('default_12m', axis=1), train_df['default_12m'])
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.935453 using {'C': 100, 'penalty': 'l2', 'solver': 'newton-cg'}
0.935453 (0.000090) with: {'C': 100, 'penalty': 'l2', 'solver': 'newton-cg'}
0.935453 (0.000090) with: {'C': 100, 'penalty': 'l2', 'solver': 'lbfgs'}
0.935453 (0.000090) with: {'C': 100, 'penalty': 'l2', 'solver': 'liblinear'}
0.935453 (0.000090) with: {'C': 10, 'penalty': 'l2', 'solver': 'newton-cg'}
0.935453 (0.000090) with: {'C': 10, 'penalty': 'l2', 'solver': 'lbfgs'}
0.935453 (0.000090) with: {'C': 10, 'penalty': 'l2', 'solver': 'liblinear'}
0.935453 (0.000090) with: {'C': 1.0, 'penalty': 'l2', 'solver': 'newton-cg'}
0.935453 (0.000090) with: {'C': 1.0, 'penalty': 'l2', 'solver': 'lbfgs'}
0.935453 (0.000090) with: {'C': 1.0, 'penalty': 'l2', 'solver': 'liblinear'}
0.935453 (0.000090) with: {'C': 0.1, 'penalty': 'l2', 'solver': 'newton-cg'}
0.935453 (0.000090) with: {'C': 0.1, 'penalty': 'l2', 'solver': 'lbfgs'}
0.935453 (0.000090) with: {'C': 0.1, 'penalty': 'l2', 'solver': 'liblinear'}
0.935453 (0.000090) wit

In [240]:
model = LogisticRegression(C=100, penalty='l2', solver='newton-cg')

model.fit(train_df.drop('default_12m', axis=1), train_df['default_12m'])
# Predict the results
predictions = model.predict(test_df)
print("Predicted Values:", predictions)

Predicted Values: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]


In [241]:
answ_df = pd.DataFrame()
answ_df['id'] = test_record_id
predict = pd.DataFrame([math.floor(float(x)) for x in (predictions*100)])
answ_df['predict'] = predict
answ_df.to_csv('PD-submit.csv',index=False, sep=';')