In [159]:
import data_handler
import machine_learning as ml
import cost_benefit as cb

import numpy as np
from tabulate import tabulate
import graphviz 

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier


%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Main results for seminar paper
This notebook is created to estimate the machine learning models for the chosen hyperparameters. See other notebook for invokement of the grid search algorithm to choose the optimal hyperparameters.

In [189]:
# First, define wrapper to call all necessary functions.
def result_wrapper(model, parameters, train, test, name, scoring='recall'):
    # I have to create a fake class in order to make validate model work.
    # This is because it was created with the purpose to work with the
    # grid search algorithm.
    x, y = train[0], train[1]

    class Object(object):
        pass
    model_obj = Object()
    model_obj.best_estimator_ = model(**parameters).fit(x, y)
    best_model = ml.validate_model(model_obj, train, test, scoring=scoring, supress_print=True)
    benefits = cb.cost_benefit_analysis(best_model, test)
    print(f"{name} & {benefits[3]} & {benefits[0]:.2f} & {benefits[1]/benefits[0]*100:.2f} & {benefits[2]/benefits[0]*100:.2f} \\\\")
    print()
    print('\\begin{table}[!hb]')
    print('\\centering')
    print(f'\\caption{{Confusion matrix for {name} model}} ')
    print('\\begin{minipage}{.5\\linewidth}')
    print('\\caption*{Train results}')
    cb.latex_printout(best_model, train)
    print('\\end{minipage}')
    print('\\begin{minipage}{.5\\linewidth}')
    print('\\caption*{Test results}')
    cb.latex_printout(best_model, test)
    print('\\end{minipage}')
    print('\\end{table}')
    return best_model

### Ohlson's Logit
I first define a function to run Ohlson's logit function, with the specified variables. I then evaluate it both in it's success at predicting bankruptcies, but also how much savings we expect.

In [179]:
def ohlson_data(year):
    x_train, x_test, y_train, y_test = data_handler.load_data(year)
    n = y_train.size
    x_train.columns = data_handler.ohlson_varnames()
    x_test.columns = data_handler.ohlson_varnames()

    # Ohlson use a dummy if liabilites are greater than assets
    x_train['liabilites > assets'] = (x_train['total liabilities / total assets'] > 1)*1.0
    x_test['liabilites > assets'] = (x_test['total liabilities / total assets'] > 1)*1.0

    # Try to replicate the variables as closely as possible,
    # some of the variables are inverted, but that should not affect
    # the predictability, only the sign of the coefficient.
    ohlson_vars = [
        'logarithm of total assets', 
        'total liabilities / total assets', 
        'working capital / total assets', 
        'current assets / short-term liabilities',
        'liabilites > assets',
        'net profit / total assets',
        'total liabilities / ((profit on operating activities + depreciation) * (12/365))',
        'sales (n) / sales (n-1)'
    ]
    x_ohlson_train = x_train[ohlson_vars]
    x_ohlson_test = x_test[ohlson_vars]
    return x_ohlson_train, x_ohlson_test, y_train, y_test

In [186]:
x_train, x_test, y_train, y_test = ohlson_data(5)
ohlson_parameters = {'penalty': 'none', 'max_iter': 1000}
result_wrapper(LogisticRegression, ohlson_parameters, (x_train, y_train), (x_test, y_test), "Ohlson's logit") ;

Ohlson's logit & 3 & 68.71 & 0.10 & 0.17 \\

\begin{table}[!hb]
\centering
\caption{Confusion matrix for Ohlson's logit} model
\begin{minipage}{.5\linewidth}
\caption*{Train results}
\begin{tabular}{lrr}
\hline
              &   Non-Bankrupt &   Bankrupt \\
\hline
 Non-Bankrupt &           4379 &         21 \\
 Bankrupt     &            306 &         22 \\
\hline
\end{tabular}
\end{minipage}
\begin{minipage}{.5\linewidth}
\caption*{Test results}
\begin{tabular}{lrr}
\hline
              &   Non-Bankrupt &   Bankrupt \\
\hline
 Non-Bankrupt &           1095 &          5 \\
 Bankrupt     &             79 &          3 \\
\hline
\end{tabular}
\end{minipage}
\end{table}


### Logit with elastic net
This is an extension of Ohlson's logit model. This time we include all available variables. Since this can lead to overfitting, we use elastic-net to reduce this problem.

In [190]:
x_train, x_test, y_train, y_test = data_handler.load_data(5, out_frame=False)
logit_parameters = {
    'C': 1.2, 
    'l1_ratio': 0,
    'penalty': 'elasticnet', 
    'solver': 'saga',
    'max_iter': 500
}
logit_best_model = result_wrapper(LogisticRegression, logit_parameters, (x_train, y_train), (x_test, y_test), "Logit")

Logit & 9 & 68.71 & 0.46 & 0.77 \\

\begin{table}[!hb]
\centering
\caption{Confusion matrix for Logit model} 
\begin{minipage}{.5\linewidth}
\caption*{Train results}
\begin{tabular}{lrr}
\hline
              &   Non-Bankrupt &   Bankrupt \\
\hline
 Non-Bankrupt &           4378 &         22 \\
 Bankrupt     &            301 &         27 \\
\hline
\end{tabular}
\end{minipage}
\begin{minipage}{.5\linewidth}
\caption*{Test results}
\begin{tabular}{lrr}
\hline
              &   Non-Bankrupt &   Bankrupt \\
\hline
 Non-Bankrupt &           1094 &          6 \\
 Bankrupt     &             73 &          9 \\
\hline
\end{tabular}
\end{minipage}
\end{table}


This prints the most important logit coefficients

In [132]:
def print_important_coeff(model, n_coeffs):
    important_coeff_arg = np.flip(
        np.argsort(np.abs(logit_best_model['estimator'].coef_))
    )
    coeff_names = data_handler.ohlson_varnames()

    coeff_list = []
    name_list = []
    for i in range(0, n_coeffs + 1):
        coeff_arg = important_coeff_arg[0, i]
        coeff_list.append(round(logit_best_model['estimator'].coef_[0, coeff_arg], 3))
        name_list.append(coeff_names[i])
    return coeff_list, name_list

def print_two_rows(coeff_list, name_list):
    # Split list in two and rezip them
    zipped_list = list(zip(coeff_list, name_list))
    zipped1 = zipped_list[:5]
    zipped2 = zipped_list[5:]
    zipped_list2 = zip(zipped1, zipped2)
    table = tabulate(zipped_list2, tablefmt='latex')

    # Remove tuple and list specifiers
    for char in ["'", "(", ")", "[", "]"]:
        table = table.replace(char, '')

    # Make some small adjustments to get a pretty list
    table = table.replace("ll", "llll")
    table = table.replace(",", " &")
    table = table.replace("cash + short-term securities + receivables - short-term liabilities / operating expenses - depreciation * 365", "defensive interval ratio")
    # table = table.replace("'", '').replace("(", '').replace(")", '').replace("[","").replace("[","")
    print(table)

In [133]:
coeff_list, name_list = print_important_coeff(logit_best_model, 10)
print_two_rows(name_list, coeff_list)

\begin{tabular}{llll}
\hline
 net profit / total assets & -0.39                                                                                            & retained earnings / total assets & 0.267          \\
 total liabilities / total assets & -0.369                                                                                    & EBIT / total assets & -0.171                      \\
 working capital / total assets & -0.366                                                                                      & book value of equity / total liabilities & -0.131 \\
 current assets / short-term liabilities & -0.311                                                                             & sales / total assets & -0.128                     \\
 defensive interval ratio & -0.298 & equity / total assets & 0.122                     \\
\hline
\end{tabular}


In [191]:
dt_parameters = {
    'max_leaf_nodes': 11,  #19
    'max_depth': 5,  #7
    'min_samples_leaf': 0.01,
    'min_samples_split': 0.01,
    'min_weight_fraction_leaf': 0,
    'min_impurity_decrease': 0
}
dt_best_model = result_wrapper(
    DecisionTreeClassifier, dt_parameters, (x_train, y_train), (x_test, y_test), "Decision Tree", scoring='roc_auc')

Decision Tree & 37 & 68.71 & 5.45 & 9.08 \\

\begin{table}[!hb]
\centering
\caption{Confusion matrix for Decision Tree model} 
\begin{minipage}{.5\linewidth}
\caption*{Train results}
\begin{tabular}{lrr}
\hline
              &   Non-Bankrupt &   Bankrupt \\
\hline
 Non-Bankrupt &           4337 &         63 \\
 Bankrupt     &            156 &        172 \\
\hline
\end{tabular}
\end{minipage}
\begin{minipage}{.5\linewidth}
\caption*{Test results}
\begin{tabular}{lrr}
\hline
              &   Non-Bankrupt &   Bankrupt \\
\hline
 Non-Bankrupt &           1072 &         28 \\
 Bankrupt     &             45 &         37 \\
\hline
\end{tabular}
\end{minipage}
\end{table}


In [200]:
dot_data = tree.export_graphviz(
    dt_best_model['estimator'], 
    out_file=None, 
    feature_names=data_handler.ohlson_varnames(),
    class_names=['non-bankrupt', 'bankrupt'],
    rounded=True, filled=True
) 
graph = graphviz.Source(dot_data) 
graph.render("bankruptcy") 

'bankruptcy.pdf'

### Gradient Descent classifier

In [192]:
GBC_parameters = {
    "learning_rate": 0.9,
    "min_samples_split": 0.005,
    "min_samples_leaf": 0.005,
    "max_depth": 8,
    'random_state': 42
}
result_wrapper(GradientBoostingClassifier, GBC_parameters, (x_train, y_train), (x_test, y_test), "Gradient Boosting", scoring='roc_auc')

Gradient Boosting & 49 & 68.71 & 6.69 & 11.15 \\

\begin{table}[!hb]
\centering
\caption{Confusion matrix for Gradient Boosting model} 
\begin{minipage}{.5\linewidth}
\caption*{Train results}
\begin{tabular}{lrr}
\hline
              &   Non-Bankrupt &   Bankrupt \\
\hline
 Non-Bankrupt &           4391 &          9 \\
 Bankrupt     &             28 &        300 \\
\hline
\end{tabular}
\end{minipage}
\begin{minipage}{.5\linewidth}
\caption*{Test results}
\begin{tabular}{lrr}
\hline
              &   Non-Bankrupt &   Bankrupt \\
\hline
 Non-Bankrupt &           1081 &         19 \\
 Bankrupt     &             33 &         49 \\
\hline
\end{tabular}
\end{minipage}
\end{table}


{'fit_time': 18.790141820907593,
 'score_time': 0.01628708839416504,
 'estimator': GradientBoostingClassifier(learning_rate=0.9, max_depth=8,
                            min_samples_leaf=0.005, min_samples_split=0.005,
                            random_state=42),
 'test_score': 0.9325757575757576,
 'train_score': 1.0}

### Neural network


In [193]:
nn_parameters = {
    'solver': 'lbfgs',
    'random_state': 42,
    'warm_start': False,
    'hidden_layer_sizes': 190,
    'max_iter': 200
}
result_wrapper(MLPClassifier, nn_parameters, (x_train, y_train), (x_test, y_test), "Neural network") ;

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
Neural network & 42 & 68.71 & 6.70 & 11.17 \\

\begin{table}[!hb]
\centering
\caption{Confusion matrix for Neural network model} 
\begin{minipage}{.5\linewidth}
\caption*{Train results}
\begin{tabular}{lrr}
\hline
              &   Non-Bankrupt &   Bankrupt \\
\hline
 Non-Bankrupt &           4366 &         34 \\
 Bankrupt     &             52 &        276 \\
\hline
\end{tabular}
\end{minipage}
\begin{minipage}{.5\linewidth}
\caption*{Test results}
\begin{tabular}{lrr}
\hline
              &   Non-Bankrupt &   Bankrupt \\
\hline
 Non-Bankrupt &           1070 &         30 \\
 Bankrupt     &             40 &         42 \\
\hline
\end{tabular}
\end{minipage}
\end{table}
