In [10]:
import data_handler
import machine_learning as ml
import cost_benefit as cb

import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier


%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Main results for seminar paper
This notebook is created to estimate the machine learning models for the chosen hyperparameters. See other notebook for invokement of the grid search algorithm to choose the optimal hyperparameters.

In [5]:
# First, define wrapper to call all necessary functions.
def result_wrapper(model, parameters, train, test, name):
    # I have to create a fake class in order to make validate model work.
    # This is because it was created with the purpose to work with the
    # grid search algorithm.
    x, y = train[0], train[1]

    class Object(object):
        pass
    model_obj = Object()
    model_obj.best_estimator_ = model(**parameters).fit(x, y)
    best_model = ml.validate_model(model_obj, train, test, supress_print=True)
    benefits = cb.cost_benefit_analysis(best_model, test)
    print(f"{name} & {benefits[0]:.2f} & {benefits[1]/benefits[0]*100:.2f} & {benefits[2]/benefits[0]*100:.2f} \\\\")
    print()
    cb.latex_printout(best_model, test)

### Ohlson's Logit
I first define a function to run Ohlson's logit function, with the specified variables. I then evaluate it both in it's success at predicting bankruptcies, but also how much savings we expect.

In [6]:
def ohlson_data(year):
    x_train, x_test, y_train, y_test = data_handler.load_data(year)
    n = y_train.size
    x_train.columns = data_handler.ohlson_varnames()
    x_test.columns = data_handler.ohlson_varnames()

    # Ohlson use a dummy if liabilites are greater than assets
    x_train['liabilites > assets'] = (x_train['total liabilities / total assets'] > 1)*1.0
    x_test['liabilites > assets'] = (x_test['total liabilities / total assets'] > 1)*1.0

    # Try to replicate the variables as closely as possible,
    # some of the variables are inverted, but that should not affect
    # the predictability, only the sign of the coefficient.
    ohlson_vars = [
        'logarithm of total assets', 
        'total liabilities / total assets', 
        'working capital / total assets', 
        'current assets / short-term liabilities',
        'liabilites > assets',
        'net profit / total assets',
        'total liabilities / ((profit on operating activities + depreciation) * (12/365))',
        'sales (n) / sales (n-1)'
    ]
    x_ohlson_train = x_train[ohlson_vars]
    x_ohlson_test = x_test[ohlson_vars]
    return x_ohlson_train, x_ohlson_test, y_train, y_test

In [7]:
x_train, x_test, y_train, y_test = ohlson_data(5)
ohlson_parameters = {'penalty': 'none', 'max_iter': 1000}
result_wrapper(LogisticRegression, ohlson_parameters, (x_train, y_train), (x_test, y_test), "Ohlson's logit")

Ohlson's logit & 68.71 & 0.10 & 0.17 \\

\begin{tabular}{lrr}
\hline
              &   Non-Bankrupt &   Bankrupt \\
\hline
 Non-Bankrupt &           1095 &          5 \\
 Bankrupt     &             79 &          3 \\
\hline
\end{tabular}


### Logit with elastic net
This is an extension of Ohlson's logit model. This time we include all available variables. Since this can lead to overfitting, we use elastic-net to reduce this problem.

In [8]:
x_train, x_test, y_train, y_test = data_handler.load_data(5, out_frame=False)
logit_parameters = {
    'C': 1.2, 
    'l1_ratio': 0,
    'penalty': 'elasticnet', 
    'solver': 'saga',
    'max_iter': 10000
}
result_wrapper(LogisticRegression, logit_parameters, (x_train, y_train), (x_test, y_test), "Logit")

Ohlson's logit & 68.71 & 0.46 & 0.77 \\

\begin{tabular}{lrr}
\hline
              &   Non-Bankrupt &   Bankrupt \\
\hline
 Non-Bankrupt &           1092 &          8 \\
 Bankrupt     &             73 &          9 \\
\hline
\end{tabular}


### Gradient Descent classifier

In [18]:
GBC_parameters = {
    "learning_rate": 0.9,
    "min_samples_split": 0.005,
    "min_samples_leaf": 0.005,
    "max_depth": 8,
    'min_weight_fraction_leaf': 0.0,
    'min_impurity_decrease': 1e-07
}
result_wrapper(GradientBoostingClassifier, GBC_parameters, (x_train, y_train), (x_test, y_test), "Gradient Boosting")

SyntaxError: invalid syntax (<ipython-input-18-9fd9446bf735>, line 6)