# My seminar - Applications of Machine Learning in Economics
### This notebook implements our baseline models using MDA as proposed by Altman (1968) and logit analysis as used by Ohlson (1980)


Importing packages

In [28]:
# common
import pandas as pd
import numpy as np
import os

# imputation
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer

from functions import load_data
from sklearn import model_selection
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis  # can we use this for Altman?
# https://python-for-multivariate-analysis.readthedocs.io/a_little_book_of_python_for_multivariate_analysis.html#linear-discriminant-analysis

In [35]:
x = pd.read_csv('data/train/x_5year.txt',delimiter=',',index_col=0)
y = pd.read_csv('data/train/y_5year.txt',delimiter=',',index_col=0)
# alternatively, if we want to tweak the settings for creating dataset on the fly
# x_train, y_train, _, _ = load_data(year = 5, share = 0.2, shuffle = True, seed = 418409376)

First, we implement a multivariate discriminant analysis using sklearn.
Altman (1968) uses 5 features: X1 (Working Capital / Total Assets); X2 (Retained Earnings/Total Assets); X3 (Earnings Before Interest and Taxes/Total Assets); X4 (Market Value of Equity/Book Value of Total Debt) and X5 (Sales/Total Assets). These correspond to X3, X6, X7, ??? and X9 respectively. Looks like we have to find something similar to X4 in Altman.

In [26]:
# first we need to impute data
# this function should ideally contain various imputation options and be put into functions.py
# here, I just used the IterativeImputer for multivariate feature imputation

#def impute_data(df,strategy:str,indicator:bool):
#    """ 
#    This function takes as input the strategy for imputation and wether an indicator matrix of missing values should be stacked onto the transformed matrix
#    Strategies for imputation include: "mean", "median", KNN
#    """
    
imp = IterativeImputer(max_iter=10, random_state=0)
x_train_imputed = pd.DataFrame(imp.fit_transform(x_train))
x_train_imputed.columns = x_train.columns
x_train_imputed.index = x_train.index

X = x_train_imputed[['v3','v6','v7','v9']]

In [27]:
lda = LinearDiscriminantAnalysis().fit(X, y_train)

In [None]:
def pretty_scalings(lda, X, out=False):
    ret = pd.DataFrame(lda.scalings_, index=X.columns, columns=["LD"+str(i+1) for i in range(lda.scalings_.shape[1])])
    if out:
        print("Coefficients of linear discriminants:")
        display(ret)
    return ret

pretty_scalings_ = pretty_scalings(lda, X, out=True)