In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('./data/merged_compustat_and_labels.csv')

In [3]:
df.columns

Index(['gvkey', 'datadate', 'fyear', 'indfmt', 'consol', 'popsrc', 'datafmt',
       'tic', 'cusip', 'conm', 'curcd', 'fyr', 'act', 'ap', 'at', 'ceq', 'che',
       'cogs', 'csho', 'dlc', 'dltis', 'dltt', 'dp', 'ib', 'invt', 'ivao',
       'ivst', 'lct', 'lt', 'ni', 'ppegt', 'ppent', 'pstk', 're', 'rect',
       'sale', 'sstk', 'txp', 'txt', 'xint', 'cik', 'costat', 'prcc_f',
       'conml', 'sic', 'Bank', 'dch_wc', 'ch_rsst', 'dch_rec', 'dch_inv',
       'soft_assets', 'ch_cs', 'ch_cm', 'ch_roa', 'issue', 'bm', 'dpi', 'reoa',
       'EBIT', 'ch_fcf', 'misstate'],
      dtype='object')

In [4]:
df.head(3)

Unnamed: 0,gvkey,datadate,fyear,indfmt,consol,popsrc,datafmt,tic,cusip,conm,...,ch_cs,ch_cm,ch_roa,issue,bm,dpi,reoa,EBIT,ch_fcf,misstate
0,1003,1990-01-31,1989,INDL,C,D,STD,ANTQ,354100,A.A. IMPORTING CO INC,...,,,,1,-1.240403,,-0.403403,-0.087941,,0
1,1004,1990-05-31,1989,INDL,C,D,STD,AIR,361105,AAR CORP,...,,,,1,0.554652,5.380405,0.24052,0.123916,27.257486,0
2,1004,1991-05-31,1990,INDL,C,D,STD,AIR,361105,AAR CORP,...,0.32954,0.297848,-0.090196,0,0.863306,0.882711,0.262695,0.082704,-10.87404,0


In [5]:
df[['fyear','misstate']]['misstate'].value_counts()

0    266737
1      1376
Name: misstate, dtype: int64

In [6]:
df.groupby('fyear')['misstate'].sum()

fyear
1989      5
1990     22
1991     38
1992     36
1993     40
1994     31
1995     36
1996     42
1997     57
1998     72
1999    103
2000    126
2001    124
2002    104
2003     87
2004     70
2005     55
2006     37
2007     36
2008     29
2009     36
2010     34
2011     27
2012     32
2013     26
2014     20
2015     14
2016     18
2017     11
2018      6
2019      2
2020      0
2021      0
2022      0
2023      0
Name: misstate, dtype: int64

#### Retrain all of them

In [1]:
import pandas as pd
import numpy as np
from MLP.utils import datasets,plot_table,train_model
from sklearn import metrics
from imblearn.under_sampling import RandomUnderSampler
from tabulate import tabulate
import matplotlib.pyplot as plt
from results.data_processing import DataProcessor
from results.utils import evaluate,null_check,results
from results.models import rus_boost, svm_model, xgb_model, logistic_regression_model,probit_regression_model,MLP,mlp_grid_search, random_forests
import json
from collections import defaultdict

In [2]:
data = pd.read_csv('./data/merged_compustat_and_labels.csv')
with open('MLP/features.json') as json_file:
    features_comp = json.load(json_file)

In [3]:
import numpy as np
models = {"MLP": MLP,
         "RUS BOOST": rus_boost,
          "Logit":logistic_regression_model,
         "Probit":probit_regression_model,
         "Xg Boost":xgb_model}

In [4]:
data['misstate'].value_counts()

0    266737
1      1376
Name: misstate, dtype: int64

In [5]:
data.replace([np.inf, -np.inf], np.nan, inplace=True)

In [6]:
data = data.fillna(0)

### Batch Processing

Batch:
		Train : (1990,2002)
		Test :  (2003,2019)
		
		Run on undersample data : Bao, XgBoost,  probit, logit, MLP
		
		Oversample data and run all models: Bao, XgBoost,  probit, logit, MLP

In [7]:
test_periods = [(2003,2019)]
data_obj = DataProcessor(data,(1990,2002), (2002,2002), (2003,2014), 5)

In [8]:
data_items = features_comp
train_period = (1990,2002)
res = defaultdict(lambda: defaultdict(dict))
for test_period in test_periods:
    for model in models.keys():
        for data_item in data_items.keys():
            auc = results(data_obj,train_period,test_period,data_items[data_item],models[model])
            res[test_period][data_item][model] = auc

Optimization terminated successfully.
         Current function value: 0.579011
         Iterations 10
Optimization terminated successfully.
         Current function value: 0.598600
         Iterations 10
Optimization terminated successfully.
         Current function value: 0.624152
         Iterations 8
Test AUC Score: 0.6701160290397631
Test AUC Score: 0.6575388747299562
Test AUC Score: 0.6679662459356145


In [9]:
res

defaultdict(<function __main__.<lambda>()>,
            {(2003,
              2019): defaultdict(dict,
                         {'features': {'MLP': 0.6138772530364529,
                           'RUS BOOST': 0.7103304849241503,
                           'Logit': 0.6299340912072732,
                           'Probit': 0.6401794666980123,
                           'Xg Boost': 0.6701160290397631},
                          'raw_financial_items_28': {'MLP': 0.6099790495289557,
                           'RUS BOOST': 0.6944388268804814,
                           'Logit': 0.6217356536172955,
                           'Probit': 0.6248934369636598,
                           'Xg Boost': 0.6575388747299562},
                          'financial_ratios_14': {'MLP': 0.6017435226593385,
                           'RUS BOOST': 0.6789267736808892,
                           'Logit': 0.6319123493795301,
                           'Probit': 0.6327112753191103,
                           'Xg Boos