In [1]:
import pandas as pd
import numpy as np


In [2]:
df = pd.read_csv('./data/merged_compustat_and_labels.csv')

In [3]:
df.columns

Index(['gvkey', 'datadate', 'fyear', 'indfmt', 'consol', 'popsrc', 'datafmt',
       'tic', 'cusip', 'conm', 'curcd', 'fyr', 'act', 'ap', 'at', 'ceq', 'che',
       'cogs', 'csho', 'dlc', 'dltis', 'dltt', 'dp', 'ib', 'invt', 'ivao',
       'ivst', 'lct', 'lt', 'ni', 'ppegt', 'ppent', 'pstk', 're', 'rect',
       'sale', 'sstk', 'txp', 'txt', 'xint', 'cik', 'costat', 'prcc_f',
       'conml', 'sic', 'Bank', 'dch_wc', 'ch_rsst', 'dch_rec', 'dch_inv',
       'soft_assets', 'ch_cs', 'ch_cm', 'ch_roa', 'issue', 'bm', 'dpi', 'reoa',
       'EBIT', 'ch_fcf', 'misstate'],
      dtype='object')

In [4]:
df.head(3)

Unnamed: 0,gvkey,datadate,fyear,indfmt,consol,popsrc,datafmt,tic,cusip,conm,...,ch_cs,ch_cm,ch_roa,issue,bm,dpi,reoa,EBIT,ch_fcf,misstate
0,1003,1990-01-31,1989,INDL,C,D,STD,ANTQ,354100,A.A. IMPORTING CO INC,...,,,,1,-1.240403,,-0.403403,-0.087941,,0
1,1004,1990-05-31,1989,INDL,C,D,STD,AIR,361105,AAR CORP,...,,,,1,0.554652,5.380405,0.24052,0.123916,27.257486,0
2,1004,1991-05-31,1990,INDL,C,D,STD,AIR,361105,AAR CORP,...,0.32954,0.297848,-0.090196,0,0.863306,0.882711,0.262695,0.082704,-10.87404,0


In [5]:
df[['fyear','misstate']]['misstate'].value_counts()

0    266737
1      1376
Name: misstate, dtype: int64

In [6]:
df.groupby('fyear')['misstate'].sum()

fyear
1989      5
1990     22
1991     38
1992     36
1993     40
1994     31
1995     36
1996     42
1997     57
1998     72
1999    103
2000    126
2001    124
2002    104
2003     87
2004     70
2005     55
2006     37
2007     36
2008     29
2009     36
2010     34
2011     27
2012     32
2013     26
2014     20
2015     14
2016     18
2017     11
2018      6
2019      2
2020      0
2021      0
2022      0
2023      0
Name: misstate, dtype: int64

#### Retrain all of them

In [1]:
import pandas as pd
import numpy as np
from MLP.utils import datasets,plot_table,train_model
from sklearn import metrics
from imblearn.under_sampling import RandomUnderSampler
from tabulate import tabulate
import matplotlib.pyplot as plt
from results.data_processing import DataProcessor
from results.utils import evaluate,null_check,results
from results.models import rus_boost, svm_model, xgb_model, logistic_regression_model,probit_regression_model,MLP,mlp_grid_search, random_forests
import json
from collections import defaultdict

In [2]:
data = pd.read_csv('./data/merged_compustat_and_labels.csv')
with open('MLP/features.json') as json_file:
    features_comp = json.load(json_file)

In [3]:
import numpy as np
models = {"MLP": MLP,
         "RUS BOOST": rus_boost,
          "Logit":logistic_regression_model,
         "Probit":probit_regression_model}

In [4]:
data['misstate'].value_counts()

0    266737
1      1376
Name: misstate, dtype: int64

In [5]:
data.replace([np.inf, -np.inf], np.nan, inplace=True)

In [6]:
data = data.fillna(0)

In [7]:
data['misstate'].value_counts()

0    266737
1      1376
Name: misstate, dtype: int64

In [8]:
### Batch Processing

test_periods = [(2007,2019)]
data_obj = DataProcessor(data,(1990,1999), (2000,2001), (2003,2014), 5)

data_items = features_comp
train_period = (1989,2006)
res = defaultdict(lambda: defaultdict(dict))
for test_period in test_periods:
    for model in models.keys():
        for data_item in data_items.keys():
            auc = results(data_obj,train_period,test_period,data_items[data_item],models[model])
            res[test_period][data_item][model] = auc

Train AUC: 1.0
Train AUC: 1.0
Train AUC: 1.0
Train AUC: 0.7265008813098601
Train AUC: 0.7272713372549853
Train AUC: 0.6832483170167131
Optimization terminated successfully.
         Current function value: 0.596644
         Iterations 10
Optimization terminated successfully.
         Current function value: 0.619102
         Iterations 10
Optimization terminated successfully.
         Current function value: 0.619523
         Iterations 10


In [9]:
for key in res.keys():
    columns = ["Test Period : "+str(key), 'MLP', 'RUS BOOST', 'Logit', 'Probit']
    df = pd.DataFrame.from_dict(res[key], orient='index').reset_index()
    df.columns = columns
    for col in ['MLP', 'RUS BOOST', 'Logit', 'Probit']:
        df[col] = df[col].round(3)
    df.set_index("Test Period : "+str(key), inplace=True)
    print(tabulate(df, headers='keys', tablefmt='pretty'))

+----------------------------+-------+-----------+-------+--------+
| Test Period : (2007, 2019) |  MLP  | RUS BOOST | Logit | Probit |
+----------------------------+-------+-----------+-------+--------+
|          features          | 0.622 |   0.657   | 0.615 | 0.622  |
|   raw_financial_items_28   | 0.635 |   0.654   | 0.617 | 0.603  |
|    financial_ratios_14     | 0.597 |   0.646   | 0.637 | 0.634  |
+----------------------------+-------+-----------+-------+--------+


In [13]:
import pandas as pd
import json

In [14]:
data = pd.read_csv('data/Bao_28_items_1990_2023_v3.csv')

In [15]:
data.columns

Index(['gvkey', 'datadate', 'fyear', 'indfmt', 'consol', 'popsrc', 'datafmt',
       'tic', 'cusip', 'conm', 'curcd', 'fyr', 'act', 'ap', 'at', 'ceq', 'che',
       'cogs', 'csho', 'dlc', 'dltis', 'dltt', 'dp', 'ib', 'invt', 'ivao',
       'ivst', 'lct', 'lt', 'ni', 'ppegt', 'ppent', 'pstk', 're', 'rect',
       'sale', 'sstk', 'txp', 'txt', 'xint', 'cik', 'costat', 'prcc_f',
       'conml', 'sic'],
      dtype='object')

In [16]:
data.isna().sum()

gvkey            0
datadate         0
fyear            0
indfmt           0
consol           0
popsrc           0
datafmt          0
tic             96
cusip           93
conm             0
curcd            0
fyr              0
act         107807
ap           61018
at           58645
ceq          59271
che          58984
cogs         60155
csho         20139
dlc          59393
dltis        79229
dltt         59532
dp           69479
ib           59843
invt         62697
ivao         69326
ivst         63544
lct         106185
lt           59135
ni           59846
ppegt        93112
ppent        65412
pstk         59431
re           66839
rect         61019
sale         60116
sstk         75498
txp          87206
txt          60116
xint         97742
cik          38466
costat           0
prcc_f       48082
conml            0
sic              0
dtype: int64

In [17]:
with open('./MLP/features.json') as json_file:
    features = json.load(json_file)

In [18]:
for feat in features['raw_financial_items_28']:
    if feat not in data.columns:
        print(feat)

In [19]:
data[features['raw_financial_items_28']+['ppent']].isna().sum()

act       107807
ap         61018
at         58645
ceq        59271
che        58984
cogs       60155
csho       20139
dlc        59393
dltis      79229
dltt       59532
dp         69479
ib         59843
invt       62697
ivao       69326
ivst       63544
lct       106185
lt         59135
ni         59846
ppegt      93112
pstk       59431
re         66839
rect       61019
sale       60116
sstk       75498
txp        87206
txt        60116
xint       97742
prcc_f     48082
ppent      65412
dtype: int64

In [20]:
data = data.dropna(subset=['at'])

In [21]:
data = data[data['at'] != 0]

In [22]:
data[features['raw_financial_items_28']+['ppent']].isna().sum()

act       49158
ap         2372
at            0
ceq         636
che         340
cogs       1582
csho       6192
dlc         749
dltis     20567
dltt        891
dp        10879
ib         1322
invt       4052
ivao      10680
ivst       4901
lct       47535
lt          491
ni         1326
ppegt     34462
pstk        796
re         8199
rect       2376
sale       1583
sstk      16841
txp       28561
txt        1550
xint      39101
prcc_f    36558
ppent      6770
dtype: int64

In [23]:
data['Bank'] = (data['sic'] >= 6000) & (data['sic'] <= 6999)
data['Bank'] = data['Bank'].astype(int)

In [24]:
data['Bank'].value_counts()

0    215127
1     52977
Name: Bank, dtype: int64

In [25]:
data[features['raw_financial_items_28']+['ppent']] = \
    data[features['raw_financial_items_28']+['ppent']].fillna(0)

In [26]:
df = data.copy()

In [27]:
df.columns

Index(['gvkey', 'datadate', 'fyear', 'indfmt', 'consol', 'popsrc', 'datafmt',
       'tic', 'cusip', 'conm', 'curcd', 'fyr', 'act', 'ap', 'at', 'ceq', 'che',
       'cogs', 'csho', 'dlc', 'dltis', 'dltt', 'dp', 'ib', 'invt', 'ivao',
       'ivst', 'lct', 'lt', 'ni', 'ppegt', 'ppent', 'pstk', 're', 'rect',
       'sale', 'sstk', 'txp', 'txt', 'xint', 'cik', 'costat', 'prcc_f',
       'conml', 'sic', 'Bank'],
      dtype='object')

In [28]:
#1 changes in working capital accruals
df['wc'] = (df['act'] - df['che']) - (df['lct'] - df['dlc'] - df['txp'])
df['ch_wc'] = df['wc'] - df['wc'].shift(1)
df['dch_wc'] = df['ch_wc'] * 2 / (df['at'] - df['at'].shift(1))

#2 changes in RSST_accruals
df['nco'] = (df['at'] - df['act'] - df['ivao']) - (df['lt'] - df['lct'] - df['dltt'])
df['ch_nco'] = df['nco'] - df['nco'].shift(1)

df['fin'] = (df['ivst'] + df['ivao']) - (df['dltt'] + df['dlc'] + df['pstk'])
df['ch_fin'] = df['fin'] - df['fin'].shift(1)

df['ch_rsst'] = (df['ch_wc'] + df['ch_nco'] + df['ch_fin']) * 2 / (df['at'] + df['at'].shift(1))

#3 changes in receivables
df['ch_rec'] = df['rect'] - df['rect'].shift(1)
df['dch_rec'] = df['ch_rec'] * 2 / (df['at'] + df['at'].shift(1))

#4 changes in inventories
df['ch_inv'] = df['invt'] - df['invt'].shift(1)
df['dch_inv'] = df['ch_inv'] * 2 / (df['at'] + df['at'].shift(1))

#5 percentage of soft assets
df['soft_assets'] = (df['at'] - df['ppent'] - df['che']) / df['at']

#6 percentage change in cash sales
df['cs'] = df['sale'] - (df['rect'] - df['rect'].shift(1))
df['ch_cs'] = (df['cs'] - df['cs'].shift(1)) / df['cs'].shift(1)

#7 change in cash margin
df['cmm'] = (df['cogs'] - (df['invt'] - df['invt'].shift(1)) + (df['ap'] - df['ap'].shift(1))) / (df['sale'] - (df['rect'] - df['rect'].shift(1)))
df['ch_cm'] = (df['cmm'] - df['cmm'].shift(1)) / df['cmm'].shift(1)

#8 change in return on assets
df['roa'] = (df['ni'] * 2) / (df['at'] + df['at'].shift(1))

df['ch_roa'] = df['roa'] - df['roa'].shift(1)

#9 actual issuance
df['issue'] = ((df['sstk'] > 0) | (df['dltis'] > 0)).astype(int)

#10 Book-to-market
df['bm'] = df['ceq'] / (df['prcc_f'] * df['csho'])

#11 Depreciation Index (Ratio from Beneish 1999)
df['dpi'] = (df['dp'].shift(1) / (df['dp'].shift(1) + df['ppent'].shift(1))) / (df['dp'] / (df['dp'] + df['ppent']))

#12 Retained earnings over assets
df['reoa'] = df['re'] / df['at']

#13 Earnings before interest and tax (Ratios from Summers and Sweeney, 1998)
df['EBIT'] = (df['ni'] + df['xint'] + df['txt']) / df['at']

#14 changes in free cash flow
df['ch_ib'] = df['ib'] - df['ib'].shift(1)
df['ch_fcf'] = df['ch_ib'] - df['ch_rsst']


In [29]:
selected_columns = ['dch_wc', 'ch_rsst', 'dch_rec', 'dch_inv', 'soft_assets', 'ch_cs', 'ch_cm', 'ch_roa', 'issue', 'bm', 'dpi', 'reoa', 'EBIT', 'ch_fcf']
selected_df = df[selected_columns]

In [30]:
selected_df.isna().sum()

dch_wc            21
ch_rsst            1
dch_rec            1
dch_inv            1
soft_assets        0
ch_cs           4126
ch_cm           7500
ch_roa             2
issue              0
bm               327
dpi            21291
reoa               0
EBIT               0
ch_fcf             1
dtype: int64

In [31]:
data[selected_columns] = df[selected_columns]


In [32]:
data.columns

Index(['gvkey', 'datadate', 'fyear', 'indfmt', 'consol', 'popsrc', 'datafmt',
       'tic', 'cusip', 'conm', 'curcd', 'fyr', 'act', 'ap', 'at', 'ceq', 'che',
       'cogs', 'csho', 'dlc', 'dltis', 'dltt', 'dp', 'ib', 'invt', 'ivao',
       'ivst', 'lct', 'lt', 'ni', 'ppegt', 'ppent', 'pstk', 're', 'rect',
       'sale', 'sstk', 'txp', 'txt', 'xint', 'cik', 'costat', 'prcc_f',
       'conml', 'sic', 'Bank', 'dch_wc', 'ch_rsst', 'dch_rec', 'dch_inv',
       'soft_assets', 'ch_cs', 'ch_cm', 'ch_roa', 'issue', 'bm', 'dpi', 'reoa',
       'EBIT', 'ch_fcf'],
      dtype='object')

In [42]:
df_labels = pd.read_csv('data/Annual_Labels2.csv')

In [36]:
# Merge dataframes on specified columns
merged_df = pd.merge(data, df_labels, left_on=['cik', 'fyear'], right_on=['CIK', 'YEARA'], how='left')

In [37]:
# Add a column with 1 for each record that is in df_labels
merged_df['misstate'] = merged_df['CIK'].notnull().astype(int)

In [40]:
merged_df.isna().sum()

gvkey               0
datadate            0
fyear               0
indfmt              0
consol              0
                ...  
P_AAER         266737
Unnamed: 9     268113
Unnamed: 10    268113
Unnamed: 11    268113
misstate            0
Length: 73, dtype: int64

In [44]:
len(df_labels)

1842

In [46]:
merged_df['misstate'].value_counts()

0    266737
1      1376
Name: misstate, dtype: int64

In [47]:
merged_df.columns

Index(['gvkey', 'datadate', 'fyear', 'indfmt', 'consol', 'popsrc', 'datafmt',
       'tic', 'cusip', 'conm', 'curcd', 'fyr', 'act', 'ap', 'at', 'ceq', 'che',
       'cogs', 'csho', 'dlc', 'dltis', 'dltt', 'dp', 'ib', 'invt', 'ivao',
       'ivst', 'lct', 'lt', 'ni', 'ppegt', 'ppent', 'pstk', 're', 'rect',
       'sale', 'sstk', 'txp', 'txt', 'xint', 'cik', 'costat', 'prcc_f',
       'conml', 'sic', 'Bank', 'dch_wc', 'ch_rsst', 'dch_rec', 'dch_inv',
       'soft_assets', 'ch_cs', 'ch_cm', 'ch_roa', 'issue', 'bm', 'dpi', 'reoa',
       'EBIT', 'ch_fcf', 'ID', 'CONAME', 'CIK', 'GVKEY', 'PERMNO', 'TICKER',
       'CNUM', 'YEARA', 'P_AAER', 'Unnamed: 9', 'Unnamed: 10', 'Unnamed: 11',
       'misstate'],
      dtype='object')