In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('./data/merged_compustat_and_labels.csv')

In [3]:
df.columns

Index(['gvkey', 'datadate', 'fyear', 'indfmt', 'consol', 'popsrc', 'datafmt',
       'tic', 'cusip', 'conm', 'curcd', 'fyr', 'act', 'ap', 'at', 'ceq', 'che',
       'cogs', 'csho', 'dlc', 'dltis', 'dltt', 'dp', 'ib', 'invt', 'ivao',
       'ivst', 'lct', 'lt', 'ni', 'ppegt', 'ppent', 'pstk', 're', 'rect',
       'sale', 'sstk', 'txp', 'txt', 'xint', 'cik', 'costat', 'prcc_f',
       'conml', 'sic', 'Bank', 'dch_wc', 'ch_rsst', 'dch_rec', 'dch_inv',
       'soft_assets', 'ch_cs', 'ch_cm', 'ch_roa', 'issue', 'bm', 'dpi', 'reoa',
       'EBIT', 'ch_fcf', 'misstate'],
      dtype='object')

In [4]:
df.head(3)

Unnamed: 0,gvkey,datadate,fyear,indfmt,consol,popsrc,datafmt,tic,cusip,conm,...,ch_cs,ch_cm,ch_roa,issue,bm,dpi,reoa,EBIT,ch_fcf,misstate
0,1003,1990-01-31,1989,INDL,C,D,STD,ANTQ,354100,A.A. IMPORTING CO INC,...,,,,1,-1.240403,,-0.403403,-0.087941,,0
1,1004,1990-05-31,1989,INDL,C,D,STD,AIR,361105,AAR CORP,...,,,,1,0.554652,5.380405,0.24052,0.123916,27.257486,0
2,1004,1991-05-31,1990,INDL,C,D,STD,AIR,361105,AAR CORP,...,0.32954,0.297848,-0.090196,0,0.863306,0.882711,0.262695,0.082704,-10.87404,0


In [5]:
df[['fyear','misstate']]['misstate'].value_counts()

0    266737
1      1376
Name: misstate, dtype: int64

In [6]:
df.groupby('fyear')['misstate'].sum()

fyear
1989      5
1990     22
1991     38
1992     36
1993     40
1994     31
1995     36
1996     42
1997     57
1998     72
1999    103
2000    126
2001    124
2002    104
2003     87
2004     70
2005     55
2006     37
2007     36
2008     29
2009     36
2010     34
2011     27
2012     32
2013     26
2014     20
2015     14
2016     18
2017     11
2018      6
2019      2
2020      0
2021      0
2022      0
2023      0
Name: misstate, dtype: int64

#### Retrain all of them

In [5]:
import pandas as pd
import numpy as np
from MLP.utils import datasets,plot_table,train_model
from sklearn import metrics
from imblearn.under_sampling import RandomUnderSampler
from tabulate import tabulate
import matplotlib.pyplot as plt
from fraudDetec.data_processing import DataProcessor
from fraudDetec.utils import evaluate,null_check,results
from fraudDetec.models import rus_boost, svm_model, xgb_model, logistic_regression_model,probit_regression_model,MLP,mlp_grid_search, random_forests
import json
from collections import defaultdict

In [6]:
data = pd.read_csv('./data/merged_compustat_and_labels.csv')

In [7]:
with open('MLP/features.json') as json_file:
    features_comp = json.load(json_file)

In [17]:
data_items = features_comp

In [5]:
import numpy as np
models = {"MLP": MLP,
         "RUS BOOST": rus_boost,
          "Logit":logistic_regression_model,
         "Probit":probit_regression_model,
         "Xg Boost":xgb_model}

In [6]:
data['misstate'].value_counts()

0    266737
1      1376
Name: misstate, dtype: int64

In [9]:
data.replace([np.inf, -np.inf], np.nan, inplace=True)

In [10]:
data = data.fillna(0)

In [11]:
data_obj = DataProcessor(data,(1990,2002), (2002,2002), (2003,2014), 5)

In [12]:
train_data, validation_data, test_data = data_obj.split_data_periods((1990,2002), (2003,2019))

### Batch Processing

Batch:
		Train : (1990,2002)
		Test :  (2003,2019)
		
		Run on undersample data : Bao, XgBoost,  probit, logit, MLP
		
		Oversample data and run all models: Bao, XgBoost,  probit, logit, MLP

In [9]:
test_periods = [(2003,2019)]
train_period = (1990,2002)

In [20]:
data_items = features_comp
res = defaultdict(lambda: defaultdict(dict))
for test_period in test_periods:
    for model in models.keys():
        for data_item in data_items.keys():
            auc = results(data_obj,train_period,test_period,data_items[data_item],models[model],sample='under')
            res[test_period][data_item][model] = auc

Optimization terminated successfully.
         Current function value: 0.560725
         Iterations 10
Optimization terminated successfully.
         Current function value: 0.598600
         Iterations 10
Optimization terminated successfully.
         Current function value: 0.624152
         Iterations 8
Test AUC Score: 0.6701160290397631
Test AUC Score: 0.6575388747299562
Test AUC Score: 0.6679662459356145


In [21]:
res

defaultdict(<function __main__.<lambda>()>,
            {(2003,
              2019): defaultdict(dict,
                         {'features': {'MLP': 0.6298421471947206,
                           'RUS BOOST': 0.7045458782070577,
                           'Logit': 0.628926424902149,
                           'Probit': 0.649334366905736,
                           'Xg Boost': 0.6701160290397631},
                          'raw_financial_items_28': {'MLP': 0.6099790495289557,
                           'RUS BOOST': 0.697365035985061,
                           'Logit': 0.6217356536172955,
                           'Probit': 0.6248934369636598,
                           'Xg Boost': 0.6575388747299562},
                          'financial_ratios_14': {'MLP': 0.6017435226593385,
                           'RUS BOOST': 0.6727191206145153,
                           'Logit': 0.6319123493795301,
                           'Probit': 0.6327112753191103,
                           'Xg Boost':

In [22]:
## MLP

res = defaultdict(lambda: defaultdict(dict))
for test_period in test_periods:
    for data_item in data_items.keys():
        param_grid = {
            'activation': ['logistic', 'tanh', 'relu'],
            'hidden_layer_sizes': [
                (len(data_items[data_item]), 40),
                (len(data_items[data_item]), 50),
                (len(data_items[data_item]), 40, 50),
                (len(data_items[data_item]), 30, 40),
                (len(data_items[data_item]), 40, 30, 50),
                (len(data_items[data_item]), 30, 40, 30),
                (len(data_items[data_item]), 40, 50, 60, 40),
                (len(data_items[data_item]), 50, 50, 50, 50),
                (len(data_items[data_item]), 30, 30, 30, 30)
                ],
            'learning_rate_init': [0.001, 0.01, 0.1]
            }
        auc = results(data_obj,train_period,test_period,data_items[data_item],mlp_grid_search,param_grid,sample='under')
        res[test_period][data_item] = auc

Best Hyperparameters: {'activation': 'logistic', 'hidden_layer_sizes': (42, 40, 50), 'learning_rate_init': 0.001}
Best AUC Score: 0.6759879935295615
Test AUC Score: 0.6645966500024637
Best Hyperparameters: {'activation': 'logistic', 'hidden_layer_sizes': (28, 30, 30, 30, 30), 'learning_rate_init': 0.001}
Best AUC Score: 0.6916270449375599
Test AUC Score: 0.6498537355538816
Best Hyperparameters: {'activation': 'logistic', 'hidden_layer_sizes': (14, 30, 40), 'learning_rate_init': 0.001}
Best AUC Score: 0.6887016357762049
Test AUC Score: 0.6433061938948582


In [23]:
res

defaultdict(<function __main__.<lambda>()>,
            {(2003,
              2019): defaultdict(dict,
                         {'features': 0.6645966500024637,
                          'raw_financial_items_28': 0.6498537355538816,
                          'financial_ratios_14': 0.6433061938948582})})

In [24]:
#MLP2

res = defaultdict(lambda: defaultdict(dict))
for test_period in test_periods:
    for data_item in data_items.keys():
        param_grid = {
            'activation': ['logistic', 'tanh', 'relu'],
            'hidden_layer_sizes': [
                (len(data_items[data_item]), 20),
                (len(data_items[data_item]), 30),
                (len(data_items[data_item]), 40, 50, 10),
                (len(data_items[data_item]), 30, 40, 10),
                (len(data_items[data_item]), 40, 30, 50, 20),
                (len(data_items[data_item]), 30, 40),
                (len(data_items[data_item]), 40, 50, 40, 40,20),
                (len(data_items[data_item]), 30, 50, 50, 40,45),
                (len(data_items[data_item]), 20,30,40,50,20)
                ],
            'learning_rate_init': [0.001, 0.01, 0.05, 0.09, 0.1, 0.5]
            }
        auc = results(data_obj,train_period,test_period,data_items[data_item],mlp_grid_search,param_grid)
        res[test_period][data_item] = auc

Best Hyperparameters: {'activation': 'logistic', 'hidden_layer_sizes': (42, 20), 'learning_rate_init': 0.001}
Best AUC Score: 0.6806870364366346
Test AUC Score: 0.6416253475172
Best Hyperparameters: {'activation': 'logistic', 'hidden_layer_sizes': (28, 20), 'learning_rate_init': 0.001}
Best AUC Score: 0.7010594212938617
Test AUC Score: 0.6536590897082667
Best Hyperparameters: {'activation': 'logistic', 'hidden_layer_sizes': (14, 30, 40), 'learning_rate_init': 0.001}
Best AUC Score: 0.6887016357762049
Test AUC Score: 0.6433061938948582


In [25]:
res

defaultdict(<function __main__.<lambda>()>,
            {(2003,
              2019): defaultdict(dict,
                         {'features': 0.6416253475172,
                          'raw_financial_items_28': 0.6536590897082667,
                          'financial_ratios_14': 0.6433061938948582})})

In [26]:
#XgBoost
res = defaultdict(lambda: defaultdict(dict))
print("XgBoost Results:))")
for test_period in test_periods:
    for data_item in data_items.keys():
        param_grid = {
                    'max_depth': [3, 4, 5],
                    'learning_rate': [0.001, 0.01, 0.05, 0.09, 0.1, 0.5]
                    }
        auc = results(data_obj,train_period,test_period,data_items[data_item],xgb_model,param_grid)
        res[test_period][data_item] = auc

XgBoost Results:))
Best Hyperparameters: {'learning_rate': 0.05, 'max_depth': 4}
Best AUC Score: 0.7338142991126182
Test AUC Score: 0.6544875579492166
Best Hyperparameters: {'learning_rate': 0.05, 'max_depth': 4}
Best AUC Score: 0.7245860362306307
Test AUC Score: 0.6473188042499356
Best Hyperparameters: {'learning_rate': 0.1, 'max_depth': 4}
Best AUC Score: 0.7334656351399393
Test AUC Score: 0.6770338995125409


In [27]:
res

defaultdict(<function __main__.<lambda>()>,
            {(2003,
              2019): defaultdict(dict,
                         {'features': 0.6544875579492166,
                          'raw_financial_items_28': 0.6473188042499356,
                          'financial_ratios_14': 0.6770338995125409})})

In [28]:
#RandomForest
res = defaultdict(lambda: defaultdict(dict))
print("Random Forest Results:))")
for test_period in test_periods:
    for data_item in data_items.keys():
        param_grid = {
                    'n_estimators': [100, 200, 300,400, 500],  # Number of trees in the forest
                    'max_depth': [ 10, 20],  # Maximum depth of the tree
                    'min_samples_split': [5, 10],  # Minimum number of samples required to split an internal node
                    'min_samples_leaf': [5,6]  # Minimum number of samples required to be at a leaf node
                    }
        auc = results(data_obj,train_period,test_period,data_items[data_item],random_forests,param_grid)
        res[test_period][data_item] = auc

Random Forest Results:))
Best Hyperparameters: {'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 200}
Best AUC Score: 0.7545736803356562
Test AUC Score: 0.7089564154213039
Best Hyperparameters: {'max_depth': 20, 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 200}
Best AUC Score: 0.7515501150406402
Test AUC Score: 0.700872533095392
Best Hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 300}
Best AUC Score: 0.7422057467297949
Test AUC Score: 0.6946625573110257


In [29]:
res

defaultdict(<function __main__.<lambda>()>,
            {(2003,
              2019): defaultdict(dict,
                         {'features': 0.7089564154213039,
                          'raw_financial_items_28': 0.700872533095392,
                          'financial_ratios_14': 0.6946625573110257})})

### Batch - Undersample

|        Model           |  MLP   |  MLP2  |  MLP3  | RUS BOOST |  Logit |  Probit | Xg Boost |  Xgb2  | Random Forest |
|------------------------|--------|--------|--------|-----------|--------|---------|----------|--------|---------------|
|      Features          | 0.6298 | 0.6646 | 0.6416 |   0.7046  | 0.6289 | 0.6493  |  0.6701  | 0.6545 |    0.7090     |
| raw_financial_items_28 | 0.6100 | 0.6499 | 0.6537 |   0.6974  | 0.6217 | 0.6249  |  0.6575  | 0.6473 |    0.7009     |
| financial_ratios_14    | 0.6017 | 0.6433 | 0.6433 |   0.6727  | 0.6319 | 0.6327  |  0.6680  | 0.6770 |    0.6947     |


---
## Window

In [12]:
train_batches,test_batches = data_obj.create_batches()

for train_period,test_period in zip(train_batches[:-1],test_batches[:-1]):
    print(train_period,test_period)

(1990, 1994) (1995, 1999)
(1995, 1999) (2000, 2004)
(2000, 2004) (2005, 2009)
(2005, 2009) (2010, 2014)
(2010, 2014) (2015, 2019)


In [13]:
models = {"MLP": MLP,
         "RUS BOOST": rus_boost,
          "Logit":logistic_regression_model,
         "Probit":probit_regression_model,
         "Xg Boost":xgb_model,
         "RF":random_forests}

In [18]:
res = defaultdict(lambda: defaultdict(dict))
for train_period,test_period in zip(train_batches[:-1],test_batches[:-1]):
    for model in models.keys():
        for data_item in data_items.keys():
            auc = results(data_obj,train_period,test_period,data_items[data_item],models[model])
            res[str(train_period) + '-' +str(test_period)][data_item][model] = auc

Train Shape:  (334, 42) (334,)
Test Shape:  (51517, 42) (51517,)
Train Shape:  (334, 28) (334,)
Test Shape:  (51517, 28) (51517,)
Train Shape:  (334, 14) (334,)
Test Shape:  (51517, 14) (51517,)
Train Shape:  (334, 42) (334,)
Test Shape:  (51517, 42) (51517,)
Train Shape:  (334, 28) (334,)
Test Shape:  (51517, 28) (51517,)
Train Shape:  (334, 14) (334,)
Test Shape:  (51517, 14) (51517,)
Train Shape:  (334, 42) (334,)
Test Shape:  (51517, 42) (51517,)
Train Shape:  (334, 28) (334,)
Test Shape:  (51517, 28) (51517,)
Train Shape:  (334, 14) (334,)
Test Shape:  (51517, 14) (51517,)
Train Shape:  (334, 42) (334,)
Test Shape:  (51517, 42) (51517,)
Optimization terminated successfully.
         Current function value: 0.532998
         Iterations 13
Train Shape:  (334, 28) (334,)
Test Shape:  (51517, 28) (51517,)
Optimization terminated successfully.
         Current function value: 0.581524
         Iterations 12
Train Shape:  (334, 14) (334,)
Test Shape:  (51517, 14) (51517,)
Optimization t

Test AUC Score: 0.6750154731407216
Train Shape:  (278, 14) (278,)
Test Shape:  (33759, 14) (33759,)
Test AUC Score: 0.6623350598100876
Train Shape:  (278, 42) (278,)
Test Shape:  (33759, 42) (33759,)
Random Forest Test AUC Score: 0.7175846427333245
Train Shape:  (278, 28) (278,)
Test Shape:  (33759, 28) (33759,)
Random Forest Test AUC Score: 0.7020713649171547
Train Shape:  (278, 14) (278,)
Test Shape:  (33759, 14) (33759,)
Random Forest Test AUC Score: 0.6733093557821848


In [19]:
res

defaultdict(<function __main__.<lambda>()>,
            {'(1990, 1994)-(1995, 1999)': defaultdict(dict,
                         {'features': {'MLP': 0.575236941521982,
                           'RUS BOOST': 0.6982953439455417,
                           'Logit': 0.6122691454104373,
                           'Probit': 0.6090246923146219,
                           'Xg Boost': 0.6629685835542898,
                           'RF': 0.6694691753962568},
                          'raw_financial_items_28': {'MLP': 0.6270016952067415,
                           'RUS BOOST': 0.6999190193881002,
                           'Logit': 0.5899124804635455,
                           'Probit': 0.5879531339276322,
                           'Xg Boost': 0.6248497716731016,
                           'RF': 0.6640008264999051},
                          'financial_ratios_14': {'MLP': 0.5,
                           'RUS BOOST': 0.6008389100028536,
                           'Logit': 0.6180479672323026,
 

In [20]:
for key in res.keys():
    columns = ["Train - Test : "+str(key), 'MLP', 'RUS BOOST', 'Logit', 'Probit','XgBoost','RF']
    df = pd.DataFrame.from_dict(res[key], orient='index').reset_index()
    df.columns = columns
    for col in ['MLP', 'RUS BOOST', 'Logit', 'Probit','XgBoost','RF']:
         
        try:
            df[col] = df[col].round(3)
        except:
            pass
    df.set_index("Train - Test : "+str(key), inplace=True)
    print(tabulate(df, headers='keys', tablefmt='fancy_grid'))

╒════════════════════════════════════════════╤═══════╤═════════════╤═════════╤══════════╤═══════════╤═══════╕
│ Train - Test : (1990, 1994)-(1995, 1999)   │   MLP │   RUS BOOST │   Logit │   Probit │   XgBoost │    RF │
╞════════════════════════════════════════════╪═══════╪═════════════╪═════════╪══════════╪═══════════╪═══════╡
│ features                                   │ 0.575 │       0.698 │   0.612 │    0.609 │     0.663 │ 0.669 │
├────────────────────────────────────────────┼───────┼─────────────┼─────────┼──────────┼───────────┼───────┤
│ raw_financial_items_28                     │ 0.627 │       0.7   │   0.59  │    0.588 │     0.625 │ 0.664 │
├────────────────────────────────────────────┼───────┼─────────────┼─────────┼──────────┼───────────┼───────┤
│ financial_ratios_14                        │ 0.5   │       0.601 │   0.618 │    0.612 │     0.572 │ 0.617 │
╘════════════════════════════════════════════╧═══════╧═════════════╧═════════╧══════════╧═══════════╧═══════╛
╒═════════

### Window- Undersample

| Time Period                    | Subcategory           | MLP            | RUS BOOST      | Logit          | Probit         | Xg Boost       | RF             |
|--------------------------------|-----------------------|----------------|----------------|----------------|----------------|----------------|----------------|
| (1990, 1994)-(1995, 1999)      | Features              | 0.5752369415   | 0.6982953439   | 0.6122691454   | 0.6090246923   | 0.6629685836   | 0.6694691754   |
|                                | Raw Financial Items 28| 0.6270016952   | 0.6999190194   | 0.5899124805   | 0.5879531339   | 0.6248497717   | 0.6640008265   |
|                                | Financial Ratios 14   | 0.5            | 0.6008389100   | 0.6180479672   | 0.6115873775   | 0.5721028249   | 0.6170945316   |
| (1995, 1999)-(2000, 2004)      | Features              | 0.6133018418   | 0.7955036145   | 0.7258620291   | 0.7281387068   | 0.7759764111   | 0.7944790597   |
|                                | Raw Financial Items 28| 0.6175584749   | 0.8060060218   | 0.7141303553   | 0.7095692220   | 0.7912637284   | 0.7992256011   |
|                                | Financial Ratios 14   | 0.5            | 0.6727730345   | 0.7021960930   | 0.6989039813   | 0.7006060843   | 0.7150423176   |
| (2000, 2004)-(2005, 2009)      | Features              | 0.6012367986   | 0.6962965495   | 0.6066864940   | 0.6050117178   | 0.6589791928   | 0.7047521398   |
|                                | Raw Financial Items 28| 0.5278267771   | 0.6974136570   | 0.5814215878   | 0.6025402723   | 0.6422978887   | 0.7041807168   |
|                                | Financial Ratios 14   | 0.5744455555   | 0.6816953016   | 0.5994519187   | 0.5972939660   | 0.6583317797   | 0.6724830115   |
| (2005, 2009)-(2010, 2014)      | Features              | 0.5350654922   | 0.7141296804   | 0.5708756098   | 0.5718106416   | 0.7009566325   | 0.6983130426   |
|                                | Raw Financial Items 28| 0.6042448443   | 0.6821197921   | 0.5449914297   | 0.5537132262   | 0.6609496723   | 0.6895493447   |
|                                | Financial Ratios 14   | 0.5            | 0.6363468358   | 0.6209751132   | 0.6054491853   | 0.6207767064   | 0.6629255395   |
| (2010, 2014)-(2015, 2019)      | Features              | 0.5            | 0.7250859166   | 0.5696692703   | 0.5839441152   | 0.7176320510   | 0.7175846427   |
|                                | Raw Financial Items 28| 0.6537774241   | 0.6599806411   | 0.6158595039   | 0.6166753340   | 0.6750154731   | 0.7020713649   |
|                                | Financial Ratios 14   | 0.5            | 0.6706600167   | 0.6341044309   | 0.6476748407   | 0.6623350598   | 0.6733093558   |
