In [1]:
from results.data_processing import DataProcessor
from results.utils import evaluate,null_check,results
from results.models import rus_boost, svm_model, xgb_model, logistic_regression_model, probit_regression_model
import pandas as pd
import json

In [2]:
data = pd.read_csv('data/data_FraudDetection_JAR2020.csv')

In [3]:
data_obj = DataProcessor(data,(1991,1999), (2000,2001), (2003,2014), 5)

In [4]:
with open('results/features.json') as json_file:
    features_comp = json.load(json_file)

## Bao RusBoost

In [10]:
test_periods = [(2003,2005), (2003,2008), (2003,2011), (2003,2014)]
models = {"RUS BOOST": rus_boost,
          "Logit":logistic_regression_model}
data_items = features_comp

In [11]:
train_period = (1991,1999)
for model in models.keys():
    print("MODEL: ", model)
    print("---"*25)
    for test_period in test_periods:
        print("Test period: ", test_period)
        print("---"*10)
        for data_item in data_items.keys():
            print(data_item)
            print("---"*5)
            print(results(data_obj,train_period,test_period,data_items[data_item],models[model]))
            print("---"*20)
        print("----"*20)

MODEL:  RUS BOOST
---------------------------------------------------------------------------
Test period:  (2003, 2005)
------------------------------
28 Raw Financial Items + 14 Financial Ratios
---------------
Training data Shape after sampling: (596, 42)
Test data Shape: (15724, 42)
0.6567676897518147
------------------------------------------------------------
28 Raw Financial Items
---------------
Training data Shape after sampling: (664, 28)
Test data Shape: (17778, 28)
0.7166485262687932
------------------------------------------------------------
14 Financial Ratios
---------------
Training data Shape after sampling: (596, 14)
Test data Shape: (15724, 14)
0.6226126552497141
------------------------------------------------------------
--------------------------------------------------------------------------------
Test period:  (2003, 2008)
------------------------------
28 Raw Financial Items + 14 Financial Ratios
---------------
Training data Shape after sampling: (596, 42)
T



0.5678999000870587
------------------------------------------------------------
28 Raw Financial Items
---------------
Training data Shape after sampling: (664, 28)
Test data Shape: (17778, 28)
0.6520753693904563
------------------------------------------------------------
14 Financial Ratios
---------------
Training data Shape after sampling: (596, 14)
Test data Shape: (15724, 14)
0.6108498053353121
------------------------------------------------------------
--------------------------------------------------------------------------------
Test period:  (2003, 2008)
------------------------------
28 Raw Financial Items + 14 Financial Ratios
---------------
Training data Shape after sampling: (596, 42)
Test data Shape: (30777, 42)




0.5673432698922265
------------------------------------------------------------
28 Raw Financial Items
---------------
Training data Shape after sampling: (664, 28)
Test data Shape: (35166, 28)
0.6012622657777734
------------------------------------------------------------
14 Financial Ratios
---------------
Training data Shape after sampling: (596, 14)
Test data Shape: (30777, 14)
0.6239855865299572
------------------------------------------------------------
--------------------------------------------------------------------------------
Test period:  (2003, 2011)
------------------------------
28 Raw Financial Items + 14 Financial Ratios
---------------
Training data Shape after sampling: (596, 42)
Test data Shape: (44674, 42)




0.5658260961568815
------------------------------------------------------------
28 Raw Financial Items
---------------
Training data Shape after sampling: (664, 28)
Test data Shape: (51326, 28)
0.5785436775977312
------------------------------------------------------------
14 Financial Ratios
---------------
Training data Shape after sampling: (596, 14)
Test data Shape: (44674, 14)
0.6251586281539605
------------------------------------------------------------
--------------------------------------------------------------------------------
Test period:  (2003, 2014)
------------------------------
28 Raw Financial Items + 14 Financial Ratios
---------------
Training data Shape after sampling: (596, 42)
Test data Shape: (58405, 42)




0.5611472124682287
------------------------------------------------------------
28 Raw Financial Items
---------------
Training data Shape after sampling: (664, 28)
Test data Shape: (68230, 28)
0.5688908717160726
------------------------------------------------------------
14 Financial Ratios
---------------
Training data Shape after sampling: (596, 14)
Test data Shape: (58405, 14)
0.6376537315587756
------------------------------------------------------------
--------------------------------------------------------------------------------


------------

## SVM -Beneish

In [12]:
import numpy as np
ben_data = pd.read_csv("Data/Beneish_scores_final.csv")
ben_data = ben_data.rename(columns={'Mistate':'misstate'})
ben_data = ben_data.replace([np.inf,-np.inf],np.nan)
ben_obj = DataProcessor(ben_data,(1991,1999), (2000,2001), (2003,2014), 5)

In [13]:
m_col = ['dsri','gmi','aqi','sgi','depi','sgai','lvgi','tata']

test_periods = [(2003,2005), (2003,2008), (2003,2011), (2003,2014)]
models = {"SVM": svm_model,
          "LOGIT":logistic_regression_model,
          "PROBIT": probit_regression_model}
data_items = {"Calculated M score Feat": m_col}

train_period = (1991,1999)
for model in models.keys():
    print("MODEL: ", model)
    print("---"*25)
    for test_period in test_periods:
        print("Test period: ", test_period)
        print("---"*10)
        for data_item in data_items.keys():
            print(data_item)
            print("---"*5)
            print(results(ben_obj,train_period,test_period,data_items[data_item],models[model]))
            print("---"*20)
        print("----"*20)

MODEL:  SVM
---------------------------------------------------------------------------
Test period:  (2003, 2005)
------------------------------
Calculated M score Feat
---------------
Training data Shape after sampling: (572, 8)
Test data Shape: (17602, 8)
0.5015771061535814
------------------------------------------------------------
--------------------------------------------------------------------------------
Test period:  (2003, 2008)
------------------------------
Calculated M score Feat
---------------
Training data Shape after sampling: (572, 8)
Test data Shape: (33425, 8)
0.5015824215571029
------------------------------------------------------------
--------------------------------------------------------------------------------
Test period:  (2003, 2011)
------------------------------
Calculated M score Feat
---------------
Training data Shape after sampling: (572, 8)
Test data Shape: (47800, 8)
0.5016321974643023
----------------------------------------------------------

## MLP

* refer to the other notebooks for the code 

| Input Var | Method  | Neurons        | Activation Func | Learning rate | AUC     |
|-----------|---------|----------------|-----------------|---------------|---------|
| 28 Raw Financial Items | MLP - 1 | 70 | Logistic | 0.003 | 0.6627 |
|  | MLP - 2 | (40,60) | Logistic | 0.003 | 0.627567 |
|  | MLP - 4 | (40,50,60,40) | Logitsic | 0.003 | 0.629537 |
|  | CNN |  |  |  | 0.6512 |
| 28 Raw + 14 Finan Ratios | MLP - 1 | 70 | Logistic | 0.005 | 0.648682 |
|  | MLP - 2 | (40,60) | Logistic | 0.003 | 0.62822 |
|  | MLP - 4 | (40,60,50,40) | Logitsic | 0.003 | 0.657703 |
|  | CNN |  |  |  | 0.5679 |


-------
------------

## Window Processing

## Bao RusBoost

In [7]:
tr_batches,test_batches = data_obj.create_batches()

In [8]:
tr_batches

[(1991, 1996), (1996, 2001), (2001, 2006), (2006, 2011), (2011, 2016)]

In [9]:
models = {"RUS BOOST": rus_boost,
          "Logit":logistic_regression_model}
data_items = features_comp

for (train_period,test_period) in zip(tr_batches,test_batches):
    for model in models.keys():
        print("MODEL: ", model)
        print("---"*25)
        print("Train period: ",train_period," Test period: ",  test_period)
        print("---"*10)
        for data_item in data_items.keys():
            print(data_item)
            print("---"*5)
            print(results(data_obj,train_period,test_period,data_items[data_item],models[model]))
            print("---"*20)
        print("----"*20)

MODEL:  RUS BOOST
---------------------------------------------------------------------------
Train period:  (1991, 1996)  Test period:  (2003, 2008)
------------------------------
28 Raw Financial Items + 14 Financial Ratios
---------------
Training data Shape after sampling: (278, 42)
Test data Shape: (30777, 42)
0.6069344514691912
------------------------------------------------------------
28 Raw Financial Items
---------------
Training data Shape after sampling: (322, 28)
Test data Shape: (35166, 28)
0.5738300071183908
------------------------------------------------------------
14 Financial Ratios
---------------
Training data Shape after sampling: (278, 14)
Test data Shape: (30777, 14)
0.5917679431323091
------------------------------------------------------------
--------------------------------------------------------------------------------
MODEL:  Logit
---------------------------------------------------------------------------
Train period:  (1991, 1996)  Test period:  (200



0.500885707734137
------------------------------------------------------------
28 Raw Financial Items
---------------
Training data Shape after sampling: (322, 28)
Test data Shape: (35166, 28)
0.46391903365511533
------------------------------------------------------------
14 Financial Ratios
---------------
Training data Shape after sampling: (278, 14)
Test data Shape: (30777, 14)
0.6070844825891832
------------------------------------------------------------
--------------------------------------------------------------------------------
MODEL:  RUS BOOST
---------------------------------------------------------------------------
Train period:  (1996, 2001)  Test period:  (2008, 2013)
------------------------------
28 Raw Financial Items + 14 Financial Ratios
---------------
Training data Shape after sampling: (706, 42)
Test data Shape: (27771, 42)
0.682075647974187
------------------------------------------------------------
28 Raw Financial Items
---------------
Training data Shape



0.5918161392826911
------------------------------------------------------------
28 Raw Financial Items
---------------
Training data Shape after sampling: (742, 28)
Test data Shape: (33049, 28)




0.6302554521936875
------------------------------------------------------------
14 Financial Ratios
---------------
Training data Shape after sampling: (706, 14)
Test data Shape: (27771, 14)
0.6650692095335862
------------------------------------------------------------
--------------------------------------------------------------------------------
MODEL:  RUS BOOST
---------------------------------------------------------------------------
Train period:  (2001, 2006)  Test period:  (2013, 2018)
------------------------------
28 Raw Financial Items + 14 Financial Ratios
---------------
Training data Shape after sampling: (706, 42)
Test data Shape: (9255, 42)
0.7244345850016232
------------------------------------------------------------
28 Raw Financial Items
---------------
Training data Shape after sampling: (726, 28)
Test data Shape: (11274, 28)
0.7340912455221008
------------------------------------------------------------
14 Financial Ratios
---------------
Training data Shape af



0.7278123888880301
------------------------------------------------------------
28 Raw Financial Items
---------------
Training data Shape after sampling: (726, 28)
Test data Shape: (11274, 28)




0.6937561062261302
------------------------------------------------------------
14 Financial Ratios
---------------
Training data Shape after sampling: (706, 14)
Test data Shape: (9255, 14)
0.7907075610246262
------------------------------------------------------------
--------------------------------------------------------------------------------
MODEL:  RUS BOOST
---------------------------------------------------------------------------
Train period:  (2006, 2011)  Test period:  (2018, 2023)
------------------------------
28 Raw Financial Items + 14 Financial Ratios
---------------
Training data Shape after sampling: (320, 42)
Test data Shape: (0, 42)
Done
------------------------------------------------------------
28 Raw Financial Items
---------------
Training data Shape after sampling: (334, 28)
Test data Shape: (0, 28)
Done
------------------------------------------------------------
14 Financial Ratios
---------------
Training data Shape after sampling: (320, 14)
Test data Sh

## SVM -Beneish

In [5]:
import numpy as np
ben_data = pd.read_csv("Data/Beneish_scores_final.csv")
ben_data = ben_data.rename(columns={'Mistate':'misstate'})
ben_data = ben_data.replace([np.inf,-np.inf],np.nan)
ben_obj = DataProcessor(ben_data,(1991,1999), (2000,2001), (2003,2014), 5)

In [6]:
m_col = ['dsri','gmi','aqi','sgi','depi','sgai','lvgi','tata']

tr_batches,test_batches = ben_obj.create_batches()

models = {"SVM": svm_model,
          "LOGIT":logistic_regression_model,
          "PROBIT": probit_regression_model}

data_items = {"Calculated M score Feat": m_col}

for (train_period,test_period) in zip(tr_batches,test_batches):
    for model in models.keys():
        print("MODEL: ", model)
        print("---"*25)
        print("Train period: ",train_period," Test period: ",  test_period)
        print("---"*10)
        for data_item in data_items.keys():
            print(data_item)
            print("---"*5)
            print(results(ben_obj,train_period,test_period,data_items[data_item],models[model]))
            print("---"*20)
        print("----"*20)

MODEL:  SVM
---------------------------------------------------------------------------
Train period:  (1991, 1996)  Test period:  (2003, 2008)
------------------------------
Calculated M score Feat
---------------
Training data Shape after sampling: (264, 8)
Test data Shape: (33425, 8)
0.5012367681847218
------------------------------------------------------------
--------------------------------------------------------------------------------
MODEL:  LOGIT
---------------------------------------------------------------------------
Train period:  (1991, 1996)  Test period:  (2003, 2008)
------------------------------
Calculated M score Feat
---------------
Training data Shape after sampling: (264, 8)
Test data Shape: (33425, 8)
0.5984930412343569
------------------------------------------------------------
--------------------------------------------------------------------------------
MODEL:  PROBIT
---------------------------------------------------------------------------
Train per