In [32]:
from results.data_processing import DataProcessor
from results.utils import evaluate,null_check,results
from results.models import rus_boost, svm_model, xgb_model, logistic_regression_model, probit_regression_model
import pandas as pd
import json

In [33]:
data = pd.read_csv('data/data_FraudDetection_JAR2020.csv')

In [34]:
data_obj = DataProcessor(data,(1991,1999), (2000,2001), (2003,2014), 5)

In [4]:
with open('results/features.json') as json_file:
    features_comp = json.load(json_file)

## Bao RusBoost

In [5]:
test_periods = [(2003,2005), (2003,2008), (2003,2011), (2003,2014)]
models = {"RUS BOOST": rus_boost,
          "Logit":logistic_regression_model}
data_items = features_comp

In [6]:
train_period = (1991,1999)
for model in models.keys():
    print("MODEL: ", model)
    print("---"*25)
    for test_period in test_periods:
        print("Test period: ", test_period)
        print("---"*10)
        for data_item in data_items.keys():
            print(data_item)
            print("---"*5)
            print(results(data_obj,train_period,test_period,data_items[data_item],models[model]))
            print("---"*20)
        print("----"*20)

MODEL:  RUS BOOST
---------------------------------------------------------------------------
Test period:  (2003, 2005)
------------------------------
28 Raw Financial Items + 14 Financial Ratios
---------------
Training data Shape after sampling: (596, 42)
Test data Shape: (15724, 42)
0.6614186079575887
------------------------------------------------------------
28 Raw Financial Items
---------------
Training data Shape after sampling: (664, 28)
Test data Shape: (17778, 28)
0.709340631761371
------------------------------------------------------------
14 Financial Ratios
---------------
Training data Shape after sampling: (596, 14)
Test data Shape: (15724, 14)
0.6156265398376074
------------------------------------------------------------
--------------------------------------------------------------------------------
Test period:  (2003, 2008)
------------------------------
28 Raw Financial Items + 14 Financial Ratios
---------------
Training data Shape after sampling: (596, 42)
Te



0.5678999000870587
------------------------------------------------------------
28 Raw Financial Items
---------------
Training data Shape after sampling: (664, 28)
Test data Shape: (17778, 28)
0.6520753693904563
------------------------------------------------------------
14 Financial Ratios
---------------
Training data Shape after sampling: (596, 14)
Test data Shape: (15724, 14)
0.6108498053353121
------------------------------------------------------------
--------------------------------------------------------------------------------
Test period:  (2003, 2008)
------------------------------
28 Raw Financial Items + 14 Financial Ratios
---------------
Training data Shape after sampling: (596, 42)
Test data Shape: (30777, 42)




0.5673432698922265
------------------------------------------------------------
28 Raw Financial Items
---------------
Training data Shape after sampling: (664, 28)
Test data Shape: (35166, 28)
0.6012622657777734
------------------------------------------------------------
14 Financial Ratios
---------------
Training data Shape after sampling: (596, 14)
Test data Shape: (30777, 14)
0.6239855865299572
------------------------------------------------------------
--------------------------------------------------------------------------------
Test period:  (2003, 2011)
------------------------------
28 Raw Financial Items + 14 Financial Ratios
---------------
Training data Shape after sampling: (596, 42)
Test data Shape: (44674, 42)




0.5658260961568815
------------------------------------------------------------
28 Raw Financial Items
---------------
Training data Shape after sampling: (664, 28)
Test data Shape: (51326, 28)
0.5785436775977312
------------------------------------------------------------
14 Financial Ratios
---------------
Training data Shape after sampling: (596, 14)
Test data Shape: (44674, 14)
0.6251586281539605
------------------------------------------------------------
--------------------------------------------------------------------------------
Test period:  (2003, 2014)
------------------------------
28 Raw Financial Items + 14 Financial Ratios
---------------
Training data Shape after sampling: (596, 42)
Test data Shape: (58405, 42)




0.5611472124682287
------------------------------------------------------------
28 Raw Financial Items
---------------
Training data Shape after sampling: (664, 28)
Test data Shape: (68230, 28)
0.5688908717160726
------------------------------------------------------------
14 Financial Ratios
---------------
Training data Shape after sampling: (596, 14)
Test data Shape: (58405, 14)
0.6376537315587756
------------------------------------------------------------
--------------------------------------------------------------------------------


------------

## SVM -Beneish

In [9]:
import numpy as np
ben_data = pd.read_csv("Data/Beneish_scores_final.csv")
ben_data = ben_data.rename(columns={'Mistate':'misstate'})
ben_data = ben_data.replace([np.inf,-np.inf],np.nan)
ben_obj = DataProcessor(ben_data,(1991,1999), (2000,2001), (2003,2014), 5)

In [10]:
m_col = ['dsri','gmi','aqi','sgi','depi','sgai','lvgi','tata']

test_periods = [(2003,2005), (2003,2008), (2003,2011), (2003,2014)]
models = {"SVM": svm_model,
          "LOGIT":logistic_regression_model,
          "PROBIT": probit_regression_model}
data_items = {"Calculated M score Feat": m_col}

train_period = (1991,1999)
for model in models.keys():
    print("MODEL: ", model)
    print("---"*25)
    for test_period in test_periods:
        print("Test period: ", test_period)
        print("---"*10)
        for data_item in data_items.keys():
            print(data_item)
            print("---"*5)
            print(results(ben_obj,train_period,test_period,data_items[data_item],models[model]))
            print("---"*20)
        print("----"*20)

MODEL:  SVM
---------------------------------------------------------------------------
Test period:  (2003, 2005)
------------------------------
Calculated M score Feat
---------------
Training data Shape after sampling: (572, 8)
Test data Shape: (17602, 8)
0.5015771061535814
------------------------------------------------------------
--------------------------------------------------------------------------------
Test period:  (2003, 2008)
------------------------------
Calculated M score Feat
---------------
Training data Shape after sampling: (572, 8)
Test data Shape: (33425, 8)
0.5015824215571029
------------------------------------------------------------
--------------------------------------------------------------------------------
Test period:  (2003, 2011)
------------------------------
Calculated M score Feat
---------------
Training data Shape after sampling: (572, 8)
Test data Shape: (47800, 8)
0.5016321974643023
----------------------------------------------------------

-------
------------

## Window Processing

## Bao RusBoost

In [35]:
tr_batches,test_batches = data_obj.create_batches()

In [36]:
tr_batches

[       fyear   gvkey  p_aaer  misstate      act       ap        at      ceq  \
 4582    1991    1004     NaN         0  289.537   43.416   395.351  196.737   
 4583    1991    1009     NaN         0   12.911    6.549    35.559    6.538   
 4584    1991    1011     NaN         0    3.163    1.121     8.720    0.810   
 4585    1991    1013     NaN         0  119.530   10.167   247.169  158.374   
 4586    1991    1014     NaN         0    6.826    0.126    14.909   10.107   
 ...      ...     ...     ...       ...      ...      ...       ...      ...   
 38296   1996  220507     NaN         0  496.000  179.000  1430.000  710.000   
 38297   1996  220508     NaN         0  191.898    8.325   549.449  391.416   
 38298   1996  220748     NaN         0   79.753   12.913   103.795   75.641   
 38299   1996  223007     NaN         0  279.910   10.983   821.528  454.853   
 38300   1996  277918     NaN         0    3.440    2.868     4.363  -18.894   
 
           che     cogs  ...  soft_ass

In [25]:
models = {"RUS BOOST": rus_boost,
          "Logit":logistic_regression_model}
data_items = features_comp

for (train_period,test_period) in zip(tr_batches,test_batches):
    print(train_period,test_period)
    for model in models.keys():
        print("MODEL: ", model)
        print("---"*25)
        for test_period in test_periods:
            print("Test period: ", test_period)
            print("---"*10)
            for data_item in data_items.keys():
                print(data_item)
                print("---"*5)
                print(results(data_obj,train_period,test_period,data_items[data_item],models[model]))
                print("---"*20)
            print("----"*20)

       fyear   gvkey  p_aaer  misstate      act       ap        at      ceq  \
4582    1991    1004     NaN         0  289.537   43.416   395.351  196.737   
4583    1991    1009     NaN         0   12.911    6.549    35.559    6.538   
4584    1991    1011     NaN         0    3.163    1.121     8.720    0.810   
4585    1991    1013     NaN         0  119.530   10.167   247.169  158.374   
4586    1991    1014     NaN         0    6.826    0.126    14.909   10.107   
...      ...     ...     ...       ...      ...      ...       ...      ...   
38296   1996  220507     NaN         0  496.000  179.000  1430.000  710.000   
38297   1996  220508     NaN         0  191.898    8.325   549.449  391.416   
38298   1996  220748     NaN         0   79.753   12.913   103.795   75.641   
38299   1996  223007     NaN         0  279.910   10.983   821.528  454.853   
38300   1996  277918     NaN         0    3.440    2.868     4.363  -18.894   

          che     cogs  ...  soft_assets     ch_cs 

KeyError: 0