In [1]:
import pandas as pd
import numpy as np
from MLP.utils import datasets,plot_table,train_model
from sklearn import metrics
from imblearn.under_sampling import RandomUnderSampler
from tabulate import tabulate
import matplotlib.pyplot as plt
from results.data_processing import DataProcessor
from results.utils import evaluate,null_check,results
from results.models import rus_boost, svm_model, xgb_model, logistic_regression_model, probit_regression_model,MLP
import json
from collections import defaultdict

In [2]:
data = pd.read_csv('data/data_FraudDetection_JAR2020.csv')
data_obj = DataProcessor(data,(1990,1999), (2000,2001), (2003,2014), 5)
with open('results/features.json') as json_file:
    features_comp = json.load(json_file)

In [3]:
import numpy as np
ben_data = pd.read_csv("Data/Beneish_scores_final.csv")
ben_data = ben_data.rename(columns={'Mistate':'misstate'})
ben_data = ben_data.replace([np.inf,-np.inf],np.nan)
ben_obj = DataProcessor(ben_data,(1990,1999), (2000,2001), (2003,2014), 5)
m_col = ['dsri','gmi','aqi','sgi','depi','sgai','lvgi','tata']
ben_items = {"Calculated M score Feat": m_col}

In [4]:
models = {"MLP": MLP,
         "RUS BOOST": rus_boost,
          "Logit":logistic_regression_model,
         "Probit":probit_regression_model}

---

-----

### Batch Processing

In [5]:
test_periods = [(2003,2005), (2003,2008), (2003,2011), (2003,2014)]

In [6]:
data_items = features_comp
train_period = (1990,1999)
res = defaultdict(lambda: defaultdict(dict))
for test_period in test_periods:
    for model in models.keys():
        for data_item in data_items.keys():
            auc = results(data_obj,train_period,test_period,data_items[data_item],models[model])
            res[test_period][data_item][model] = auc
        for ben_item in ben_items.keys():
            auc = results(ben_obj,train_period,test_period,ben_items[ben_item],models[model])
            res[test_period][ben_item][model] = auc

Training data Shape after sampling: (694, 42)
Test data Shape: (17778, 42)
Training data Shape after sampling: (694, 28)
Test data Shape: (17778, 28)
Training data Shape after sampling: (694, 14)
Test data Shape: (17778, 14)
Training data Shape after sampling: (580, 8)
Test data Shape: (17906, 8)
Training data Shape after sampling: (694, 42)
Test data Shape: (17778, 42)
Training data Shape after sampling: (694, 28)
Test data Shape: (17778, 28)
Training data Shape after sampling: (694, 14)
Test data Shape: (17778, 14)
Training data Shape after sampling: (580, 8)
Test data Shape: (17906, 8)
Training data Shape after sampling: (694, 42)
Test data Shape: (17778, 42)
Training data Shape after sampling: (694, 28)
Test data Shape: (17778, 28)
Training data Shape after sampling: (694, 14)
Test data Shape: (17778, 14)
Training data Shape after sampling: (580, 8)
Test data Shape: (17906, 8)
Training data Shape after sampling: (694, 42)
Test data Shape: (17778, 42)
Optimization terminated success

In [7]:
for key in res.keys():
    columns = ["Test Period : "+str(key), 'MLP', 'RUS BOOST', 'Logit', 'Probit']
    df = pd.DataFrame.from_dict(res[key], orient='index').reset_index()
    df.columns = columns
    for col in ['MLP', 'RUS BOOST', 'Logit', 'Probit']:
        df[col] = df[col].round(3)
    df.set_index("Test Period : "+str(key), inplace=True)
    print(tabulate(df, headers='keys', tablefmt='pretty'))

+----------------------------------------------+-------+-----------+-------+--------+
|          Test Period : (2003, 2005)          |  MLP  | RUS BOOST | Logit | Probit |
+----------------------------------------------+-------+-----------+-------+--------+
| 28 Raw Financial Items + 14 Financial Ratios | 0.647 |   0.695   | 0.596 | 0.609  |
|            28 Raw Financial Items            | 0.64  |   0.686   | 0.526 | 0.529  |
|             14 Financial Ratios              | 0.637 |   0.607   | 0.648 | 0.645  |
|           Calculated M score Feat            |  0.5  |   0.586   | 0.537 | 0.562  |
+----------------------------------------------+-------+-----------+-------+--------+
+----------------------------------------------+-------+-----------+-------+--------+
|          Test Period : (2003, 2008)          |  MLP  | RUS BOOST | Logit | Probit |
+----------------------------------------------+-------+-----------+-------+--------+
| 28 Raw Financial Items + 14 Financial Ratios | 0.634

----

------

## Window Processing

In [8]:
train_batches,test_batches = data_obj.create_batches()

In [9]:
for train_period,test_period in zip(train_batches[:-1],test_batches[:-1]):
    print(train_period,test_period)

(1990, 1995) (1996, 2001)
(1996, 2001) (2002, 2006)
(2002, 2006) (2007, 2011)
(2007, 2011) (2012, 2016)
(2012, 2016) (2017, 2021)


In [13]:
data_items = features_comp
res = defaultdict(lambda: defaultdict(dict))
for train_period,test_period in zip(train_batches[:-1],test_batches[:-1]):
    for model in models.keys():
        for data_item in data_items.keys():
            auc = results(data_obj,train_period,test_period,data_items[data_item],models[model])
            res[str(train_period) + '-' +str(test_period)][data_item][model] = auc
        for ben_item in ben_items.keys():
            auc = results(ben_obj,train_period,test_period,ben_items[ben_item],models[model])
            res[str(train_period) + '-' +str(test_period)][ben_item][model] = auc

Training data Shape after sampling: (286, 42)
Test data Shape: (40192, 42)
Training data Shape after sampling: (286, 28)
Test data Shape: (40192, 28)
Training data Shape after sampling: (286, 14)
Test data Shape: (40192, 14)
Training data Shape after sampling: (216, 8)
Test data Shape: (42423, 8)
Training data Shape after sampling: (286, 42)
Test data Shape: (40192, 42)
Training data Shape after sampling: (286, 28)
Test data Shape: (40192, 28)
Training data Shape after sampling: (286, 14)
Test data Shape: (40192, 14)
Training data Shape after sampling: (216, 8)
Test data Shape: (42423, 8)
Training data Shape after sampling: (286, 42)
Test data Shape: (40192, 42)
Training data Shape after sampling: (286, 28)
Test data Shape: (40192, 28)
Training data Shape after sampling: (286, 14)
Test data Shape: (40192, 14)
Training data Shape after sampling: (216, 8)
Test data Shape: (42423, 8)
Training data Shape after sampling: (286, 42)
Test data Shape: (40192, 42)
Optimization terminated success

In [15]:
res

defaultdict(<function __main__.<lambda>()>,
            {'(1990, 1995)-(1996, 2001)': defaultdict(dict,
                         {'28 Raw Financial Items + 14 Financial Ratios': {'MLP': 0.5,
                           'RUS BOOST': 0.7046322725463294,
                           'Logit': 0.561341721183428,
                           'Probit': 0.5767141516236642},
                          '28 Raw Financial Items': {'MLP': 0.5,
                           'RUS BOOST': 0.67387705534829,
                           'Logit': 0.5481236078621643,
                           'Probit': 0.5495846947434784},
                          '14 Financial Ratios': {'MLP': 0.5,
                           'RUS BOOST': 0.6482069931406658,
                           'Logit': 0.635494579483079,
                           'Probit': 0.6406290115923745},
                          'Calculated M score Feat': {'MLP': 0.5,
                           'RUS BOOST': 0.5224153607774116,
                           'Logit': 0.

In [20]:
for key in res.keys():
    columns = ["Train - Test : "+str(key), 'MLP', 'RUS BOOST', 'Logit', 'Probit']
    df = pd.DataFrame.from_dict(res[key], orient='index').reset_index()
    df.columns = columns
    for col in ['MLP', 'RUS BOOST', 'Logit', 'Probit']:
         
        try:
            df[col] = df[col].round(3)
        except:
            pass
    df.set_index("Train - Test : "+str(key), inplace=True)
    print(tabulate(df, headers='keys', tablefmt='fancy_grid'))

╒══════════════════════════════════════════════╤═══════╤═════════════╤═════════╤══════════╕
│ Train - Test : (1990, 1995)-(1996, 2001)     │   MLP │   RUS BOOST │   Logit │   Probit │
╞══════════════════════════════════════════════╪═══════╪═════════════╪═════════╪══════════╡
│ 28 Raw Financial Items + 14 Financial Ratios │   0.5 │       0.705 │   0.561 │    0.577 │
├──────────────────────────────────────────────┼───────┼─────────────┼─────────┼──────────┤
│ 28 Raw Financial Items                       │   0.5 │       0.674 │   0.548 │    0.55  │
├──────────────────────────────────────────────┼───────┼─────────────┼─────────┼──────────┤
│ 14 Financial Ratios                          │   0.5 │       0.648 │   0.635 │    0.641 │
├──────────────────────────────────────────────┼───────┼─────────────┼─────────┼──────────┤
│ Calculated M score Feat                      │   0.5 │       0.522 │   0.537 │    0.539 │
╘══════════════════════════════════════════════╧═══════╧═════════════╧═════════╧