In [2]:
import pandas as pd
import numpy as np
from MLP.utils import datasets,plot_table,train_model
from sklearn import metrics
from imblearn.under_sampling import RandomUnderSampler
from tabulate import tabulate
import matplotlib.pyplot as plt
from results.data_processing import DataProcessor
from results.utils import evaluate,null_check,results
from results.models import rus_boost, svm_model, xgb_model, logistic_regression_model, probit_regression_model,MLP,mlp_grid_search
import json
from collections import defaultdict

In [3]:
data = pd.read_csv('data/data_FraudDetection_JAR2020.csv')
data_obj = DataProcessor(data,(1990,1999), (2000,2001), (2003,2014), 5)
with open('results/features.json') as json_file:
    features_comp = json.load(json_file)

In [4]:
import numpy as np
ben_data = pd.read_csv("Data/Beneish_scores_final.csv")
ben_data = ben_data.rename(columns={'Mistate':'misstate'})
ben_data = ben_data.replace([np.inf,-np.inf],np.nan)
ben_obj = DataProcessor(ben_data,(1990,1999), (2000,2001), (2003,2014), 5)
m_col = ['dsri','gmi','aqi','sgi','depi','sgai','lvgi','tata']
ben_items = {"Calculated M score Feat": m_col}

In [5]:
models = {"MLP": MLP,
         "RUS BOOST": rus_boost,
          "Logit":logistic_regression_model,
         "Probit":probit_regression_model}

---

-----

### Batch Processing

In [23]:
test_periods = [(2003,2005), (2003,2008), (2003,2011), (2003,2014)]

In [6]:
data_items = features_comp
train_period = (1990,1999)
res = defaultdict(lambda: defaultdict(dict))
for test_period in test_periods:
    for model in models.keys():
        for data_item in data_items.keys():
            auc = results(data_obj,train_period,test_period,data_items[data_item],models[model])
            res[test_period][data_item][model] = auc
        for ben_item in ben_items.keys():
            auc = results(ben_obj,train_period,test_period,ben_items[ben_item],models[model])
            res[test_period][ben_item][model] = auc

Optimization terminated successfully.
         Current function value: 0.539626
         Iterations 10
Optimization terminated successfully.
         Current function value: 0.602770
         Iterations 9
Optimization terminated successfully.
         Current function value: 0.619036
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.682725
         Iterations 8
Optimization terminated successfully.
         Current function value: 0.539626
         Iterations 10
Optimization terminated successfully.
         Current function value: 0.602770
         Iterations 9
Optimization terminated successfully.
         Current function value: 0.619036
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.682725
         Iterations 8
Optimization terminated successfully.
         Current function value: 0.539626
         Iterations 10
Optimization terminated successfully.
         Current function value: 0.602770

In [8]:
for key in res.keys():
    columns = ["Test Period : "+str(key), 'MLP', 'RUS BOOST', 'Logit', 'Probit']
    df = pd.DataFrame.from_dict(res[key], orient='index').reset_index()
    df.columns = columns
    for col in ['MLP', 'RUS BOOST', 'Logit', 'Probit']:
        df[col] = df[col].round(3)
    df.set_index("Test Period : "+str(key), inplace=True)
    print(tabulate(df, headers='keys', tablefmt='pretty'))

+----------------------------------------------+-------+-----------+-------+--------+
|          Test Period : (2003, 2005)          |  MLP  | RUS BOOST | Logit | Probit |
+----------------------------------------------+-------+-----------+-------+--------+
| 28 Raw Financial Items + 14 Financial Ratios | 0.647 |   0.712   | 0.596 | 0.609  |
|            28 Raw Financial Items            | 0.64  |   0.694   | 0.526 | 0.529  |
|             14 Financial Ratios              | 0.637 |   0.61    | 0.648 | 0.645  |
|           Calculated M score Feat            |  0.5  |   0.579   | 0.537 | 0.562  |
+----------------------------------------------+-------+-----------+-------+--------+
+----------------------------------------------+-------+-----------+-------+--------+
|          Test Period : (2003, 2008)          |  MLP  | RUS BOOST | Logit | Probit |
+----------------------------------------------+-------+-----------+-------+--------+
| 28 Raw Financial Items + 14 Financial Ratios | 0.634

## Batch - Tuning

In [13]:
test_periods = [(2003,2005), (2003,2008), (2003,2011), (2003,2014)]

In [14]:
data_items = features_comp
train_period = (1990,1999)
res = defaultdict(lambda: defaultdict(dict))
for test_period in test_periods:
    for data_item in data_items.keys():
        param_grid = {
            'activation': ['logistic', 'tanh', 'relu'],
            'hidden_layer_sizes': [
                (len(data_items[data_item]), 40),
                (len(data_items[data_item]), 50),
                (len(data_items[data_item]), 40, 50),
                (len(data_items[data_item]), 30, 40),
                (len(data_items[data_item]), 40, 30, 50),
                (len(data_items[data_item]), 30, 40, 30),
                (len(data_items[data_item]), 40, 50, 60, 40),
                (len(data_items[data_item]), 50, 50, 50, 50),
                (len(data_items[data_item]), 30, 30, 30, 30)
                ],
            'learning_rate_init': [0.001, 0.01, 0.1]
            }
        auc = results(data_obj,train_period,test_period,data_items[data_item],mlp_grid_search,param_grid)
        res[test_period][data_item] = auc

Best Hyperparameters: {'activation': 'logistic', 'hidden_layer_sizes': (42, 50), 'learning_rate_init': 0.001}
Best AUC Score: 0.7013556575749392
Test AUC Score: 0.653978955377263
Best Hyperparameters: {'activation': 'relu', 'hidden_layer_sizes': (28, 40, 50, 60, 40), 'learning_rate_init': 0.001}
Best AUC Score: 0.7185012152308939
Test AUC Score: 0.6463963130962225
Best Hyperparameters: {'activation': 'logistic', 'hidden_layer_sizes': (14, 50), 'learning_rate_init': 0.001}
Best AUC Score: 0.6952212920454887
Test AUC Score: 0.687706556168748
Best Hyperparameters: {'activation': 'logistic', 'hidden_layer_sizes': (42, 50), 'learning_rate_init': 0.001}
Best AUC Score: 0.7013556575749392
Test AUC Score: 0.6338119723979867
Best Hyperparameters: {'activation': 'relu', 'hidden_layer_sizes': (28, 40, 50, 60, 40), 'learning_rate_init': 0.001}
Best AUC Score: 0.7185012152308939
Test AUC Score: 0.61288467164021
Best Hyperparameters: {'activation': 'logistic', 'hidden_layer_sizes': (14, 50), 'learni

In [16]:
res

defaultdict(<function __main__.<lambda>()>,
            {(2003,
              2005): defaultdict(dict,
                         {'28 Raw Financial Items + 14 Financial Ratios': 0.653978955377263,
                          '28 Raw Financial Items': 0.6463963130962225,
                          '14 Financial Ratios': 0.687706556168748}),
             (2003,
              2008): defaultdict(dict,
                         {'28 Raw Financial Items + 14 Financial Ratios': 0.6338119723979867,
                          '28 Raw Financial Items': 0.61288467164021,
                          '14 Financial Ratios': 0.6960195736539408}),
             (2003,
              2011): defaultdict(dict,
                         {'28 Raw Financial Items + 14 Financial Ratios': 0.6218777613103184,
                          '28 Raw Financial Items': 0.5998332445548472,
                          '14 Financial Ratios': 0.6928441416005573}),
             (2003,
              2014): defaultdict(dict,
             

In [17]:
data_items = features_comp
train_period = (1990,1999)
res = defaultdict(lambda: defaultdict(dict))
for test_period in test_periods:
    for data_item in data_items.keys():
        param_grid = {
            'activation': ['logistic', 'tanh', 'relu'],
            'hidden_layer_sizes': [
                (len(data_items[data_item]), 20),
                (len(data_items[data_item]), 30),
                (len(data_items[data_item]), 40, 50, 10),
                (len(data_items[data_item]), 30, 40, 10),
                (len(data_items[data_item]), 40, 30, 50, 20),
                (len(data_items[data_item]), 30, 40),
                (len(data_items[data_item]), 40, 50, 40, 40,20),
                (len(data_items[data_item]), 30, 50, 50, 40,45),
                (len(data_items[data_item]), 20,30,40,50,20)
                ],
            'learning_rate_init': [0.001, 0.01, 0.05, 0.09, 0.1, 0.5]
            }
        auc = results(data_obj,train_period,test_period,data_items[data_item],mlp_grid_search,param_grid)
        res[test_period][data_item] = auc

Best Hyperparameters: {'activation': 'relu', 'hidden_layer_sizes': (42, 40, 50, 40, 40, 20), 'learning_rate_init': 0.001}
Best AUC Score: 0.694273712005281
Test AUC Score: 0.6562173902131673
Best Hyperparameters: {'activation': 'relu', 'hidden_layer_sizes': (28, 40, 50, 40, 40, 20), 'learning_rate_init': 0.001}
Best AUC Score: 0.7143835328712455
Test AUC Score: 0.6166769917232233
Best Hyperparameters: {'activation': 'logistic', 'hidden_layer_sizes': (14, 30), 'learning_rate_init': 0.001}
Best AUC Score: 0.6937276082455666
Test AUC Score: 0.6892830536101593
Best Hyperparameters: {'activation': 'relu', 'hidden_layer_sizes': (42, 40, 50, 40, 40, 20), 'learning_rate_init': 0.001}
Best AUC Score: 0.694273712005281
Test AUC Score: 0.6347803918792168
Best Hyperparameters: {'activation': 'relu', 'hidden_layer_sizes': (28, 40, 50, 40, 40, 20), 'learning_rate_init': 0.001}
Best AUC Score: 0.7143835328712455
Test AUC Score: 0.6085774688934004
Best Hyperparameters: {'activation': 'logistic', 'hidd

In [18]:
res

defaultdict(<function __main__.<lambda>()>,
            {(2003,
              2005): defaultdict(dict,
                         {'28 Raw Financial Items + 14 Financial Ratios': 0.6562173902131673,
                          '28 Raw Financial Items': 0.6166769917232233,
                          '14 Financial Ratios': 0.6892830536101593}),
             (2003,
              2008): defaultdict(dict,
                         {'28 Raw Financial Items + 14 Financial Ratios': 0.6347803918792168,
                          '28 Raw Financial Items': 0.6085774688934004,
                          '14 Financial Ratios': 0.6992850325541522}),
             (2003,
              2011): defaultdict(dict,
                         {'28 Raw Financial Items + 14 Financial Ratios': 0.6014966623744048,
                          '28 Raw Financial Items': 0.5949930669469625,
                          '14 Financial Ratios': 0.6950892045881555}),
             (2003,
              2014): defaultdict(dict,
         

In [None]:
pd.DataFrame.from_dict(res[key], orient='index')

------------
_____________

## XgBoost Tuning

In [5]:
test_periods = [(2003,2005), (2003,2008), (2003,2011), (2003,2014)]

In [6]:
data_items = features_comp
train_period = (1990,1999)
val_period = (2000,2002)
res = defaultdict(lambda: defaultdict(dict))
print("XgBoost Results:))")
for test_period in test_periods:
    for data_item in data_items.keys():
        param_grid = {
                    'max_depth': [3, 4, 5],
                    'learning_rate': [0.001, 0.01, 0.05, 0.09, 0.1, 0.5]
                    }
        auc = results(data_obj,train_period,test_period,data_items[data_item],xgb_model,param_grid)
        res[test_period][data_item] = auc


XgBoost Results:))
Best Hyperparameters: {'learning_rate': 0.01, 'max_depth': 3}
Best AUC Score: 0.7680541302847541
Test AUC Score: 0.7068586554795009
Best Hyperparameters: {'learning_rate': 0.1, 'max_depth': 3}
Best AUC Score: 0.7508008521619107
Test AUC Score: 0.6918923318953105
Best Hyperparameters: {'learning_rate': 0.09, 'max_depth': 5}
Best AUC Score: 0.7067278782968763
Test AUC Score: 0.6446801301881758
Best Hyperparameters: {'learning_rate': 0.01, 'max_depth': 3}
Best AUC Score: 0.7670189335973835
Test AUC Score: 0.6886457000693178
Best Hyperparameters: {'learning_rate': 0.05, 'max_depth': 5}
Best AUC Score: 0.7361730728838479
Test AUC Score: 0.6650736179921308
Best Hyperparameters: {'learning_rate': 0.09, 'max_depth': 5}
Best AUC Score: 0.7066864704293817
Test AUC Score: 0.6437164696074347
Best Hyperparameters: {'learning_rate': 0.01, 'max_depth': 3}
Best AUC Score: 0.7670189335973835
Test AUC Score: 0.6773202007128545
Best Hyperparameters: {'learning_rate': 0.1, 'max_depth': 

In [7]:
res

defaultdict(<function __main__.<lambda>()>,
            {(2003,
              2005): defaultdict(dict,
                         {'28 Raw Financial Items + 14 Financial Ratios': 0.7068586554795009,
                          '28 Raw Financial Items': 0.6918923318953105,
                          '14 Financial Ratios': 0.6446801301881758}),
             (2003,
              2008): defaultdict(dict,
                         {'28 Raw Financial Items + 14 Financial Ratios': 0.6886457000693178,
                          '28 Raw Financial Items': 0.6650736179921308,
                          '14 Financial Ratios': 0.6437164696074347}),
             (2003,
              2011): defaultdict(dict,
                         {'28 Raw Financial Items + 14 Financial Ratios': 0.6773202007128545,
                          '28 Raw Financial Items': 0.6654288591001246,
                          '14 Financial Ratios': 0.6434718769484477}),
             (2003,
              2014): defaultdict(dict,
         

-----------
----

In [None]:
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.model_selection import GridSearchCV
# def random_forests(data_obj,train_period,test_period,item):
#     # Define the Random Forest classifier
#     rf_model = RandomForestClassifier(random_state=42)

#     # Define the parameter grid to search
#     param_grid = {
#         'n_estimators': [100, 200, 300],  # Number of trees in the forest
#         'max_depth': [None, 10, 20, 30],  # Maximum depth of the tree
#         'min_samples_split': [2, 5, 10],  # Minimum number of samples required to split an internal node
#         'min_samples_leaf': [1, 2, 4]  # Minimum number of samples required to be at a leaf node
#     }

#     # Perform grid search using cross-validation
#     grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, scoring='roc_auc', cv=5)
#     grid_search.fit(X_train, y_train)

#     # Get the best hyperparameters and the corresponding AUC score
#     best_params = grid_search.best_params_
#     best_auc = grid_search.best_score_
#     test_auc = grid_search.score(X_test, y_test)

#     # Print the best hyperparameters, the corresponding AUC score, and test AUC Score
#     print("Best Hyperparameters:", best_params)
#     print("Best AUC Score:", best_auc)
#     print("Test AUC Score:", test_auc)

#     # Return the test AUC score
#     return test_auc

    

In [None]:
# random_forests

In [None]:
test_periods = [(2003,2005), (2003,2008), (2003,2011), (2003,2014)]

data_items = features_comp
train_period = (1990,1999)
val_period = (2000,2002)

res = defaultdict(lambda: defaultdict(dict))
print("Random Forest Results:))")
for test_period in test_periods:
    for data_item in data_items.keys():
        param_grid = {
                    'n_estimators': [100, 200, 300,400, 500],  # Number of trees in the forest
                    'max_depth': [ 10, 20],  # Maximum depth of the tree
                    'min_samples_split': [5, 10],  # Minimum number of samples required to split an internal node
                    'min_samples_leaf': [5,6]  # Minimum number of samples required to be at a leaf node
                    }
        auc = results(data_obj,train_period,test_period,data_items[data_item],random_forests,param_grid)
        res[test_period][data_item] = auc


----

------

## Window Processing

In [9]:
train_batches,test_batches = data_obj.create_batches()

In [10]:
for train_period,test_period in zip(train_batches[:-1],test_batches[:-1]):
    print(train_period,test_period)

(1990, 1995) (1996, 2001)
(1996, 2001) (2002, 2006)
(2002, 2006) (2007, 2011)
(2007, 2011) (2012, 2016)
(2012, 2016) (2017, 2021)


In [11]:
data_items = features_comp
res = defaultdict(lambda: defaultdict(dict))
for train_period,test_period in zip(train_batches[:-1],test_batches[:-1]):
    for model in models.keys():
        for data_item in data_items.keys():
            auc = results(data_obj,train_period,test_period,data_items[data_item],models[model])
            res[str(train_period) + '-' +str(test_period)][data_item][model] = auc
        for ben_item in ben_items.keys():
            auc = results(ben_obj,train_period,test_period,ben_items[ben_item],models[model])
            res[str(train_period) + '-' +str(test_period)][ben_item][model] = auc

Optimization terminated successfully.
         Current function value: 0.498506
         Iterations 10
Optimization terminated successfully.
         Current function value: 0.580061
         Iterations 9
Optimization terminated successfully.
         Current function value: 0.608704
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.669697
         Iterations 10
Optimization terminated successfully.
         Current function value: 0.536646
         Iterations 10
Optimization terminated successfully.
         Current function value: 0.585333
         Iterations 9
Optimization terminated successfully.
         Current function value: 0.603220
         Iterations 9
Optimization terminated successfully.
         Current function value: 0.678238
         Iterations 13
Optimization terminated successfully.
         Current function value: 0.511275
         Iterations 12
Optimization terminated successfully.
         Current function value: 0.5700

In [12]:
for key in res.keys():
    columns = ["Train - Test : "+str(key), 'MLP', 'RUS BOOST', 'Logit', 'Probit']
    df = pd.DataFrame.from_dict(res[key], orient='index').reset_index()
    df.columns = columns
    for col in ['MLP', 'RUS BOOST', 'Logit', 'Probit']:
         
        try:
            df[col] = df[col].round(3)
        except:
            pass
    df.set_index("Train - Test : "+str(key), inplace=True)
    print(tabulate(df, headers='keys', tablefmt='fancy_grid'))

╒══════════════════════════════════════════════╤═══════╤═════════════╤═════════╤══════════╕
│ Train - Test : (1990, 1995)-(1996, 2001)     │   MLP │   RUS BOOST │   Logit │   Probit │
╞══════════════════════════════════════════════╪═══════╪═════════════╪═════════╪══════════╡
│ 28 Raw Financial Items + 14 Financial Ratios │   0.5 │       0.698 │   0.561 │    0.577 │
├──────────────────────────────────────────────┼───────┼─────────────┼─────────┼──────────┤
│ 28 Raw Financial Items                       │   0.5 │       0.697 │   0.548 │    0.55  │
├──────────────────────────────────────────────┼───────┼─────────────┼─────────┼──────────┤
│ 14 Financial Ratios                          │   0.5 │       0.651 │   0.635 │    0.641 │
├──────────────────────────────────────────────┼───────┼─────────────┼─────────┼──────────┤
│ Calculated M score Feat                      │   0.5 │       0.516 │   0.537 │    0.539 │
╘══════════════════════════════════════════════╧═══════╧═════════════╧═════════╧

## MLP Param Tuning

In [5]:
train_batches,test_batches = data_obj.create_batches()

for train_period,test_period in zip(train_batches[:-1],test_batches[:-1]):
    print(train_period,test_period)

(1990, 1995) (1996, 2001)
(1996, 2001) (2002, 2006)
(2002, 2006) (2007, 2011)
(2007, 2011) (2012, 2016)
(2012, 2016) (2017, 2021)


In [6]:
data_items = features_comp

In [7]:
from results.models import mlp_grid_search
model = mlp_grid_search
res = defaultdict(lambda: defaultdict(dict))


for train_period,test_period in zip(train_batches[:-1],test_batches[:-1]):

    for data_item in data_items.keys():
        
        param_grid = {
            'activation': ['logistic', 'tanh', 'relu'],
            'hidden_layer_sizes': [
                (len(data_items[data_item]), 40),
                (len(data_items[data_item]), 50),
                (len(data_items[data_item]), 40, 50),
                (len(data_items[data_item]), 30, 40),
                (len(data_items[data_item]), 40, 30, 50),
                (len(data_items[data_item]), 30, 40, 30),
                (len(data_items[data_item]), 40, 50, 60, 40),
                (len(data_items[data_item]), 50, 50, 50, 50),
                (len(data_items[data_item]), 30, 30, 30, 30)
                ],
            'learning_rate_init': [0.001, 0.01, 0.1]
            }
        auc = results(data_obj,train_period,test_period,data_items[data_item],model,param_grid)
        res[str(train_period) + '-' +str(test_period)][data_item] = auc
    for ben_item in ben_items.keys():
        auc = results(ben_obj,train_period,test_period,ben_items[ben_item],model,param_grid)
        res[str(train_period) + '-' +str(test_period)][ben_item] = auc
        
        

Best Hyperparameters: {'activation': 'tanh', 'hidden_layer_sizes': (42, 40, 50, 60, 40), 'learning_rate_init': 0.001}
Best AUC Score: 0.6932138610497707
Test AUC Score: 0.6094481700488392
Best Hyperparameters: {'activation': 'tanh', 'hidden_layer_sizes': (28, 30, 30, 30, 30), 'learning_rate_init': 0.001}
Best AUC Score: 0.6785586886359776
Test AUC Score: 0.6249807849696124
Best Hyperparameters: {'activation': 'logistic', 'hidden_layer_sizes': (14, 40), 'learning_rate_init': 0.01}
Best AUC Score: 0.6883302191268899
Test AUC Score: 0.5937202403938217
Best Hyperparameters: {'activation': 'tanh', 'hidden_layer_sizes': (14, 40), 'learning_rate_init': 0.001}
Best AUC Score: 0.6509248327430145
Test AUC Score: 0.4641166715014932
Best Hyperparameters: {'activation': 'logistic', 'hidden_layer_sizes': (42, 30, 40), 'learning_rate_init': 0.001}
Best AUC Score: 0.7779707815924033
Test AUC Score: 0.7358577767638388
Best Hyperparameters: {'activation': 'logistic', 'hidden_layer_sizes': (28, 40), 'lea



Best Hyperparameters: {'activation': 'logistic', 'hidden_layer_sizes': (14, 40, 50), 'learning_rate_init': 0.001}
Best AUC Score: 0.7115028000973946
Test AUC Score: 0.6765969719962719
Best Hyperparameters: {'activation': 'tanh', 'hidden_layer_sizes': (14, 40), 'learning_rate_init': 0.01}
Best AUC Score: 0.6072459952611434
Test AUC Score: 0.6031902992528666
Best Hyperparameters: {'activation': 'logistic', 'hidden_layer_sizes': (42, 30, 40), 'learning_rate_init': 0.001}
Best AUC Score: 0.7450904045828857
Test AUC Score: 0.6893268063087458
Best Hyperparameters: {'activation': 'relu', 'hidden_layer_sizes': (28, 40, 50), 'learning_rate_init': 0.01}
Best AUC Score: 0.7618863005728608
Test AUC Score: 0.648777987109461
Best Hyperparameters: {'activation': 'relu', 'hidden_layer_sizes': (14, 40, 50), 'learning_rate_init': 0.001}
Best AUC Score: 0.6519949427139278
Test AUC Score: 0.6287276805820385
Best Hyperparameters: {'activation': 'tanh', 'hidden_layer_sizes': (14, 30, 40), 'learning_rate_ini



Best Hyperparameters: {'activation': 'relu', 'hidden_layer_sizes': (14, 40, 30, 50), 'learning_rate_init': 0.001}
Best AUC Score: 0.6264799999999999
Test AUC Score: 0.5940969885203554




Best Hyperparameters: {'activation': 'tanh', 'hidden_layer_sizes': (14, 40, 30, 50), 'learning_rate_init': 0.1}
Best AUC Score: 0.6059259259259259
Test AUC Score: 0.39256489021375596


In [9]:
res

defaultdict(<function __main__.<lambda>()>,
            {'(1990, 1995)-(1996, 2001)': defaultdict(dict,
                         {'28 Raw Financial Items + 14 Financial Ratios': 0.6094481700488392,
                          '28 Raw Financial Items': 0.6249807849696124,
                          '14 Financial Ratios': 0.5937202403938217,
                          'Calculated M score Feat': 0.4641166715014932}),
             '(1996, 2001)-(2002, 2006)': defaultdict(dict,
                         {'28 Raw Financial Items + 14 Financial Ratios': 0.7358577767638388,
                          '28 Raw Financial Items': 0.7378827268830929,
                          '14 Financial Ratios': 0.6765969719962719,
                          'Calculated M score Feat': 0.6031902992528666}),
             '(2002, 2006)-(2007, 2011)': defaultdict(dict,
                         {'28 Raw Financial Items + 14 Financial Ratios': 0.6893268063087458,
                          '28 Raw Financial Items': 0.64877798

In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV, train_test_split



for train_period,test_period in zip(train_batches[:-1],test_batches[:-1]):
    train_data, validation_data, test_data = data_obj.split_data_periods(train_period, test_period)
    for item in data_items.keys():
        
        param_grid = {
                    'activation': ['logistic', 'tanh', 'relu'],
                    'hidden_layer_sizes': [
                        (len(data_items[item]), 40),
                        (len(data_items[item]), 50),
                        (len(data_items[item]), 40, 50),
                        (len(data_items[item]), 30, 40),
                        (len(data_items[item]), 40, 30, 50),
                        (len(data_items[item]), 30, 40, 30),
                        (len(data_items[item]), 40, 50, 60, 40),
                        (len(data_items[item]), 50, 50, 50, 50),
                        (len(data_items[item]), 30, 30, 30, 30)
                    ],
                    'learning_rate_init': [0.001, 0.01, 0.1]
                }

        
        train_data, validation_data, test_data = null_check(data_items[item], train_data, validation_data, test_data)
    
        X_train, y_train = train_data[data_items[item]], train_data['misstate']
        X_test, y_test = test_data[data_items[item]], test_data['misstate']
        rus = RandomUnderSampler(random_state=42)
        X_train_resampled, y_train_resampled = rus.fit_resample(X_train, y_train)
        mlp_model = MLPClassifier(max_iter=10000, random_state=42, verbose=False,early_stopping=True)
        grid_search = GridSearchCV(estimator=mlp_model, param_grid=param_grid, scoring='roc_auc', cv=5)
        grid_search.fit(X_train_resampled, y_train_resampled)

        best_params = grid_search.best_params_
        best_auc = grid_search.best_score_
        test_auc = grid_search.score(X_test, y_test)
        print("Best Hyperparameters:", best_params)
        print("Best AUC Score:", best_auc)
        print("Test AUC Score:", test_auc)
