In [1]:
import numpy as np
import pandas as pd
import random
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from imblearn.under_sampling import RandomUnderSampler
from tabulate import tabulate
import matplotlib.pyplot as plt

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
# read the CSV file
df = pd.read_csv('data_FraudDetection_JAR2020.csv')

In [4]:
df.head(3)

Unnamed: 0,fyear,gvkey,p_aaer,misstate,act,ap,at,ceq,che,cogs,...,soft_assets,ch_cs,ch_cm,ch_roa,issue,bm,dpi,reoa,EBIT,ch_fcf
0,1990,1009,,0,10.047,3.736,32.335,6.262,0.002,30.633,...,0.312448,0.095082,0.082631,-0.019761,1,0.41317,0.873555,0.16762,0.161961,-0.04214
1,1990,1011,,0,1.247,0.803,7.784,0.667,0.171,1.125,...,0.315904,0.188832,-0.211389,-0.117832,1,0.157887,0.745139,-0.428957,-0.157888,0.100228
2,1990,1017,,0,55.04,3.601,118.12,44.393,3.132,107.343,...,0.605342,0.097551,-0.10578,0.091206,1,2.231337,1.015131,0.394768,0.063681,0.066348


In [5]:
df.columns

Index(['fyear', 'gvkey', 'p_aaer', 'misstate', 'act', 'ap', 'at', 'ceq', 'che',
       'cogs', 'csho', 'dlc', 'dltis', 'dltt', 'dp', 'ib', 'invt', 'ivao',
       'ivst', 'lct', 'lt', 'ni', 'ppegt', 'pstk', 're', 'rect', 'sale',
       'sstk', 'txp', 'txt', 'xint', 'prcc_f', 'dch_wc', 'ch_rsst', 'dch_rec',
       'dch_inv', 'soft_assets', 'ch_cs', 'ch_cm', 'ch_roa', 'issue', 'bm',
       'dpi', 'reoa', 'EBIT', 'ch_fcf'],
      dtype='object')

In [100]:
# select the columns to use as features
features = ['act', 'ap', 'at', 'ceq', 'che', 'cogs', 'csho', 'dlc', 'dltis', 'dltt', 'dp', 'ib', 'invt', 'ivao', 'ivst', 'lct', 'lt', 'ni', 'ppegt', 'pstk', 're', 'rect',
            'sale', 'sstk', 'txp', 'txt', 'xint', 'prcc_f', 'dch_wc', 'ch_rsst', 'dch_rec', 'dch_inv', 'soft_assets', 'ch_cs', 'ch_cm', 'ch_roa', 'bm', 'dpi', 'reoa', 'EBIT', 'ch_fcf','issue']

raw_financial_items_28 = ['act', 'ap', 'at', 'ceq', 'che', 'cogs', 'csho', 'dlc', 'dltis', 'dltt', 'dp', 'ib', 'invt', 'ivao', 'ivst', 'lct', 'lt', 'ni', 'ppegt', 'pstk', 're', 'rect',
            'sale', 'sstk', 'txp', 'txt', 'xint', 'prcc_f']

financial_ratios_14 = ['dch_wc', 'ch_rsst', 'dch_rec', 'dch_inv', 'soft_assets', 'ch_cs', 'ch_cm', 'ch_roa', 'bm', 'dpi', 'reoa', 'EBIT', 'ch_fcf','issue']

In [101]:
df[raw_financial_items_28].isna().sum()

act       0
ap        0
at        0
ceq       0
che       0
cogs      0
csho      0
dlc       0
dltis     0
dltt      0
dp        0
ib        0
invt      0
ivao      0
ivst      0
lct       0
lt        0
ni        0
ppegt     0
pstk      0
re        0
rect      0
sale      0
sstk      0
txp       0
txt       0
xint      0
prcc_f    0
dtype: int64

In [102]:
df[financial_ratios_14].isna().sum()

dch_wc          4759
ch_rsst         4851
dch_rec         4743
dch_inv         4615
soft_assets      592
ch_cs          15918
ch_cm          17107
ch_roa         12678
bm                18
dpi             9228
reoa             591
EBIT             591
ch_fcf          5407
issue              0
dtype: int64

In [103]:
df[financial_ratios_14][df[financial_ratios_14]['ch_rsst'].isna()]

Unnamed: 0,dch_wc,ch_rsst,dch_rec,dch_inv,soft_assets,ch_cs,ch_cm,ch_roa,bm,dpi,reoa,EBIT,ch_fcf,issue
506,,,0.072195,0.093950,0.309468,2.255857,-0.351043,0.026031,0.460195,1.180657,0.250241,0.115694,,1
595,,,,,0.902506,,,,-7.034304,,-2.791459,-1.671689,,0
1200,,,,0.252746,0.991532,,,-0.034184,0.219229,0.151757,-0.015701,0.019983,,1
1601,,,-0.084244,-0.018568,0.790887,0.987211,-0.165315,-0.023623,1.054388,0.826361,-0.405198,0.031624,,1
2151,,,0.127701,0.000000,0.765263,-0.998884,-2.122524,-0.542159,0.887674,1.216388,-12.407675,-0.616995,,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
145774,,,,,0.004515,,,,4.937457,,-0.105019,-0.009988,,0
145821,,,,,0.993599,,,,1.101880,,0.050989,0.059279,,1
145924,,,,,0.835009,,,,0.300968,,-4.599718,-0.876563,,1
145974,,,,,0.776411,,,,0.402487,,-0.212378,-0.089157,,1


In [104]:
df.shape

(146045, 46)

### Distinct Training and Testing Periods

In [233]:
data =  df.fillna(0)

In [234]:
train_data = data[(data['fyear'] >=1991 ) & (data['fyear'] <= 1999)]
validation_data = data[(data['fyear'] >= 2000 ) & (data['fyear'] <= 2001)]
test_data = data[(data['fyear'] >= 2003 ) & (data['fyear'] <= 2008)]

In [235]:
# Count positive and negative cases
train_misstate_1 = train_data['misstate'].value_counts()[1]
test_misstate_1 = test_data['misstate'].value_counts()[1]
train_misstate_0 = train_data['misstate'].value_counts()[0]
test_misstate_0 = test_data['misstate'].value_counts()[0]

In [236]:
print(f"Positives cases training: ", train_misstate_1)
print(f"Positives cases test: ", test_misstate_1)
print(f"Negative case training: ", train_misstate_0)
print(f"Negative case test: ", test_misstate_0)

Positives cases training:  332
Positives cases test:  261
Negative case training:  53720
Negative case test:  34905


#### For all features


In [237]:
y_train_resampled.value_counts()

0    332
1    332
Name: misstate, dtype: int64

## For 2 layers

In [238]:
hidden_layer_neurons = [(40,50),(40,60),(50,70),(40,70),(50,90)]


In [239]:
# Split the training and testing data into features and labels
X_train = train_data[features]
y_train = train_data['misstate']

X_valid = validation_data[features]
y_val = validation_data['misstate']

X_test = test_data[features]
y_test = test_data['misstate'] 

In [240]:
X_train.shape

rus = RandomUnderSampler(random_state=42)
X_train_resampled, y_train_resampled = rus.fit_resample(X_train, y_train)

print(f"Number of observations in train set: {len(X_train_resampled)}")

Number of observations in train set: 664


### For all features

In [241]:
def two_layer_mlp(inputs,i,j,X_train, y_train):
    clf = MLPClassifier(hidden_layer_sizes=(inputs, i,j),
                            random_state=42,
                            verbose=False,
                            learning_rate_init=0.003,
                            activation='logistic')

    # Fit data onto the model
    clf.fit(X_train, y_train)

    # Make prediction on test dataset
    ypred = clf.predict(X_test)
    auc = metrics.roc_auc_score(y_test, ypred)

    # Calculate the confusion matrix
    cm = confusion_matrix(y_test, ypred)
    TN, FP, FN, TP = cm[0, 0], cm[0, 1], cm[1, 0], cm[1, 1]  # True negatives ,False positives, False negatives, True positives
    
    return (auc,(TN, FP, FN, TP))



In [242]:
column1,column2,column3,column4,column5,column6, column7= [],[],[],[],[],[],[]

for i,j in hidden_layer_neurons:
    auc,params = two_layer_mlp(42,i,j,X_train_resampled, y_train_resampled)
    column1.append(i)
    column2.append(j)
    column3.append(auc)
    column4.append(params[0])
    column5.append(params[1])
    column6.append(params[2])
    column7.append(params[3])
    
#list of lists for the rows
columns = list(zip(column1, column2, column3, column4, column5, column6, column7))

#headers for each column
headers = ['Neurons in HL-1', 'Neurons in HL-2', 'AUC', 'TN', 'FP', 'FN','TP']

#table using the tabulate function
table = tabulate(columns, headers, tablefmt="fancy_grid")

print(table)

╒═══════════════════╤═══════════════════╤══════════╤═══════╤═══════╤══════╤══════╕
│   Neurons in HL-1 │   Neurons in HL-2 │      AUC │    TN │    FP │   FN │   TP │
╞═══════════════════╪═══════════════════╪══════════╪═══════╪═══════╪══════╪══════╡
│                40 │                50 │ 0.615292 │ 19951 │ 14954 │   89 │  172 │
├───────────────────┼───────────────────┼──────────┼───────┼───────┼──────┼──────┤
│                40 │                60 │ 0.62822  │ 21121 │ 13784 │   91 │  170 │
├───────────────────┼───────────────────┼──────────┼───────┼───────┼──────┼──────┤
│                50 │                70 │ 0.617557 │ 25726 │  9179 │  131 │  130 │
├───────────────────┼───────────────────┼──────────┼───────┼───────┼──────┼──────┤
│                40 │                70 │ 0.624743 │ 21012 │ 13893 │   92 │  169 │
├───────────────────┼───────────────────┼──────────┼───────┼───────┼──────┼──────┤
│                50 │                90 │ 0.602639 │ 22946 │ 11959 │  118 │  143 │
╘═══

### For raw financial data

In [243]:
# Split the training and testing data into features and labels
X_train = train_data[raw_financial_items_28]
y_train = train_data['misstate']

X_valid = validation_data[raw_financial_items_28]
y_val = validation_data['misstate']

X_test = test_data[raw_financial_items_28]
y_test = test_data['misstate'] 

X_train.shape

rus = RandomUnderSampler(random_state=42)
X_train_resampled, y_train_resampled = rus.fit_resample(X_train, y_train)

print(f"Number of observations in train set: {len(X_train_resampled)}")

Number of observations in train set: 664


In [244]:
column1,column2,column3,column4,column5,column6, column7= [],[],[],[],[],[],[]

for i,j in hidden_layer_neurons:
    auc,params = two_layer_mlp(28,i,j,X_train_resampled, y_train_resampled)
    column1.append(i)
    column2.append(j)
    column3.append(auc)
    column4.append(params[0])
    column5.append(params[1])
    column6.append(params[2])
    column7.append(params[3])
    
#list of lists for the rows
columns = list(zip(column1, column2, column3, column4, column5, column6, column7))

#headers for each column
headers = ['Neurons in HL-1', 'Neurons in HL-2', 'AUC', 'TN', 'FP', 'FN','TP']

#table using the tabulate function
table = tabulate(columns, headers, tablefmt="fancy_grid")

print(table)

╒═══════════════════╤═══════════════════╤══════════╤═══════╤═══════╤══════╤══════╕
│   Neurons in HL-1 │   Neurons in HL-2 │      AUC │    TN │    FP │   FN │   TP │
╞═══════════════════╪═══════════════════╪══════════╪═══════╪═══════╪══════╪══════╡
│                40 │                50 │ 0.624215 │ 22045 │ 12860 │  100 │  161 │
├───────────────────┼───────────────────┼──────────┼───────┼───────┼──────┼──────┤
│                40 │                60 │ 0.605036 │ 26858 │  8047 │  146 │  115 │
├───────────────────┼───────────────────┼──────────┼───────┼───────┼──────┼──────┤
│                50 │                70 │ 0.627567 │ 24820 │ 10085 │  119 │  142 │
├───────────────────┼───────────────────┼──────────┼───────┼───────┼──────┼──────┤
│                40 │                70 │ 0.605795 │ 23835 │ 11070 │  123 │  138 │
├───────────────────┼───────────────────┼──────────┼───────┼───────┼──────┼──────┤
│                50 │                90 │ 0.607225 │ 24871 │ 10034 │  130 │  131 │
╘═══

#### Financial ratios only

In [222]:
# Split the training and testing data into features and labels
X_train = train_data[financial_ratios_14]
y_train = train_data['misstate']

X_valid = validation_data[financial_ratios_14]
y_val = validation_data['misstate']

X_test = test_data[financial_ratios_14]
y_test = test_data['misstate'] 

X_train.shape

rus = RandomUnderSampler(random_state=42)
X_train_resampled, y_train_resampled = rus.fit_resample(X_train, y_train)

print(f"Number of observations in train set: {len(X_train_resampled)}")

Number of observations in train set: 664


In [223]:
column1,column2,column3,column4,column5,column6, column7= [],[],[],[],[],[],[]

for i,j in hidden_layer_neurons:
    auc,params = two_layer_mlp(14,i,j,X_train_resampled, y_train_resampled)
    column1.append(i)
    column2.append(j)
    column3.append(auc)
    column4.append(params[0])
    column5.append(params[1])
    column6.append(params[2])
    column7.append(params[3])
    
#list of lists for the rows
columns = list(zip(column1, column2, column3, column4, column5, column6, column7))

#headers for each column
headers = ['Neurons in HL-1', 'Neurons in HL-2', 'AUC', 'TN', 'FP', 'FN','TP']

#table using the tabulate function
table = tabulate(columns, headers, tablefmt="fancy_grid")

print(table)

╒═══════════════════╤═══════════════════╤══════════╤═══════╤═══════╤══════╤══════╕
│   Neurons in HL-1 │   Neurons in HL-2 │      AUC │    TN │    FP │   FN │   TP │
╞═══════════════════╪═══════════════════╪══════════╪═══════╪═══════╪══════╪══════╡
│                40 │                50 │ 0.626636 │ 26761 │  8144 │  134 │  127 │
├───────────────────┼───────────────────┼──────────┼───────┼───────┼──────┼──────┤
│                40 │                60 │ 0.630992 │ 23588 │ 11317 │  108 │  153 │
├───────────────────┼───────────────────┼──────────┼───────┼───────┼──────┼──────┤
│                50 │                70 │ 0.623654 │ 24948 │  9957 │  122 │  139 │
├───────────────────┼───────────────────┼──────────┼───────┼───────┼──────┼──────┤
│                40 │                70 │ 0.594693 │ 27607 │  7298 │  157 │  104 │
├───────────────────┼───────────────────┼──────────┼───────┼───────┼──────┼──────┤
│                50 │                90 │ 0.621879 │ 25894 │  9011 │  130 │  131 │
╘═══

#### for 4 layers

In [256]:
hidden_layer_neurons = [(40,50,70,20),(40,50,60,40),(50,70,90,40),(40,70,40,50),(50,90,50,60)]


In [257]:
# Split the training and testing data into features and labels
X_train = train_data[features]
y_train = train_data['misstate']

X_valid = validation_data[features]
y_val = validation_data['misstate']

X_test = test_data[features]
y_test = test_data['misstate'] 

In [258]:
X_train.shape

rus = RandomUnderSampler(random_state=42)
X_train_resampled, y_train_resampled = rus.fit_resample(X_train, y_train)

print(f"Number of observations in train set: {len(X_train_resampled)}")

Number of observations in train set: 664


### For all features

In [259]:
def two_layer_mlp(inputs,i,j,k,l,X_train, y_train):
    clf = MLPClassifier(hidden_layer_sizes=(inputs, i,j,k,l),
                            random_state=42,
                            verbose=False,
                            learning_rate_init=0.005,
                            activation='logistic')

    # Fit data onto the model
    clf.fit(X_train, y_train)

    # Make prediction on test dataset
    ypred = clf.predict(X_test)
    auc = metrics.roc_auc_score(y_test, ypred)

    # Calculate the confusion matrix
    cm = confusion_matrix(y_test, ypred)
    TN, FP, FN, TP = cm[0, 0], cm[0, 1], cm[1, 0], cm[1, 1]  # True negatives ,False positives, False negatives, True positives
    
    return (auc,(TN, FP, FN, TP))



In [260]:
column1,column2,column3,column4,column5,column6, column7= [],[],[],[],[],[],[]
column23,column34 = [],[]
for i,j,k,l in hidden_layer_neurons:
    auc,params = two_layer_mlp(42,i,j,k,l,X_train_resampled, y_train_resampled)
    column1.append(i)
    column2.append(j)
    column23.append(k)
    column34.append(l)
    column3.append(auc)
    column4.append(params[0])
    column5.append(params[1])
    column6.append(params[2])
    column7.append(params[3])
    
#list of lists for the rows
columns = list(zip(column1, column2,column23,column34, column3, column4, column5, column6, column7))

#headers for each column
headers = ['Neurons in HL-1', 'Neurons in HL-2','Neurons in HL-3', 'Neurons in HL-4', 'AUC', 'TN', 'FP', 'FN','TP']

#table using the tabulate function
table = tabulate(columns, headers, tablefmt="fancy_grid")

print(table)

╒═══════════════════╤═══════════════════╤═══════════════════╤═══════════════════╤══════════╤═══════╤═══════╤══════╤══════╕
│   Neurons in HL-1 │   Neurons in HL-2 │   Neurons in HL-3 │   Neurons in HL-4 │      AUC │    TN │    FP │   FN │   TP │
╞═══════════════════╪═══════════════════╪═══════════════════╪═══════════════════╪══════════╪═══════╪═══════╪══════╪══════╡
│                40 │                50 │                70 │                20 │ 0.635429 │ 21758 │ 13147 │   92 │  169 │
├───────────────────┼───────────────────┼───────────────────┼───────────────────┼──────────┼───────┼───────┼──────┼──────┤
│                40 │                50 │                60 │                40 │ 0.657703 │ 20237 │ 14668 │   69 │  192 │
├───────────────────┼───────────────────┼───────────────────┼───────────────────┼──────────┼───────┼───────┼──────┼──────┤
│                50 │                70 │                90 │                40 │ 0.64207  │ 20483 │ 14422 │   79 │  182 │
├───────────────

### For raw financial data

In [261]:
# Split the training and testing data into features and labels
X_train = train_data[raw_financial_items_28]
y_train = train_data['misstate']

X_valid = validation_data[raw_financial_items_28]
y_val = validation_data['misstate']

X_test = test_data[raw_financial_items_28]
y_test = test_data['misstate'] 

X_train.shape

rus = RandomUnderSampler(random_state=42)
X_train_resampled, y_train_resampled = rus.fit_resample(X_train, y_train)

print(f"Number of observations in train set: {len(X_train_resampled)}")

Number of observations in train set: 664


In [262]:
column1,column2,column3,column4,column5,column6, column7= [],[],[],[],[],[],[]
column23,column34 = [],[]
for i,j,k,l in hidden_layer_neurons:
    auc,params = two_layer_mlp(28,i,j,k,l,X_train_resampled, y_train_resampled)
    column1.append(i)
    column2.append(j)
    column23.append(k)
    column34.append(l)
    column3.append(auc)
    column4.append(params[0])
    column5.append(params[1])
    column6.append(params[2])
    column7.append(params[3])
    
#list of lists for the rows
columns = list(zip(column1, column2,column23,column34, column3, column4, column5, column6, column7))

#headers for each column
headers = ['Neurons in HL-1', 'Neurons in HL-2','Neurons in HL-3', 'Neurons in HL-4', 'AUC', 'TN', 'FP', 'FN','TP']

#table using the tabulate function
table = tabulate(columns, headers, tablefmt="fancy_grid")

print(table)

╒═══════════════════╤═══════════════════╤═══════════════════╤═══════════════════╤══════════╤═══════╤═══════╤══════╤══════╕
│   Neurons in HL-1 │   Neurons in HL-2 │   Neurons in HL-3 │   Neurons in HL-4 │      AUC │    TN │    FP │   FN │   TP │
╞═══════════════════╪═══════════════════╪═══════════════════╪═══════════════════╪══════════╪═══════╪═══════╪══════╪══════╡
│                40 │                50 │                70 │                20 │ 0.607497 │ 20343 │ 14562 │   96 │  165 │
├───────────────────┼───────────────────┼───────────────────┼───────────────────┼──────────┼───────┼───────┼──────┼──────┤
│                40 │                50 │                60 │                40 │ 0.629537 │ 25225 │  9680 │  121 │  140 │
├───────────────────┼───────────────────┼───────────────────┼───────────────────┼──────────┼───────┼───────┼──────┼──────┤
│                50 │                70 │                90 │                40 │ 0.621505 │ 22792 │ 12113 │  107 │  154 │
├───────────────

#### Financial ratios only

In [263]:
# Split the training and testing data into features and labels
X_train = train_data[financial_ratios_14]
y_train = train_data['misstate']

X_valid = validation_data[financial_ratios_14]
y_val = validation_data['misstate']

X_test = test_data[financial_ratios_14]
y_test = test_data['misstate'] 

X_train.shape

rus = RandomUnderSampler(random_state=42)
X_train_resampled, y_train_resampled = rus.fit_resample(X_train, y_train)

print(f"Number of observations in train set: {len(X_train_resampled)}")

Number of observations in train set: 664


In [264]:
column1,column2,column3,column4,column5,column6, column7= [],[],[],[],[],[],[]
column23,column34 = [],[]
for i,j,k,l in hidden_layer_neurons:
    auc,params = two_layer_mlp(14,i,j,k,l,X_train_resampled, y_train_resampled)
    column1.append(i)
    column2.append(j)
    column23.append(k)
    column34.append(l)
    column3.append(auc)
    column4.append(params[0])
    column5.append(params[1])
    column6.append(params[2])
    column7.append(params[3])
    
#list of lists for the rows
columns = list(zip(column1, column2,column23,column34, column3, column4, column5, column6, column7))

#headers for each column
headers = ['Neurons in HL-1', 'Neurons in HL-2','Neurons in HL-3', 'Neurons in HL-4', 'AUC', 'TN', 'FP', 'FN','TP']

#table using the tabulate function
table = tabulate(columns, headers, tablefmt="fancy_grid")

print(table)

╒═══════════════════╤═══════════════════╤═══════════════════╤═══════════════════╤══════════╤═══════╤═══════╤══════╤══════╕
│   Neurons in HL-1 │   Neurons in HL-2 │   Neurons in HL-3 │   Neurons in HL-4 │      AUC │    TN │    FP │   FN │   TP │
╞═══════════════════╪═══════════════════╪═══════════════════╪═══════════════════╪══════════╪═══════╪═══════╪══════╪══════╡
│                40 │                50 │                70 │                20 │ 0.5      │     0 │ 34905 │    0 │  261 │
├───────────────────┼───────────────────┼───────────────────┼───────────────────┼──────────┼───────┼───────┼──────┼──────┤
│                40 │                50 │                60 │                40 │ 0.5      │     0 │ 34905 │    0 │  261 │
├───────────────────┼───────────────────┼───────────────────┼───────────────────┼──────────┼───────┼───────┼──────┼──────┤
│                50 │                70 │                90 │                40 │ 0.605232 │ 25668 │  9237 │  137 │  124 │
├───────────────

-------------------------
* Build New

In [169]:
# Split the training and testing data into features and labels
X_train = train_data[features]
y_train = train_data['misstate']

X_valid = validation_data[features]
y_val = validation_data['misstate']

X_test = test_data[features]
y_test = test_data['misstate'] 

In [170]:
X_train.shape

(54052, 42)

In [171]:
rus = RandomUnderSampler(random_state=42)
X_train_resampled, y_train_resampled = rus.fit_resample(X_train, y_train)

print(f"Number of observations in train set: {len(X_train_resampled)}")

Number of observations in train set: 664


In [172]:
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras import layers, models
from tensorflow.keras.metrics import AUC
from sklearn.metrics import roc_auc_score


scaler = StandardScaler()
X_train = scaler.fit_transform(X_train_resampled)
X_test = scaler.transform(X_test)

X_train = X_train.reshape(X_train.shape[0], 7, 6, 1)
X_test = X_test.reshape(X_test.shape[0], 7, 6, 1)

# Build the CNN model
model = models.Sequential()
model.add(layers.Conv2D(32, (2, 2), activation='relu', input_shape=(7, 6, 1)))
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Conv2D(64, (2, 2), activation='relu'))
model.add(layers.Flatten())
model.add(layers.Dense(32, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))

# Compile the model
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=[AUC()])

# Train the model
model.fit(X_train, y_train_resampled, epochs=100, batch_size=16)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.History at 0x1d801566c80>

In [173]:
y_pred = model.predict(X_test)
auc = roc_auc_score(y_test, y_pred)
print(f'AUC: {auc:.4f}')

AUC: 0.5679


In [45]:
print(X_train.shape)
print(X_train.size)


(664, 28)
18592


---------------

In [183]:
# Split the training and testing data into features and labels
X_train = train_data[raw_financial_items_28]
y_train = train_data['misstate']

X_valid = validation_data[raw_financial_items_28]
y_val = validation_data['misstate']

X_test = test_data[raw_financial_items_28]
y_test = test_data['misstate'] 

In [184]:
X_train.shape

(54052, 28)

In [185]:
rus = RandomUnderSampler(random_state=42)
X_train_resampled, y_train_resampled = rus.fit_resample(X_train, y_train)

print(f"Number of observations in train set: {len(X_train_resampled)}")

Number of observations in train set: 664


In [186]:
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras import layers, models
from tensorflow.keras.metrics import AUC
from sklearn.metrics import roc_auc_score


scaler = StandardScaler()
X_train = scaler.fit_transform(X_train_resampled)
X_test = scaler.transform(X_test)

X_train = X_train.reshape(X_train.shape[0], 7, 4, 1)
X_test = X_test.reshape(X_test.shape[0], 7, 4, 1)

# Build the CNN model
model = models.Sequential()
model.add(layers.Conv2D(32, (1, 2), activation='relu', input_shape=(7, 4, 1)))
model.add(layers.ZeroPadding2D(padding=((0, 0), (0, 1))))
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Conv2D(64, (1, 2), activation='relu'))
model.add(layers.Flatten())
model.add(layers.Dense(32, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))

# Compile the model
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=[AUC()])

# Train the model
model.fit(X_train, y_train_resampled, epochs=100, batch_size=16)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.History at 0x1d802347dc0>

In [187]:
y_pred = model.predict(X_test)
auc = roc_auc_score(y_test, y_pred)
print(f'AUC: {auc:.4f}')

AUC: 0.6512


#### Performance Evaluation over 2003-2008 sample

| Input Var | Method | Neurons | Activation Func| Learning rate  | AUC |
|----------|----------|----------|----------|----------|----------|
|   28 Raw Financial Items  |   MLP - 1  |  70   |   Logistic  |   0.003 | 0.6627|
|     |   MLP - 2 |   (40,60)  |   Logistic  |   0.003 |  0.627567  |
|    |   MLP - 4  |   (40,50,60,40)  |   Logitsic  |   0.003  |  0.629537  |
|    |   CNN  |     |     |  |   0.6512  |
|   28 Raw + 14 Finan Ratios |   MLP - 1  |  70   |   Logistic  |   0.005|   0.648682|
|     |   MLP - 2 |   (40,60)  |   Logistic  |   0.003 | 0.62822  |
|    |   MLP - 4  |   (40,60,50,40)  |   Logitsic  |   0.003  |  0.657703  |
|    |   CNN  |     |     |     |    0.5679  |

