# Research - Results

* The dataset utilized in this study has been subjected to undersampling as a mitigation strategy to address the issue of class imbalance.

In [2]:
import numpy as np
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score, ndcg_score, precision_score, recall_score
import time
from sklearn import metrics
from imblearn.under_sampling import RandomUnderSampler
from tabulate import tabulate
import matplotlib.pyplot as plt
from sklearn.svm import SVC
import xgboost as xgb
import statsmodels.api as sm
from sklearn.linear_model import LogisticRegression

In [23]:
class DataProcessor:
    def __init__(self, data, train_period, val_period, test_period, window_size):
        self.data = data
        self.train_period = train_period
        self.val_period = val_period
        self.test_period = test_period
        self.window_size = window_size


    def split_data_periods(self,train_period,test_period):
        train_data = self.data[(self.data['fyear'] >= train_period[0] ) & (self.data['fyear'] <= train_period[1])]
        validation_data = self.data[(self.data['fyear'] >= self.val_period[0] ) & (self.data['fyear'] <= self.val_period[1])]
        test_data = self.data[(self.data['fyear'] >= test_period[0] ) & (self.data['fyear'] <= test_period[1])]
        return train_data, validation_data, test_data

    def create_batches(self):
        train_batches,test_batches =[], []
        train_start,train_end = self.train_period[0], self.train_period[0]+self.window_size
        test_start,test_end = self.test_period[0], self.test_period[0]+self.window_size

        while test_start <= 2023: 
            train_batches.append((train_start,train_end))
            test_batches.append((test_start,test_end))
 
            train_start += self.window_size
            train_end += self.window_size
            test_start += self.window_size
            test_end += self.window_size

            if test_end > 2023:
                test_end==2023
        return train_batches, test_batches

In [12]:
# select the columns to use as features
features = ['act', 'ap', 'at', 'ceq', 'che', 'cogs', 'csho', 'dlc', 'dltis', 'dltt', 'dp', 'ib', 'invt', 'ivao', 'ivst', 'lct', 'lt', 'ni', 'ppegt', 'pstk', 're', 'rect',
            'sale', 'sstk', 'txp', 'txt', 'xint', 'prcc_f', 'dch_wc', 'ch_rsst', 'dch_rec', 'dch_inv', 'soft_assets', 'ch_cs', 'ch_cm', 'ch_roa', 'bm', 'dpi', 'reoa', 'EBIT', 'ch_fcf']

raw_financial_items_28 = ['act', 'ap', 'at', 'ceq', 'che', 'cogs', 'csho', 'dlc', 'dltis', 'dltt', 'dp', 'ib', 'invt', 'ivao', 'ivst', 'lct', 'lt', 'ni', 'ppegt', 'pstk', 're', 'rect',
            'sale', 'sstk', 'txp', 'txt', 'xint', 'prcc_f']

financial_ratios_14 = ['dch_wc', 'ch_rsst', 'dch_rec', 'dch_inv', 'soft_assets', 'ch_cs', 'ch_cm', 'ch_roa', 'bm', 'dpi', 'reoa', 'EBIT', 'ch_fcf','issue']


In [13]:
import pandas as pd
data = pd.read_csv('Data/data_FraudDetection_JAR2020.csv')

In [24]:
data_obj = DataProcessor(data,(1991,1999), (2000,2001), (2003,2014), 5)


## Results - Data [Batch Processing]

### Reproduce Bao

In [20]:
def rus_boost(X_train, y_train, X_test, y_test):
    metrics = {}
    base_model = DecisionTreeClassifier(min_samples_leaf=5)
    rusboost = AdaBoostClassifier(base_model, n_estimators=300, learning_rate=0.1)
    rusboost.fit(X_train, y_train)

    y_pred = rusboost.predict(X_test)
    y_scores = rusboost.decision_function(X_test)

    
    metrics['auc'] = roc_auc_score(y_test, y_scores)

    return metrics['auc']

def svm_model(X_train,y_train,X_test,y_test):
    svc=SVC(kernel='linear', probability=True, class_weight='balanced', random_state=42)
    svc.fit(X_train,y_train)
    y_scores = svc.predict_proba(X_test)[:, 1]
    return roc_auc_score(y_test, y_scores)

def xgb_model(X_train,y_train,X_test,y_test):
    xgb_clf = xgb.XGBClassifier()
    xgb_clf.fit(X_train,y_train)
    y_scores = xgb_clf.predict_proba(X_test)[:, 1]
    return roc_auc_score(y_test, y_scores)

def logistic_regression_model(X_train, y_train, X_test, y_test):

    logit_clf = LogisticRegression(solver='liblinear') 
    logit_clf.fit(X_train, y_train)
    y_scores = logit_clf.predict_proba(X_test)[:, 1]
    return roc_auc_score(y_test, y_scores)

def probit_regression_model(X_train, y_train, X_test, y_test):
    X_train = sm.add_constant(X_train)
    X_test = sm.add_constant(X_test)
    probit_model = sm.Probit(y_train, X_train)
    probit_result = probit_model.fit()
    y_scores = probit_result.predict(X_test)
    return roc_auc_score(y_test, y_scores)
    

def evaluate(item,train_data,test_data,model_name):
    X_train, y_train = train_data[item], train_data['misstate']
    X_test, y_test = test_data[item], test_data['misstate'] 

    rus = RandomUnderSampler(random_state=42)
    X_train_resampled, y_train_resampled = rus.fit_resample(X_train, y_train)

    print("Training data Shape after sampling:", X_train_resampled.shape)
    print("Test data Shape:", X_test.shape)
    return model_name(X_train_resampled, y_train_resampled, X_test, y_test)

def null_check(item,train_data,val_data,test_data):
    train_data = train_data.dropna(subset=item)
    val_data = val_data.dropna(subset=item)
    test_data = test_data.dropna(subset=item)
    return train_data,val_data,test_data

def results(obj,train_period,test_period,item,model_name):
    train_data, validation_data, test_data = obj.split_data_periods(train_period,test_period)
    train_data, validation_data, test_data = null_check(item,train_data, validation_data, test_data)
    return evaluate(item,train_data,test_data,model_name)

In [116]:
test_periods = [(2003,2005), (2003,2008), (2003,2011), (2003,2014)]
models = {"RUS BOOST": rus_boost,
          "Logit":logistic_regression_model,
         "SVM": svm_model}
data_items = {"28 Raw Financial Items": raw_financial_items_28,
              "14 Financial Ratios": financial_ratios_14,
              "28 Raw Financial Items + 14 Financial Ratios": features}

In [None]:
train_period = (1991,1999)
for model in models.keys():
    print("MODEL: ", model)
    print("---"*25)
    for test_period in test_periods:
        print("Test period: ", test_period)
        print("---"*10)
        for data_item in data_items.keys():
            print(data_item)
            print("---"*5)
            print(results(data_obj,train_period,test_period,data_items[data_item],models[model]))
            print("---"*20)

MODEL:  RUS BOOST
---------------------------------------------------------------------------
Test period:  (2003, 2005)
------------------------------
28 Raw Financial Items
---------------
Training data Shape after sampling: (664, 28)
Test data Shape: (17778, 28)
0.7169351621672315
------------------------------------------------------------
14 Financial Ratios
---------------
Training data Shape after sampling: (596, 14)
Test data Shape: (15724, 14)
0.6180801523818429
------------------------------------------------------------
28 Raw Financial Items + 14 Financial Ratios
---------------
Training data Shape after sampling: (596, 41)
Test data Shape: (15724, 41)
0.6500865717396342
------------------------------------------------------------
Test period:  (2003, 2008)
------------------------------
28 Raw Financial Items
---------------
Training data Shape after sampling: (664, 28)
Test data Shape: (35166, 28)
0.6838289588434069
--------------------------------------------------------

## Reproduce Beneish - SVM

In [None]:
ben_data = pd.read_csv("Beneish_scores_final.csv")
ben_data = ben_data.rename(columns={'Mistate':'misstate'})
ben_data = ben_data.replace([np.inf,-np.inf],np.nan)
ben_obj = DataProcessor(ben_data,(1991,1999), (2000,2001), (2003,2014), 5)

In [None]:
m_col = ['dsri','gmi','aqi','sgi','depi','sgai','lvgi','tata']

In [None]:
test_periods = [(2003,2005), (2003,2008), (2003,2011), (2003,2014)]
models = {"SVM": svm_model}
data_items = {"Calculated M score Feat": m_col}

train_period = (1991,1999)
for model in models.keys():
    print("MODEL: ", model)
    print("---"*25)
    for test_period in test_periods:
        print("Test period: ", test_period)
        print("---"*10)
        for data_item in data_items.keys():
            print(data_item)
            print("---"*5)
            print(results(ben_obj,train_period,test_period,data_items[data_item],models[model]))
            print("---"*20)

### Reproduce Beneish - Probit/Logit

In [109]:
test_periods = [(2003,2005), (2003,2008), (2003,2011), (2003,2014)]
models = {"Logit": logistic_regression_model,
         "Probit": probit_regression_model}
data_items = {"Calculated M score Feat": m_col}

train_period = (1991,1999)
for model in models.keys():
    print("MODEL: ", model)
    print("---"*25)
    for test_period in test_periods:
        print("Test period: ", test_period)
        print("---"*10)
        for data_item in data_items.keys():
            print(data_item)
            print("---"*5)
            print(results(ben_obj,train_period,test_period,data_items[data_item],models[model]))
            print("---"*20)

MODEL:  Logit
---------------------------------------------------------------------------
Test period:  (2003, 2005)
------------------------------
Calculated M score Feat
---------------
Training data Shape after sampling: (572, 8)
Test data Shape: (17602, 8)
0.6243783247396255
------------------------------------------------------------
Test period:  (2003, 2008)
------------------------------
Calculated M score Feat
---------------
Training data Shape after sampling: (572, 8)
Test data Shape: (33425, 8)
0.6046082740958321
------------------------------------------------------------
Test period:  (2003, 2011)
------------------------------
Calculated M score Feat
---------------
Training data Shape after sampling: (572, 8)
Test data Shape: (47800, 8)
0.6033893280710942
------------------------------------------------------------
Test period:  (2003, 2014)
------------------------------
Calculated M score Feat
---------------
Training data Shape after sampling: (572, 8)
Test data Shap