In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from collections import Counter
from sklearn.metrics import classification_report
from sklearn import metrics
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
import random
from sklearn.utils import resample
import math
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
clf = AdaBoostClassifier()

In [None]:
# List of the 31 audit related variables (ARV)
ARV=[         
       'Industry Specialization (sale)_national',
       'Industry Specialization (sale)_city', 
    
    'Office Size_sales', 
    'Big4',
    
       'New Client', 'Tenure', 
    
    'Same MSA', 'INTEGRATED AUDIT',
       'Historical is accelerated filer',
       'Busy', 'WorkloadCompression',
       'HI_pre_sq', 'Auditor resigned_revised', 
    
    'Log_AuditFee', 'Log_TaxFee',
       'Log_AuditRelatedFee', 'Log_OtherFee', 'NonAuditFeeRatio', 'Influence',
       'AbnormalLAF', 'Log Audit Report Lag', 'Due_to_Auditor',
       'Going concern', 'SOX404auditorWeak',
    
       'Disc. Accruals', 'Abs (Disc. Accruals)', 'Abs(Accruals)',
       'Abs(Accruals/CFO)', 'DD Residual', 
    'Small Profit', 'Prior ROA meet']


# list of ARV that are continous variables 
Cont_arv=[      'Industry Specialization (sale)_national',
       'Industry Specialization (sale)_city', 
    
    'Office Size_sales', 
   'Tenure', 
    
  'WorkloadCompression',
       'HI_pre_sq', 
    
    'Log_AuditFee', 'Log_TaxFee',
       'Log_AuditRelatedFee', 'Log_OtherFee', 'NonAuditFeeRatio', 'Influence',
       'AbnormalLAF', 'Log Audit Report Lag', 
    
       'Disc. Accruals', 'Abs (Disc. Accruals)', 'Abs(Accruals)',
       'Abs(Accruals/CFO)', 'DD Residual']

In [None]:
predictors=ARV

# standardize continuous variables
def Transform(X):
    for name in list(Cont_arv):
        X[name]=(X[name]-X[name].mean())/X[name].std()
        X=X[['CIK Number_x','Data Year - Fiscal_x','Label','RES']+ARV]     
    return X


Test_Year=[]
Cost=[]
Round=[]
BestModel=[]
AUC=[]

testyears=list(range(2015, 2018))
costratios=[1,20,30,40,50]

for TestYear in testyears:
    MasterPath='/Users/username/'#user name is masked for double-blind review
    FilePath="FolderPath/Analysis/Correct consecutive restatement/Train and Test/Set_"+str(TestYear)+".xlsx" #FolderPath is masked for double-blind review
    Path=MasterPath+FilePath
    df=pd.read_excel(Path)
    trans_df=Transform(df)
    
    #train and test for final results
    test=trans_df[trans_df['Data Year - Fiscal_x']==TestYear]
    train=trans_df[trans_df['Data Year - Fiscal_x']<=TestYear-2]

    #split training data into training for validation and testing for validation
    vali_test=trans_df[trans_df['Data Year - Fiscal_x']==TestYear-2]
    vali_train=trans_df[trans_df['Data Year - Fiscal_x']<=TestYear-4]

    test_X=test[predictors]
    test_y=test['RES']
    train_X=train[predictors]
    train_y=train['RES']

    vali_test_X=vali_test[predictors]
    vali_test_y=vali_test['RES']
    vali_train_X=vali_train[predictors]
    vali_train_y=vali_train['RES']

    
    param_grid = {'base_estimator': [DecisionTreeClassifier(max_depth=1),DecisionTreeClassifier(max_depth=3),DecisionTreeClassifier(max_depth=5)], 
       'n_estimators': [30, 50, 70],
      'learning_rate': [0.5, 1, 1.5]} # This is a grid of hyperparameters for AdaBoost


    Base_Estimator=[]
    N_Estimators=[]
    Learning_Rate=[]
    Algorithm=[]


    Tune_AUC=[]
 

    vali_train_RES_indices = vali_train[vali_train.RES==1].index  #get the index of RES instances in vali_train 
    vali_train_NORES_indices = vali_train[vali_train.RES==0].index #get the index of NON-RES instances in vali_train 
    
    for num_split in costratios:

        
        splitted_NORES=np.array_split(np.random.permutation(vali_train_NORES_indices),num_split) #randomly split the NON-RES instances in vali-train into num_split batches following Perols et al. (2017)

        #Begin Hyperparameter Tuning under each cost ratio 

        for estimator in param_grid['base_estimator']:
            for number in param_grid['n_estimators']:
                for rate in param_grid['learning_rate']:
                    tune=AdaBoostClassifier(base_estimator=estimator, n_estimators=number, learning_rate=rate) #for model with each and every combination of hyperparameters
                    y_score_array=[] # to store probability predictions from models trained by each subsample
                    for splitted_NORES_indices in splitted_NORES: # here we adopt the OU method from Perols et al. (2017)

                        under_sample_indices = np.concatenate([vali_train_RES_indices,splitted_NORES_indices]) # for each batch, create an undersampled training set as indicated in Perols et al. (2007)
                        
                        vali_train_under_sample_y = vali_train_y.loc[under_sample_indices]
                        vali_train_under_sample_X = vali_train_X.loc[under_sample_indices]

                        probas_=tune.fit(vali_train_under_sample_X, vali_train_under_sample_y).predict_proba(vali_test_X)
                        y_score=probas_[:, 1]
                        y_score_array.append(y_score)

                    #average the prediction probabilities from different batches        
                    y_average_score=[]
                    for i in range(len(vali_test_y)):
                        average_score=np.mean(y_score_array, axis=0)[i]
                        y_average_score.append(average_score)


                    auc=roc_auc_score(vali_test_y, y_average_score)
                    
                    Tune_AUC.append(auc)
                    Base_Estimator.append(estimator)
                    N_Estimators.append(number)
                    Learning_Rate.append(rate)


        HyperTune_Results=pd.DataFrame()
        HyperTune_Results['Base_Estimator']=Base_Estimator
        HyperTune_Results['N_Estimators']=N_Estimators
        HyperTune_Results['Learning_Rate']=Learning_Rate
        HyperTune_Results['AUC']=Tune_AUC

        #The tuned model is the one with the combination that maximizes AUC in the hold-out validation set
        tunedresults=HyperTune_Results[HyperTune_Results['AUC']==HyperTune_Results['AUC'].max()].reset_index()

        bestmodel=AdaBoostClassifier(base_estimator=tunedresults['Base_Estimator'][0],
                                    n_estimators=tunedresults['N_Estimators'][0],
                                    learning_rate=tunedresults['Learning_Rate'][0]) #use the hyperparameters selected above


        print("for test year "+str(TestYear)+" the best model is "+ str(bestmodel))

        #End hyperparameter tuning under each cost ratio

        #Use the best model to make classification - trained on the training set and test on the hold-out test set

        train_RES_indices = train[train.RES==1].index  
        train_NORES_indices =train[train.RES==0].index  
        
        seed=3
        
        for eachround in range(seed): 

            splitted_NORES=np.array_split(np.random.permutation(train_NORES_indices),num_split)

            y_score_array=[] # used to collect prediction probability from each split 
            y_average_score=[] # will store average prediction probability 
            y_pred_all=[] # will store the prediction results using the average prediciton probability based on threshold of 0.5


            for splitted_NORES_indices in splitted_NORES: # here we adopt the OU method from Perols et al. (2017)
                under_sample_indices = np.concatenate([train_RES_indices,splitted_NORES_indices]) # for each batch, create an undersampled training set
                train_under_sample_y = train_y.loc[under_sample_indices]
                train_under_sample_X = train_X.loc[under_sample_indices]

                probas_ = bestmodel.fit(train_under_sample_X, train_under_sample_y).predict_proba(test_X) 
                #obtain the probability prediction 
                y_score=probas_[:, 1]
                y_score_array.append(y_score)


            num_test=len(test_y) 
            for i in range(num_test):
                average_score=np.mean(y_score_array, axis=0)[i] # take the average of the probability scores from all splits
                y_average_score.append(average_score)


            auc=roc_auc_score(test_y, y_average_score)
            
            AUC.append(auc)
            Test_Year.append(TestYear)
            BestModel.append(bestmodel)
            Round.append(eachround)
            Cost.append(num_split)

results=pd.DataFrame()
results['Test Year']=Test_Year
results['AUC']=AUC
results['Misclassification Cost Ratio']=Cost
results['Round']=Round
results['Tuned Model']=BestModel

results