In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
cd /content/drive/My Drive/newdata

In [None]:
!pip install fancyimpute
!pip install impyute

In [None]:
# To supress warnings
import warnings
warnings.filterwarnings("ignore")
warnings.filterwarnings("ignore", category=DeprecationWarning)

# Basic Libraries for Data organization, Statistical operations and Plotting
import numpy as np
import pandas as pd
%matplotlib inline
# For loading .arff files
from scipy.io import arff
# To analyze the type of missing data
import missingno as msno
# Library for performing k-NN and MICE imputations 
import fancyimpute
# Library to perform Expectation-Maximization (EM) imputation
import impyute as impy
# To perform mean imputation
#To perform kFold Cross Validation
from sklearn.model_selection import KFold
# Formatted counter of class labels
from collections import Counter
# Ordered Dictionary
from collections import OrderedDict
# Library imbalanced-learn to deal with the data imbalance. To use SMOTE oversampling
from imblearn.over_sampling import SMOTE 
from scipy.stats.mstats import winsorize


# Impoting classification models
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from imblearn.ensemble import BalancedBaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB

import random

from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve
from sklearn.metrics import precision_recall_curve

In [None]:
df_train = pd.read_csv('/content/drive/MyDrive/newdata/train_bankcruptcy.csv')
df_test = pd.read_csv('/content/drive/MyDrive/newdata/test_bankcruptcy.csv')

In [None]:
df_train.head()

# Module

In [None]:
def load_dataframes(a):
    return [pd.DataFrame(data_i_year[0]) for data_i_year in a]
def set_new_headers(dataframes):
    cols = ['X' + str(i+1) for i in range(len(dataframes[0].columns)-1)]
    cols.append('Y')
    for df in dataframes:
        df.columns = cols


# Preprocess

In [None]:
df_train = df_train.replace({'?':np.nan})
df_test = df_test.replace({'?':np.nan})
df_train = df_train.rename(columns={"class":"TARGET"})

In [None]:
df_test.shape

In [None]:
index = 0 
while(index<=65):
  colname = df_train.columns[index]
  col = getattr(df_train, colname)
  df_train[colname] = col.astype(float)
  index+=1

In [None]:
index = 0 
while(index<=65):
  colname = df_test.columns[index]
  col = getattr(df_test, colname)
  df_test[colname] = col.astype(float)
  index+=1

In [None]:
df_train_1 = df_train[df_train['forecasting period']==1]
df_train_2 = df_train[df_train['forecasting period']==2]
df_train_3 = df_train[df_train['forecasting period']==3]
df_train_4 = df_train[df_train['forecasting period']==4]
dataframes = [df_train_1, df_train_2, df_train_3, df_train_4]
set_new_headers(dataframes)    


In [None]:
df_test_1 = df_test[df_test['forecasting period']==1]
df_test_2 = df_test[df_test['forecasting period']==2]
df_test_3 = df_test[df_test['forecasting period']==3]
df_test_4 = df_test[df_test['forecasting period']==4]
dataframes_test = [df_test_1, df_test_2, df_test_3, df_test_4]
set_new_headers(dataframes_test)  

In [None]:
# print the first 5 rows of a dataset 'year1'
dataframes[3].Y.describe()

# Data_preprocessiong


In [None]:
############################################################
# Get Clean dataframes by dropping all the rows which have missing values
def drop_nan_rows(dataframes, verbose=False):
    clean_dataframes = [df.dropna(axis=0, how='any') for df in dataframes]
    if verbose:
        for i in range(len(dataframes)):
            print(str(i+1)+'year:','Original Length=', len(dataframes[i]), '\tCleaned Length=', len(clean_dataframes[i]), '\tMissing Data=', len(dataframes[i])-len(clean_dataframes[i]))
    return clean_dataframes

# Doing a quick analysis of how many missing values are there in each of the 5 dataframes
nan_dropped_dataframes = drop_nan_rows(dataframes, verbose=True)
nan_dropped_dataframes = drop_nan_rows(dataframes_test, verbose=True)

In [None]:
# generate the sparsity matrix (figure) for all the dataframes
def generate_sparsity_matrix(dfs):
    for i in range(4):
        missing_df_i = dfs[i].columns[dfs[i].isnull().any()].tolist()
        msno.matrix(dfs[i][missing_df_i], figsize=(20,5))

generate_sparsity_matrix(dataframes)

In [None]:
# generate the heatmap for all the dataframes
def generate_heatmap(dfs):
    for i in range(4):
        missing_df_i = dfs[i].columns[dfs[i].isnull().any()].tolist()
        msno.heatmap(dfs[i][missing_df_i], figsize=(20,20))
        
generate_heatmap(dataframes)  

*Data imputation*

In [None]:
#KNN
def perform_knn_imputation(dfs):
    knn_imputed_datasets = [fancyimpute.KNN(k=100,verbose=True).fit_transform(dfs[i]) for i in range(len(dfs))]
    return [pd.DataFrame(data=knn_imputed_datasets[i]) for i in range(len(dfs))]
knn_imputed_dataframes_test = perform_knn_imputation(dataframes_test)
knn_imputed_dataframes = perform_knn_imputation(dataframes)
set_new_headers(knn_imputed_dataframes)
set_new_headers(knn_imputed_dataframes_test)

In [None]:
df = knn_imputed_dataframes
test_frames = knn_imputed_dataframes_test

# Handling Outlier


In [None]:
train_index = []
for i in range(3):
  a = df[i][['X1','X2']]
  train_index.append(a)

In [None]:
test_index= []
for i in range(3):
  a = df[i][['X1','X2']]
  test_index.append(a)

In [None]:
from sklearn import preprocessing


In [None]:
scaler = preprocessing.RobustScaler()
for i in range(4):
  df[i].drop(columns=['X1', 'X2'], inplace=True)
  col = df[i].columns
  df[i] = scaler.fit_transform(df[i])
  df[i] = pd.DataFrame(df[i], columns = col)

In [None]:
for i in range(4):
  test_frames[i].drop(columns=['X1', 'X2'], inplace=True)
  col = test_frames[i].columns
  test_frames[i] = scaler.fit_transform(test_frames[i])
  test_frames[i] = pd.DataFrame(test_frames[i], columns = col)

# train

In [None]:
a=df[0]
for i in a.columns:
  if i == ['Y']:
    a[i] = a[i]
  else:
    a[i] = a[i].apply(lambda x: np.clip(x, a_min=np.quantile(x,q=0.02), a_max=np.quantile(x,q=0.98)))

In [None]:
b=df[1]
for i in a.columns:
  if i == ['Y']:
    b[i] = b[i]
  else:
    b[i] = b[i].apply(lambda x: np.clip(x, a_min=np.quantile(x,q=0.02), a_max=np.quantile(x,q=0.98)))

In [None]:
c=df[1]
for i in a.columns:
  if i == ['Y']:
    c[i] = c[i]
  else:
    c[i] = c[i].apply(lambda x: np.clip(x, a_min=np.quantile(x,q=0.02), a_max=np.quantile(x,q=0.98)))

In [None]:
d=df[1]
for i in a.columns:
  if i == ['Y']:
    d[i] = d[i]
  else:
    d[i] = d[i].apply(lambda x: np.clip(x, a_min=np.quantile(x,q=0.02), a_max=np.quantile(x,q=0.98)))

In [None]:
clean_dataframes = [a, b, c ,d]

# Test

In [None]:
a_test =test_frames[0]
for i in a_test.columns:
  if i == ['Y']:
    a_test[i] = a_test[i]
  else:
    a_test[i] = a_test[i].apply(lambda x: np.clip(x, a_min=np.quantile(x,q=0.02), a_max=np.quantile(x,q=0.98)))

In [None]:
b_test =test_frames[1]
for i in b_test.columns:
  if i == ['Y']:
    b_test[i] = b_test[i]
  else:
    b_test[i] = b_test[i].apply(lambda x: np.clip(x, a_min=np.quantile(x,q=0.02), a_max=np.quantile(x,q=0.98)))

In [None]:
c_test =test_frames[2]
for i in c_test.columns:
  if i == ['Y']:
    c_test[i] = c_test[i]
  else:
    c_test[i] = c_test[i].apply(lambda x: np.clip(x, a_min=np.quantile(x,q=0.02), a_max=np.quantile(x,q=0.98)))

In [None]:
d_test =test_frames[3]
for i in d_test.columns:
  if i == ['Y']:
    d_test[i] = d_test[i]
  else:
    d_test[i] = d_test[i].apply(lambda x: np.clip(x, a_min=np.quantile(x,q=0.02), a_max=np.quantile(x,q=0.98)))

In [None]:
imp_feat_list = ['X5','X31','X51','X14','X59','X42','X53','X36','X6','X48'] #  ,'DAYS_EMPLOYED'
poly = [a[imp_feat_list], b[imp_feat_list], c[imp_feat_list], d[imp_feat_list]]
poly_test = [a_test[imp_feat_list], b_test[imp_feat_list], c_test[imp_feat_list], d_test[imp_feat_list]]

In [None]:
from sklearn.preprocessing import PolynomialFeatures
poly_transform = PolynomialFeatures(degree=2)

for i in range(4):
    poly[i] = poly_transform.fit_transform(poly[i])
    poly_feat_name_list = poly_transform.get_feature_names(imp_feat_list)
    poly[i] = pd.DataFrame(poly[i],columns=poly_feat_name_list)
    

In [None]:
for i in range(4):
    poly[i].index = poly[i].index
    clean_dataframes[i] = poly[i].merge(clean_dataframes[i],how='inner', on = imp_feat_list)

# Dealing with imbalanced data

In [None]:
def check_data_imbalance(dfs):
    for i in range(len(dfs)):
        print('Dataset: '+str(i+1)+'year')
        print(dfs[i].groupby('Y').size())
        try:
          minority_percent = (dfs[i]['Y'].tolist().count(1) / len(dfs[i]['Y'].tolist()))*100
        except:
          minority_percent = 0 
        print('Minority (label 1) percentage: '+  str(minority_percent) + '%')
        print('-'*64)
        
check_data_imbalance(clean_dataframes)

In [None]:
def prepare_kfold_cv_data(k, X, y, verbose=False):
    X = X.values
    y = y.values
    kf = KFold(n_splits=k, shuffle=True, random_state=42)
    X_train = []
    y_train = []
    X_test = []
    y_test = []
    
    for train_index, test_index in kf.split(X):
        X_train.append(X[train_index])
        y_train.append(y[train_index])
        X_test.append(X[test_index])
        y_test.append(y[test_index])
    return X_train, y_train, X_test, y_test

In [None]:
from xgboost import XGBClassifier


In [None]:
param_a = {'booster': 'gbtree',
 'lambda': 1.433876126076925e-05,
 'alpha': 5.541294137451031e-06,
 'scale_pos_weight': 0.7194659025661956,
 'max_depth': 9,
 'eta': 0.7248402612848088,
 'gamma': 0.0002853050773396981,
 'grow_policy': 'depthwise'}
param_b = {'booster': 'dart',
 'lambda': 2.7422411585568494e-06,
 'alpha': 0.02194295318858507,
 'scale_pos_weight': 0.9991694953298548,
 'max_depth': 9,
 'eta': 0.4081155559171879,
 'gamma': 0.0002291703800060232,
 'grow_policy': 'depthwise',
 'sample_type': 'weighted',
 'normalize_type': 'forest',
 'rate_drop': 8.15129169703931e-07,
 'skip_drop': 8.733696949171181e-07}
param_c = {'booster': 'gbtree',
 'lambda': 6.9404480293999284e-06,
 'alpha': 2.0844411765906503e-05,
 'scale_pos_weight': 0.7518281741167833,
 'max_depth': 7,
 'eta': 0.4964415154222043,
 'gamma': 6.187888926971437e-07,
 'grow_policy': 'lossguide'}
param_d = {'booster': 'dart',
 'lambda': 0.0004246095494636485,
 'alpha': 0.0018844772828474623,
 'scale_pos_weight': 0.9840603537740624,
 'max_depth': 7,
 'eta': 0.6859005312049079,
 'gamma': 0.02316741557339404,
 'grow_policy': 'lossguide',
 'sample_type': 'weighted',
 'normalize_type': 'tree',
 'rate_drop': 4.589378989920688e-05,
 'skip_drop': 2.653220112029294e-07}
param = [param_a, param_b, param_c, param_d]

In [None]:
clean_dataframes[1].head()

In [None]:
def split_dataframes_features_labels(dfs):
    feature_dfs = [dfs[i].iloc[:,1:-2] for i in range(len(dfs))]
    label_dfs = [dfs[i].iloc[:,-1] for i in range(len(dfs))]
    return feature_dfs, label_dfs

from sklearn.preprocessing import RobustScaler
scaler = RobustScaler()

In [None]:
from pandas.core.common import random_state
feature_dfs, label_dfs = split_dataframes_features_labels(clean_dataframes)   
verbose= False
k_folds = 5
for df_index in range(len(clean_dataframes)):
  if verbose: print('\t\tDataset: ' + '\033[1m' + str(df_index+1) + 'year' + '\033[0m')
                
  # Calling the 'prepare_kfold_cv_data' returns lists of features and labels 
  # for train and test sets respectively.
  # The number of items in the list is equal to k_folds
  X_train_list, y_train_list, X_test_list, y_test_list = prepare_kfold_cv_data(k_folds, feature_dfs[df_index], label_dfs[df_index], verbose)

  #scaler.fit(feature_dfs[df_index])
  #X_train_list = scaler.fit_transform(X_train_list[df_index])
  #X_test_list = scaler.fit_transform(X_test_list[df_index])
  metrics_results = OrderedDict()
  accuracy_list = np.zeros([k_folds])
  precision_list = np.zeros([k_folds,2])
  recall_list = np.zeros([k_folds,2])
  TN_list = np.zeros([k_folds])
  FP_list = np.zeros([k_folds])
  FN_list = np.zeros([k_folds])
  TP_list = np.zeros([k_folds])                
  clf = XGBClassifier(**param[df_index])             
                # Iterate over all the k-folds
  for k_index in range(k_folds):
    X_train = X_train_list[k_index]
    y_train = y_train_list[k_index]
    X_test = X_test_list[k_index]
    y_test = y_test_list[k_index]
                    
    # Fit the model and 
    
    clf.fit(X_train, y_train, verbose=True, early_stopping_rounds=1000, eval_metric='logloss', eval_set=[[X_test,y_test]])
    y_test_predicted = clf.predict(X_test)
                    
                    #code for calculating accuracy 
    _accuracy_ = accuracy_score(y_test, y_test_predicted, normalize=True)
    accuracy_list[k_index] = _accuracy_
                    
                    #code for calculating recall 
    _recalls_ = recall_score(y_test, y_test_predicted, average=None)
    recall_list[k_index] = _recalls_
                    
                    #code for calculating precision 
    _precisions_ = precision_score(y_test, y_test_predicted, average=None)
    precision_list[k_index] = _precisions_
                    
                    #code for calculating confusion matrix 
    _confusion_matrix_ = confusion_matrix(y_test, y_test_predicted)
    TN_list[k_index] = _confusion_matrix_[0][0]
    FP_list[k_index] = _confusion_matrix_[0][1]
    FN_list[k_index] = _confusion_matrix_[1][0]
    TP_list[k_index] = _confusion_matrix_[1][1]

In [None]:
year_results= pd.DataFrame
metrics_results['Accuracy'] = np.mean(accuracy_list)
metrics_results['Precisions'] = np.mean(precision_list, axis=0)
metrics_results['Recalls'] = np.mean(recall_list, axis=0)
metrics_results['F1-score'] = (2*metrics_results['Precisions']*metrics_results['Recalls'])/(metrics_results['Precisions']+metrics_results['Recalls'])
metrics_results['TN'] = np.mean(TN_list)
metrics_results['FP'] = np.mean(FP_list)
metrics_results['FN'] = np.mean(FN_list)
metrics_results['TP'] = np.mean(TP_list)
                
if verbose:
  print('\t\t\tAccuracy:', metrics_results['Accuracy'])
  print('\t\t\tPrecision:', metrics_results['Precisions'])
  print('\t\t\tRecall:', metrics_results['Recalls'])
                
metrics_results