In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
cd /content/drive/My Drive/newdata

In [None]:
# Data imputation
!pip install imbalanced-learn
from imblearn.over_sampling import SMOTE
import pandas as pd
import sys
import seaborn as sns
import scipy as sp
import numpy as np
import shelve
from sklearn import preprocessing
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

# ANN
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation,Dropout
from tensorflow.keras.callbacks import EarlyStopping

# model evaluation
from sklearn.metrics import confusion_matrix, classification_report, f1_score
from sklearn.metrics import plot_confusion_matrix
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
df_train = pd.read_csv('/content/drive/MyDrive/newdata/train_bankcruptcy.csv')
df_test = pd.read_csv('/content/drive/MyDrive/newdata/test_bankcruptcy.csv')

In [None]:
df_train.head()

In [None]:
def extract_cat_var(df):

    cat_colnames = list(df.select_dtypes('object').columns)

    for col in cat_colnames:
      print('Categorical column: {}\n'.format(col))
      print('Number of unique entries: {}\n'.format(df[col].nunique()))
      print('Unique entry names:\n{}\n'.format(df[col].unique()))
      print('Value counts of each entry:\n{}\n'.format(df[col].value_counts(dropna=False)))
      print('---------------------------------------------------------')

    return

# modules

In [None]:
def plot_kde_hist_var(df,varList,calcStat = True, drawAll = False):
    numVar = len(varList)

    plt.figure(figsize=(10,numVar*4))
    ks_stat_list = []
    ks_pval_list = []
    try:
        for i,var in enumerate(varList):
            tgt_true = df.loc[df['TARGET']==1,var]
            tgt_false = df.loc[df['TARGET']==0,var]

            # calculate statistical significance between both populations
            if calcStat == True:
                (ks_stat,ks_pval)= sp.stats.ks_2samp(tgt_true,tgt_false)
                ks_stat_list.append(ks_stat)
                ks_pval_list.append(ks_pval)
                ks_hval_list = [True for hyp in ks_pval_list if hyp<0.05]

            #
            median_tgt_true = tgt_true.median()
            median_tgt_false = tgt_false.median()
            corrVal = df['TARGET'].corr(df[var])
            print('Median Value of {} when Target (True): {:.6f}'.format(var,median_tgt_true))
            print('Median Value of {} when Target (False): {:.6f}'.format(var,median_tgt_false))
            print('Pearson Correlation of {} with Target (True): {:.6f}'.format(var,corrVal))

            # drawing KDE distributions
            tgt_true.dropna(inplace=True) # require to dropna for sns.distplot function
            tgt_false.dropna(inplace=True)
            plt.subplot(numVar,1,i+1)
            sns.distplot(tgt_true,rug=drawAll,kde=drawAll,label='Target: True')
            sns.distplot(tgt_false,rug=drawAll,kde=drawAll,label='Target: False')
            plt.legend()
            #plt.title(var)
    except TypeError as error:
        print(error)
        print('Features are objects.  Need ints/floats')

    return ks_hval_list, ks_pval_list
def label_encoding_df(df,cat_limit = 2):

    le = preprocessing.LabelEncoder()
    le_count = 0
    label_encode_list = []
    for col in df:
        if df[col].dtype=='object':
            if df[col].nunique(dropna=False) <= cat_limit:
                print(col)
                le_count += 1
                le.fit(df[col])
                df[col] = le.transform(df[col])
                label_encode_list.append(col)

    print('{0} columns were label encoded'.format(le_count))

    return df, label_encode_list
def print_tab_miss_val(df,miss_val_thresh=50,numColPrint=10,printData=False):
    # Evaluate missing values in the data
    num_miss_val = df.isnull().sum()
    pct_miss_val = num_miss_val/df.shape[0]*100

    tab_miss_val = pd.concat([num_miss_val,pct_miss_val],axis=1)
    tab_miss_val.columns = ['Missing Values','Percentage']
    tab_miss_val  = tab_miss_val[tab_miss_val['Missing Values']>0]
    tab_miss_val['Percentage'] = tab_miss_val['Percentage'].round(1)
    tab_miss_val.sort_values(['Percentage'],ascending=False,inplace=True)

    numCol_miss_val = tab_miss_val.shape[0]
    numCol_total = df.shape[1]
    pctCol_miss_val = round((numCol_miss_val/numCol_total)*100)

    numCol_crit_miss_val = tab_miss_val[tab_miss_val['Percentage'] > miss_val_thresh].shape[0]
    pctCol_crit_miss_val = round(numCol_crit_miss_val/numCol_total*100)

    info_miss_val = pd.Series(data=[numCol_miss_val,pctCol_miss_val,numCol_crit_miss_val,pctCol_crit_miss_val],
              index=['Cols Missing Values','Cols Missing Values (%)',
            'Cols Critical Missing Values', 'Cols Critical Missing Values (%)'])

    if printData==True:
        print(info_miss_val)
        print('\n Top {} columns with missing values is as follows:'.format(numColPrint))
        print(tab_miss_val['Percentage'].head(numColPrint))

    return info_miss_val, tab_miss_val
def convSeries2Str(seriesData):
    strList = ''
    for idx,val in seriesData.iteritems():
        strVal = '{}({}), '.format(idx,val)
        strList = strList + strVal

    return strList
def print_basic_info_df(df,bal_thresh=30):

    (numRow,numCol) = df.shape
    memory = int(sys.getsizeof(df)/(10**6))

    dtypeVals = df.dtypes.value_counts()
    dtypeStr = convSeries2Str(dtypeVals)

    # Extract the unique variables of each  column that are strings, and extract the unique variables including NaNs
    catVals = df.select_dtypes('object').nunique(dropna=False)
    catStr = convSeries2Str(catVals)

    # Is the dataframe balanced?
    if 'TARGET' in df:
        (numRow,numCol) = df.shape
        pctTarget_true = int(df['TARGET'].sum()/numRow*100)
        if pctTarget_true > 100-bal_thresh or pctTarget_true < bal_thresh:
            isBalanced='No'
        else:
            isBalanced='True'
    else:
        isBalanced='N/A'
        pctTarget_true='N/A'

    series_data = [numRow, numCol, dtypeStr,memory,pctTarget_true,isBalanced,catStr]
    series_idx = ['Num rows','Num cols','Dtype','Memory (MB)','True (%)','Is Balanced','Categorical cols']
    series_info = pd.Series(series_data,index = series_idx)

    dict_info = [{'Num rows': numRow, 'Num cols': numCol,'Dtype': dtypeStr,
    'Memory (MB)': memory,'True (%)': pctTarget_true,'Is Balanced':isBalanced,
    'Category cols': catStr} ]

    return series_info
def print_compare_df(df1,df2,miss_val_thresh=50,bal_thresh=30,printCompareData=False):

    # Prints combined basic data of each dataframe
    df1_basicinfo = print_basic_info_df(df1)
    df2_basicinfo = print_basic_info_df(df2)
    comb_basic_info = pd.concat([df1_basicinfo,df2_basicinfo],axis=1)

    # Compare missing value data
    miss_val_info_df1, miss_val_tab_df1 =  print_tab_miss_val(df1)
    miss_val_info_df2, miss_val_tab_df2 =  print_tab_miss_val(df2)
    comb_miss_val_info = pd.concat([miss_val_info_df1,miss_val_info_df2],axis=1)


    s1 = set(df1.dtypes)
    s2 = set(df2.dtypes)

    # Compare two dataframes for number of missing categories, and values in each category
    # As the training and test datasets are of different sizes, the training dataset may have values
    # in the feature columns that are not in the test datasets.
    # This code analyzes whether there are more than 5 different unique variables between feature columns
    # of the test and training datasets.
    if s1 == s2:
        for x in list(s1):

            df1_catCols = df1.select_dtypes(x).nunique(dropna=False)
            df2_catCols = df2.select_dtypes(x).nunique(dropna=False)
            diff_catColsList = df1_catCols - df2_catCols
            diff_catCols = diff_catColsList[(diff_catColsList<5) & (diff_catColsList>-5) & (diff_catColsList!=0)]
            for y in diff_catCols.index:
                df1_valCnt = df1[y].value_counts()
                df1_valCnt.name = df1_valCnt.name+'_DF1'
                df2_valCnt = df2[y].value_counts()
                df2_valCnt.name = df2_valCnt.name+'_DF2'
                comb_valCnt = pd.concat([df1_valCnt,df2_valCnt],axis=1)

                if printCompareData==True:
                    print(comb_valCnt)
                    plt.figure()
                    comb_valCnt.plot.bar(rot=60,title=y)



    return comb_basic_info, comb_miss_val_info, miss_val_tab_df1, miss_val_tab_df2

# Preprocess

In [None]:
df_train = df_train.replace({'?':np.nan})
df_test = df_test.replace({'?':np.nan})
df_train = df_train.rename(columns={"class":"TARGET"})

In [None]:
for i in df_train.columns:
  df_train[i] = pd.to_numeric(df_train[i])
  

In [None]:
extract_cat_var(df_train)

In [None]:
comb_basic_info,comb_miss_val,comb_miss_val_app_train,comb_miss_val_app_test = print_compare_df(df_train,df_test)

print('\nCombined basic info:\n{}'.format(comb_basic_info))
print('\nCombined missing info:\n{}'.format(comb_miss_val))

In [None]:
print_compare_df(df_train,df_test,printCompareData=False)

In [None]:
df_train['total_liabilities_minus_cash_over_sales'].describe()

In [None]:
df_train_corr = df_train.corr()
df_app_train_corr_target = df_train_corr['TARGET'].sort_values()
print('+ve corr: \n{0}'.format(df_app_train_corr_target.tail(32)))
print('-ve corr: \n{0}'.format(df_app_train_corr_target.head(32)))

In [None]:
var_pos_corr_list = df_app_train_corr_target.head(10).index.values
var_neg_corr_list = df_app_train_corr_target[-2:-10:-1].index.values

print(var_pos_corr_list)
print(var_neg_corr_list)

#plot_kde_hist_var(df_train,var_pos_corr_list,drawAll=True)
plot_kde_hist_var(df_train,var_neg_corr_list,drawAll=True)

In [None]:
condi1 = df_train['total_liabilities_over_total_assets'] <1
condi2 = df_train['total_liabilities_over_total_assets'] > 0
condi3 = df_train['total_liabilities_over_total_assets'] = 1
df_train = df_train[(condi1|condi3)&condi2]

In [None]:
corr_df = df_train.corr()
high_corr = ~(corr_df.mask(np.eye(len(corr_df ), dtype=bool)).abs() > 0.5).any()
high_corr

corr_df = corr_df.loc[high_corr,high_corr]
print(corr_df.columns)

In [None]:
#['TARGET','logarithm_of_total_assets','working_capital_over_total_assets','retained_earnings_over_total_assets','profit_on_sales_over_sales']

In [None]:
df_ext_src = df_train[list(corr_df.columns)] 
df_ext_src_corr = df_ext_src.corr()
sns.heatmap(df_ext_src_corr,vmin=-1.0,vmax=1.0,annot=True)
sns.set(rc={'figure.figsize':(160.7,80.27)})

In [None]:
df_train[['EBIT_over_total_assets', 'gross_profit_plus_interest_over_total_assets']].describe()

In [None]:
df_ext_src_sample = df_ext_src[['total_liabilities_over_total_assets','TARGET']].dropna().sample(5000)
grid = sns.PairGrid(data = df_ext_src_sample, diag_sharey=True,
                    hue = 'TARGET', 
                    vars = [x for x in list(df_ext_src_sample.columns) if x != 'TARGET'])

grid.map_upper(plt.scatter, alpha = 0.2)
grid.map_diag(sns.kdeplot)
grid.map_lower(sns.kdeplot, cmap = plt.cm.OrRd_r);

In [None]:
var_list = ['total_liabilities_over_total_assets']

In [None]:
df_train.drop(columns=['total_assets_over_total_liabilities'],inplace=True)

In [None]:
df_train['sales_over_fixed_assets'].describe()

In [None]:
df_train['net_profit_over_total_assets'].hist()

In [None]:
df_train['total_liabilities_over_total_assets'].hist()

Dealing missing value by KNN imputation

In [None]:
import sklearn.neighbors._base
import sys
sys.modules['sklearn.neighbors.base'] = sklearn.neighbors._base
from sklearn.impute import KNNImputer

imputer = KNNImputer(n_neighbors=6)

In [None]:
df_train.head()

In [None]:
df_train.shape

In [None]:
X = df_train
X_new = imputer.fit_transform(X)

col = list(df_train.columns)
df_train_full = pd.DataFrame(X_new, columns= col)
train = df_train_full.copy()

In [None]:
X = train.drop(columns='TARGET')
y = train.TARGET
X_train, X_test,y_train,y_test = train_test_split(X, y,stratify=y)


In [None]:
# class imbalance is treated
sns.set_style('white');
sns.set_context(context='notebook',font_scale=1.2)
sns.countplot(x=y_train);
plt.title('Target variable balanced');

In [None]:
from sklearn.preprocessing import RobustScaler
scaler = RobustScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)


# PCA


In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=33)
X_train = pca.fit_transform(X_train)
X_test = pca.fit_transform(X_test)

# ANN model

In [None]:
# early stopping
early_stop =  EarlyStopping(monitor='val_auc',mode='max', verbose=1, patience=27,restore_best_weights=True)

# ANN
model =  Sequential()

model.add(Dense(units=8,activation='relu'))
model.add(Dropout(0.10))

model.add(Dense(units=4,activation='relu'))

model.add(Dense(units=1,activation='sigmoid'))

# compile ANN
model.compile(loss='binary_crossentropy', optimizer='adam',metrics = ['accuracy'])

In [None]:
# Train ANN
model.fit(x=X_train, 
          y=y_train, 
          epochs=120,
          validation_data=(X_test, y_test), verbose=1,
          callbacks=[early_stop]
          )

In [None]:
# model history to df
loss_plot = pd.DataFrame(model.history.history)
accuracy_plot = pd.DataFrame(model.history.history)

#  accuracy and loss plot
fig, (ax1, ax2) = plt.subplots(1, 2,figsize=(14,4))
plt.style.use('seaborn')
ax1.plot(loss_plot.loc[:, ['loss']], label='Training loss');
ax1.plot(loss_plot.loc[:, ['val_loss']],label='Validation loss');
ax1.set_title('Training and Validation loss')
ax1.set_xlabel('epochs')
ax1.set_ylabel('Loss')
ax1.legend(loc="best");

ax2.plot(accuracy_plot.loc[:, ['accuracy']],label='Training_accuracy');
ax2.plot(accuracy_plot.loc[:, ['val_accuracy']], label='Validation_accuracy');
ax2.set_title('Training_and_Validation_accuracy');
ax2.set_xlabel('epochs')
ax2.set_ylabel('accuracy')
ax2.legend(loc="best");

In [None]:
y_pred = model.predict(X_test)
y_pred = (y_pred > 0.1)

In [None]:
print(classification_report(y_test, y_pred))


In [None]:
sns.heatmap(confusion_matrix(y_test,y_pred,normalize='true'), annot=True);#


# XGB model


In [None]:
from xgboost import XGBClassifier
import xgboost as xgb

In [None]:
clf = XGBClassifier(objective='binary:logistic',seed=42,scale_pos_weight=2.2775)

In [None]:
clf.fit(X_train, y_train, verbose=True, early_stopping_rounds=10, eval_metric='aucpr', eval_set=[[X_test,y_test]])

In [None]:
y_pred1 = clf.predict(X_test)

In [None]:
print(classification_report(y_test, y_pred1))
