## Now let's choose the techniques from our trials  and build the best 3 models for further ensemble and tuning 

In [34]:
# As for the evaluation criteria we will take recall score and accuracy 
# (PCA, LowVar, SelKbest, tree) X (SMOTE, ADASYN) 
# Out of the 8 possible cases, Univariate feature selection with chi2 test + ADASYN 
#performed the best so we will proceed with this combination

In [3]:
# Data Prep
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
data = pd.read_csv('data.csv')
data.columns = data.columns.str.lstrip()
data.isna().sum()
# Not enough to see all, so checking differently 
non_check = [x for x in data if data[x].isna().sum() > 0]
# Assignment of X and y 
data.rename({'Bankrupt?' : 'Bankruptcy'}, axis = 1, inplace=True)
y = data['Bankruptcy']
X = data.drop('Bankruptcy', axis = 1)

In [4]:
# Libraries for model building 
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression 
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.ensemble import ExtraTreesClassifier,GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import roc_auc_score,roc_curve
from sklearn.model_selection import cross_validate
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
#pipeline building
from imblearn.pipeline import Pipeline
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import StratifiedKFold

In [5]:
from imblearn.over_sampling import ADASYN
from imblearn.over_sampling import SMOTE 
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import chi2, SelectKBest

In [6]:
## The top performing models are selected for further analysis: 
# xgb_clf, cat_clf, lgb_clf
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import GridSearchCV
from imblearn.pipeline import Pipeline
xgb_model = XGBClassifier(use_label_encoder=False) 
catboost_model = CatBoostClassifier()
lightgbm_model = LGBMClassifier()

In [33]:
 # Split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y, random_state=0)

# Remove features with low variance
sel = VarianceThreshold(threshold=0.1)
sel.fit(X_train)
X_train = sel.transform(X_train)
X_test = sel.transform(X_test)

# Standardize 
min_max_scaler = MinMaxScaler()
X_train = min_max_scaler.fit_transform(X_train)
X_test = min_max_scaler.transform(X_test)

from imblearn.over_sampling import ADASYN
oversample = ADASYN()
X_train, y_train = oversample.fit_resample(X_train, y_train)

In [1]:
#pipe = Pipeline([
#    ('feature sel', VarianceThreshold(0.1)),
#    ('scaler', MinMaxScaler()),
#    ('selection', ADASYN()), 
#    ('classifier', catboost_model)
#])

#params_xgb = {'classifier__gamma' : [0.5, 1], 
#             'classifier__learning_rate' : [ 0.01, 0.05], 
#             'classifier__max_depth' : [2, 3]}

#params_cat = {'classifier__learning_rate' : [0.01, 0.02, 0.005], 
#             'classifier__max_depth' : [1, 2, 3]}
#params_lgb = {'classifier__num_leaves' : [2, 4, 6], 
#             'classifier__max_depth' : [2,4,6], 
#             'classifier__learning_rate' : [0.01, 0.03, 0.05]}



#grid = GridSearchCV(pipe,param_grid=params_cat, cv=cv, scoring='recall', return_train_score=True).fit(X, y)

In [7]:
xgb_model = XGBClassifier(use_label_encoder=False, gamma =1, learning_rate=0.05, max_depth=2, eval_metric='error') 
catboost_model = CatBoostClassifier(learning_rate=0.01, max_depth=1, silent=True)
lightgbm_model = LGBMClassifier(learning_rate=0.05, max_depth=4, num_leaves=4)
built_models = [xgb_model, catboost_model, lightgbm_model]
models_names = ['xgb_model', 'catboost_model', 'lightgbm_model']
metrics = ['acc_test', 'acc_train',
            'f1_test', 'f1_train',
            'rec_test', 'rec_train', 
            'prec_test', 'prec_train', 
            'auc_test', 'auc_train']
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=1, random_state=0)

In [65]:
# Evaluation metrics of our top performing three algorithms after hyperparameter tuning 
df_scores = pd.DataFrame(np.zeros((len(built_models), len(metrics))), columns=metrics)
for index, model in enumerate(built_models):
    scores = {'acc': 'accuracy',
               'f1_score': 'f1',
                   'recall': 'recall', 
                      'precision' : 'precision', 
                             'roc_auc': 'roc_auc'}
    
    pipe = Pipeline([
        ('feature sel', SelectKBest(chi2, k=30)),
        ('scaler', MinMaxScaler()),
        ('selection', ADASYN()), 
        ('classifier', model)
    ])
    
    scores = cross_validate(pipe, X, y, scoring=scores, cv=cv, return_train_score=True)
    val_scores = pd.DataFrame(scores).drop(['fit_time', 'score_time'], axis = 1)
    df_scores.iloc[index, :] = val_scores.mean(axis=0) 
    df_scores.rename({index : models_names[index]}, axis = 0, inplace=True)

In [66]:
df_scores

Unnamed: 0,acc_test,acc_train,f1_test,f1_train,rec_test,rec_train,prec_test,prec_train,auc_test,auc_train
xgb_model,0.898077,0.907367,0.335171,0.376596,0.795455,0.866667,0.212593,0.240606,0.931053,0.955313
catboost_model,0.896319,0.898894,0.328469,0.351742,0.790909,0.85,0.207791,0.221766,0.9256,0.942887
lightgbm_model,0.903798,0.910218,0.354229,0.386365,0.809091,0.875758,0.227592,0.24789,0.931082,0.955064


In [8]:
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=1)

In [14]:
### Trying ensemble techniques, max_vote_classfier 
from sklearn.ensemble import VotingClassifier
ensemble_model = VotingClassifier(estimators=[('xgb', xgb_model), 
                                              ('cat', catboost_model), 
                                               ('lgbm', lightgbm_model)], voting='hard')

In [17]:
# Ensemble (max voting classifier - XGBoost, Catboost and Light GBM)
scores=['accuracy','recall']
    
pipe = Pipeline([
        ('feature sel', SelectKBest(chi2, k=30)),
        ('scaler', MinMaxScaler()),
        ('selection', ADASYN()), 
        ('classifier', ensemble_model)
    ])

scores = cross_validate(pipe, X, y, scoring=scores, cv=cv, return_train_score=True)
val_scores = pd.DataFrame(scores).drop(['fit_time', 'score_time'], axis = 1)
ensemble_scores=val_scores.mean(axis=0)

In [70]:
# This is the result of the ensemble model, when using max voting classifier 
ensemble_scores

test_accuracy     0.902479
train_accuracy    0.907073
test_recall       0.813636
train_recall      0.868182
dtype: float64