In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline


In [2]:
# preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
import pandas_profiling as pp

In [3]:
# models
from sklearn.linear_model import LogisticRegression, Perceptron, RidgeClassifier, SGDClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier 
from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier, VotingClassifier 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
import xgboost as xgb
from xgboost import XGBClassifier
import lightgbm as lgb
from lightgbm import LGBMClassifier

In [4]:
# NN models
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras import optimizers
from keras.wrappers.scikit_learn import KerasClassifier
from keras.callbacks import EarlyStopping, ModelCheckpoint

In [5]:
# model tuning
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe, space_eval

# import warnings filter
from warnings import simplefilter
# ignore all future warnings
simplefilter(action='ignore', category=FutureWarning)

In [6]:
data = pd.read_csv("/content/sample_data/column_2C_weka.csv")

In [7]:
data.head()

Unnamed: 0,pelvic_incidence,pelvic_tilt numeric,lumbar_lordosis_angle,sacral_slope,pelvic_radius,degree_spondylolisthesis,class
0,63.027818,22.552586,39.609117,40.475232,98.672917,-0.2544,Abnormal
1,39.056951,10.060991,25.015378,28.99596,114.405425,4.564259,Abnormal
2,68.832021,22.218482,50.092194,46.613539,105.985135,-3.530317,Abnormal
3,69.297008,24.652878,44.311238,44.64413,101.868495,11.211523,Abnormal
4,49.712859,9.652075,28.317406,40.060784,108.168725,7.918501,Abnormal


In [8]:
data.tail()

Unnamed: 0,pelvic_incidence,pelvic_tilt numeric,lumbar_lordosis_angle,sacral_slope,pelvic_radius,degree_spondylolisthesis,class
305,47.903565,13.616688,36.0,34.286877,117.449062,-4.245395,Normal
306,53.936748,20.721496,29.220534,33.215251,114.365845,-0.42101,Normal
307,61.446597,22.694968,46.170347,38.751628,125.670725,-2.70788,Normal
308,45.252792,8.693157,41.583126,36.559635,118.545842,0.21475,Normal
309,33.841641,5.073991,36.641233,28.767649,123.945244,-0.199249,Normal


In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 310 entries, 0 to 309
Data columns (total 7 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   pelvic_incidence          310 non-null    float64
 1   pelvic_tilt numeric       310 non-null    float64
 2   lumbar_lordosis_angle     310 non-null    float64
 3   sacral_slope              310 non-null    float64
 4   pelvic_radius             310 non-null    float64
 5   degree_spondylolisthesis  310 non-null    float64
 6   class                     310 non-null    object 
dtypes: float64(6), object(1)
memory usage: 17.1+ KB


In [10]:
data.describe()

Unnamed: 0,pelvic_incidence,pelvic_tilt numeric,lumbar_lordosis_angle,sacral_slope,pelvic_radius,degree_spondylolisthesis
count,310.0,310.0,310.0,310.0,310.0,310.0
mean,60.496653,17.542822,51.93093,42.953831,117.920655,26.296694
std,17.23652,10.00833,18.554064,13.423102,13.317377,37.559027
min,26.147921,-6.554948,14.0,13.366931,70.082575,-11.058179
25%,46.430294,10.667069,37.0,33.347122,110.709196,1.603727
50%,58.691038,16.357689,49.562398,42.404912,118.268178,11.767934
75%,72.877696,22.120395,63.0,52.695888,125.467674,41.287352
max,129.834041,49.431864,125.742385,121.429566,163.071041,418.543082


In [11]:
data.isna().sum()

pelvic_incidence            0
pelvic_tilt numeric         0
lumbar_lordosis_angle       0
sacral_slope                0
pelvic_radius               0
degree_spondylolisthesis    0
class                       0
dtype: int64

In [12]:
data.shape

(310, 7)

In [13]:
data.columns

Index(['pelvic_incidence', 'pelvic_tilt numeric', 'lumbar_lordosis_angle',
       'sacral_slope', 'pelvic_radius', 'degree_spondylolisthesis', 'class'],
      dtype='object')

In [14]:
data.drop(columns=['degree_spondylolisthesis'])

Unnamed: 0,pelvic_incidence,pelvic_tilt numeric,lumbar_lordosis_angle,sacral_slope,pelvic_radius,class
0,63.027818,22.552586,39.609117,40.475232,98.672917,Abnormal
1,39.056951,10.060991,25.015378,28.995960,114.405425,Abnormal
2,68.832021,22.218482,50.092194,46.613539,105.985135,Abnormal
3,69.297008,24.652878,44.311238,44.644130,101.868495,Abnormal
4,49.712859,9.652075,28.317406,40.060784,108.168725,Abnormal
...,...,...,...,...,...,...
305,47.903565,13.616688,36.000000,34.286877,117.449062,Normal
306,53.936748,20.721496,29.220534,33.215251,114.365845,Normal
307,61.446597,22.694968,46.170347,38.751628,125.670725,Normal
308,45.252792,8.693157,41.583126,36.559635,118.545842,Normal


EDA

In [15]:
!pip install pandas



# Preparing to modeling 

Encoding categorical features

In [16]:
# Determination categorical features
numerics = ['int8', 'int16', 'int32', 'int64', 'float16', 'float32', 'float64']
categorical_columns = []
features = data.columns.values.tolist()
for col in features:
    if data[col].dtype in numerics: continue
    categorical_columns.append(col)
categorical_columns

['class']

In [17]:
# Encoding categorical features
for col in categorical_columns:
    if col in data.columns:
        le = LabelEncoder()
        le.fit(list(data[col].astype(str).values))
        data[col] = le.transform(list(data[col].astype(str).values))

In [18]:
target_name = 'class'
data_target = data[target_name]
data = data.drop([target_name], axis=1)

In [19]:
train, test, target, target_test = train_test_split(data, data_target, test_size=0.3, random_state=1)

In [20]:
print(target.shape)
print(target_test.shape)
print(train.shape)
print(test.shape)

(217,)
(93,)
(217, 6)
(93, 6)


Creation of training and validation sets

In [21]:
#%% split training set to validation set
Xtrain, Xval, Ztrain, Zval = train_test_split(train, target, test_size=0.3, random_state=1)

Tuning models and test for all features 

# Logistic Regression 

In [22]:
# Logistic Regression

logreg = LogisticRegression()
logreg.fit(train, target)
acc_log = round(logreg.score(train, target) * 100, 2)
acc_log

84.79

In [23]:
acc_test_log = round(logreg.score(test, target_test) * 100, 2)
acc_test_log

87.1

# Support Vector Machines

In [24]:
svc = SVC()
svc.fit(train, target)
acc_svc = round(svc.score(train, target) * 100, 2)
acc_svc

84.33

In [25]:
acc_test_svc = round(svc.score(test, target_test) * 100, 2)
acc_test_svc

84.95

# Linear SVC

In [26]:
linear_svc = LinearSVC(dual=False)  # dual=False when n_samples > n_features.
linear_svc.fit(train, target)
acc_linear_svc = round(linear_svc.score(train, target) * 100, 2)
acc_linear_svc

84.79

In [27]:
acc_test_linear_svc = round(linear_svc.score(test, target_test) * 100, 2)
acc_test_linear_svc

86.02

# k-Nearest Neighbors algorithm

In [28]:
knn = GridSearchCV(estimator=KNeighborsClassifier(), param_grid={'n_neighbors': [2, 3]}, cv=10).fit(train, target)
acc_knn = round(knn.score(train, target) * 100, 2)
print(acc_knn, knn.best_params_)

90.32 {'n_neighbors': 3}


In [29]:
acc_test_knn = round(knn.score(test, target_test) * 100, 2)
acc_test_knn

86.02

# Gaussian Naive Bayes

In [30]:
gaussian = GaussianNB()
gaussian.fit(train, target)
acc_gaussian = round(gaussian.score(train, target) * 100, 2)
acc_gaussian

76.96

In [31]:
acc_test_gaussian = round(gaussian.score(test, target_test) * 100, 2)
acc_test_gaussian

81.72

# Perceptron

In [32]:
perceptron = Perceptron()
perceptron.fit(train, target)
acc_perceptron = round(perceptron.score(train, target) * 100, 2)
acc_perceptron

82.03

In [33]:
acc_test_perceptron = round(perceptron.score(test, target_test) * 100, 2)
acc_test_perceptron

84.95

# Stochastic Gradient Descent

In [34]:
sgd = SGDClassifier()
sgd.fit(train, target)
acc_sgd = round(sgd.score(train, target) * 100, 2)
acc_sgd

77.88

In [35]:
acc_test_sgd = round(perceptron.score(test, target_test) * 100, 2)
acc_test_sgd

84.95

# Decision Tree Classifier

In [36]:
decision_tree = DecisionTreeClassifier()
decision_tree.fit(train, target)
acc_decision_tree = round(decision_tree.score(train, target) * 100, 2)
acc_decision_tree

100.0

In [37]:
acc_test_decision_tree = round(decision_tree.score(test, target_test) * 100, 2)
acc_test_decision_tree

80.65

# Random Forest

In [38]:
random_forest = GridSearchCV(estimator=RandomForestClassifier(), param_grid={'n_estimators': [100, 300]}, cv=5).fit(train, target)
random_forest.fit(train, target)
acc_random_forest = round(random_forest.score(train, target) * 100, 2)
print(acc_random_forest,random_forest.best_params_)

100.0 {'n_estimators': 100}


In [39]:
acc_test_random_forest = round(random_forest.score(test, target_test) * 100, 2)
acc_test_random_forest

83.87

XGB

In [40]:
def hyperopt_xgb_score(params):
    clf = XGBClassifier(**params)
    current_score = cross_val_score(clf, train, target, cv=10).mean()
    print(current_score, params)
    return current_score 
 
space_xgb = {
            'learning_rate': hp.quniform('learning_rate', 0, 0.05, 0.0001),
            'n_estimators': hp.choice('n_estimators', range(100, 1000)),
            'eta': hp.quniform('eta', 0.025, 0.5, 0.005),
            'max_depth':  hp.choice('max_depth', np.arange(2, 12, dtype=int)),
            'min_child_weight': hp.quniform('min_child_weight', 1, 9, 0.025),
            'subsample': hp.quniform('subsample', 0.5, 1, 0.005),
            'gamma': hp.quniform('gamma', 0.5, 1, 0.005),
            'colsample_bytree': hp.quniform('colsample_bytree', 0.5, 1, 0.005),
            'eval_metric': 'auc',
            'objective': 'binary:logistic',
            'booster': 'gbtree',
            'tree_method': 'exact',
            'silent': 1,
            'missing': None
        }
 
best = fmin(fn=hyperopt_xgb_score, space=space_xgb, algo=tpe.suggest, max_evals=10)
print('best:')
print(best)


0.7928571428571429
{'booster': 'gbtree', 'colsample_bytree': 0.97, 'eta': 0.325, 'eval_metric': 'auc', 'gamma': 0.975, 'learning_rate': 0.0134, 'max_depth': 2, 'min_child_weight': 8.35, 'missing': None, 'n_estimators': 612, 'objective': 'binary:logistic', 'silent': 1, 'subsample': 0.705, 'tree_method': 'exact'}
0.8112554112554111
{'booster': 'gbtree', 'colsample_bytree': 0.54, 'eta': 0.03, 'eval_metric': 'auc', 'gamma': 0.6950000000000001, 'learning_rate': 0.004200000000000001, 'max_depth': 4, 'min_child_weight': 5.65, 'missing': None, 'n_estimators': 268, 'objective': 'binary:logistic', 'silent': 1, 'subsample': 0.505, 'tree_method': 'exact'}
0.8112554112554111
{'booster': 'gbtree', 'colsample_bytree': 0.865, 'eta': 0.135, 'eval_metric': 'auc', 'gamma': 0.555, 'learning_rate': 0.030100000000000002, 'max_depth': 4, 'min_child_weight': 8.450000000000001, 'missing': None, 'n_estimators': 609, 'objective': 'binary:logistic', 'silent': 1, 'subsample': 0.555, 'tree_method': 'exact'}
0.82489

In [41]:
params = space_eval(space_xgb, best)
params

{'booster': 'gbtree',
 'colsample_bytree': 0.97,
 'eta': 0.325,
 'eval_metric': 'auc',
 'gamma': 0.975,
 'learning_rate': 0.0134,
 'max_depth': 2,
 'min_child_weight': 8.35,
 'missing': None,
 'n_estimators': 612,
 'objective': 'binary:logistic',
 'silent': 1,
 'subsample': 0.705,
 'tree_method': 'exact'}

In [42]:
XGB_Classifier = XGBClassifier(**params)
XGB_Classifier.fit(train, target)
acc_XGB_Classifier = round(XGB_Classifier.score(train, target) * 100, 2)
acc_XGB_Classifier

84.33

In [43]:
acc_test_XGB_Classifier = round(XGB_Classifier.score(test, target_test) * 100, 2)
acc_test_XGB_Classifier

87.1

In [44]:
fig =  plt.figure(figsize = (15,15))
axes = fig.add_subplot(111)
xgb.plot_importance(XGB_Classifier,ax = axes,height =0.5)
plt.show();
plt.close()

LGBM Classifier 

In [45]:
def hyperopt_lgb_score(params):
    clf = LGBMClassifier(**params)
    current_score = cross_val_score(clf, train, target, cv=10).mean()
    print(current_score, params)
    return current_score 
 
space_lgb = {
            'learning_rate': hp.quniform('learning_rate', 0, 0.05, 0.0001),
            'n_estimators': hp.choice('n_estimators', range(100, 1000)),
            'max_depth':  hp.choice('max_depth', np.arange(2, 12, dtype=int)),
            'num_leaves': hp.choice('num_leaves', 2*np.arange(2, 2**11, dtype=int)),
            'min_child_weight': hp.quniform('min_child_weight', 1, 9, 0.025),
            'colsample_bytree': hp.quniform('colsample_bytree', 0.5, 1, 0.005),
            'objective': 'binary',
            'boosting_type': 'gbdt',
            }
 
best = fmin(fn=hyperopt_lgb_score, space=space_lgb, algo=tpe.suggest, max_evals=10)
print('best:')
print(best)

0.8478354978354978
{'boosting_type': 'gbdt', 'colsample_bytree': 0.58, 'learning_rate': 0.006900000000000001, 'max_depth': 3, 'min_child_weight': 3.4250000000000003, 'n_estimators': 638, 'num_leaves': 286, 'objective': 'binary'}
0.8341991341991342
{'boosting_type': 'gbdt', 'colsample_bytree': 0.64, 'learning_rate': 0.012100000000000001, 'max_depth': 3, 'min_child_weight': 6.525, 'n_estimators': 831, 'num_leaves': 3388, 'objective': 'binary'}
0.8432900432900432
{'boosting_type': 'gbdt', 'colsample_bytree': 0.655, 'learning_rate': 0.0063, 'max_depth': 10, 'min_child_weight': 2.85, 'n_estimators': 788, 'num_leaves': 2844, 'objective': 'binary'}
0.8480519480519482
{'boosting_type': 'gbdt', 'colsample_bytree': 0.615, 'learning_rate': 0.049, 'max_depth': 11, 'min_child_weight': 1.1500000000000001, 'n_estimators': 710, 'num_leaves': 2298, 'objective': 'binary'}
0.8476190476190476
{'boosting_type': 'gbdt', 'colsample_bytree': 0.625, 'learning_rate': 0.0397, 'max_depth': 4, 'min_child_weight': 

In [46]:
params = space_eval(space_lgb, best)
params


{'boosting_type': 'gbdt',
 'colsample_bytree': 0.875,
 'learning_rate': 0.047,
 'max_depth': 8,
 'min_child_weight': 8.725,
 'n_estimators': 635,
 'num_leaves': 2106,
 'objective': 'binary'}

In [47]:
LGB_Classifier = LGBMClassifier(**params)
LGB_Classifier.fit(train, target)
acc_LGB_Classifier = round(LGB_Classifier.score(train, target) * 100, 2)
acc_LGB_Classifier

88.02

In [48]:
acc_test_LGB_Classifier = round(LGB_Classifier.score(test, target_test) * 100, 2)
acc_test_LGB_Classifier

87.1

In [49]:
fig =  plt.figure(figsize = (15,15))
axes = fig.add_subplot(111)
lgb.plot_importance(LGB_Classifier,ax = axes,height = 0.5)
plt.show();
plt.close()

GradientBoosting

In [50]:
def hyperopt_gb_score(params):
    clf = GradientBoostingClassifier(**params)
    current_score = cross_val_score(clf, train, target, cv=10).mean()
    print(current_score, params)
    return current_score 
 
space_gb = {
            'n_estimators': hp.choice('n_estimators', range(100, 1000)),
            'max_depth': hp.choice('max_depth', np.arange(2, 10, dtype=int))            
        }
 
best = fmin(fn=hyperopt_gb_score, space=space_gb, algo=tpe.suggest, max_evals=10)
print('best:')
print(best)

0.8337662337662337
{'max_depth': 2, 'n_estimators': 291}
0.8339826839826839
{'max_depth': 4, 'n_estimators': 937}
0.824025974025974
{'max_depth': 6, 'n_estimators': 643}
0.82012987012987
{'max_depth': 5, 'n_estimators': 190}
0.8525974025974026
{'max_depth': 7, 'n_estimators': 627}
0.8158008658008657
{'max_depth': 5, 'n_estimators': 467}
0.8255411255411256
{'max_depth': 8, 'n_estimators': 727}
0.8385281385281385
{'max_depth': 2, 'n_estimators': 638}
0.8478354978354978
{'max_depth': 7, 'n_estimators': 590}
0.8196969696969697
{'max_depth': 3, 'n_estimators': 605}
100%|██████████| 10/10 [00:32<00:00,  3.26s/it, best loss: 0.8158008658008657]
best:
{'max_depth': 3, 'n_estimators': 367}


In [51]:
params = space_eval(space_gb, best)
params

{'max_depth': 5, 'n_estimators': 467}

In [52]:
# Gradient Boosting Classifier

gradient_boosting = GradientBoostingClassifier(**params)
gradient_boosting.fit(train, target)
acc_gradient_boosting = round(gradient_boosting.score(train, target) * 100, 2)
acc_gradient_boosting

100.0

In [53]:
acc_test_gradient_boosting = round(gradient_boosting.score(test, target_test) * 100, 2)
acc_test_gradient_boosting

79.57

# Ridge Classifier

In [54]:
ridge_classifier = RidgeClassifier()
ridge_classifier.fit(train, target)
acc_ridge_classifier = round(ridge_classifier.score(train, target) * 100, 2)
acc_ridge_classifier

83.87

In [55]:
acc_test_ridge_classifier = round(ridge_classifier.score(test, target_test) * 100, 2)
acc_test_ridge_classifier

81.72

# Bagging Classifier

In [56]:
bagging_classifier = BaggingClassifier()
bagging_classifier.fit(train, target)
Y_pred = bagging_classifier.predict(test).astype(int)
acc_bagging_classifier = round(bagging_classifier.score(train, target) * 100, 2)
acc_bagging_classifier

99.54

In [57]:
acc_test_bagging_classifier = round(bagging_classifier.score(test, target_test) * 100, 2)
acc_test_bagging_classifier

82.8

ExtraTreesClassifier

In [58]:
def hyperopt_etc_score(params):
    clf = ExtraTreesClassifier(**params)
    current_score = cross_val_score(clf, train, target, cv=10).mean()
    print(current_score, params)
    return current_score 
 
space_etc = {
            'n_estimators': hp.choice('n_estimators', range(100, 1000)),
            'max_features': hp.choice('max_features', np.arange(2, 17, dtype=int)),
            'min_samples_leaf': hp.choice('min_samples_leaf', np.arange(1, 5, dtype=int)),
            'max_depth':  hp.choice('max_depth', np.arange(2, 12, dtype=int)),
            'max_features': None # for small number of features
        }
 
best = fmin(fn=hyperopt_etc_score, space=space_etc, algo=tpe.suggest, max_evals=10)
print('best:')
print(best)

0.8614718614718614
{'max_depth': 5, 'max_features': None, 'min_samples_leaf': 1, 'n_estimators': 152}
0.7874458874458875
{'max_depth': 2, 'max_features': None, 'min_samples_leaf': 3, 'n_estimators': 124}
0.791991341991342
{'max_depth': 2, 'max_features': None, 'min_samples_leaf': 1, 'n_estimators': 198}
0.8380952380952381
{'max_depth': 3, 'max_features': None, 'min_samples_leaf': 2, 'n_estimators': 623}
0.8567099567099566
{'max_depth': 6, 'max_features': None, 'min_samples_leaf': 4, 'n_estimators': 570}
0.8755411255411255
{'max_depth': 9, 'max_features': None, 'min_samples_leaf': 4, 'n_estimators': 612}
0.8244588744588744
{'max_depth': 3, 'max_features': None, 'min_samples_leaf': 1, 'n_estimators': 478}
0.866017316017316
{'max_depth': 7, 'max_features': None, 'min_samples_leaf': 4, 'n_estimators': 766}
0.8616883116883116
{'max_depth': 7, 'max_features': None, 'min_samples_leaf': 3, 'n_estimators': 481}
0.7779220779220778
{'max_depth': 2, 'max_features': None, 'min_samples_leaf': 1, 'n_

In [59]:
params = space_eval(space_etc, best)
params

{'max_depth': 2,
 'max_features': None,
 'min_samples_leaf': 1,
 'n_estimators': 371}

In [60]:
# Extra Trees Classifier

extra_trees_classifier = ExtraTreesClassifier(**params)
extra_trees_classifier.fit(train, target)
acc_etc = round(extra_trees_classifier.score(train, target) * 100, 2)
acc_etc

80.65

In [61]:
acc_test_etc = round(extra_trees_classifier.score(test, target_test) * 100, 2)
acc_test_etc

76.34

NN

In [62]:
def build_ann(optimizer='adam'):
    
    # Initializing the ANN
    ann = Sequential()
    
    # Adding the input layer and the first hidden layer of the ANN with dropout
    ann.add(Dense(units=32, kernel_initializer='glorot_uniform', activation='relu', input_shape=(len(train.columns),)))
    
    # Add other layers, it is not necessary to pass the shape because there is a layer before
    ann.add(Dense(units=64, kernel_initializer='glorot_uniform', activation='relu'))
    ann.add(Dropout(rate=0.5))
    ann.add(Dense(units=64, kernel_initializer='glorot_uniform', activation='relu'))
    ann.add(Dropout(rate=0.5))
    
    # Adding the output layer
    ann.add(Dense(units=1, kernel_initializer='glorot_uniform', activation='sigmoid'))
    
    # Compiling the ANN
    ann.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
    
    return ann

In [63]:
opt = optimizers.Adam(lr=0.001)
ann = build_ann(opt)
# Training the ANN
history = ann.fit(Xtrain, Ztrain, batch_size=16, epochs=100, validation_data=(Xval, Zval))

  "The `lr` argument is deprecated, use `learning_rate` instead.")


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [64]:
# Predicting the Train set results
ann_prediction = ann.predict(train)
ann_prediction = (ann_prediction > 0.5)*1 # convert probabilities to binary output

# Compute error between predicted data and true response and display it in confusion matrix
acc_ann1 = round(metrics.accuracy_score(target, ann_prediction) * 100, 2)
acc_ann1

81.57

In [65]:
# Predicting the Test set results
ann_prediction_test = ann.predict(test)
ann_prediction_test = (ann_prediction_test > 0.5)*1 # convert probabilities to binary output

# Compute error between predicted data and true response and display it in confusion matrix
acc_test_ann1 = round(metrics.accuracy_score(target_test, ann_prediction_test) * 100, 2)
acc_test_ann1

83.87

 Neural Network 2 

In [66]:
# Model
model = Sequential()
model.add(Dense(16, input_dim = train.shape[1], activation = 'relu'))
model.add(Dropout(0.3))
model.add(Dense(64, activation = 'relu'))
model.add(Dropout(0.3))
model.add(Dense(32, activation = 'relu'))
model.add(Dense(1, activation = 'sigmoid'))
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_4 (Dense)              (None, 16)                112       
_________________________________________________________________
dropout_2 (Dropout)          (None, 16)                0         
_________________________________________________________________
dense_5 (Dense)              (None, 64)                1088      
_________________________________________________________________
dropout_3 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_6 (Dense)              (None, 32)                2080      
_________________________________________________________________
dense_7 (Dense)              (None, 1)                 33        
Total params: 3,313
Trainable params: 3,313
Non-trainable params: 0
____________________________________________________

In [67]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [68]:
es = EarlyStopping(monitor='val_accuracy', patience=20, mode='max')
hist = model.fit(train, target, batch_size=64, validation_data=(Xval, Zval), 
               epochs=500, verbose=1, callbacks=[es])

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500
Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500


In [69]:
plt.plot(hist.history['accuracy'], label='acc')
plt.plot(hist.history['val_accuracy'], label='val_acc')
# plt.plot(hist.history['acc'], label='acc')
# plt.plot(hist.history['val_acc'], label='val_acc')
plt.ylim((0, 1))
plt.legend()

<matplotlib.legend.Legend at 0x7fee49694350>

In [70]:
# Predicting the Train set results
nn_prediction = model.predict(train)
nn_prediction = (nn_prediction > 0.5)*1 # convert probabilities to binary output

# Compute error between predicted data and true response
acc_ann2 = round(metrics.accuracy_score(target, nn_prediction) * 100, 2)
acc_ann2

75.12

In [71]:
# Predicting the Test set results
nn_prediction_test = model.predict(test)
nn_prediction_test = (nn_prediction_test > 0.5)*1 # convert probabilities to binary output

# Compute error between predicted data and true response
acc_test_ann2 = round(metrics.accuracy_score(target_test, nn_prediction_test) * 100, 2)
acc_test_ann2

75.27

Voting_Classifier(hars_Voting)

In [72]:
Voting_Classifier_hard = VotingClassifier(estimators=[('lr', logreg), ('rf', random_forest), ('gbc', gradient_boosting)], voting='hard')
for clf, label in zip([logreg, random_forest, gradient_boosting, Voting_Classifier_hard], 
                      ['Logistic Regression', 'Random Forest', 'Gradient Boosting Classifier', 'Ensemble']):
    scores = cross_val_score(clf, train, target, cv=10, scoring='accuracy')
    print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))

Accuracy: 0.85 (+/- 0.09) [Logistic Regression]
Accuracy: 0.86 (+/- 0.09) [Random Forest]
Accuracy: 0.82 (+/- 0.11) [Gradient Boosting Classifier]
Accuracy: 0.86 (+/- 0.10) [Ensemble]


In [73]:
Voting_Classifier_hard.fit(train, target)
acc_VC_hard = round(Voting_Classifier_hard.score(train, target) * 100, 2)
acc_VC_hard

100.0

In [74]:
acc_test_VC_hard = round(Voting_Classifier_hard.score(test, target_test) * 100, 2)
acc_test_VC_hard

86.02

VotingClassifier (soft voting) 

In [75]:
eclf = VotingClassifier(estimators=[('lr', logreg), ('rf', random_forest), ('gbc', gradient_boosting)], voting='soft')
params = {'lr__C': [1.0, 100.0], 'gbc__learning_rate': [0.05, 1]}
Voting_Classifier_soft = GridSearchCV(estimator=eclf, param_grid=params, cv=5)
Voting_Classifier_soft.fit(train, target)
acc_VC_soft = round(Voting_Classifier_soft.score(train, target) * 100, 2)
acc_VC_soft

100.0

In [76]:
acc_test_VC_soft = round(Voting_Classifier_soft.score(test, target_test) * 100, 2)
acc_test_VC_soft

82.8

AdaBoost Classifier 

In [77]:
def hyperopt_ab_score(params):
    clf = AdaBoostClassifier(**params)
    current_score = cross_val_score(clf, train, target, cv=10).mean()
    print(current_score, params)
    return current_score 
 
space_ab = {
            'n_estimators': hp.choice('n_estimators', range(50, 1000)),
            'learning_rate': hp.quniform('learning_rate', 0, 0.05, 0.0001)       
        }
 
best = fmin(fn=hyperopt_ab_score, space=space_ab, algo=tpe.suggest, max_evals=10)
print('best:')
print(best)

0.8432900432900432
{'learning_rate': 0.0439, 'n_estimators': 816}
0.8432900432900432
{'learning_rate': 0.0287, 'n_estimators': 960}
0.825108225108225
{'learning_rate': 0.0148, 'n_estimators': 581}
0.8296536796536795
{'learning_rate': 0.023, 'n_estimators': 259}
0.8203463203463203
{'learning_rate': 0.0097, 'n_estimators': 505}
0.8385281385281385
{'learning_rate': 0.031400000000000004, 'n_estimators': 935}
0.7833333333333334
{'learning_rate': 0.0166, 'n_estimators': 106}
0.8437229437229437
{'learning_rate': 0.045200000000000004, 'n_estimators': 289}
0.7831168831168831
{'learning_rate': 0.0038, 'n_estimators': 472}
0.7831168831168831
{'learning_rate': 0.0036000000000000003, 'n_estimators': 334}
100%|██████████| 10/10 [01:25<00:00,  8.53s/it, best loss: 0.7831168831168831]
best:
{'learning_rate': 0.0038, 'n_estimators': 422}


In [78]:
params = space_eval(space_ab, best)
params

{'learning_rate': 0.0038, 'n_estimators': 472}

In [79]:
# AdaBoost Classifier

Ada_Boost = AdaBoostClassifier(**params)
Ada_Boost.fit(train, target)
Ada_Boost.score(train, target)
acc_AdaBoost = round(Ada_Boost.score(train, target) * 100, 2)
acc_AdaBoost

85.25

In [80]:
acc_test_AdaBoost = round(Ada_Boost.score(test, target_test) * 100, 2)
acc_test_AdaBoost

82.8

 Models evaluation

In [81]:
models = pd.DataFrame({
    'Model': ['Logistic Regression', 'Support Vector Machines', 'Linear SVC', 'k-Nearest Neighbors', 'Naive Bayes', 
              'Perceptron', 'Stochastic Gradient Decent', 
              'Decision Tree Classifier', 'Random Forest',  'XGBClassifier', 'LGBMClassifier',
              'GradientBoostingClassifier', 'RidgeClassifier', 'BaggingClassifier', 'ExtraTreesClassifier', 
              'Neural Network 1', 'Neural Network 2', 
              'VotingClassifier-hard voiting', 'VotingClassifier-soft voting',
              'AdaBoostClassifier'],
    
    'Score_train': [acc_log, acc_svc, acc_linear_svc, acc_knn, acc_gaussian, 
              acc_perceptron, acc_sgd, 
              acc_decision_tree, acc_random_forest, acc_XGB_Classifier, acc_LGB_Classifier,
              acc_gradient_boosting, acc_ridge_classifier, acc_bagging_classifier, acc_etc, 
              acc_ann1, acc_ann2, 
              acc_VC_hard, acc_VC_soft,
              acc_AdaBoost],
    'Score_test': [acc_test_log, acc_test_svc, acc_test_linear_svc, acc_test_knn, acc_test_gaussian, 
              acc_test_perceptron, acc_test_sgd, 
              acc_test_decision_tree, acc_test_random_forest, acc_test_XGB_Classifier, acc_test_LGB_Classifier,
              acc_test_gradient_boosting, acc_test_ridge_classifier, acc_test_bagging_classifier, acc_test_etc, 
              acc_test_ann1, acc_test_ann2, 
              acc_test_VC_hard, acc_test_VC_soft,
              acc_test_AdaBoost]
                    })

In [82]:
models.sort_values(by=['Score_train', 'Score_test'], ascending=False)

Unnamed: 0,Model,Score_train,Score_test
17,VotingClassifier-hard voiting,100.0,86.02
8,Random Forest,100.0,83.87
18,VotingClassifier-soft voting,100.0,82.8
7,Decision Tree Classifier,100.0,80.65
11,GradientBoostingClassifier,100.0,79.57
13,BaggingClassifier,99.54,82.8
3,k-Nearest Neighbors,90.32,86.02
10,LGBMClassifier,88.02,87.1
19,AdaBoostClassifier,85.25,82.8
0,Logistic Regression,84.79,87.1


In [83]:
models.sort_values(by=['Score_test', 'Score_train'], ascending=False)

Unnamed: 0,Model,Score_train,Score_test
10,LGBMClassifier,88.02,87.1
0,Logistic Regression,84.79,87.1
9,XGBClassifier,84.33,87.1
17,VotingClassifier-hard voiting,100.0,86.02
3,k-Nearest Neighbors,90.32,86.02
2,Linear SVC,84.79,86.02
1,Support Vector Machines,84.33,84.95
5,Perceptron,82.03,84.95
6,Stochastic Gradient Decent,77.88,84.95
8,Random Forest,100.0,83.87


In [84]:
models['Score_diff'] = abs(models['Score_train'] - models['Score_test'])
models.sort_values(by=['Score_diff'], ascending=True)

Unnamed: 0,Model,Score_train,Score_test,Score_diff
16,Neural Network 2,75.12,75.27,0.15
1,Support Vector Machines,84.33,84.95,0.62
10,LGBMClassifier,88.02,87.1,0.92
2,Linear SVC,84.79,86.02,1.23
12,RidgeClassifier,83.87,81.72,2.15
15,Neural Network 1,81.57,83.87,2.3
0,Logistic Regression,84.79,87.1,2.31
19,AdaBoostClassifier,85.25,82.8,2.45
9,XGBClassifier,84.33,87.1,2.77
5,Perceptron,82.03,84.95,2.92


In [86]:
# Plot
import matplotlib.pyplot as plt
plt.figure(figsize=[25,6])
xx = models['Model']
plt.tick_params(labelsize=14)
plt.plot(xx, models['Score_train'], label = 'Score_train')
plt.plot(xx, models['Score_test'], label = 'Score_test')
plt.legend()
plt.title('Score of 20 popular models for train and test datasets')
plt.xlabel('Models')
plt.ylabel('Score, %')
plt.xticks(xx, rotation='vertical')
plt.savefig('graph.png')
plt.show()