In [25]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pandas import Series, DataFrame
import sklearn.utils
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import imblearn
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, KFold, StratifiedKFold, LeaveOneOut, RandomizedSearchCV
from sklearn.metrics import recall_score, classification_report, accuracy_score, precision_score, make_scorer, f1_score, confusion_matrix 
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import  ColumnTransformer
%matplotlib inline

In [46]:
#loading dataset
dataset = pd.read_csv('https://raw.githubusercontent.com/gogzicole/stage-f-07-heart-failure/master/data/heart_failure_clinical_records_dataset.csv')
dataset.head()

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
0,75.0,0,582,0,20,1,265000.0,1.9,130,1,0,4,1
1,55.0,0,7861,0,38,0,263358.03,1.1,136,1,0,6,1
2,65.0,0,146,0,20,0,162000.0,1.3,129,1,1,7,1
3,50.0,1,111,0,20,0,210000.0,1.9,137,1,0,7,1
4,65.0,1,160,1,20,0,327000.0,2.7,116,0,0,8,1


In [47]:

# selecting feature matrix and target variable
X = dataset.drop(columns = 'DEATH_EVENT')
y = dataset['DEATH_EVENT']
y.value_counts()

0    203
1     96
Name: DEATH_EVENT, dtype: int64

In [48]:

make_column_transformer = [2,4,6,7,8,11] #columns to apply min max scaling
#Create set of hyperparameters to search through randomly
n_estimators = [50, 100, 300, 500, 1000] 

min_samples_split = [2, 3, 5, 7, 9]

min_samples_leaf = [1, 2, 4, 6, 8]

max_features = ['auto', 'sqrt', 'log2', None] 

hyperparameter_grid = {'n_estimators': n_estimators,

                       'min_samples_leaf': min_samples_leaf,

                       'min_samples_split': min_samples_split,

                       'max_features': max_features}
#Integrate everything into a pipeline
pipe = Pipeline([('col_transform', ColumnTransformer(remainder = 'passthrough',
                transformers = [('scaler',MinMaxScaler(),make_column_transformer)])),
                ('random_search',RandomizedSearchCV(ExtraTreesClassifier(random_state = 1),
                hyperparameter_grid, scoring = 'f1', verbose = 1))])

In [49]:
step_1 = pipe.named_steps['col_transform'].fit_transform(X)

In [50]:
x_train, x_test,y_train, y_test = train_test_split(step_1, y, test_size = 0.2, random_state = 0)

In [51]:
pipe.fit(x_train,y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:   14.4s finished


Pipeline(steps=[('col_transform',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('scaler', MinMaxScaler(),
                                                  [2, 4, 6, 7, 8, 11])])),
                ('random_search',
                 RandomizedSearchCV(estimator=ExtraTreesClassifier(random_state=1),
                                    param_distributions={'max_features': ['auto',
                                                                          'sqrt',
                                                                          'log2',
                                                                          None],
                                                         'min_samples_leaf': [1,
                                                                              2,
                                                                              4,
                                                                   

In [52]:
model1 = pipe.named_steps['random_search'].best_estimator_
model1.fit(x_train, y_train)

ExtraTreesClassifier(max_features=None, min_samples_leaf=2, min_samples_split=7,
                     n_estimators=1000, random_state=1)

In [53]:

y_pred1 = model1.predict(x_test)

In [54]:
report1 = classification_report(y_test,y_pred1)
print('The classification report for the Dataset as is with Extra trees classifier:')
print(report1)

The classification report for the Dataset as is with Extra trees classifier:
              precision    recall  f1-score   support

           0       0.80      0.97      0.88        37
           1       0.93      0.61      0.74        23

    accuracy                           0.83        60
   macro avg       0.87      0.79      0.81        60
weighted avg       0.85      0.83      0.82        60



In [55]:
#upsample
import imblearn
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=1)
X_balanced, y_balanced = smote.fit_sample(X, y)
X_balanced = DataFrame(X_balanced, columns = X.columns)

In [56]:
#train
step_bal = pipe.named_steps['col_transform'].fit_transform(X_balanced)
x_train, x_test,y_train, y_test = train_test_split(step_bal, y_balanced, test_size = 0.2, random_state = 0)
pipe.fit(x_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:   27.3s finished


Pipeline(steps=[('col_transform',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('scaler', MinMaxScaler(),
                                                  [2, 4, 6, 7, 8, 11])])),
                ('random_search',
                 RandomizedSearchCV(estimator=ExtraTreesClassifier(random_state=1),
                                    param_distributions={'max_features': ['auto',
                                                                          'sqrt',
                                                                          'log2',
                                                                          None],
                                                         'min_samples_leaf': [1,
                                                                              2,
                                                                              4,
                                                                   

In [57]:
#predict
model_bal = pipe.named_steps['random_search'].best_estimator_
model_bal.fit(x_train, y_train)
y_pred_bal = model_bal.predict(x_test)

In [58]:
#metrics
recall_bal = recall_score(y_test,y_pred_bal)
accuracy_bal = accuracy_score(y_test,y_pred_bal)
precision_bal = precision_score(y_test,y_pred_bal)
f1score_bal = f1_score(y_test,y_pred_bal)
print('the Recall for smoted Extra Trees with upsampling is:{}'.format(round(recall_bal,4)))
print(f'the Accuracy for smoted Extra Trees with upsampling is:{round(accuracy_bal,4)}')
print('the Precision for smoted Extra Trees with upsampling  is: %s' %(round(precision_bal,4)))
print(f'the F1_score for smoted Extra Trees with upsampling  is: {round(f1score_bal,4)}')

the Recall for smoted Extra Trees with upsampling is:0.9149
the Accuracy for smoted Extra Trees with upsampling is:0.878
the Precision for smoted Extra Trees with upsampling  is: 0.8776
the F1_score for smoted Extra Trees with upsampling  is: 0.8958


In [59]:
#Build new pipeline with XGB classifier
pipe_XG = Pipeline([('col_transform', ColumnTransformer(remainder = 'passthrough',
                transformers = [('scaler',MinMaxScaler(),make_column_transformer)])),
                ('random_search',RandomizedSearchCV(XGBClassifier(random_state = 1),
                hyperparameter_grid, scoring = 'f1', verbose = 1))])

In [60]:
#fit upsampled data to XGB pipeline
step_XG = pipe_XG.named_steps['col_transform'].fit_transform(X_balanced)
x_train, x_test,y_train, y_test = train_test_split(step_XG, y_balanced, test_size = 0.2, random_state = 0)
pipe_XG.fit(x_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    5.6s finished


Pipeline(steps=[('col_transform',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('scaler', MinMaxScaler(),
                                                  [2, 4, 6, 7, 8, 11])])),
                ('random_search',
                 RandomizedSearchCV(estimator=XGBClassifier(random_state=1),
                                    param_distributions={'max_features': ['auto',
                                                                          'sqrt',
                                                                          'log2',
                                                                          None],
                                                         'min_samples_leaf': [1,
                                                                              2,
                                                                              4,
                                                                          

In [61]:
#predict
model_XG = pipe_XG.named_steps['random_search'].best_estimator_
model_XG.fit(x_train, y_train)
y_pred_XG = model_XG.predict(x_test)

In [62]:
#evaluate
recall_XG = recall_score(y_test,y_pred_XG)
accuracy_XG = accuracy_score(y_test,y_pred_XG)
precision_XG = precision_score(y_test,y_pred_XG)
f1score_XG = f1_score(y_test,y_pred_XG)
print('the Recall for smoted XGB is:{}'.format(round(recall_XG,4)))
print(f'the Accuracy for smoted XGB is:{round(accuracy_XG,4)}')
print('the Precision for smoted XGB is: %s' %(round(precision_XG,4)))
print(f'the F1_score for smoted XGB is: {round(f1score_XG,4)}')

the Recall for smoted XGB is:0.9362
the Accuracy for smoted XGB is:0.8902
the Precision for smoted XGB is: 0.88
the F1_score for smoted XGB is: 0.9072
