## This  project analyzes Heart Failure Prediction Dataset using Logistic Regression and Random Forests, and then, compares results obtained. The two methods were used because output results should be binary (0/1). Link to the dataset: https://www.kaggle.com/fedesoriano/heart-failure-prediction. 
### Finished on 20/01/2022.

### First, we need to read the comma-separated file using pandas. Custom headers were given for convenience.

In [4]:
import pandas as pd
# headers = ["age", "sex", "cpa", "rbp", "chol",
#            "fbs", "recg", "maxhr", "EA",
#            "oldpeak", "ST_slope", "HD"]
# # cpa = ChestPainType
# # rbp = resting blood pressure
# # chol = cholesterol
# # fbs = fasting blood sugar
# # recg = resting ECG
# # EA = exercise angina
# # HD = heart disease

In [5]:
df = pd.read_csv("../input/heart-failure-prediction/heart.csv", header=0)
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


### For some reasons, previous headers were set as one new row of data, so it has to be removed.

In [6]:
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


## Handling missing values

In [7]:
cols_with_missing = [col for col in df.columns
                     if df[col].isnull().any()]

In [8]:
cols_with_missing

[]

In [9]:
df.isnull().sum()

Age               0
Sex               0
ChestPainType     0
RestingBP         0
Cholesterol       0
FastingBS         0
RestingECG        0
MaxHR             0
ExerciseAngina    0
Oldpeak           0
ST_Slope          0
HeartDisease      0
dtype: int64

## Handling categoricals

In [10]:
for i in df.columns:
    print(i,'\t\t',df[i].dtype)

Age 		 int64
Sex 		 object
ChestPainType 		 object
RestingBP 		 int64
Cholesterol 		 int64
FastingBS 		 int64
RestingECG 		 object
MaxHR 		 int64
ExerciseAngina 		 object
Oldpeak 		 float64
ST_Slope 		 object
HeartDisease 		 int64


In [11]:
s = (df.dtypes == 'object')
object_cols = list(s[s].index)

In [12]:
object_cols

['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope']

In [13]:
for i in range(len(object_cols)):
    print(df[object_cols[i]].value_counts(), '\n\n')

M    725
F    193
Name: Sex, dtype: int64 


ASY    496
NAP    203
ATA    173
TA      46
Name: ChestPainType, dtype: int64 


Normal    552
LVH       188
ST        178
Name: RestingECG, dtype: int64 


N    547
Y    371
Name: ExerciseAngina, dtype: int64 


Flat    460
Up      395
Down     63
Name: ST_Slope, dtype: int64 




# Train/val/test split: 60/20/20

In [14]:
df = df.sample(frac=1, random_state=42)

In [15]:
import numpy as np

In [16]:
train, val, test = np.split(
                                df.sample(frac=1, random_state=42),
                                [int(.6*len(df)), int(.8*len(df))]
                            )

In [17]:
print(train.shape,'\n',
      val.shape,'\n',
      test.shape)

(550, 12) 
 (184, 12) 
 (184, 12)


### One-hot encoding categoricals

In [18]:
train = pd.get_dummies(train)
val = pd.get_dummies(val)
test = pd.get_dummies(test)
train, val = train.align(val, join='left', axis=1)
train, test = train.align(test, join='left', axis=1)

In [19]:
train

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease,Sex_F,Sex_M,ChestPainType_ASY,...,ChestPainType_NAP,ChestPainType_TA,RestingECG_LVH,RestingECG_Normal,RestingECG_ST,ExerciseAngina_N,ExerciseAngina_Y,ST_Slope_Down,ST_Slope_Flat,ST_Slope_Up
183,46,110,238,0,140,1.0,0,0,1,1,...,0,0,0,0,1,0,1,0,1,0
72,52,120,182,0,150,0.0,1,0,1,1,...,0,0,0,1,0,1,0,0,1,0
512,35,123,161,0,153,-0.1,0,0,1,0,...,1,0,0,0,1,1,0,0,0,1
770,46,105,204,0,172,0.0,0,1,0,0,...,0,0,0,1,0,1,0,0,0,1
187,41,120,237,1,138,1.0,1,0,1,1,...,0,0,0,1,0,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
690,45,142,309,0,147,0.0,1,0,1,1,...,0,0,1,0,0,0,1,0,1,0
489,54,136,220,0,140,3.0,1,0,1,1,...,0,0,0,1,0,0,1,0,1,0
33,41,130,172,0,130,2.0,1,0,1,1,...,0,0,0,0,1,1,0,0,1,0
90,49,110,208,0,160,0.0,0,1,0,0,...,0,0,0,1,0,1,0,0,0,1


In [20]:
train.dtypes

Age                    int64
RestingBP              int64
Cholesterol            int64
FastingBS              int64
MaxHR                  int64
Oldpeak              float64
HeartDisease           int64
Sex_F                  uint8
Sex_M                  uint8
ChestPainType_ASY      uint8
ChestPainType_ATA      uint8
ChestPainType_NAP      uint8
ChestPainType_TA       uint8
RestingECG_LVH         uint8
RestingECG_Normal      uint8
RestingECG_ST          uint8
ExerciseAngina_N       uint8
ExerciseAngina_Y       uint8
ST_Slope_Down          uint8
ST_Slope_Flat          uint8
ST_Slope_Up            uint8
dtype: object

## X/y split

In [21]:
y_train =  train['HeartDisease']
y_val = val['HeartDisease']
y_test = test['HeartDisease']

X_train = train.drop('HeartDisease', axis = 1)
X_val = val.drop('HeartDisease', axis = 1)
X_test = test.drop('HeartDisease', axis = 1)

In [22]:
X_train

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,Sex_F,Sex_M,ChestPainType_ASY,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_LVH,RestingECG_Normal,RestingECG_ST,ExerciseAngina_N,ExerciseAngina_Y,ST_Slope_Down,ST_Slope_Flat,ST_Slope_Up
183,46,110,238,0,140,1.0,0,1,1,0,0,0,0,0,1,0,1,0,1,0
72,52,120,182,0,150,0.0,0,1,1,0,0,0,0,1,0,1,0,0,1,0
512,35,123,161,0,153,-0.1,0,1,0,0,1,0,0,0,1,1,0,0,0,1
770,46,105,204,0,172,0.0,1,0,0,1,0,0,0,1,0,1,0,0,0,1
187,41,120,237,1,138,1.0,0,1,1,0,0,0,0,1,0,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
690,45,142,309,0,147,0.0,0,1,1,0,0,0,1,0,0,0,1,0,1,0
489,54,136,220,0,140,3.0,0,1,1,0,0,0,0,1,0,0,1,0,1,0
33,41,130,172,0,130,2.0,0,1,1,0,0,0,0,0,1,1,0,0,1,0
90,49,110,208,0,160,0.0,1,0,0,1,0,0,0,1,0,1,0,0,0,1


# XGBoost

In [23]:
from xgboost import XGBClassifier

In [24]:
model_XGB = XGBClassifier(n_estimators = 250, learning_rate = 0.1, n_jobs = 4, use_label_encoder = False)

In [25]:
model_XGB.fit(X_train, y_train,
               early_stopping_rounds = 5,
               eval_set = [(X_val, y_val)],
               verbose = True)

[0]	validation_0-logloss:0.64178
[1]	validation_0-logloss:0.60345
[2]	validation_0-logloss:0.57079
[3]	validation_0-logloss:0.54283
[4]	validation_0-logloss:0.51953
[5]	validation_0-logloss:0.50146
[6]	validation_0-logloss:0.48068
[7]	validation_0-logloss:0.46684
[8]	validation_0-logloss:0.45421
[9]	validation_0-logloss:0.44337
[10]	validation_0-logloss:0.43292
[11]	validation_0-logloss:0.42267
[12]	validation_0-logloss:0.41591
[13]	validation_0-logloss:0.41164
[14]	validation_0-logloss:0.40538
[15]	validation_0-logloss:0.40453
[16]	validation_0-logloss:0.39983
[17]	validation_0-logloss:0.39674
[18]	validation_0-logloss:0.39510
[19]	validation_0-logloss:0.39381
[20]	validation_0-logloss:0.39214
[21]	validation_0-logloss:0.39149
[22]	validation_0-logloss:0.39166
[23]	validation_0-logloss:0.39073
[24]	validation_0-logloss:0.39024
[25]	validation_0-logloss:0.38872
[26]	validation_0-logloss:0.38874
[27]	validation_0-logloss:0.38842
[28]	validation_0-logloss:0.38566
[29]	validation_0-loglos

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.1, max_delta_step=0,
              max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=250, n_jobs=4,
              num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', use_label_encoder=False,
              validate_parameters=1, verbosity=None)

In [26]:
predictions_XGB = model_XGB.predict(X_test)

In [27]:
predictions_XGB

array([0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1,
       1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1,
       1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0,
       0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0,
       1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0,
       1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1,
       1, 1, 0, 0, 1, 1, 0, 1])

In [28]:
from sklearn.metrics import mean_squared_error as mse

In [29]:
error_XGB = mse(y_test, predictions_XGB)

In [30]:
print('XGB MSE : ', error_XGB)

XGB MSE :  0.125


In [31]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [32]:
print(classification_report(y_test, predictions_XGB))

              precision    recall  f1-score   support

           0       0.86      0.85      0.86        81
           1       0.88      0.89      0.89       103

    accuracy                           0.88       184
   macro avg       0.87      0.87      0.87       184
weighted avg       0.87      0.88      0.87       184



In [33]:
confusion_matrix(y_test, predictions_XGB)

array([[69, 12],
       [11, 92]])

In [34]:
(69+92)/confusion_matrix(y_test, predictions_XGB).sum()

0.875

In [35]:
1-error_XGB

0.875

In [36]:
from sklearn.model_selection import cross_val_score

In [37]:
X = pd.concat([X_train, X_val, X_test], axis=0)
y = pd.concat([y_train, y_val, y_test], axis=0)

In [40]:
import sklearn

In [41]:
sorted(sklearn.metrics.SCORERS.keys())

['accuracy',
 'adjusted_mutual_info_score',
 'adjusted_rand_score',
 'average_precision',
 'balanced_accuracy',
 'completeness_score',
 'explained_variance',
 'f1',
 'f1_macro',
 'f1_micro',
 'f1_samples',
 'f1_weighted',
 'fowlkes_mallows_score',
 'homogeneity_score',
 'jaccard',
 'jaccard_macro',
 'jaccard_micro',
 'jaccard_samples',
 'jaccard_weighted',
 'max_error',
 'mutual_info_score',
 'neg_brier_score',
 'neg_log_loss',
 'neg_mean_absolute_error',
 'neg_mean_gamma_deviance',
 'neg_mean_poisson_deviance',
 'neg_mean_squared_error',
 'neg_mean_squared_log_error',
 'neg_median_absolute_error',
 'neg_root_mean_squared_error',
 'normalized_mutual_info_score',
 'precision',
 'precision_macro',
 'precision_micro',
 'precision_samples',
 'precision_weighted',
 'r2',
 'recall',
 'recall_macro',
 'recall_micro',
 'recall_samples',
 'recall_weighted',
 'roc_auc',
 'roc_auc_ovo',
 'roc_auc_ovo_weighted',
 'roc_auc_ovr',
 'roc_auc_ovr_weighted',
 'v_measure_score']

In [91]:
scores_XGB = -1 * cross_val_score(model_XGB, X, y,
                             cv=5,
                             scoring='neg_log_loss')



In [98]:
scores_XGB

array([0.39354286, 0.50860647, 0.36087099, 0.41417193, 0.31555153])

In [99]:
scores_XGB.mean()

0.39854875560235287

In [102]:
train_predictions_XGB = model_XGB.predict(X_train)

In [104]:
confusion_matrix(y_train, train_predictions_XGB)

array([[236,  13],
       [  5, 296]])

In [106]:
(236+296)/confusion_matrix(y_train, train_predictions_XGB).sum()

0.9672727272727273

In [67]:
# XGB Classifier turned out to work better than XGB Regressor (87.5 vs 84.2)

# Logistic Regression

### Here, specific parameters were added to avoid the bug that showed up continuously. Alternative solution could be to scale the inputs so that they have much smaller values, but setting a large maximum iteration value did work as well.

In [48]:
from sklearn.linear_model import LogisticRegression

In [57]:
model_LR = LogisticRegression(solver="lbfgs", max_iter=1000)

### Training the model on train dataset.

In [58]:
X_log = pd.concat([X_train, X_val], axis=0)
y_log = pd.concat([y_train, y_val], axis=0)

In [59]:
model_LR.fit(X_log, y_log)

LogisticRegression(max_iter=1000)

In [60]:
predictions_LR = model_LR.predict(X_test)

In [61]:
print(classification_report(y_test, predictions_LR))

              precision    recall  f1-score   support

           0       0.86      0.83      0.84        81
           1       0.87      0.89      0.88       103

    accuracy                           0.86       184
   macro avg       0.86      0.86      0.86       184
weighted avg       0.86      0.86      0.86       184



In [62]:
confusion_matrix(y_test, predictions_LR)

array([[67, 14],
       [11, 92]])

In [55]:
(67+92)/confusion_matrix(y_test, predictions_LR).sum()

0.8641304347826086

In [112]:
scores_LR = -1 * cross_val_score(model_LR, X, y,
                             cv=5,
                             scoring='neg_log_loss')

In [113]:
scores_LR

array([0.33014229, 0.39364662, 0.30977191, 0.32825496, 0.33166144])

In [114]:
scores_LR.mean()

0.3386954428886635

In [107]:
train_predictions_LR = model_LR.predict(X_log)

In [108]:
confusion_matrix(y_log, train_predictions_LR)

array([[275,  54],
       [ 45, 360]])

In [109]:
(275+360)/confusion_matrix(y_log, train_predictions_LR).sum()

0.8651226158038147

In [None]:
def sigmoid(z):
    return 1.0 / (1.0 + np.exp(-z))

In [None]:
def cross_entropy_loss(yHat, y):
    if y == 1:
        return -np.log(yHat)
    else:
        return -np.log(1 - yHat)

In [None]:
-(y_test * np.log(predictions_LR))

# Random Forest

In [120]:
from sklearn.ensemble import RandomForestRegressor

In [121]:
model_RF = RandomForestRegressor(random_state=1)

### Training the second model on the previous train dataset.

In [122]:
model_RF.fit(X_log, y_log)

RandomForestRegressor(random_state=1)

In [123]:
predictions_RF = model_RF.predict(X_test)

In [124]:
predictions_RF

array([0.24, 0.34, 0.73, 0.58, 0.65, 0.  , 0.81, 0.51, 0.53, 0.  , 0.96,
       0.63, 0.24, 0.58, 0.99, 0.77, 0.  , 0.57, 0.47, 0.  , 1.  , 0.63,
       0.9 , 0.  , 0.  , 0.27, 0.97, 0.  , 0.47, 0.17, 0.  , 0.48, 1.  ,
       0.  , 0.82, 0.88, 0.97, 0.12, 0.92, 0.78, 0.82, 0.05, 0.79, 0.97,
       0.7 , 0.55, 0.98, 0.98, 0.85, 0.  , 0.98, 0.66, 0.09, 1.  , 0.06,
       0.83, 0.21, 0.  , 0.98, 0.01, 0.  , 0.  , 0.  , 0.08, 0.  , 1.  ,
       0.83, 0.35, 0.97, 0.  , 0.72, 0.99, 0.  , 0.  , 0.  , 1.  , 0.89,
       0.96, 0.57, 1.  , 0.93, 0.98, 0.42, 1.  , 1.  , 0.1 , 0.96, 0.  ,
       0.03, 0.33, 1.  , 0.06, 0.04, 0.85, 0.09, 0.82, 1.  , 0.79, 0.8 ,
       0.32, 0.85, 0.83, 0.07, 0.88, 0.99, 0.99, 0.64, 0.86, 0.94, 0.99,
       1.  , 0.06, 0.07, 0.97, 0.84, 0.7 , 0.66, 0.85, 0.96, 0.35, 0.02,
       0.05, 0.12, 0.06, 0.98, 0.74, 0.  , 0.1 , 0.95, 0.93, 0.89, 0.  ,
       0.84, 0.27, 0.  , 1.  , 0.  , 0.49, 1.  , 0.  , 0.67, 0.  , 0.  ,
       0.98, 0.58, 1.  , 0.32, 0.98, 0.42, 0.94, 0.

In [125]:
for i in range (len(predictions_RF)):
    if(predictions_RF[i] >= 0.5):
        predictions_RF[i]=1
    else:
        predictions_RF[i]=0
predictions_RF = predictions_RF.astype(int)

In [126]:
predictions_RF

array([0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1,
       1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1,
       1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0,
       0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0,
       1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0,
       0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0,
       1, 1, 0, 0, 1, 1, 1, 1])

In [127]:
print(classification_report(y_test, predictions_RF))

              precision    recall  f1-score   support

           0       0.86      0.80      0.83        81
           1       0.85      0.89      0.87       103

    accuracy                           0.85       184
   macro avg       0.85      0.85      0.85       184
weighted avg       0.85      0.85      0.85       184



In [128]:
confusion_matrix(y_test, predictions_RF)

array([[65, 16],
       [11, 92]])

In [129]:
(65+92)/confusion_matrix(y_test, predictions_RF).sum()

0.8532608695652174

In [130]:
scores_RF = -1 * cross_val_score(model_RF, X, y,
                             cv=5,
                             scoring='neg_mean_squared_error')

In [131]:
scores_RF

array([0.10215326, 0.12923967, 0.10190652, 0.11107432, 0.10288962])

In [132]:
scores_RF.mean()

0.10945267818959373

In [133]:
train_predictions_RF = model_RF.predict(X_log)

In [134]:
for i in range (len(train_predictions_RF)):
    if(train_predictions_RF[i] >= 0.5):
        train_predictions_RF[i]=1
    else:
        train_predictions_RF[i]=0
train_predictions_RF = train_predictions_RF.astype(int)

In [135]:
confusion_matrix(y_log, train_predictions_RF)

array([[329,   0],
       [  0, 405]])

In [136]:
(329+405)/confusion_matrix(y_log, train_predictions_RF).sum()

1.0