# Importing Libraries

In [0]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, f1_score

# Loading Data

In [2]:
!unzip heart-disease-uci.zip

Archive:  heart-disease-uci.zip
  inflating: heart.csv               


In [0]:
DataSet = pd.read_csv("heart.csv")

In [5]:
DataSet.shape

(303, 14)

In [6]:
DataSet.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


refrence - https://www.kaggle.com/tentotheminus9/what-causes-heart-disease-explaining-the-model

The meaning of some of the column headers are not obvious. Here's what they mean,

- age: The person's age in years
- sex: The person's sex (1 = male, 0 = female)
- cp: The chest pain experienced (Value 1: typical angina, Value 2: atypical angina, Value 3: non-anginal pain, Value 4: asymptomatic)
- trestbps: The person's resting blood pressure (mm Hg on admission to the hospital)
- chol: The person's cholesterol measurement in mg/dl
- fbs: The person's fasting blood sugar (> 120 mg/dl, 1 = true; 0 = false)
- restecg: Resting electrocardiographic measurement (0 = normal, 1 = having ST-T wave abnormality, 2 = showing probable or definite left ventricular hypertrophy by Estes' criteria)
- thalach: The person's maximum heart rate achieved
exang: Exercise induced angina (1 = yes; 0 = no)
- oldpeak: ST depression induced by exercise relative to rest ('ST' relates to positions on the ECG plot. See more here)
- slope: the slope of the peak exercise ST segment (Value 1: upsloping, Value 2: flat, Value 3: downsloping)
- ca: The number of major vessels (0-3)
- thal: A blood disorder called thalassemia (3 = normal; 6 = fixed defect; 7 = reversable defect)
- target: Heart disease (0 = no, 1 = yes)

**The diagnosis of heart disease is done on a combination of clinical signs and test results.**

From the  information of heart disease risk factors led me to the following: 
- high cholesterol, high blood pressure, diabetes, weight, family history and smoking 
-  increasing age, male gender and heredity. 
- Note that thalassemia, one of the variables in this dataset, is heredity. 
- Major factors that can be modified are: Smoking, high cholesterol, high blood pressure, physical inactivity, and being overweight and having diabetes.
-  Other factors include stress, alcohol and poor diet/nutrition.

# Data Preprocessing

In [0]:
DataSet.columns = ['age', 'sex', 'chest_pain_type', 'resting_blood_pressure', 'cholesterol', 'fasting_blood_sugar', 'rest_ecg', 'max_heart_rate_achieved',
       'exercise_induced_angina', 'st_depression', 'st_slope', 'num_major_vessels', 'thalassemia', 'target']

In [8]:
DataSet['sex'][DataSet['sex'] == 0] = 'female'
DataSet['sex'][DataSet['sex'] == 1] = 'male'

DataSet['chest_pain_type'][DataSet['chest_pain_type'] == 1] = 'typical angina'
DataSet['chest_pain_type'][DataSet['chest_pain_type'] == 2] = 'atypical angina'
DataSet['chest_pain_type'][DataSet['chest_pain_type'] == 3] = 'non-anginalpain'
DataSet['chest_pain_type'][DataSet['chest_pain_type'] == 4] = 'asymptomatic'

DataSet['fasting_blood_sugar'][DataSet['fasting_blood_sugar'] == 0] = 'lower than 120mg/ml'
DataSet['fasting_blood_sugar'][DataSet['fasting_blood_sugar'] == 1] = 'greater than 120mg/ml'

DataSet['rest_ecg'][DataSet['rest_ecg'] == 0] = 'normal'
DataSet['rest_ecg'][DataSet['rest_ecg'] == 1] = 'ST-T wave abnormality'
DataSet['rest_ecg'][DataSet['rest_ecg'] == 2] = 'left ventricular hypertrophy'

DataSet['exercise_induced_angina'][DataSet['exercise_induced_angina'] == 0] = 'no'
DataSet['exercise_induced_angina'][DataSet['exercise_induced_angina'] == 1] = 'yes'

DataSet['st_slope'][DataSet['st_slope'] == 1] = 'upsloping'
DataSet['st_slope'][DataSet['st_slope'] == 2] = 'flat'
DataSet['st_slope'][DataSet['st_slope'] == 3] = 'downsloping'

DataSet['thalassemia'][DataSet['thalassemia'] == 1] = 'normal'
DataSet['thalassemia'][DataSet['thalassemia'] == 2] = 'fixed defect'
DataSet['thalassemia'][DataSet['thalassemia'] == 3] = 'reversable defect'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame

See 

In [0]:
DataSet['sex'] = DataSet['sex'].astype('object')
DataSet['chest_pain_type'] = DataSet['chest_pain_type'].astype('object')
DataSet['fasting_blood_sugar'] = DataSet['fasting_blood_sugar'].astype('object')
DataSet['rest_ecg'] = DataSet['rest_ecg'].astype('object')
DataSet['exercise_induced_angina'] = DataSet['exercise_induced_angina'].astype('object')
DataSet['st_slope'] = DataSet['st_slope'].astype('object')
DataSet['thalassemia'] = DataSet['thalassemia'].astype('object')

In [10]:
DataSet.target.value_counts()

1    165
0    138
Name: target, dtype: int64

In [11]:
DataSet.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
age                        303 non-null int64
sex                        303 non-null object
chest_pain_type            303 non-null object
resting_blood_pressure     303 non-null int64
cholesterol                303 non-null int64
fasting_blood_sugar        303 non-null object
rest_ecg                   303 non-null object
max_heart_rate_achieved    303 non-null int64
exercise_induced_angina    303 non-null object
st_depression              303 non-null float64
st_slope                   303 non-null object
num_major_vessels          303 non-null int64
thalassemia                303 non-null object
target                     303 non-null int64
dtypes: float64(1), int64(6), object(7)
memory usage: 33.2+ KB


In [0]:
DataSet = pd.get_dummies(DataSet, drop_first=True)

In [13]:
DataSet.shape

(303, 20)

In [0]:
DataSet_features = DataSet.drop(['target'], axis=1)
DataSet_Labels = DataSet['target']

In [0]:
X_train,X_test,Y_train,Y_test=train_test_split(DataSet_features, DataSet_Labels ,test_size=0.2,random_state=32, stratify=DataSet_Labels)

In [16]:
print(X_train.shape)
print(Y_train.shape)
print(X_test.shape)
print(Y_test.shape)

(242, 19)
(242,)
(61, 19)
(61,)


# Base Models

In [0]:
lr=LogisticRegression(class_weight='balanced', tol=1e-10, verbose=2, penalty='l2')
dtc=DecisionTreeClassifier(max_depth=3)
rf=RandomForestClassifier(n_estimators=11, n_jobs=-1, random_state=0, verbose=2)

In [18]:
lr.fit(X_train,Y_train)
dtc.fit(X_train,Y_train)
rf.fit(X_train,Y_train)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  11 out of  11 | elapsed:    0.0s finished


[LibLinear]building tree 1 of 11
building tree 2 of 11
building tree 3 of 11
building tree 4 of 11
building tree 5 of 11building tree 6 of 11

building tree 7 of 11building tree 9 of 11

building tree 8 of 11
building tree 10 of 11
building tree 11 of 11


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=11, n_jobs=-1,
                       oob_score=False, random_state=0, verbose=2,
                       warm_start=False)

In [19]:
confusion_matrix(Y_test,lr.predict(X_test))

array([[23,  5],
       [ 4, 29]])

In [20]:
confusion_matrix(Y_test,dtc.predict(X_test))

array([[25,  3],
       [ 5, 28]])

In [21]:
confusion_matrix(Y_test,rf.predict(X_test))

[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  11 out of  11 | elapsed:    0.0s finished


array([[24,  4],
       [ 5, 28]])

# Ensembel Learning

In [22]:
vc=VotingClassifier(estimators=[('dtc',dtc),('lr',lr),('rf',rf)],
                    voting='soft')
vc.fit(X_train,Y_train)

[LibLinear]

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  11 out of  11 | elapsed:    1.4s finished


VotingClassifier(estimators=[('dtc',
                              DecisionTreeClassifier(class_weight=None,
                                                     criterion='gini',
                                                     max_depth=3,
                                                     max_features=None,
                                                     max_leaf_nodes=None,
                                                     min_impurity_decrease=0.0,
                                                     min_impurity_split=None,
                                                     min_samples_leaf=1,
                                                     min_samples_split=2,
                                                     min_weight_fraction_leaf=0.0,
                                                     presort=False,
                                                     random_state=None,
                                                     splitter='best')),
        

# Evaluation Metrics

In [23]:
print(classification_report(Y_test, vc.predict(X_test)))

              precision    recall  f1-score   support

           0       0.89      0.86      0.87        28
           1       0.88      0.91      0.90        33

    accuracy                           0.89        61
   macro avg       0.89      0.88      0.88        61
weighted avg       0.89      0.89      0.89        61



[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  11 out of  11 | elapsed:    0.0s finished


In [24]:
print(f1_score(Y_test, vc.predict(X_test)))

[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  11 out of  11 | elapsed:    0.0s finished


0.8955223880597014
