In [1]:
# importing requirements
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

In [2]:
# we need to read the data as our first step after importing libraries
df = pd.read_csv('heart.csv')

In [3]:
# a sample of our data
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


Dataset rows details copied from : https://archive.ics.uci.edu/ml/datasets/Heart+Disease
- age : patient's age in years
- sex : defines gender (1 = male; 0 = female)
- cp : chest pain type
- trestbps : resting blood pressure (in mm Hg on admission to the hospital)
- chol : serum cholestoral in mg/dl
- fbs : (fasting blood sugar > 120 mg/dl) (1 = true; 0 = false)
- restecg : resting electrocardiographic results
- thalach : maximum heart rate achieved
- exang : exercise induced angina (1 = yes; 0 = no)
- oldpeak : ST depression induced by exercise relative to rest
- slope : the slope of the peak exercise ST segment
- ca : number of major vessels (0-3) colored by flourosopy
- thal : 3 = normal; 6 = fixed defect; 7 = reversable defect
- target : have disease or not (1=yes, 0=no

ca, thal and slope rows are considered as categorical variables because they accept a fixed and limited number of possible values. As a result we need to convert them into dummy variables.

In [4]:
# Creation of dummy variables
cp_dummy = pd.get_dummies(df['cp'], prefix = "cp")
thal_dummy = pd.get_dummies(df['thal'], prefix = "thal")
slope_dummy = pd.get_dummies(df['slope'], prefix = "slope")

In [5]:
# now  ca, thal and slope rows will be dropped and dummy variables replace and repressent them.
df = pd.concat([df, cp_dummy, thal_dummy, slope_dummy], axis = 1)
df = df.drop(columns = ['cp', 'thal', 'slope'])
df.head()

Unnamed: 0,age,sex,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,ca,...,cp_1,cp_2,cp_3,thal_0,thal_1,thal_2,thal_3,slope_0,slope_1,slope_2
0,63,1,145,233,1,0,150,0,2.3,0,...,0,0,1,0,1,0,0,1,0,0
1,37,1,130,250,0,1,187,0,3.5,0,...,0,1,0,0,0,1,0,1,0,0
2,41,0,130,204,0,0,172,0,1.4,0,...,1,0,0,0,0,1,0,0,0,1
3,56,1,120,236,0,1,178,0,0.8,0,...,1,0,0,0,0,1,0,0,0,1
4,57,0,120,354,0,1,163,1,0.6,0,...,0,0,0,0,0,1,0,0,0,1


In [6]:
# the 'target' row values will be stored in a seperate valiable to be used as labels for supervised learning approaches.
data_labels = df.target.values
X = df.drop(['target'], axis = 1)

In [7]:
# normalizing our data using min-max scaling

# create a scaler object
scaler = MinMaxScaler()

# fitting and transforming the data
normalized_data = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)


In [8]:
# a sample of our normalized data
normalized_data.head()

Unnamed: 0,age,sex,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,ca,...,cp_1,cp_2,cp_3,thal_0,thal_1,thal_2,thal_3,slope_0,slope_1,slope_2
0,0.708333,1.0,0.481132,0.244292,1.0,0.0,0.603053,0.0,0.370968,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
1,0.166667,1.0,0.339623,0.283105,0.0,0.5,0.885496,0.0,0.564516,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
2,0.25,0.0,0.339623,0.178082,0.0,0.0,0.770992,0.0,0.225806,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
3,0.5625,1.0,0.245283,0.251142,0.0,0.5,0.816794,0.0,0.129032,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
4,0.583333,0.0,0.245283,0.520548,0.0,0.5,0.70229,1.0,0.096774,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0


In [9]:
# now we need to split the data. 80% will be used for training and other 20% will be kept for testing.
x_train, x_test, y_train, y_test = train_test_split(normalized_data, data_labels, test_size = 0.2,random_state=0)

# Logistic Regression
We will use logistic regression method in this part to estimate discrete values based on a given set of categorical independent variables

In [10]:
# creating the model
logistic_regression_model = LogisticRegression()
# fitting the model
logistic_regression_model.fit(x_train, y_train)
# testing and calculating accuracy
accuracy = logistic_regression_model.score(x_test, y_test)*100

print("Logistic Regression Accuracy is {}".format(accuracy))

Logistic Regression Accuracy is 86.88524590163934


# KNN
in this algorithm,the process of choosing optimal value for K is called "parameter tuning" which is critical in order to achieve higher accuracy. since there are no pre-defined statistical method for the mentioned process we have to choose and evaluate different values for k randomly or based on iteration.
Choosing a small value of K leads to unstable decision boundaries and if we pick a large one, most of records will be classified as the most probable class.

In [11]:
list_of_accuracy = [] # accuracy for different values of K will be stored in a list

# Using iteration to find the right value for K
for k in range(1,30):
    KNN_classifier = KNeighborsClassifier(n_neighbors = k)
    KNN_classifier.fit(x_train, y_train)
    list_of_accuracy.append(KNN_classifier.score(x_test, y_test))

# selection of maximum accuracy and converting it to percentage
accuracy = max(list_of_accuracy)*100
# extracting minimum optimal value for K
K = list_of_accuracy.index(max(list_of_accuracy)) + 1  # the addition is because K starts from 1 and index starts from 1

print("KNN optimal accuracy is {}, and K is {}".format(accuracy, K))

KNN optimal accuracy is 88.52459016393442, and K is 3


# SVM
Support Vector Machine (SVM) is a supervised learning algorithm used for classification or regression although it is more preferred for classification. This algorithm, in simple terms tries to seperate different classes in a plot by a line or lines. 

In [12]:
# creating the classifier
SVM = SVC(random_state = 1) # random_state controls the pseudo random number generation for shuffling the data for probability estimates.
# fitting the model
SVM.fit(x_train, y_train)
# testing and calculating accuracy
svm_accuracy = SVM.score(x_test, y_test)*100

print("SVM Accuracy is {}".format(svm_accuracy))

SVM Accuracy is 88.52459016393442


# Random Forest
A random forest is a meta estimator that fits a number of decision tree classifiers on various sub-samples of the dataset and uses averaging to improve the predictive accuracy and control over-fitting. 

In [13]:
# creating the classifier
random_forest = RandomForestClassifier(n_estimators = 1000)  # n_estimators is the number of trees in the forest
# fitting the model
random_forest.fit(x_train, y_train)
# testing and calculating accuracy
accuracy = random_forest.score(x_test, y_test)*100

print("Random Forest Accuracy is {}".format(accuracy))

Random Forest Accuracy is 91.80327868852459


# Decision Tree
In general, Decision tree analysis is a predictive modelling tool that can be applied across many areas. They can be used for both classification and regression tasks.

In [14]:
# creating the tree
decision_tree = DecisionTreeClassifier()
# fitting the model
decision_tree.fit(x_train, y_train)
# testing and calculating accuracy
accuracy = decision_tree.score(x_test, y_test)*100

print("Decision Tree Accuracy is {}".format(accuracy))

Decision Tree Accuracy is 80.32786885245902
