In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings('ignore')

In [4]:
hrt = pd.read_csv('heart.csv')
hrt.head(3)

FileNotFoundError: [Errno 2] No such file or directory: 'heart.csv'

In [None]:
hrt.shape

In [None]:
hrt.describe(include='all')

In [None]:
#Checking null values
hrt.isna().sum()

In [None]:
#Filling null values in Age column with average age
hrt['Age'].fillna(np.mean(hrt['Age']), inplace = True)

In [None]:
#Filling null values in Sex column with mode
hrt['Sex'].value_counts()

In [None]:
hrt['Sex'].fillna('M', inplace = True)

## Exploratory Data Analysis

In [None]:
gender = hrt.Sex.value_counts()
gender

In [None]:
plt.bar(['Male','Female'], gender.values, color = 'Maroon')
plt.xlabel("Gender")
plt.ylabel('Count of person')
plt.show()

In [None]:
cp = hrt.ChestPainType.value_counts()
plt.bar(cp.index, cp.values, color='Salmon')
plt.xlabel("Chest Pain Type")
plt.ylabel('Count of patients')
plt.show()

In [None]:
#Heart disease distribution
hd = hrt.HeartDisease.value_counts()
plt.pie(hd.values, labels=["With Heart disease", "W/O heart disease"], autopct = "%0.1f%%",
       explode=[0.01,0.01], colors=['SkyBlue','Salmon'])
plt.show()

In [None]:
#Distribution of Cholesterol
plt.hist(hrt['Cholesterol'], bins=15, edgecolor = 'Skyblue')
plt.show()

In [None]:
plt.boxplot(hrt['Cholesterol'])
plt.show()

In [None]:
#Distribution of Heart rate
plt.hist(hrt.MaxHR, bins=15, color = 'Salmon', edgecolor = 'Pink')
plt.show()

In [None]:
sns.boxplot(x = 'ChestPainType', y='Age', data=hrt, hue = 'ChestPainType')
plt.xlabel('Type of chest pain')
plt.ylabel('Age')
plt.show()

In [None]:
plt.scatter(hrt.Cholesterol, hrt.RestingBP, color = 'SkyBlue')
plt.xlabel('Cholesterol value')
plt.ylabel('BP at rest')
plt.show()

The graph shows that their is no conclusive evidence stating BP changes due to Cholesterol

In [None]:
plt.scatter(hrt.Cholesterol, hrt.MaxHR, color = 'Green')
plt.xlabel('Cholesterol value')
plt.ylabel('Heart Rate')
plt.show()

In [None]:
sns.set_style('darkgrid')
sns.countplot(x = 'HeartDisease',hue = 'ChestPainType', data = hrt)
plt.show()

In [None]:
sns.set_style('darkgrid')
sns.countplot(x = 'HeartDisease',hue = 'ExerciseAngina', data = hrt)
plt.show()

In [None]:
hrt.groupby('HeartDisease')['Cholesterol'].mean()

In [None]:
hrt.groupby('HeartDisease')['RestingBP'].mean()

In [None]:
encode = LabelEncoder()
hrt['Sex'] = encode.fit_transform(hrt['Sex'])
hrt.ChestPainType = encode.fit_transform(hrt.ChestPainType)
hrt.RestingECG = encode.fit_transform(hrt.RestingECG)
hrt.ExerciseAngina = encode.fit_transform(hrt.ExerciseAngina)
hrt.ST_Slope = encode.fit_transform(hrt.ST_Slope)

In [None]:
X = hrt.iloc[: , :-1]
y = hrt.HeartDisease

In [None]:
x_tr, x_te, y_tr, y_te = train_test_split(X,y,test_size=0.20)

In [None]:
model = LogisticRegression()
model.fit(x_tr, y_tr)

In [None]:
y_pr = model.predict(x_te)
print("The accuracy is:", accuracy_score(y_te, y_pr)*100)

In [None]:
accuracies = []
accuracies.append(accuracy_score(y_te, y_pr)*100)

In [None]:
#Hyper Parameter tunning using grid search CV
max_feat = [0.2,0.5,1.0]
max_dep = [2,3,4,5,6,7]
param_grid = {'max_features':max_feat, 'max_depth':max_dep}

model = DecisionTreeClassifier()
cls = GridSearchCV(estimator=model, param_grid=param_grid, cv = 5, n_jobs=-1, verbose=2)

cls.fit(x_tr, y_tr)

In [None]:
#Getting best parameter combination
cls.best_params_

In [None]:
#Tunning Decision tree with best parameters
model = DecisionTreeClassifier(max_depth=4, max_features=1.0)
model.fit(x_tr, y_tr)
y_pr = model.predict(x_te)
acc = accuracy_score(y_te, y_pr)*100
accuracies.append(acc)

In [None]:
accuracies

In [None]:
#Applying KNN
k_vals = [1,2,3,4,5,6,7,8,9,10]
accus = []
for i in k_vals:
    model = KNeighborsClassifier(n_neighbors=i)
    model.fit(x_tr, y_tr)
    y_pred = model.predict(x_te)
    accus.append(accuracy_score(y_te, y_pred)*100)

In [None]:
accus

In [None]:
#As we get best accuracy for K = 6, use that value for number of neighbors
model = KNeighborsClassifier(n_neighbors=6)
model.fit(x_tr, y_tr)
y_pred = model.predict(x_te)
accuracies.append(accuracy_score(y_te, y_pred)*100)

In [None]:
#Applying SVM
model = SVC(kernel='poly')
model.fit(x_tr, y_tr)
y_pred = model.predict(x_te)
acc = accuracy_score(y_te, y_pred)*100
accuracies.append(acc)
accuracies

In [None]:
#Using Naive Bayes
model = GaussianNB()
model.fit(x_tr, y_tr)
y_pred = model.predict(x_te)
accuracies.append(accuracy_score(y_te, y_pred)*100)
accuracies

In [None]:
#Using Ensemble Random Forest model
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()
model.fit(x_tr, y_tr)
y_pred = model.predict(x_te)
accuracies.append(accuracy_score(y_te, y_pred)*100)
accuracies

In [None]:
fig = plt.figure(figsize=(8,4))
classifiers = ['Log. Reg.','Decision Tree','K-NN','SVM','Naive Bayes','Random Forest']
plt.bar(classifiers,accuracies, color = 'Lightblue',edgecolor = 'Black')
plt.xlabel("Classifiers used")
plt.ylabel("Accuracies obtained")
plt.title("Comarision of various ML classification Algorithms")
plt.show()