In [None]:
import numpy as np
import pandas as pd

In [None]:
df = pd.read_csv('Heart_Disease_Prediction.csv')

In [None]:
df.shape

In [None]:
df.columns

In [None]:
df.dtypes

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
df.isnull().any()

In [None]:
df.info()

In [None]:
df.describe().T

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [None]:
fig = plt.figure(figsize = (15,15))
ax = fig.gca()
g = df.hist(ax=ax)

In [None]:
g = sns.countplot(x='Heart Disease', data=df)
plt.xlabel('Heart Disease')
plt.ylabel('Count')

In [None]:
# Selecting correlated features using Heatmap

# Get correlation of all the features of the dataset
corr_matrix = df.corr()
top_corr_features = corr_matrix.index

# Plotting the heatmap
plt.figure(figsize=(20,20))
sns.heatmap(data=df[top_corr_features].corr(), annot=True, cmap='viridis')

In [None]:
dataset = pd.get_dummies(df, columns=['Sex', 'Chest pain type', 'FBS over 120', 'EKG results', 'Exercise angina', 'Slope of ST', 'Number of vessels fluro', 'Thallium'])

In [None]:
dataset.columns

In [None]:
from sklearn.preprocessing import StandardScaler
standScaler = StandardScaler()
columns_to_scale = ['Age', 'BP', 'Cholesterol', 'Max HR', 'ST depression']
dataset[columns_to_scale] = standScaler.fit_transform(dataset[columns_to_scale])

In [None]:
dataset.head()

In [None]:
# Splitting the dataset into dependent and independent features
X = dataset.drop('Heart Disease', axis=1)
y = dataset['Heart Disease']

In [None]:
#MODEL BUILDING - 3 Algorithms
X

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
#K-NEIGHBOURS CLASSIFIER
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score

In [None]:
# Finding the best accuracy for knn algorithm using cross_val_score 
knn_scores = []
for i in range(1, 21):
  knn_classifier = KNeighborsClassifier(n_neighbors=i)
  cvs_scores = cross_val_score(knn_classifier, X, y, cv=10)
  knn_scores.append(round(cvs_scores.mean(),3))


In [None]:
# Plotting the results of knn_scores
plt.figure(figsize=(20,15))
plt.plot([k for k in range(1, 21)], knn_scores, color = 'orange')
for i in range(1,21):
    plt.text(i, knn_scores[i-1], (i, knn_scores[i-1]))
plt.xticks([i for i in range(1, 21)])
plt.xlabel('Number of Neighbors (K)')
plt.ylabel('Scores')
plt.title('K Neighbors Classifier scores for different K values')

In [None]:
# Training the knn classifier model with k value as 12
knn_classifier = KNeighborsClassifier(n_neighbors=12)
cvs_scores = cross_val_score(knn_classifier, X, y, cv=10)
res1=round(cvs_scores.mean(),4)*100
print("KNeighbours Classifier Accuracy with K=12 is: {}%".format(res1))

In [None]:
#DECISON TREE CLASSIFIER
from sklearn.tree import DecisionTreeClassifier

In [None]:
# Finding the best accuracy for decision tree algorithm using cross_val_score 
decision_scores = []
for i in range(1, 11):
  decision_classifier = DecisionTreeClassifier(max_depth=i)
  cvs_scores = cross_val_score(decision_classifier, X, y, cv=10)
  decision_scores.append(round(cvs_scores.mean(),3))

In [None]:
# Plotting the results of decision_scores
plt.figure(figsize=(20,15))
plt.plot([i for i in range(1, 11)], decision_scores, color = 'green')
for i in range(1,11):
    plt.text(i, decision_scores[i-1], (i, decision_scores[i-1]))
plt.xticks([i for i in range(1, 11)])
plt.xlabel('Depth of Decision Tree (N)')
plt.ylabel('Scores')
plt.title('Decision Tree Classifier scores for different depth values')

In [None]:
# Training the decision tree classifier model with max_depth value as 3
decision_classifier = DecisionTreeClassifier(max_depth=3)
cvs_scores = cross_val_score(decision_classifier, X, y, cv=10)
res2=(round(cvs_scores.mean(), 4)*100)
print("Decision Tree Classifier Accuracy with max_depth=3 is: {}%".format(res2))

In [None]:
#RANDOM FOREST CLASSIFIER
# Importing essential libraries
from sklearn.ensemble import RandomForestClassifier

In [None]:
# Finding the best accuracy for random forest algorithm using cross_val_score 
forest_scores = []
for i in range(10, 101, 10):
  forest_classifier = RandomForestClassifier(n_estimators=i)
  cvs_scores = cross_val_score(forest_classifier, X, y, cv=5)
  forest_scores.append(round(cvs_scores.mean(),3))

In [None]:
# Plotting the results of forest_scores
plt.figure(figsize=(20,15))
plt.plot([n for n in range(10, 101, 10)], forest_scores, color = 'turquoise')
for i in range(1,11):
    plt.text(i*10, forest_scores[i-1], (i*10, forest_scores[i-1]))
plt.xticks([i for i in range(10, 101, 10)])
plt.xlabel('Number of Estimators (N)')
plt.ylabel('Scores')
plt.title('Random Forest Classifier scores for different N values')

In [None]:
# Training the random forest classifier model with n value as 90
forest_classifier = RandomForestClassifier(n_estimators=90)
cvs_scores = cross_val_score(forest_classifier, X, y, cv=5)
res3=(round(cvs_scores.mean(), 4)*100)
print("Random Forest Classifier Accuracy with n_estimators=90 is: {}%".format(res3))

In [None]:
#SVM ALGORITHM

In [None]:
# Importing essential libraries
from sklearn.svm import SVC
# Finding the best accuracy for SVC algorithm using cross_val_score 
sv_scores = []
for i in range(10, 101, 10):
  sv_classifier = SVC(kernel='linear', C=1.0)
  cvs_scores = cross_val_score(sv_classifier, X, y, cv=5)
  sv_scores.append(round(cvs_scores.mean(),3))
# Plotting the results of svm_scores
plt.figure(figsize=(20,15))
plt.plot([n for n in range(10, 101, 10)], sv_scores, color = 'red')
for i in range(1,11):
    plt.text(i*10, sv_scores[i-1], (i*10, sv_scores[i-1]))
plt.xticks([i for i in range(10, 101, 10)])
plt.xlabel('Linear Kernel')
plt.ylabel('Scores')
plt.title('SVM Classifier scores for different N values')
# Training the SVM classifier model with n value as 90
sv_classifier = SVC(kernel='linear', C=1.0)
cvs_scores = cross_val_score(sv_classifier, X, y, cv=5)
print("Support Vector Machine Accuracy with linear kernel is: {}%".format(round(cvs_scores.mean(), 4)*100))


In [None]:
#LOGISTIC REGRESSION

In [None]:
# Importing essential libraries
from sklearn.linear_model import LogisticRegression
# Finding the best accuracy for LogisticRegression algorithm using cross_val_score 
lr_scores = []
for i in range(10, 101, 10):
  lr_classifier = LogisticRegression(random_state=16)
  cvs_scores = cross_val_score(lr_classifier, X, y, cv=5)
  lr_scores.append(round(cvs_scores.mean(),3))
# Plotting the results of lr_scores
plt.figure(figsize=(20,15))
plt.plot([n for n in range(10, 101, 10)], lr_scores, color = 'red')
for i in range(1,11):
    plt.text(i*10, lr_scores[i-1], (i*10, lr_scores[i-1]))
plt.xticks([i for i in range(10, 101, 10)])
plt.xlabel('Random State')
plt.ylabel('Scores')
plt.title('Logistic Regression scores for different N values')
# Training the Logistic Regression classifier model with n value as 90
lr_classifier = LogisticRegression(random_state=16)
cvs_scores = cross_val_score(lr_classifier, X, y, cv=5)
print("Logistic Regression Classifier Accuracy with random state 16 is: {}%".format(round(cvs_scores.mean(), 4)*100))


In [None]:
#GAUSSIAN NB ALGORITHM

In [None]:
# Importing essential libraries
from sklearn.naive_bayes import GaussianNB
# Finding the best accuracy for GaussianNB algorithm using cross_val_score 
gnb_scores = []
for i in range(10, 101, 10):
  gnb_classifier = GaussianNB()
  cvs_scores = cross_val_score(gnb_classifier, X, y, cv=5)
  gnb_scores.append(round(cvs_scores.mean(),3))
# Plotting the results of gnb_scores
plt.figure(figsize=(20,15))
plt.plot([n for n in range(10, 101, 10)], gnb_scores, color = 'red')
for i in range(1,11):
    plt.text(i*10, gnb_scores[i-1], (i*10, gnb_scores[i-1]))
plt.xticks([i for i in range(10, 101, 10)])
plt.xlabel('Random State')
plt.ylabel('Scores')
plt.title('GaussianNB scores for different N values')
# Training the GaussianNB classifier model with n value as 90
gnb_classifier = GaussianNB()
cvs_scores = cross_val_score(gnb_classifier, X, y, cv=5)
print("GaussianNB Classifier Accuracy is: {}%".format(round(cvs_scores.mean(), 4)*100))


In [None]:
#GRADIENT BOOSTING CLASSIFIER
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
# Importing essential libraries
from sklearn.ensemble import GradientBoostingClassifier
# Finding the best accuracy for GradientBoostingClassifier algorithm using cross_val_score 
gbc_scores = []
for i in range(10, 101, 10):
  gbc_classifier = GradientBoostingClassifier(learning_rate=0.1)
  cvs_scores = cross_val_score(gbc_classifier, X, y, cv=5)
  gbc_scores.append(round(cvs_scores.mean(),3))
# Plotting the results of gbc_scores
plt.figure(figsize=(20,15))
plt.plot([n for n in range(10, 101, 10)], gbc_scores, color = 'red')
for i in range(1,11):
    plt.text(i*10, gbc_scores[i-1], (i*10, gbc_scores[i-1]))
plt.xticks([i for i in range(10, 101, 10)])
plt.xlabel('Learning Rate')
plt.ylabel('Scores')
plt.title('GradientBoostingClassifier scores for different N values')
# Training the GradientBoostingClassifier model with n value as 90
gbc_classifier = GradientBoostingClassifier(learning_rate=0.1)
cvs_scores = cross_val_score(gbc_classifier, X, y, cv=5)
print("GradientBoostingClassifier Accuracy with learning rate 1 is: {}%".format(round(cvs_scores.mean(), 4)*100))


In [None]:
accu=max(res1,res2,res3)
if accu==res1:
    s='K-Neighbours Classifier\n'
elif accu==res2:
    s='Decision Tree Classifier\n'
elif accu==res3:
    s='Random Forest Classifier\n'
print('The most accurate model is the '+s)