In [1]:
import seaborn as sns
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

# required dependancy
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier

In [2]:
df = pd.read_csv("/kaggle/input/heart-attack-analysis-prediction-dataset/heart.csv")

## Visualizing the data

In [3]:
df.head()

Unnamed: 0,age,sex,cp,trtbps,chol,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall,output
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       303 non-null    int64  
 1   sex       303 non-null    int64  
 2   cp        303 non-null    int64  
 3   trtbps    303 non-null    int64  
 4   chol      303 non-null    int64  
 5   fbs       303 non-null    int64  
 6   restecg   303 non-null    int64  
 7   thalachh  303 non-null    int64  
 8   exng      303 non-null    int64  
 9   oldpeak   303 non-null    float64
 10  slp       303 non-null    int64  
 11  caa       303 non-null    int64  
 12  thall     303 non-null    int64  
 13  output    303 non-null    int64  
dtypes: float64(1), int64(13)
memory usage: 33.3 KB


In [5]:
df['output'].value_counts()

1    165
0    138
Name: output, dtype: int64

In [6]:
df.corr()

Unnamed: 0,age,sex,cp,trtbps,chol,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall,output
age,1.0,-0.098447,-0.068653,0.279351,0.213678,0.121308,-0.116211,-0.398522,0.096801,0.210013,-0.168814,0.276326,0.068001,-0.225439
sex,-0.098447,1.0,-0.049353,-0.056769,-0.197912,0.045032,-0.058196,-0.04402,0.141664,0.096093,-0.030711,0.118261,0.210041,-0.280937
cp,-0.068653,-0.049353,1.0,0.047608,-0.076904,0.094444,0.044421,0.295762,-0.39428,-0.14923,0.119717,-0.181053,-0.161736,0.433798
trtbps,0.279351,-0.056769,0.047608,1.0,0.123174,0.177531,-0.114103,-0.046698,0.067616,0.193216,-0.121475,0.101389,0.06221,-0.144931
chol,0.213678,-0.197912,-0.076904,0.123174,1.0,0.013294,-0.15104,-0.00994,0.067023,0.053952,-0.004038,0.070511,0.098803,-0.085239
fbs,0.121308,0.045032,0.094444,0.177531,0.013294,1.0,-0.084189,-0.008567,0.025665,0.005747,-0.059894,0.137979,-0.032019,-0.028046
restecg,-0.116211,-0.058196,0.044421,-0.114103,-0.15104,-0.084189,1.0,0.044123,-0.070733,-0.05877,0.093045,-0.072042,-0.011981,0.13723
thalachh,-0.398522,-0.04402,0.295762,-0.046698,-0.00994,-0.008567,0.044123,1.0,-0.378812,-0.344187,0.386784,-0.213177,-0.096439,0.421741
exng,0.096801,0.141664,-0.39428,0.067616,0.067023,0.025665,-0.070733,-0.378812,1.0,0.288223,-0.257748,0.115739,0.206754,-0.436757
oldpeak,0.210013,0.096093,-0.14923,0.193216,0.053952,0.005747,-0.05877,-0.344187,0.288223,1.0,-0.577537,0.222682,0.210244,-0.430696


## Preprocessing Data

In [7]:
# dropping columns with poor co-relation
df.drop(['chol', 'fbs'], axis=1, inplace=True)

In [8]:
X = df.drop(['output'], axis=1)
y = df['output']

In [9]:
for i in y:
    print(i, end=" ")

1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
std_scaler = StandardScaler()
X_train_sc = std_scaler.fit_transform(X_train)
X_test_sc = std_scaler.fit_transform(X_test)

## Training SVC

In [12]:
# Finding the best parameters for svc
svc_clf_trial = SVC()
param = {
    'C' : [0.1, 0.4, 0.8, 1, 1.2, 1.4, 1.6, 2, 2.2, 2.4],
    'kernel' : ['rbf', 'poly', 'linear']
}
grid_svc = GridSearchCV(svc_clf_trial,param)
grid_svc.fit(X_train_sc, y_train)
grid_svc.best_params_

{'C': 2, 'kernel': 'rbf'}

In [13]:
svc_clf = SVC(C=2, kernel='rbf', random_state=42)
svc_clf.fit(X_train_sc, y_train)
y_pred_svc = svc_clf.predict(X_test_sc)
accuracy_score(y_test, y_pred_svc)

0.8688524590163934

## Training LogisticRegression

In [14]:
log_reg_trial = LogisticRegression(random_state=42)
param = {
    'solver' : ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga'],
    'penalty' : ['l2', 'none'],
    'C' : [0.1, 0.3, 0.8, 1, 1.2, 1.4, 1.6, 1.8]
}
grid_log_reg = GridSearchCV(log_reg_trial, param)
grid_log_reg.fit(X_train_sc, y_train)
grid_log_reg.best_params_

{'C': 0.1, 'penalty': 'l2', 'solver': 'liblinear'}

In [15]:
log_reg = LogisticRegression(C=0.1, penalty='l2', solver='liblinear', random_state=42)
log_reg.fit(X_train_sc, y_train)
y_pred_log = log_reg.predict(X_test_sc)
accuracy_score(y_test, y_pred_log)

0.8852459016393442

## Training KNeighborsClassifier

In [16]:
knn_clf_trial = KNeighborsClassifier()
param = {
    'n_neighbors' : [1,2,3,4,5,6,7]
}
grid_knn = GridSearchCV(knn_clf_trial, param)
grid_knn.fit(X_train_sc, y_train)
grid_knn.best_params_

{'n_neighbors': 5}

In [17]:
knn_clf = KNeighborsClassifier(n_neighbors=5)
knn_clf.fit(X_train_sc, y_train)
y_pred_knn = knn_clf.predict(X_test_sc)
accuracy_score(y_test, y_pred_knn)

0.8852459016393442

## Training RandomForestRegressor

In [18]:
rf_clf = RandomForestClassifier(n_estimators=100, random_state=42)
rf_clf.fit(X_train_sc, y_train)
y_pred_rf = rf_clf.predict(X_test_sc)
accuracy_score(y_test, y_pred_rf)

0.8524590163934426

## Trying VotingClassifier

In [19]:
voting_clf = VotingClassifier(
    estimators=[('svc', svc_clf), ('log_reg', log_reg),
                ('knn', knn_clf), ('rf', rf_clf)],
    voting='hard'
)
voting_clf.fit(X_train_sc, y_train)
y_pred_voting = voting_clf.predict(X_test_sc)
accuracy_score(y_test, y_pred_voting)

0.8688524590163934

### Since Kneighbors and LogisticRegression both have same accuracy score lets use cross_val_score to further evaluate them

In [20]:
cross_knn = cross_val_score(estimator = knn_clf, X = X_train_sc, y = y_train, cv=10)
cross_logreg = cross_val_score(estimator = log_reg, X = X_train_sc, y = y_train, cv=10)

print("KNN: ", cross_knn.mean())
print("Log_reg: ", cross_logreg.mean())

KNN:  0.8180000000000002
Log_reg:  0.8226666666666667


### Since we now know that knn and log_reg are overfitting, lets try cross_val_score on all models

In [21]:
cross_rf = cross_val_score(estimator = rf_clf, X = X_train_sc, y = y_train, cv=10)
cross_svc = cross_val_score(estimator = svc_clf, X = X_train_sc, y = y_train, cv=10)
cross_vote = cross_val_score(estimator = voting_clf, X = X_train_sc, y = y_train, cv=10)

print("RF: ", cross_rf.mean())
print("SVC: ", cross_svc.mean())
print("Vote: ", cross_vote.mean())

RF:  0.8181666666666667
SVC:  0.8220000000000001
Vote:  0.8305


## Thus the Voting Classifier gives the best accuracy

#### Note: The overfitting of dataset can be reduced by introduction of noise in dataset.