In [38]:
from sklearn.decomposition import PCA
import pandas as pd
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder
import numpy as np
import matplotlib.pyplot as plt

In [39]:
df = pd.read_csv('heart.csv')
df

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0
...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,M,TA,110,264,0,Normal,132,N,1.2,Flat,1
914,68,M,ASY,144,193,1,Normal,141,N,3.4,Flat,1
915,57,M,ASY,130,131,0,Normal,115,Y,1.2,Flat,1
916,57,F,ATA,130,236,0,LVH,174,N,0.0,Flat,1


In [40]:
for i in ['RestingBP','Cholesterol','MaxHR','Oldpeak']:
    df = df[(df[i]>=(df[i].mean()-3*df[i].std())) & (df[i]<=(df[i].mean()+3*df[i].std()))]

for i in ['Sex','ChestPainType','RestingECG','ST_Slope','ExerciseAngina']:
    df[i] = df[i].apply(lambda x: x.strip())
    
df.reset_index(drop=True,inplace=True)

In [41]:
ohe = OneHotEncoder()
encoded_df = pd.DataFrame((ohe.fit_transform(df[['Sex','ChestPainType','RestingECG','ST_Slope','ExerciseAngina']])).toarray(),columns=ohe.get_feature_names_out(input_features=['Sex', 'ChestPainType', 'RestingECG', 'ST_Slope', 'ExerciseAngina']))

In [42]:
df = pd.concat([df,encoded_df],axis=1)
df

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,...,ChestPainType_NAP,ChestPainType_TA,RestingECG_LVH,RestingECG_Normal,RestingECG_ST,ST_Slope_Down,ST_Slope_Flat,ST_Slope_Up,ExerciseAngina_N,ExerciseAngina_Y
0,40,M,ATA,140,289,0,Normal,172,N,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,...,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
2,37,M,ATA,130,283,0,ST,98,N,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
4,54,M,NAP,150,195,0,Normal,122,N,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
894,45,M,TA,110,264,0,Normal,132,N,1.2,...,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
895,68,M,ASY,144,193,1,Normal,141,N,3.4,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
896,57,M,ASY,130,131,0,Normal,115,Y,1.2,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
897,57,F,ATA,130,236,0,LVH,174,N,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0


In [43]:
X = df.drop(['HeartDisease','Sex','ChestPainType','RestingECG','ST_Slope','ExerciseAngina'],axis=1)
y = df['HeartDisease']

In [44]:
from sklearn.preprocessing import scale
X = scale(X)
X

array([[-1.42815446,  0.46590022,  0.84963584, ...,  1.13469459,
         0.8229452 , -0.8229452 ],
       [-0.47585532,  1.63471366, -0.16812204, ..., -0.88129441,
         0.8229452 , -0.8229452 ],
       [-1.7455875 , -0.1185065 ,  0.79361247, ...,  1.13469459,
         0.8229452 , -0.8229452 ],
       ...,
       [ 0.3706328 , -0.1185065 , -0.62564622, ..., -0.88129441,
        -1.21514774,  1.21514774],
       [ 0.3706328 , -0.1185065 ,  0.35476274, ..., -0.88129441,
         0.8229452 , -0.8229452 ],
       [-1.63977649,  0.34901888, -0.21480818, ...,  1.13469459,
         0.8229452 , -0.8229452 ]])

In [45]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)

In [46]:
models = [{
    'model':SVC(),
    'params':{
        'kernel':['linear', 'poly', 'rbf', 'sigmoid'],
        'gamma':[1,2,3]
        }
    },{
    'model': LogisticRegression(),
    'params':{}
    },{
    'model':RandomForestClassifier(),
    'params':{
        'n_estimators':[50,75,100],
        'criterion':['gini', 'entropy']
    }
    }]

models_stats = []
best_score = 0

for i in models:
    clf = GridSearchCV(i['model'],i['params'],cv=5)
    clf.fit(X,y)
    models_stats.append({
        'Model':str(i['model']).replace('()',''),
        'Best_Params':clf.best_params_,
        'Best_scores':clf.best_score_
    })

    if clf.best_score_ > best_score:
        model = clf.best_estimator_
        best_score = clf.best_score_

models_stats

[{'Model': 'SVC',
  'Best_Params': {'gamma': 1, 'kernel': 'linear'},
  'Best_scores': 0.8263749224084419},
 {'Model': 'LogisticRegression',
  'Best_Params': {},
  'Best_scores': 0.8263997517070143},
 {'Model': 'RandomForestClassifier',
  'Best_Params': {'criterion': 'entropy', 'n_estimators': 50},
  'Best_scores': 0.8352948479205462}]

In [48]:
model

In [47]:
model.fit(X,y)
model.score(X_test,y_test)

1.0

In [54]:
pca = PCA(0.95)
X_pca =  pca.fit_transform(X,y)

In [55]:
X_train, X_test, y_train, y_test = train_test_split(X_pca,y,test_size=0.2)

In [56]:
new_model = RandomForestClassifier(50,criterion='entropy')
new_model.fit(X_pca,y)
new_model.score(X_test,y_test)

0.9944444444444445

PCA reduce the model accuracy and reduce X dimension / feature but make the computation lighter