# Heart Disease Prediction

In [67]:
import pandas as pd
import numpy as np

In [68]:
df = pd.read_csv("heart.csv")
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [69]:
import warnings
warnings.filterwarnings("ignore")

In [70]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             918 non-null    int64  
 1   Sex             918 non-null    object 
 2   ChestPainType   918 non-null    object 
 3   RestingBP       918 non-null    int64  
 4   Cholesterol     918 non-null    int64  
 5   FastingBS       918 non-null    int64  
 6   RestingECG      918 non-null    object 
 7   MaxHR           918 non-null    int64  
 8   ExerciseAngina  918 non-null    object 
 9   Oldpeak         918 non-null    float64
 10  ST_Slope        918 non-null    object 
 11  HeartDisease    918 non-null    int64  
dtypes: float64(1), int64(6), object(5)
memory usage: 86.2+ KB


In [71]:
df.isna().sum()

Age               0
Sex               0
ChestPainType     0
RestingBP         0
Cholesterol       0
FastingBS         0
RestingECG        0
MaxHR             0
ExerciseAngina    0
Oldpeak           0
ST_Slope          0
HeartDisease      0
dtype: int64

In [72]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [73]:
np.unique(df["RestingECG"])

array(['LVH', 'Normal', 'ST'], dtype=object)

In [74]:
np.unique(df["ExerciseAngina"])

array(['N', 'Y'], dtype=object)

In [75]:
np.unique(df["ST_Slope"])

array(['Down', 'Flat', 'Up'], dtype=object)

In [76]:
df["Sex"] = le.fit_transform(df["Sex"])
df["ExerciseAngina"] = le.fit_transform(df["ExerciseAngina"])
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,1,ATA,140,289,0,Normal,172,0,0.0,Up,0
1,49,0,NAP,160,180,0,Normal,156,0,1.0,Flat,1
2,37,1,ATA,130,283,0,ST,98,0,0.0,Up,0
3,48,0,ASY,138,214,0,Normal,108,1,1.5,Flat,1
4,54,1,NAP,150,195,0,Normal,122,0,0.0,Up,0


In [77]:
np.unique(df["ExerciseAngina"])

array([0, 1])

In [78]:
df = pd.get_dummies(df, drop_first=True, dtype = int)
df.head()

Unnamed: 0,Age,Sex,RestingBP,Cholesterol,FastingBS,MaxHR,ExerciseAngina,Oldpeak,HeartDisease,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_Normal,RestingECG_ST,ST_Slope_Flat,ST_Slope_Up
0,40,1,140,289,0,172,0,0.0,0,1,0,0,1,0,0,1
1,49,0,160,180,0,156,0,1.0,1,0,1,0,1,0,1,0
2,37,1,130,283,0,98,0,0.0,0,1,0,0,0,1,0,1
3,48,0,138,214,0,108,1,1.5,1,0,0,0,1,0,1,0
4,54,1,150,195,0,122,0,0.0,0,0,1,0,1,0,0,1


In [79]:
X = df.drop(["HeartDisease"], axis = 1)
y = df["HeartDisease"]

In [80]:
X.head()

Unnamed: 0,Age,Sex,RestingBP,Cholesterol,FastingBS,MaxHR,ExerciseAngina,Oldpeak,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_Normal,RestingECG_ST,ST_Slope_Flat,ST_Slope_Up
0,40,1,140,289,0,172,0,0.0,1,0,0,1,0,0,1
1,49,0,160,180,0,156,0,1.0,0,1,0,1,0,1,0
2,37,1,130,283,0,98,0,0.0,1,0,0,0,1,0,1
3,48,0,138,214,0,108,1,1.5,0,0,0,1,0,1,0
4,54,1,150,195,0,122,0,0.0,0,1,0,1,0,0,1


In [81]:
X.shape

(918, 15)

In [82]:
y[:5]

0    0
1    1
2    0
3    1
4    0
Name: HeartDisease, dtype: int64

In [83]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

X_scaled = scaler.fit_transform(X)
X_scaled[:5]

array([[-1.4331398 ,  0.51595242,  0.41090889,  0.82507026, -0.55134134,
         1.38292822, -0.8235563 , -0.83243239,  2.07517671, -0.53283777,
        -0.22967867,  0.81427482, -0.49044933, -1.00218103,  1.15067399],
       [-0.47848359, -1.93816322,  1.49175234, -0.17196105, -0.55134134,
         0.75415714, -0.8235563 ,  0.10566353, -0.48188667,  1.87674385,
        -0.22967867,  0.81427482, -0.49044933,  0.99782372, -0.86905588],
       [-1.75135854,  0.51595242, -0.12951283,  0.7701878 , -0.55134134,
        -1.52513802, -0.8235563 , -0.83243239,  2.07517671, -0.53283777,
        -0.22967867, -1.22808661,  2.03894663, -1.00218103,  1.15067399],
       [-0.5845565 , -1.93816322,  0.30282455,  0.13903954, -0.55134134,
        -1.13215609,  1.21424608,  0.57471149, -0.48188667, -0.53283777,
        -0.22967867,  0.81427482, -0.49044933,  0.99782372, -0.86905588],
       [ 0.05188098,  0.51595242,  0.95133062, -0.0347549 , -0.55134134,
        -0.5819814 , -0.8235563 , -0.83243239, 

In [84]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size = 0.2, random_state = 42)

In [85]:
len(X_train)

734

In [86]:
len(X_test)

184

# Model Selection

In [88]:
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier

In [89]:
model_params = {
    "logistic_regression" : {
        "model" : LogisticRegression(max_iter = 1000),
        "params" : {
            "C" : [0.1, 1, 10]
        }
    },
    "svc" : {
        "model" : SVC(gamma = "auto"),
        "params" : {
            "C" : [0.1, 1, 10],
            "kernel" : ["linear", "rbf"]
        }
    },
    "random_forest" : {
        "model" : RandomForestClassifier(),
        "params" : {
            "n_estimators" : [10,50,100]
        }
    },
    "decision_tree" : {
        "model" : DecisionTreeClassifier(),
        "params" : {
            "max_depth" : [None, 5, 10]
        }
    },
    "gaussianNB" : {
        "model" : GaussianNB(),
        "params" : {
            "var_smoothing" : [1e-8, 1e-9, 1e-10]
        }
    }
}

In [90]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score

In [91]:
scores = []

for model, model_param in model_params.items():
    clf = RandomizedSearchCV(model_param["model"], model_param["params"], cv = 3)
    clf.fit(X_train, y_train)
    scores.append({
        "model_name" : model,
        "best_parameter" : clf.best_params_,
        "best_score" : clf.best_score_
    })
scores

[{'model_name': 'logistic_regression',
  'best_parameter': {'C': 0.1},
  'best_score': 0.8719192595070816},
 {'model_name': 'svc',
  'best_parameter': {'kernel': 'linear', 'C': 1},
  'best_score': 0.8705531392885023},
 {'model_name': 'random_forest',
  'best_parameter': {'n_estimators': 50},
  'best_score': 0.8610181777629085},
 {'model_name': 'decision_tree',
  'best_parameter': {'max_depth': 5},
  'best_score': 0.7983383517341363},
 {'model_name': 'gaussianNB',
  'best_parameter': {'var_smoothing': 1e-08},
  'best_score': 0.865110962417754}]

In [117]:
score_df = pd.DataFrame(scores)
score_df

Unnamed: 0,model_name,best_parameter,best_score
0,logistic_regression,{'C': 0.1},0.871919
1,svc,"{'kernel': 'linear', 'C': 1}",0.870553
2,random_forest,{'n_estimators': 50},0.861018
3,decision_tree,{'max_depth': 5},0.798338
4,gaussianNB,{'var_smoothing': 1e-08},0.865111


In [131]:
model = LogisticRegression(max_iter=1000, C=0.1)
model.fit(X_train, y_train)
model.score(X_test, y_test)

0.8532608695652174

In [129]:
cross_val_score(model, X_train, y_train, cv=3).mean()

0.8719192595070816

# Principal Component Analysis

In [134]:
from sklearn.decomposition import PCA

In [136]:
pca = PCA(n_components=0.95)
X_pca = pca.fit_transform(X_scaled)

In [140]:
pca.n_components_

13

In [142]:
X_scaled.shape

(918, 15)

In [144]:
pca.explained_variance_ratio_

array([0.22586656, 0.11008684, 0.0946602 , 0.08202286, 0.07477575,
       0.0709148 , 0.06266947, 0.05485375, 0.05125547, 0.044232  ,
       0.04085286, 0.03082035, 0.02842801])

In [146]:
X_train_pca, X_test_pca, y_train_pca, y_test_pca = train_test_split(X_pca, y, test_size = 0.2, random_state = 42) 

In [148]:
model_pca = LogisticRegression(max_iter = 1000, C = 0.1)
model_pca.fit(X_train_pca, y_train_pca)

In [150]:
model_pca.score(X_test_pca, y_test_pca)

0.8532608695652174

# Using Replace Function

In [155]:
new_df = pd.read_csv("heart.csv")
new_df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [159]:
np.unique(new_df["Sex"])

array(['F', 'M'], dtype=object)

In [163]:
np.unique(new_df["ChestPainType"])

array(['ASY', 'ATA', 'NAP', 'TA'], dtype=object)

In [165]:
np.unique(new_df["RestingECG"])

array(['LVH', 'Normal', 'ST'], dtype=object)

In [167]:
np.unique(new_df["ExerciseAngina"])

array(['N', 'Y'], dtype=object)

In [169]:
np.unique(new_df["ST_Slope"])

array(['Down', 'Flat', 'Up'], dtype=object)

In [173]:
new_df["Sex"].replace({
    "F" : 0,
    "M" : 1
}, inplace = True)

new_df["ChestPainType"].replace({
    "ASY" : 0,
    "ATA" : 1,
    "NAP" : 2,
    "TA" : 3
}, inplace = True)

new_df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,1,1,140,289,0,Normal,172,N,0.0,Up,0
1,49,0,2,160,180,0,Normal,156,N,1.0,Flat,1
2,37,1,1,130,283,0,ST,98,N,0.0,Up,0
3,48,0,0,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,1,2,150,195,0,Normal,122,N,0.0,Up,0
