In [59]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report,confusion_matrix,ConfusionMatrixDisplay
from sklearn.decomposition import PCA


%matplotlib inline

In [60]:
df = pd.read_csv("/media/magesh/c9e890ad-c69f-4596-9b0f-c1eee81a1764/Magesh/Projects/ML/Data/heart.csv")
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [61]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             918 non-null    int64  
 1   Sex             918 non-null    object 
 2   ChestPainType   918 non-null    object 
 3   RestingBP       918 non-null    int64  
 4   Cholesterol     918 non-null    int64  
 5   FastingBS       918 non-null    int64  
 6   RestingECG      918 non-null    object 
 7   MaxHR           918 non-null    int64  
 8   ExerciseAngina  918 non-null    object 
 9   Oldpeak         918 non-null    float64
 10  ST_Slope        918 non-null    object 
 11  HeartDisease    918 non-null    int64  
dtypes: float64(1), int64(6), object(5)
memory usage: 86.2+ KB


In [62]:
df.shape

(918, 12)

In [63]:
df.columns

Index(['Age', 'Sex', 'ChestPainType', 'RestingBP', 'Cholesterol', 'FastingBS',
       'RestingECG', 'MaxHR', 'ExerciseAngina', 'Oldpeak', 'ST_Slope',
       'HeartDisease'],
      dtype='object')

In [64]:
higher_val_columns = ['RestingBP','Cholesterol','MaxHR','Oldpeak']

limits={}
for col in higher_val_columns:
    upper_limit = df[col].mean() + 3 * df[col].std()
    lower_limit = df[col].mean() - 3 * df[col].std()
    limits[col] = {'upper':upper_limit,'lower':lower_limit}
print(limits)
    

{'RestingBP': {'upper': 188.22518968641754, 'lower': 76.7617384181576}, 'Cholesterol': {'upper': 526.9519979267626, 'lower': -129.35286938645763}, 'MaxHR': {'upper': 213.19037060647202, 'lower': 60.42836577697024}, 'Oldpeak': {'upper': 4.087074287570628, 'lower': -2.312346618725312}}


In [65]:
df_filtered = df.copy()

for col in higher_val_columns:
    df_filtered = df_filtered[(df_filtered[col] <= limits[col]['upper']) & (df_filtered[col] >= limits[col]['lower'])]

df_filtered

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0
...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,M,TA,110,264,0,Normal,132,N,1.2,Flat,1
914,68,M,ASY,144,193,1,Normal,141,N,3.4,Flat,1
915,57,M,ASY,130,131,0,Normal,115,Y,1.2,Flat,1
916,57,F,ATA,130,236,0,LVH,174,N,0.0,Flat,1


In [66]:
one_hot_encoded = pd.get_dummies(df_filtered[['Sex','ChestPainType','RestingECG','ExerciseAngina','ST_Slope']],
                                 prefix=['Sex','ChestPainType','RestingECG','ExerciseAngina','ST_Slope'])
one_hot_encoded = one_hot_encoded.astype(int)


# Concatenate the one-hot encoded columns with the original DataFrame
df_encoded = pd.concat([df_filtered, one_hot_encoded], axis=1)

df_encoded = df_encoded.drop(['Sex','ChestPainType','RestingECG','ExerciseAngina','ST_Slope'],axis=1)

df_encoded

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease,Sex_F,Sex_M,ChestPainType_ASY,...,ChestPainType_NAP,ChestPainType_TA,RestingECG_LVH,RestingECG_Normal,RestingECG_ST,ExerciseAngina_N,ExerciseAngina_Y,ST_Slope_Down,ST_Slope_Flat,ST_Slope_Up
0,40,140,289,0,172,0.0,0,0,1,0,...,0,0,0,1,0,1,0,0,0,1
1,49,160,180,0,156,1.0,1,1,0,0,...,1,0,0,1,0,1,0,0,1,0
2,37,130,283,0,98,0.0,0,0,1,0,...,0,0,0,0,1,1,0,0,0,1
3,48,138,214,0,108,1.5,1,1,0,1,...,0,0,0,1,0,0,1,0,1,0
4,54,150,195,0,122,0.0,0,0,1,0,...,1,0,0,1,0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,110,264,0,132,1.2,1,0,1,0,...,0,1,0,1,0,1,0,0,1,0
914,68,144,193,1,141,3.4,1,0,1,1,...,0,0,0,1,0,1,0,0,1,0
915,57,130,131,0,115,1.2,1,0,1,1,...,0,0,0,1,0,0,1,0,1,0
916,57,130,236,0,174,0.0,1,1,0,0,...,0,0,1,0,0,1,0,0,1,0


In [67]:
X = df_encoded.drop('HeartDisease',axis=1)
X

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,Sex_F,Sex_M,ChestPainType_ASY,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_LVH,RestingECG_Normal,RestingECG_ST,ExerciseAngina_N,ExerciseAngina_Y,ST_Slope_Down,ST_Slope_Flat,ST_Slope_Up
0,40,140,289,0,172,0.0,0,1,0,1,0,0,0,1,0,1,0,0,0,1
1,49,160,180,0,156,1.0,1,0,0,0,1,0,0,1,0,1,0,0,1,0
2,37,130,283,0,98,0.0,0,1,0,1,0,0,0,0,1,1,0,0,0,1
3,48,138,214,0,108,1.5,1,0,1,0,0,0,0,1,0,0,1,0,1,0
4,54,150,195,0,122,0.0,0,1,0,0,1,0,0,1,0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,110,264,0,132,1.2,0,1,0,0,0,1,0,1,0,1,0,0,1,0
914,68,144,193,1,141,3.4,0,1,1,0,0,0,0,1,0,1,0,0,1,0
915,57,130,131,0,115,1.2,0,1,1,0,0,0,0,1,0,0,1,0,1,0
916,57,130,236,0,174,0.0,1,0,0,1,0,0,1,0,0,1,0,0,1,0


In [68]:
y = df_encoded['HeartDisease']
y

0      0
1      1
2      0
3      1
4      0
      ..
913    1
914    1
915    1
916    1
917    0
Name: HeartDisease, Length: 898, dtype: int64

In [69]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled

array([[-1.42957847,  0.46446475,  0.84944867, ..., -0.26033912,
        -1.        ,  1.13614098],
       [-0.47715466,  1.63491331, -0.16777385, ..., -0.26033912,
         1.        , -0.88017246],
       [-1.74705307, -0.12075953,  0.79345477, ..., -0.26033912,
        -1.        ,  1.13614098],
       ...,
       [ 0.36944427, -0.12075953, -0.62505737, ..., -0.26033912,
         1.        , -0.88017246],
       [ 0.36944427, -0.12075953,  0.35483588, ..., -0.26033912,
         1.        , -0.88017246],
       [-1.6412282 ,  0.34741989, -0.21443544, ..., -0.26033912,
        -1.        ,  1.13614098]])

In [70]:
X_train,X_test,y_train,y_test = train_test_split(X_scaled,y,test_size=0.20,random_state=42)

In [71]:
X_train

array([[ 0.79274374,  0.93264417,  0.04686943, ..., -0.26033912,
        -1.        ,  1.13614098],
       [ 1.11021835,  0.75707689,  0.13086028, ..., -0.26033912,
         1.        , -0.88017246],
       [-1.3237536 , -0.70598381,  1.28806755, ..., -0.26033912,
         1.        , -0.88017246],
       ...,
       [-0.58297953, -1.29120809,  0.28950966, ...,  3.84114384,
        -1.        , -0.88017246],
       [ 0.15779454,  0.23037504,  0.28017735, ..., -0.26033912,
         1.        , -0.88017246],
       [-0.79462927, -0.82302867, -0.11177995, ..., -0.26033912,
         1.        , -0.88017246]])

In [72]:
svm = SVC()
svm.fit(X_train,y_train)
svm.score(X_test,y_test)

0.85

In [73]:
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train,y_train)
lr.score(X_test,y_test)

0.8666666666666667

In [74]:
rf = RandomForestClassifier()
rf.fit(X_train,y_train)
rf.score(X_test,y_test)

0.8611111111111112

In [76]:
pca = PCA()
X_pca = pca.fit_transform(X_scaled)
X_pca

array([[ 3.08353065e+00, -1.36852160e+00, -3.77661731e-01, ...,
         1.08443080e-15,  6.22477465e-16, -4.35999679e-16],
       [ 1.48622615e+00,  1.96176118e+00, -1.56570664e+00, ...,
         2.22504880e-16, -3.10496999e-16,  5.68659895e-16],
       [ 1.91660752e+00, -7.26393252e-01,  1.43110694e+00, ...,
         7.99399218e-16, -1.33052572e-17,  9.22668812e-16],
       ...,
       [-2.42260092e+00, -8.10363365e-01, -1.46690257e+00, ...,
         1.23703143e-16, -1.55111292e-16, -2.25378038e-17],
       [ 2.15909663e+00,  3.27768193e+00, -1.65325218e-01, ...,
         3.94265667e-17, -3.02901740e-16,  5.34290775e-17],
       [ 2.67122682e+00, -1.54614913e+00,  2.21266303e-01, ...,
         6.09105721e-17, -5.95321485e-17,  5.99086553e-17]])

In [77]:
X_train_pca,X_test_pca,y_train,y_test = train_test_split(X_pca,y,test_size=0.2,random_state=42)
X_train_pca

array([[ 1.09893997e+00, -1.14295635e+00,  1.26370477e-01, ...,
         2.87313604e-16,  5.50458735e-17,  1.51123449e-17],
       [-1.35395882e+00,  1.23839043e+00,  1.26645558e+00, ...,
         1.43120544e-16, -1.87283693e-16, -1.13027036e-16],
       [-2.26356655e+00, -3.98350641e-01, -2.20266989e+00, ...,
         2.00215799e-16, -8.32241931e-17,  1.39484866e-17],
       ...,
       [ 1.60370811e+00, -1.50152212e+00, -1.49003280e-01, ...,
         1.86922648e-16, -2.05622100e-16,  1.96072350e-16],
       [-1.89470814e+00,  7.30093279e-01,  9.66577881e-01, ...,
        -1.04992645e-16, -9.94266463e-17,  8.85535002e-17],
       [-1.36168148e-01, -1.40167681e+00, -9.99417240e-01, ...,
         2.76745685e-17, -2.77491266e-17,  8.82624301e-17]])

In [78]:
svm_p = SVC()
svm_p.fit(X_train_pca,y_train)
svm_p.score(X_test_pca,y_test)

0.85

In [79]:
lr_p = LogisticRegression(max_iter=1000)
lr_p.fit(X_train_pca,y_train)
lr_p.score(X_test_pca,y_test)

0.8666666666666667

In [80]:
rf_p = RandomForestClassifier()
rf_p.fit(X_train_pca,y_train)
rf_p.score(X_test_pca,y_test)

0.8444444444444444