In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from sklearn.naive_bayes import GaussianNB
import os
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

In [2]:
## loading the data

for _,dirnames, filenames in os.walk('C:\\Users\\ajaym\\Desktop\\HeartDiseaseClassification\\Data'):
    for filename in filenames:
        print(filename)

EDA.ipynb
Model_training.ipynb
raw_data.csv
Untitled.ipynb
EDA-checkpoint.ipynb
Model_training-checkpoint.ipynb
Untitled-checkpoint.ipynb


In [3]:
mydf = pd.read_csv('raw_data.csv')

mydf.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [4]:
copydf = mydf.copy()

In [5]:
## extracting dependent and independent features

dependent_var = copydf[['target']]

dependent_var.head()

Unnamed: 0,target
0,1
1,1
2,1
3,1
4,1


In [6]:
independent_var = copydf.drop('target', axis = 1)

independent_var.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2


In [7]:
independent_var.nunique()

age          41
sex           2
cp            4
trestbps     49
chol        152
fbs           2
restecg       3
thalach      91
exang         2
oldpeak      40
slope         3
ca            5
thal          4
dtype: int64

In [8]:
independent_var.dtypes

age           int64
sex           int64
cp            int64
trestbps      int64
chol          int64
fbs           int64
restecg       int64
thalach       int64
exang         int64
oldpeak     float64
slope         int64
ca            int64
thal          int64
dtype: object

In [9]:
independent_var[['cp', 'restecg', 'thal']] = independent_var[['cp', 'restecg', 'thal']].astype('O')

In [10]:
independent_var.dtypes

age           int64
sex           int64
cp           object
trestbps      int64
chol          int64
fbs           int64
restecg      object
thalach       int64
exang         int64
oldpeak     float64
slope         int64
ca            int64
thal         object
dtype: object

In [11]:
numeric_cols = [feature for feature in independent_var.columns if independent_var[feature].dtype != 'O']

In [12]:
numeric_cols

['age',
 'sex',
 'trestbps',
 'chol',
 'fbs',
 'thalach',
 'exang',
 'oldpeak',
 'slope',
 'ca']

In [13]:
len(numeric_cols)

10

In [14]:
categoric_cols = [feature for feature in independent_var.columns if independent_var[feature].dtypes == 'O']

In [15]:
categoric_cols

['cp', 'restecg', 'thal']

In [16]:
print('The number of numeric features are {} and they are: {}'.format(len(numeric_cols), numeric_cols))
print('The number of categoric_cols are {} and they are : {}'.format(len(categoric_cols), categoric_cols))

The number of numeric features are 10 and they are: ['age', 'sex', 'trestbps', 'chol', 'fbs', 'thalach', 'exang', 'oldpeak', 'slope', 'ca']
The number of categoric_cols are 3 and they are : ['cp', 'restecg', 'thal']


In [17]:
independent_var = pd.get_dummies(independent_var, columns = ['cp', 'restecg', 'thal'], drop_first = True)

In [18]:
independent_var.head()

Unnamed: 0,age,sex,trestbps,chol,fbs,thalach,exang,oldpeak,slope,ca,cp_1,cp_2,cp_3,restecg_1,restecg_2,thal_1,thal_2,thal_3
0,63,1,145,233,1,150,0,2.3,0,0,False,False,True,False,False,True,False,False
1,37,1,130,250,0,187,0,3.5,0,0,False,True,False,True,False,False,True,False
2,41,0,130,204,0,172,0,1.4,2,0,True,False,False,False,False,False,True,False
3,56,1,120,236,0,178,0,0.8,2,0,True,False,False,True,False,False,True,False
4,57,0,120,354,0,163,1,0.6,2,0,False,False,False,True,False,False,True,False


In [19]:
independent_var.dtypes

age            int64
sex            int64
trestbps       int64
chol           int64
fbs            int64
thalach        int64
exang          int64
oldpeak      float64
slope          int64
ca             int64
cp_1            bool
cp_2            bool
cp_3            bool
restecg_1       bool
restecg_2       bool
thal_1          bool
thal_2          bool
thal_3          bool
dtype: object

In [20]:
bool_features = [feature for feature in independent_var.columns if independent_var[feature].dtype =='bool']

bool_features

['cp_1',
 'cp_2',
 'cp_3',
 'restecg_1',
 'restecg_2',
 'thal_1',
 'thal_2',
 'thal_3']

In [21]:
independent_var[['cp_1',
 'cp_2',
 'cp_3',
 'restecg_1',
 'restecg_2',
 'thal_1',
 'thal_2',
 'thal_3']] = independent_var[['cp_1',
 'cp_2',
 'cp_3',
 'restecg_1',
 'restecg_2',
 'thal_1',
 'thal_2',
 'thal_3']].astype('int')

In [22]:
independent_var.dtypes

age            int64
sex            int64
trestbps       int64
chol           int64
fbs            int64
thalach        int64
exang          int64
oldpeak      float64
slope          int64
ca             int64
cp_1           int32
cp_2           int32
cp_3           int32
restecg_1      int32
restecg_2      int32
thal_1         int32
thal_2         int32
thal_3         int32
dtype: object

In [23]:
independent_var.head()

Unnamed: 0,age,sex,trestbps,chol,fbs,thalach,exang,oldpeak,slope,ca,cp_1,cp_2,cp_3,restecg_1,restecg_2,thal_1,thal_2,thal_3
0,63,1,145,233,1,150,0,2.3,0,0,0,0,1,0,0,1,0,0
1,37,1,130,250,0,187,0,3.5,0,0,0,1,0,1,0,0,1,0
2,41,0,130,204,0,172,0,1.4,2,0,1,0,0,0,0,0,1,0
3,56,1,120,236,0,178,0,0.8,2,0,1,0,0,1,0,0,1,0
4,57,0,120,354,0,163,1,0.6,2,0,0,0,0,1,0,0,1,0


In [25]:
numericFeatures = [feature for feature in independent_var.columns if independent_var[feature].dtype != 'O']

numericFeatures

['age',
 'sex',
 'trestbps',
 'chol',
 'fbs',
 'thalach',
 'exang',
 'oldpeak',
 'slope',
 'ca',
 'cp_1',
 'cp_2',
 'cp_3',
 'restecg_1',
 'restecg_2',
 'thal_1',
 'thal_2',
 'thal_3']

In [26]:
from sklearn.model_selection import train_test_split

In [27]:
X_train, X_test, y_train, y_test = train_test_split(independent_var, dependent_var,test_size = 0.2)

In [28]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((242, 18), (61, 18), (242, 1), (61, 1))

In [29]:
continous_features_xtrain = X_train[['age', 'trestbps', 'chol', 'thalach', 'oldpeak']]

In [30]:
continous_features_xtrain.head()

Unnamed: 0,age,trestbps,chol,thalach,oldpeak
65,35,138,183,182,1.4
15,50,120,219,158,1.6
256,58,128,259,130,3.0
78,52,128,205,184,0.0
180,55,132,353,132,1.2


In [39]:
continous_features_xtrain.shape

(242, 5)

In [31]:
continous_features_xtest = X_test[['age', 'trestbps', 'chol', 'thalach', 'oldpeak']]

In [32]:
continous_features_xtest.head()

Unnamed: 0,age,trestbps,chol,thalach,oldpeak
298,57,140,241,123,0.2
20,59,135,234,161,0.5
22,42,140,226,178,0.0
176,60,117,230,160,1.4
293,67,152,212,150,0.8


In [40]:
from sklearn.preprocessing import PowerTransformer

In [41]:
pt = PowerTransformer(method='yeo-johnson')

In [42]:
continous_features_xtrain = pt.fit_transform(continous_features_xtrain)

In [43]:
continous_features_xtrain.shape

(242, 5)

In [44]:
continous_features_xtrain = pd.DataFrame(continous_features_xtrain)
continous_features_xtrain.head()

Unnamed: 0,0,1,2,3,4
0,-1.976647,0.463968,-1.435224,1.541221,0.705684
1,-0.489377,-0.660288,-0.493905,0.226512,0.828733
2,0.389145,-0.131202,0.355943,-0.973319,1.397133
3,-0.275985,-0.131202,-0.836083,1.663012,-1.191272
4,0.051771,0.115016,1.850926,-0.89837,0.564622


In [45]:
continous_features_xtrain.shape

(242, 5)

In [46]:
X_train[['age', 'trestbps', 'chol', 'thalach', 'oldpeak']] = continous_features_xtrain[[0,1,2,3,4]]

In [47]:
X_train.head()

Unnamed: 0,age,sex,trestbps,chol,fbs,thalach,exang,oldpeak,slope,ca,cp_1,cp_2,cp_3,restecg_1,restecg_2,thal_1,thal_2,thal_3
65,1.450885,0,1.568377,0.845673,0,-1.700599,0,0.769213,2,0,0,0,0,1,0,0,1,0
15,0.389145,0,0.115016,-0.378008,0,1.01649,0,1.452861,1,0,0,1,0,1,0,0,1,0
256,,1,,,0,,1,,1,2,0,0,0,0,0,0,0,1
78,-1.976647,1,-0.660288,-1.017914,1,-0.973319,0,0.828733,2,0,1,0,0,1,0,0,1,0
180,0.389145,1,-1.245808,-0.242936,0,0.582011,1,1.236265,1,1,0,0,0,1,0,0,0,1


In [49]:
continous_features_xtest = pt.transform(continous_features_xtest)

In [50]:
continous_features_xtest = pd.DataFrame(continous_features_xtest)
continous_features_xtest.head()

Unnamed: 0,0,1,2,3,4
0,66.057551,143.63811,260.504275,149.518328,0.242124
1,68.479138,138.47682,252.812146,198.295029,0.532341
2,48.030472,143.63811,244.025977,220.312611,0.043928
3,69.691357,119.906638,248.418407,197.003377,1.366756
4,78.202202,156.029938,228.663121,184.109433,0.815735


In [51]:
X_test[['age', 'trestbps', 'chol', 'thalach', 'oldpeak']] = continous_features_xtest[[0,1,2,3,4]]

In [54]:
X_test

Unnamed: 0,age,sex,trestbps,chol,fbs,thalach,exang,oldpeak,slope,ca,cp_1,cp_2,cp_3,restecg_1,restecg_2,thal_1,thal_2,thal_3
298,,0,,,0,,1,,1,0,0,0,0,1,0,0,0,1
20,70.904509,1,153.964185,262.702735,0,167.411825,0,1.001520,1,0,0,0,0,1,0,0,0,1
22,78.202202,1,128.157953,274.799743,0,200.879534,0,0.242124,2,0,0,0,0,1,0,0,1,0
176,,1,,,1,,1,,2,2,0,0,0,1,0,0,0,1
293,,1,,,0,,0,,1,0,0,1,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
166,,1,,,0,,1,,1,2,0,0,0,0,0,0,0,1
289,,0,,,0,,1,,1,1,0,0,0,0,1,0,0,1
203,,1,,,1,,1,,1,0,0,1,0,0,0,0,0,1
111,,1,,,1,,0,,2,1,0,1,0,1,0,0,0,1


In [53]:
X_test.shape

(61, 18)

In [56]:
preprocessor = ColumnTransformer(
transformers = [
    ('standardscaler', standard_transformer, numericFeatures)
])