In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('data/heart.csv')
df.shape

(918, 12)

### Outliers

In [3]:
columns = ['RestingBP', 'Cholesterol', 'MaxHR']
for x in columns:
    df['zscore_' + x] = (df[x] - df[x].mean()) / df[x].std()

In [4]:
outliers = []
for x in columns:
    outliers.append(df.loc[(df['zscore_' + x] > 3) | (df['zscore_' + x] < -3)])
    
outliers = pd.concat(outliers)

In [5]:
df.drop(outliers.index, inplace=True)
df.shape

(906, 15)

In [6]:
df.drop(['zscore_RestingBP', 'zscore_Cholesterol', 'zscore_MaxHR'], axis=1, inplace=True)
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


### Label Encoding - Hot encoding

In [7]:
df.dtypes

Age                 int64
Sex                object
ChestPainType      object
RestingBP           int64
Cholesterol         int64
FastingBS           int64
RestingECG         object
MaxHR               int64
ExerciseAngina     object
Oldpeak           float64
ST_Slope           object
HeartDisease        int64
dtype: object

In [8]:
keys = df.select_dtypes(include='object').keys()

for i in keys:
    values = df[i].unique()
    print(i, values)

Sex ['M' 'F']
ChestPainType ['ATA' 'NAP' 'ASY' 'TA']
RestingECG ['Normal' 'ST' 'LVH']
ExerciseAngina ['N' 'Y']
ST_Slope ['Up' 'Flat' 'Down']


In [9]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

for i in keys:
    if len(df[i].unique()) > 2:
        df[i] = df[[i]].apply(le.fit_transform)
    else:
        df = pd.get_dummies(df, columns=[i])
        
df.head()

Unnamed: 0,Age,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,Oldpeak,ST_Slope,HeartDisease,Sex_F,Sex_M,ExerciseAngina_N,ExerciseAngina_Y
0,40,1,140,289,0,1,172,0.0,2,0,0,1,1,0
1,49,2,160,180,0,1,156,1.0,1,1,1,0,1,0
2,37,1,130,283,0,2,98,0.0,2,0,0,1,1,0
3,48,0,138,214,0,1,108,1.5,1,1,1,0,0,1
4,54,2,150,195,0,1,122,0.0,2,0,0,1,1,0


### Scaling

In [10]:
X = df.drop('HeartDisease', axis=1)
y = df['HeartDisease']

In [11]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X = scaler.fit_transform(X)

### Models and Ensemble

In [12]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, AdaBoostClassifier, VotingClassifier

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=5)

In [14]:
svm = SVC()
svm.fit(X_train, y_train)
svm.score(X_test, y_test)

0.8571428571428571

In [15]:
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
dt.score(X_test, y_test)

0.7527472527472527

In [16]:
lg = LogisticRegression()
lg.fit(X_train, y_train)
lg.score(X_test, y_test)

0.8296703296703297

In [17]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
rf.score(X_test, y_test)

0.8736263736263736

In [18]:
bc = BaggingClassifier()
bc.fit(X_train, y_train)
bc.score(X_test, y_test)

0.8406593406593407

In [19]:
bc.get_params()

{'base_estimator': None,
 'bootstrap': True,
 'bootstrap_features': False,
 'max_features': 1.0,
 'max_samples': 1.0,
 'n_estimators': 10,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [20]:
abc = AdaBoostClassifier(n_estimators=10)
abc.fit(X_train, y_train)
abc.score(X_test, y_test)

0.8406593406593407

In [21]:
abc.get_params()

{'algorithm': 'SAMME.R',
 'base_estimator': None,
 'learning_rate': 1.0,
 'n_estimators': 10,
 'random_state': None}

In [22]:
vc = VotingClassifier(estimators=[['svm', svm], ['dt', dt], ['lg', lg]])
vc.fit(X_train, y_train)
vc.score(X_test, y_test)

0.8461538461538461