In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt   # use matplotlib for plotting with inline plots

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score

from sklearn.neighbors import NearestNeighbors
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

np.random.seed(0)
%matplotlib inline

import warnings
warnings.filterwarnings('ignore') # for deprecated matplotlib functions

In [2]:
# load the data
data = pd.read_csv('data/heart.csv')

# display the raw data that is inside heart.csv
display(data)

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
5,57,1,0,140,192,0,1,148,0,0.4,1,0,1,1
6,56,0,1,140,294,0,0,153,0,1.3,1,0,2,1
7,44,1,1,120,263,0,1,173,0,0.0,2,0,3,1
8,52,1,2,172,199,1,1,162,0,0.5,2,0,3,1
9,57,1,2,150,168,0,1,174,0,1.6,2,0,2,1


In [3]:
# # Make Data Readable
# data.columns = ['age', 'sex', 'chest_pain_type', 'resting_blood_pressure', 'cholesterol', 'fasting_blood_sugar', 'rest_ecg', 'max_heart_rate_achieved',
#                 'exercise_induced_angina', 'st_depression', 'st_slope', 'num_major_vessels', 'thalassemia', 'target']


# data['sex'][data['sex'] == 0] = 'female'
# data['sex'][data['sex'] == 1] = 'male'

# data['chest_pain_type'][data['chest_pain_type'] == 1] = 'typical angina'
# data['chest_pain_type'][data['chest_pain_type'] == 2] = 'atypical angina'
# data['chest_pain_type'][data['chest_pain_type'] == 3] = 'non-anginal pain'
# data['chest_pain_type'][data['chest_pain_type'] == 4] = 'asymptomatic'

# data['fasting_blood_sugar'][data['fasting_blood_sugar'] == 0] = 'lower than 120mg/ml'
# data['fasting_blood_sugar'][data['fasting_blood_sugar'] == 1] = 'greater than 120mg/ml'

# data['rest_ecg'][data['rest_ecg'] == 0] = 'normal'
# data['rest_ecg'][data['rest_ecg'] == 1] = 'ST-T wave abnormality'
# data['rest_ecg'][data['rest_ecg'] == 2] = 'left ventricular hypertrophy'

# data['exercise_induced_angina'][data['exercise_induced_angina'] == 0] = 'no'
# data['exercise_induced_angina'][data['exercise_induced_angina'] == 1] = 'yes'

# data['st_slope'][data['st_slope'] == 1] = 'upsloping'
# data['st_slope'][data['st_slope'] == 2] = 'flat'
# data['st_slope'][data['st_slope'] == 3] = 'downsloping'

# data['thalassemia'][data['thalassemia'] == 1] = 'normal'
# data['thalassemia'][data['thalassemia'] == 2] = 'fixed defect'
# data['thalassemia'][data['thalassemia'] == 3] = 'reversable defect'

# # display cleaned columns
# display(data) 

In [4]:
X = data.drop(['target'], axis=1)
y = data['target'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("X_train:",X_train.shape)
print("X_test:",X_test.shape)

print("y_train:",y_train.shape)
print("y_test:",y_test.shape)

X_train: (242, 13)
X_test: (61, 13)
y_train: (242,)
y_test: (61,)


# K-Nearest Neighbor

In [5]:
knn = KNeighborsClassifier(n_neighbors=10, weights='distance', 
                           algorithm='auto', leaf_size=30, p=1, 
                           metric='minkowski',metric_params=None,
                          n_jobs=-1)

knn.fit(X_train, y_train)
# fit the model using X as training data and Y as target values

y_validation_hat = knn.predict_proba(X_test)[:,1]

# roc_auc_score(y_true, y_score, average=’macro’, sample_weight=None, max_fpr=None)
knn_classifier_roc = roc_auc_score(y_test, y_validation_hat, average='macro', sample_weight=None)
print("roc auc:", knn_classifier_roc)

print("training error:", 1 - knn.score(X_train, y_train))
print("validation error:", 1 - knn.score(X_test, y_test))
# returns the mean accuracy on the given test data and labels

# Create the data for submission by taking the P(Y=1) column from probs and just add a running index as the first column.
# Yte = np.vstack((np.arange(Xte.shape[0]), knn.predict_proba(Xte)[:,1])).T 
# Output a file with two columns, a row ID and a confidence in class 1: 
#np.savetxt('Y_submit.txt',Yte,'%d, %.2f',header='ID,Prob1',comments='',delimiter=',')

roc auc: 0.8372844827586207
training error: 0.0
validation error: 0.29508196721311475


# Random Forest

In [6]:
rforest = RandomForestClassifier(n_estimators=400, 
                                 criterion='gini', max_depth=90, 
                                 min_samples_split=8, 
                                 min_samples_leaf=5, 
                                 min_weight_fraction_leaf=0.0, 
                                 max_features='sqrt', 
                                 max_leaf_nodes=None, 
                                 min_impurity_decrease=0.0, 
                                 min_impurity_split=None, 
                                 bootstrap=True, oob_score=True, 
                                 n_jobs=-1, random_state=None, 
                                 verbose=0, warm_start=False, 
                                 class_weight=None)

rforest.fit(X_train, y_train)
# fit the model using X as training data and Y as target values

y_validation_hat = rforest.predict_proba(X_test)[:,1]

# roc_auc_score(y_true, y_score, average=’macro’, sample_weight=None, max_fpr=None)
rforest_classifier_roc = roc_auc_score(y_test, y_validation_hat, average='macro', sample_weight=None)
print("roc auc:", rforest_classifier_roc)

print("training error:", 1 - rforest.score(X_train, y_train))
print("validation error:", 1 - rforest.score(X_test, y_test))
# returns the mean accuracy on the given test data and labels

# Create the data for submission by taking the P(Y=1) column from probs and just add a running index as the first column.
# Yte = np.vstack((np.arange(Xte.shape[0]), rforest.predict_proba(Xte)[:,1])).T 
# Output a file with two columns, a row ID and a confidence in class 1: 
#np.savetxt('Y_submit.txt',Yte,'%d, %.2f',header='ID,Prob1',comments='',delimiter=',')

roc auc: 0.9364224137931034
training error: 0.0826446280991735
validation error: 0.1311475409836066


# Decision Tree

In [7]:
dtree = DecisionTreeClassifier(criterion='gini', splitter='best', 
                               max_depth=25, min_samples_split=2**6, 
                               min_samples_leaf=1, 
                               min_weight_fraction_leaf=0.0, 
                               max_features=6, random_state=None, 
                               max_leaf_nodes=None, 
                               min_impurity_decrease=0.0, 
                               min_impurity_split=None, 
                               class_weight='balanced', presort=False)

dtree.fit(X_train, y_train)
# fit the model using X as training data and Y as target values
y_validation_hat = dtree.predict_proba(X_test)[:,1]

# roc_auc_score(y_true, y_score, average=’macro’, sample_weight=None, max_fpr=None)
dtree_classifier_roc = roc_auc_score(y_test, y_validation_hat, average='macro', sample_weight=None)
print("roc auc:", dtree_classifier_roc)

print("training error:", 1 - dtree.score(X_train, y_train))
print("validation error:", 1 - dtree.score(X_test, y_test))
# returns the mean accuracy on the given test data and labels

# Create the data for submission by taking the P(Y=1) column from probs and just add a running index as the first column.
# Yte = np.vstack((np.arange(Xte.shape[0]), dtree.predict_proba(Xte)[:,1])).T 
# Output a file with two columns, a row ID and a confidence in class 1: 
#np.savetxt('Y_submit.txt',Yte,'%d, %.2f',header='ID,Prob1',comments='',delimiter=',')

roc auc: 0.8701508620689655
training error: 0.26859504132231404
validation error: 0.34426229508196726


# Neural Network

In [9]:
nn = MLPClassifier(hidden_layer_sizes=(100, ), activation='relu', 
                   solver='adam', alpha=0.0001, batch_size='auto', 
                   learning_rate='constant', learning_rate_init=0.001, 
                   power_t=0.5, max_iter=200, shuffle=True, 
                   random_state=None, tol=0.0001, verbose=False, 
                   warm_start=False, momentum=0.9, 
                   nesterovs_momentum=True, early_stopping=False, 
                   validation_fraction=0.1, beta_1=0.9, beta_2=0.999, 
                   epsilon=1e-08, n_iter_no_change=10)

nn.fit(X_train, y_train)
# fit the model using X as training data and Y as target values
y_validation_hat = nn.predict_proba(X_test)[:,1]

# roc_auc_score(y_true, y_score, average=’macro’, sample_weight=None, max_fpr=None)
nn_classifier_roc = roc_auc_score(y_test, y_validation_hat, average='macro', sample_weight=None)
print("roc auc:", nn_classifier_roc)

print("training error:", 1 - nn.score(X_train, y_train))
print("validation error:", 1 - nn.score(X_test, y_test))
# returns the mean accuracy on the given test data and labels

# Create the data for submission by taking the P(Y=1) column from probs and just add a running index as the first column.
# Yte = np.vstack((np.arange(Xte.shape[0]), dtree.predict_proba(Xte)[:,1])).T 
# Output a file with two columns, a row ID and a confidence in class 1: 
#np.savetxt('Y_submit.txt',Yte,'%d, %.2f',header='ID,Prob1',comments='',delimiter=',')

roc auc: 0.9051724137931034
training error: 0.20661157024793386
validation error: 0.11475409836065575
