In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.core.interactiveshell import InteractiveShell
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split 
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score

In [2]:
df = pd.read_csv('heart.csv')

In [3]:
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [4]:
df['age'].value_counts()

58    19
57    17
54    16
59    14
52    13
51    12
62    11
44    11
60    11
56    11
64    10
41    10
63     9
67     9
55     8
45     8
42     8
53     8
61     8
65     8
43     8
66     7
50     7
48     7
46     7
49     5
47     5
39     4
35     4
68     4
70     4
40     3
71     3
69     3
38     3
34     2
37     2
77     1
76     1
74     1
29     1
Name: age, dtype: int64

***Exploratory data analysis***

In [5]:
df = df[df['thal'] != 0]
df = df[df['age'] != 29]
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [6]:
df['thal'] = df['thal'].replace(1, 'fixed defect')
df['thal'] = df['thal'].replace(2, 'normal')
df['thal'] = df['thal'].replace(3, 'reversable defect')
df['cp'] = df['cp'].replace(0, 'asymptomatic')
df['cp'] = df['cp'].replace(1, 'atypical angina')
df['cp'] = df['cp'].replace(2, 'non-anginal pain')
df['cp'] = df['cp'].replace(3, 'typical angina')
df['restecg'] = df['restecg'].replace(0, 'ventricular hypertrophy')
df['restecg'] = df['restecg'].replace(1, 'normal')
df['restecg'] = df['restecg'].replace(2, 'ST-T wave abnormality')
df['slope'] = df['slope'].replace(0, 'downsloping')
df['slope'] = df['slope'].replace(1, 'flat')
df['slope'] = df['slope'].replace(2, 'upsloping')

In [7]:
temp = pd.get_dummies(df[['cp', 'restecg', 'slope', 'thal']])
df = df.join(temp, how='left')
df = df.drop(['cp','restecg', 'slope', 'thal'], axis=1)
df.head()

Unnamed: 0,age,sex,trestbps,chol,fbs,thalach,exang,oldpeak,ca,target,...,cp_typical angina,restecg_ST-T wave abnormality,restecg_normal,restecg_ventricular hypertrophy,slope_downsloping,slope_flat,slope_upsloping,thal_fixed defect,thal_normal,thal_reversable defect
0,63,1,145,233,1,150,0,2.3,0,1,...,1,0,0,1,1,0,0,1,0,0
1,37,1,130,250,0,187,0,3.5,0,1,...,0,0,1,0,1,0,0,0,1,0
2,41,0,130,204,0,172,0,1.4,0,1,...,0,0,0,1,0,0,1,0,1,0
3,56,1,120,236,0,178,0,0.8,0,1,...,0,0,1,0,0,0,1,0,1,0
4,57,0,120,354,0,163,1,0.6,0,1,...,0,0,1,0,0,0,1,0,1,0


In [8]:
df = df.drop(['restecg_ventricular hypertrophy', 'slope_upsloping', 'thal_fixed defect', 'cp_typical angina'], axis=1)
df.head()

Unnamed: 0,age,sex,trestbps,chol,fbs,thalach,exang,oldpeak,ca,target,cp_asymptomatic,cp_atypical angina,cp_non-anginal pain,restecg_ST-T wave abnormality,restecg_normal,slope_downsloping,slope_flat,thal_normal,thal_reversable defect
0,63,1,145,233,1,150,0,2.3,0,1,0,0,0,0,0,1,0,0,0
1,37,1,130,250,0,187,0,3.5,0,1,0,0,1,0,1,1,0,1,0
2,41,0,130,204,0,172,0,1.4,0,1,0,1,0,0,0,0,0,1,0
3,56,1,120,236,0,178,0,0.8,0,1,0,1,0,0,1,0,0,1,0
4,57,0,120,354,0,163,1,0.6,0,1,1,0,0,0,1,0,0,1,0


In [9]:
sns.countplot(df.target)
df.target.value_counts()

1    163
0    137
Name: target, dtype: int64

In [10]:
X = df.drop(['target'], axis=1)
Y = df['target']
train_features, test_features, train_labels, test_labels = train_test_split(X, Y, test_size=0.2, random_state=0)

In [11]:
rf = RandomForestClassifier()
dt = DecisionTreeClassifier()
lr = LogisticRegression()
knn = KNeighborsClassifier()

In [12]:
rf.fit(train_features, train_labels)
dt.fit(train_features, train_labels)
lr.fit(train_features, train_labels)
knn.fit(train_features, train_labels)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [13]:
rf_pred_train = rf.predict(train_features)
dt_pred_train = dt.predict(train_features)
lr_pred_train = lr.predict(train_features)
knn_pred_train = knn.predict(train_features)
rf_pred_test = rf.predict(test_features)
dt_pred_test = dt.predict(test_features)
lr_pred_test = lr.predict(test_features)
knn_pred_test = knn.predict(test_features)

In [14]:
rf_prob = rf.predict_proba(test_features)[:,1]
dt_prob = dt.predict_proba(test_features)[:,1]
lr_prob = lr.predict_proba(test_features)[:,1]
knn_prob = knn.predict_proba(test_features)[:,1]

In [15]:
rf_prob

array([ 0.7,  0.2,  0.9,  0.1,  0.8,  0.2,  0.1,  0.5,  0.7,  0.7,  0.5,
        1. ,  0. ,  0. ,  1. ,  0.9,  0.4,  0.8,  0. ,  0.1,  0.1,  0.9,
        0.7,  1. ,  0. ,  0.9,  0.4,  0. ,  0. ,  0.3,  0.8,  1. ,  0.9,
        0.8,  0.3,  0.4,  0.3,  0. ,  0.8,  0.5,  0.5,  0.3,  0.3,  0.8,
        0.9,  0.9,  0.5,  1. ,  0.6,  0.8,  0.9,  0.1,  0.3,  1. ,  0.9,
        0.1,  0.2,  1. ,  0.7,  0.8])

In [16]:
print(classification_report(test_labels,rf_pred_test))
print('Random Forest baseline: ' + str(roc_auc_score(train_labels, rf_pred_train)))
print('Random Forest: ' + str(roc_auc_score(test_labels, rf_pred_test)))
print(classification_report(test_labels,dt_pred_test))
print('Decision Tree baseline: ' + str(roc_auc_score(train_labels, dt_pred_train)))
print('Decision Tree: ' + str(roc_auc_score(test_labels, dt_pred_test)))
print(classification_report(test_labels,lr_pred_test))
print('Logistic Regression baseline: ' + str(roc_auc_score(train_labels, lr_pred_train)))
print('Logistic Regression: ' + str(roc_auc_score(test_labels, lr_pred_test)))
print(classification_report(test_labels,knn_pred_test))
print('KNN baseline: ' + str(roc_auc_score(train_labels, knn_pred_train)))
print('KNN: ' + str(roc_auc_score(test_labels, knn_pred_test)))

             precision    recall  f1-score   support

          0       0.83      0.86      0.85        29
          1       0.87      0.84      0.85        31

avg / total       0.85      0.85      0.85        60

Random Forest baseline: 0.981060606061
Random Forest: 0.850389321468
             precision    recall  f1-score   support

          0       0.78      0.72      0.75        29
          1       0.76      0.81      0.78        31

avg / total       0.77      0.77      0.77        60

Decision Tree baseline: 1.0
Decision Tree: 0.765294771969
             precision    recall  f1-score   support

          0       0.82      0.79      0.81        29
          1       0.81      0.84      0.83        31

avg / total       0.82      0.82      0.82        60

Logistic Regression baseline: 0.861952861953
Logistic Regression: 0.815906562848
             precision    recall  f1-score   support

          0       0.63      0.59      0.61        29
          1       0.64      0.68      0.