# Heart Disease Data Set -- Cleveland

In [3]:
# modules we will use
import pandas as pd
import numpy as np

# read in the Heart Disease Data Set--Cleveland from UCI online data base
dataset_url='http://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data'
heart_data_cle=pd.read_csv(dataset_url)

# set seed for reproducibility
np.random.seed(0)

In [11]:
# look at a few rows of the nfl_data file. I can see a handful of missing data already!
heart_data_cle.columns=['age','sex','cp','trestbps','chol','fbs','restecg','thalach','exang','oldpeak','slope','ca','thal','num']
heart_data_cle.sample(5)

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
183,60.0,0.0,4.0,158.0,305.0,0.0,2.0,161.0,0.0,0.0,1.0,0.0,3.0,1
190,51.0,1.0,4.0,140.0,298.0,0.0,0.0,122.0,1.0,4.2,2.0,3.0,7.0,3
277,57.0,1.0,2.0,154.0,232.0,0.0,2.0,164.0,0.0,0.0,1.0,1.0,3.0,1
73,44.0,1.0,4.0,110.0,197.0,0.0,2.0,177.0,0.0,0.0,1.0,1.0,3.0,1
278,58.0,0.0,4.0,130.0,197.0,0.0,0.0,131.0,0.0,0.6,2.0,0.0,3.0,0


In [13]:
heart_data_cle.shape

(302, 14)

In [28]:
# get the number of missing data points per column
heart_data_cle = heart_data_cle.replace({'?': np.nan})
missing_values_count = heart_data_cle.isnull().sum()
missing_values_count[:]

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          4
thal        2
num         0
dtype: int64

In [29]:
# get the percentage of missing data points per column
missing_values_percentage = np.divide(heart_data_cle.isnull().sum(), heart_data_cle.count()) * 100
missing_values_percentage[:]

age         0.000000
sex         0.000000
cp          0.000000
trestbps    0.000000
chol        0.000000
fbs         0.000000
restecg     0.000000
thalach     0.000000
exang       0.000000
oldpeak     0.000000
slope       0.000000
ca          1.342282
thal        0.666667
num         0.000000
dtype: float64

In [32]:
# It seems in this dataset, the missing values are a negligible part. Therefore, let's just drop out them. 
heart_data_cle = heart_data_cle.dropna()
heart_data_cle.shape

(296, 14)

In [33]:
# Verify that all the missing data has been removed
missing_values_count_after_removed = heart_data_cle.isnull().sum()
missing_values_count_after_removed[:]

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
num         0
dtype: int64

In [35]:
# Now let's split the data set into training set and test set
from sklearn.model_selection import train_test_split
X, y = heart_data_cle.iloc[:, 0: -2], heart_data_cle.iloc[:, -1]
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2,    # 20% of the data is used as testing data
                                                    random_state=0, 
                                                    stratify=y)


In [42]:
print("X_train.shape=" + str(X_train.shape))
print("X_test.shape=" + str(X_test.shape))
print("y_train.shape=" + str(y_train.shape))
print("y_test.shape=" + str(y_test.shape))

X_train.shape=(236, 12)
X_test.shape=(60, 12)
y_train.shape=(236,)
y_test.shape=(60,)


In [44]:
# Normalization of the dataframe
#from sklearn.preprocessing import MinMaxScaler
#mms = MinMaxScaler()
#X_train_norm = mms.fit_transform(X_train)
#X_test_norm = mms.transform(X_test)

# Standardization of the dataframe
from sklearn.preprocessing import StandardScaler
stdsc = StandardScaler()
X_train_std = stdsc.fit_transform(X_train)
X_test_std = stdsc.transform(X_test)

In [84]:
# The data has been preprocessed, now let's first use linear Support Vector Machine to learn and predict this dataset
from sklearn.svm import SVC
svm = SVC(kernel='linear', C=10.0, random_state=0)
svm.fit(X_train_std, y_train)
#svm.predict(X_test_std)
svm.score(X_test_std, y_test)

0.55

In [85]:
# Try the Kernel SVM 
svm_kernel=SVC(kernel='rbf', random_state=0, gamma=0.2, C=10.0)
svm_kernel.fit(X_train_std, y_train)
#svm_kernel.predict(X_test_std)
svm_kernel.score(X_test_std, y_test)


0.5833333333333334

In [86]:
# Try Logistic Regression
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(C=100.0, random_state=1)
lr.fit(X_train_std, y_train)
lr.score(X_test_std, y_test)

0.5666666666666667

In [91]:
# Try KNN
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5, p=2, metric='minkowski')
knn.fit(X_train_std, y_train)
lr.score(X_test_std, y_test)

0.5666666666666667