In [None]:
import numpy as np
import pandas as pd
import matplotlib as plt
import time as t

from sklearn import svm
from sklearn import metrics

In [None]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [None]:
train_label = train['HeartDisease']
train_set = train.drop(['HeartDisease'], axis = 1)

test_label = test['HeartDisease']
test_set = test.drop(['HeartDisease'], axis = 1)

In [None]:
train.shape

# CONTENTS
1. [Stratified Sample](#Stratified-Sample)
2. [Parameter Tuning](#Parameter-Tuning)
3. [SMOTEd Sample](#SMOTEd-Data)
4. [50-50 split](#50-50-Split-Data)

# Stratified Sample

### Linear Kernel

In [None]:
ts = t.time()

In [None]:
clf = svm.SVC(C = 1)
clf.fit(train_set, train_label)
pred_label = clf.predict(test_set)

acc = round(metrics.accuracy_score(test_label, pred_label)*100, 3)
print("Accuracy: ", acc, '%')

In [None]:
from sklearn.metrics import confusion_matrix
confuse = confusion_matrix(test_label, pred_label)
print("Class No Accuracy: %f" % (confuse[0,0] / sum(confuse[0,:])))
print("Class Yes Accuracy: %f" % (confuse[1,1] / sum(confuse[1,:])))

In [None]:
from sklearn.metrics import classification_report
report = classification_report(test_label, pred_label, output_dict=True)
pdreport = pd.DataFrame(report).transpose()
print(pdreport.style.to_latex())

In [None]:
te = t.time() - ts
print(te)

#### Polynomial Kernel -- takes too long

In [None]:
ts = t.time()

In [None]:
poly_clf = svm.SVC(kernel='poly', gamma = 0.1, C = 1, degree = 3)
poly_clf.fit(train_set, train_label)
poly_pred = poly_clf.predict(test_set)

poly_acc = round(metrics.accuracy_score(test_label, poly_pred)*100, 3)
print('Accuracy : %f', poly_acc * 100)

In [None]:
te = t.time() - ts
print(te)

### RBF Kernel

In [None]:
rbf_clf = svm.SVC(kernel='rbf', gamma = 0.1, C = 1)
rbf_clf.fit(train_set, train_label)
rbf_pred = rbf_clf.predict(test_set)

rbf_acc = round(metrics.accuracy_score(test_label, rbf_pred)*100, 3)
print('Accuracy :', rbf_acc, '%')

### Sigmoid Kernel

In [None]:
sig_clf = svm.SVC(kernel='sigmoid', gamma = 0.01, C = 10)
sig_clf.fit(train_set, train_label)
sig_pred = sig_clf.predict(test_set)

sig_acc = round(metrics.accuracy_score(test_label, sig_pred)*100, 3)
print('Accuracy :', sig_acc, '%')

Linear, polynomial and rbf kernel SVMs all have similaraccuracies; class boundaries might be linear.
<br>
We now consider only the linear SVM model, because of its lower computational cost and time to compute.

## Parameter Tuning

In [None]:
from sklearn.model_selection import GridSearchCV

#parameter_grid = {'C': [0.01, 0.1, 1, 10, 100], 'gamma': [1,0.1,0.01,0.001],'kernel': ['rbf']} #, 'poly', 'sigmoid']}
c_list = [0.01, 0.1, 1, 10, 100]
c_grid = {'C' : c_list}

In [None]:
grid = GridSearchCV(svm.SVC(), c_grid, refit=True, return_train_score = True, verbose=2)
grid.fit(train50_set, train50_label)

#print(grid.cv_results_)

# SMOTEd Data

In [None]:
os_train = pd.read_csv('train_smoted.csv')
os_test = pd.read_csv('test_smoted.csv')

In [None]:
os_trainX = os_train.drop(['HeartDisease'], axis = 1)
os_trainy = os_train['HeartDisease']

os_testX = os_test.drop(['HeartDisease'], axis = 1)
os_testy = os_test['HeartDisease']

### Linear Kernel

In [None]:
ts = t.time()

In [None]:
from sklearn import svm

clf = svm.SVC(C=100)
clf.fit(os_trainX, os_trainy)
pred_label = clf.predict(os_testX)

from sklearn import metrics
lin_acc = round(metrics.accuracy_score(os_testy, pred_label)*100, 3)
print('Accuracy :', lin_acc, '%')

In [None]:
print(t.time() - ts)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(os_testy, pred_label, output_dict = True))

In [None]:
# print table for latex

rep = classification_report(os_testy, pred_label, output_dict=True)
pdrep = pd.DataFrame(rep).transpose()
print(pdrep.style.to_latex())

In [None]:
from sklearn.metrics import confusion_matrix
confuse = confusion_matrix(os_testy, pred_label)
print("Class No Accuracy: %f" % (confuse[0,0] / sum(confuse[0,:])))
print("Class Yes Accuracy: %f" % (confuse[1,1] / sum(confuse[1,:])))

### Sigmoid Kernel

In [None]:
ts = t.time()

In [None]:
sig_clf = svm.SVC(kernel='sigmoid', gamma = 0.01, C = 1)
sig_clf.fit(os_trainX, os_trainy)
sig_pred = sig_clf.predict(os_testX)

sig_acc = round(metrics.accuracy_score(os_testy, sig_pred)*100, 3)
print('Accuracy :', sig_acc, '%')

In [None]:
print(t.time() - ts)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(os_testy, sig_pred))

In [None]:
from sklearn.metrics import confusion_matrix
confuse_sig = confusion_matrix(os_testy, sig_pred)
print("Class No Accuracy: %f" % (confuse_sig[0,0] / sum(confuse_sig[0,:])))
print("Class Yes Accuracy: %f" % (confuse_sig[1,1] / sum(confuse_sig[1,:])))

### rbf Kernel

In [None]:
ts = t.time()

In [None]:
rbf_clf = svm.SVC(kernel='rbf', gamma = 0.1, C = 1)
rbf_clf.fit(os_trainX, os_trainy)
rbf_pred = rbf_clf.predict(os_testX)

rbf_acc = round(metrics.accuracy_score(os_testy, rbf_pred)*100, 3)
print('Accuracy :', rbf_acc, '%')

In [None]:
print(t.time() - ts)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(os_testy, rbf_pred))

In [None]:
from sklearn.metrics import confusion_matrix
confusemat = confusion_matrix(os_testy, rbf_pred)
print("Class No Accuracy: %f" % (confusemat[0,0] / sum(confusemat[0,:])))
print("Class Yes Accuracy: %f" % (confusemat[1,1] / sum(confusemat[1,:])))

### Polynomial Kernel -- takes too long

In [None]:
ts = t.time()

In [None]:
poly_clf = svm.SVC(kernel='poly', gamma = 0.1, C = 1, degree = 3)
poly_clf.fit(os_trainX, os_trainy)
poly_pred = poly_clf.predict(os_testX)

poly_acc = round(metrics.accuracy_score(os_testy, poly_pred)*100, 3)
print('Accuracy : %f', poly_acc * 100)

In [None]:
print(t.time() - ts)

# 50-50 Split Data

In [None]:
train50 = pd.read_csv('train50.csv')
test50 = pd.read_csv('test50.csv')

In [None]:
train50_label = train50['HeartDisease']
train50_set = train50.drop(['HeartDisease'], axis = 1)

test50_label = test50['HeartDisease']
test50_set = test50.drop(['HeartDisease'], axis = 1)

#### Linear Kernel

In [None]:
clf = svm.SVC(C = 100)
clf.fit(train50_set, train50_label)
pred_label = clf.predict(test50_set)

acc = round(metrics.accuracy_score(test50_label, pred_label)*100, 3)
print("Accuracy: ", acc, '%')

In [None]:
from sklearn.metrics import confusion_matrix
confuse = confusion_matrix(test50_label, pred_label)
print("Class No Accuracy: %f" % (confuse[0,0] / sum(confuse[0,:])))
print("Class Yes Accuracy: %f" % (confuse[1,1] / sum(confuse[1,:])))

In [None]:
from sklearn.metrics import classification_report
print(classification_report(test50_label, pred_label))

In [None]:
# print as latex code

rep = classification_report(test50_label, pred_label, output_dict=True)
pdrep = pd.DataFrame(rep).transpose()
print(pdrep.style.to_latex())

In [None]:
# plot ROC

pred_prob = clf.predict_proba(test50_set)[:,1]
fpr, tpr, threshold = metrics.roc_curve(test50_label, pred_prob)
roc_auc = metrics.auc(fpr, tpr)

In [None]:
import matplotlib.pyplot as pl

pl.plot(fpr, tpr, 'g', label = 'AUC = %0.2f' % roc_auc)
pl.legend(loc = 'lower right')
pl.xlim([0, 1])
pl.ylim([0, 1])
pl.ylabel('True Positive Rate')
pl.xlabel('False Positive Rate')
pl.show()
pl.savefig('SVM50_ROC.png')

[Back to Contents](#CONTENTS)