# Medical Diagnosis with Support Vector Machines

## Task 1: Import Libraries



In [1]:
import numpy as np
import pandas as pd
from sklearn import svm
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

## Task 1: Get Data

In [2]:
column_names = ["pregnancies", "glucose", "bpressure", "skinfold", "insulin", "bmi", "pedigree", "age", "class"]

df = pd.read_csv("data.csv", names=column_names)
print(df.shape)
df.head()

(768, 9)


Unnamed: 0,pregnancies,glucose,bpressure,skinfold,insulin,bmi,pedigree,age,class
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


## Task 1: Extract Features

In [3]:
X = df.iloc[:,:8]
X.head()

Unnamed: 0,pregnancies,glucose,bpressure,skinfold,insulin,bmi,pedigree,age
0,6,148,72,35,0,33.6,0.627,50
1,1,85,66,29,0,26.6,0.351,31
2,8,183,64,0,0,23.3,0.672,32
3,1,89,66,23,94,28.1,0.167,21
4,0,137,40,35,168,43.1,2.288,33


## Task 1: Extract Class Labels

In [4]:
y = df['class']
y.head()

0    1
1    0
2    1
3    0
4    1
Name: class, dtype: int64

## Task 2: Split Dataset

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25, random_state=1)
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)
X_test.head()

(576, 8)
(576,)
(192, 8)
(192,)


Unnamed: 0,pregnancies,glucose,bpressure,skinfold,insulin,bmi,pedigree,age
285,7,136,74,26,135,26.0,0.647,51
101,1,151,60,0,0,26.1,0.179,22
581,6,109,60,27,0,25.0,0.206,27
352,3,61,82,28,0,34.4,0.243,46
726,1,116,78,29,180,36.1,0.496,25


## Task 2: Normalize Features

In [6]:
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_train[:5,:]

array([[ 0.06459212, -0.77092144, -0.43922309,  0.20311001, -0.69449068,
        -0.47467996, -0.07502996, -0.94438203],
       [ 0.36223259, -0.32186694,  0.15969386,  0.51828071, -0.69449068,
        -1.04451678, -0.18379061, -0.5204699 ],
       [-1.12596979,  1.89133023,  1.05806928,  0.39221243,  0.11717847,
         0.62523762, -0.46475564,  0.15778952],
       [ 0.65987307, -0.57846951, -0.13976462, -1.24667522, -0.69449068,
        -0.9915087 , -0.66112905, -0.35090504],
       [ 1.85043498,  0.03096159,  0.45915233,  0.70738313, -0.69449068,
        -0.55419207,  0.13342797,  1.00561378]])

## Task 3: Training a Support Vector Machine

In [7]:
clf = svm.SVC(kernel='sigmoid')
clf.fit(X_train, y_train)

SVC(kernel='sigmoid')

## Task 3: Decision Boundary

In [8]:
y_pred = clf.predict(X_train)
print(y_pred)
print(accuracy_score(y_train, y_pred))

[0 0 1 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 1 0 1 0 1 0 0 0
 1 0 1 0 0 1 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 1 0 1 0 0 0 1 0 0 0 0
 0 0 0 1 1 0 0 0 0 0 1 1 0 0 0 1 0 0 0 1 0 0 1 1 0 1 0 0 1 1 0 0 0 0 0 0 0
 1 0 0 0 0 1 1 1 0 0 1 0 0 1 0 0 1 0 0 0 0 0 1 0 0 1 0 0 0 0 1 1 0 1 0 0 0
 1 0 0 0 0 0 0 1 1 0 0 0 1 0 1 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 1 1 1 0 1 0 0 0 0 0 0 1 0 1 0 0 1 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 1 1 0
 0 1 0 0 0 1 0 0 0 0 0 1 1 0 1 0 0 0 1 0 0 1 1 0 1 0 1 0 1 1 1 0 1 0 0 0 0
 0 0 0 0 0 0 0 1 1 1 0 0 0 0 1 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 1 0 0 1 0 1 0
 1 1 1 0 1 1 0 0 1 1 0 1 0 0 0 1 0 0 0 0 1 1 0 0 1 0 0 1 0 1 0 0 1 0 0 0 0
 0 1 0 0 0 1 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 1 0 0 0 0 1 0
 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 1 0
 1 1 0 0 0 1 1 1 0 0 1 0 0 0 1 0 0 1 0 0 0 1 1 1 0 1 0 0 0 1 0 0 1 0 1 0 0
 0 1 1 1 1 0 0 0 1 1 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0
 0 1 0 0 0 0 0 0 0 0 1 0 

## Task 3: SVM Kernels

In [9]:
for k in ('linear', 'poly', 'rbf', 'sigmoid'):
    clf = svm.SVC(kernel=k)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_train)
    print(k)
    print(accuracy_score(y_train, y_pred))
    

linear
0.7777777777777778
poly
0.7951388888888888
rbf
0.8246527777777778
sigmoid
0.6666666666666666


## Task 4: Instantiating the Best Model

In [10]:
clf = svm.SVC(kernel='rbf')
clf.fit(X_train, y_train)

SVC()

## Task 4: Making a single prediction

In [11]:
# "pregnancies", "glucose", "bpressure", 
# "skinfold", "insulin", "bmi", 
# "pedigree", "age", "class"

patient = np.array([[ 1., 100., 75., 40., 0., 45., 1.5, 20.]])
patient = scaler.transform(patient)
clf.predict(patient)

array([0])

## Task 4: Testing Set Prediction

In [12]:
patient = np.array([X_test.iloc[8]])
patient = scaler.transform(patient)
print(clf.predict(patient))
print(y_test.iloc[8])

[0]
0


## Task 5: Accuracy on Testing Set

In [13]:
X_test = scaler.transform(X_test)
y_pred = clf.predict(X_test)
print( accuracy_score(y_test, y_pred))


0.796875


## Task 5: Comparison to All-Zero Prediction

In [14]:
y_zero = np.zeros(y_test.shape)
print(accuracy_score(y_test, y_zero))

0.640625


## Task 5: Precision and Recall

In [15]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.80      0.91      0.85       123
           1       0.79      0.59      0.68        69

    accuracy                           0.80       192
   macro avg       0.79      0.75      0.76       192
weighted avg       0.80      0.80      0.79       192

