In [1]:
# importing libraries
import numpy as np
import pandas as pd
from sklearn import svm
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

In [2]:
# Getting Data
dataset = pd.read_csv("diabetes.csv")
dataset.shape

(768, 9)

In [3]:
dataset.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [4]:
# Extract Features
X = dataset.iloc[:, :8]
X.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6,148,72,35,0,33.6,0.627,50
1,1,85,66,29,0,26.6,0.351,31
2,8,183,64,0,0,23.3,0.672,32
3,1,89,66,23,94,28.1,0.167,21
4,0,137,40,35,168,43.1,2.288,33


In [5]:
# Extract Class Labels
y = dataset["Outcome"]
y.head()

0    1
1    0
2    1
3    0
4    1
Name: Outcome, dtype: int64

In [6]:
# Split Dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75, random_state=0)

In [7]:
print(X_train.shape)

(576, 8)


In [8]:
print(y_train.shape)

(576,)


In [9]:
print(X_test.shape)

(192, 8)


In [10]:
print(y_test.shape)

(192,)


In [11]:
X_train.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
762,9,89,62,0,0,22.5,0.142,33
127,1,118,58,36,94,33.3,0.261,23
564,0,91,80,0,0,32.4,0.601,27
375,12,140,82,43,325,39.2,0.528,58
663,9,145,80,46,130,37.9,0.637,40


In [12]:
# Normalize Features
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)

In [13]:
# View first 5 rows
X_train[:5, :]

array([[ 1.50755225, -1.01521454, -0.40451932, -1.31921491, -0.71823753,
        -1.22070104, -0.98325882, -0.04863985],
       [-0.82986389, -0.09964691, -0.61509602,  0.9287299 ,  0.08374747,
         0.13719053, -0.62493647, -0.88246592],
       [-1.12204091, -0.95207195,  0.54307587, -1.31921491, -0.71823753,
         0.0240329 ,  0.39884168, -0.5489355 ],
       [ 2.38408331,  0.59492164,  0.64836422,  1.36583027,  2.05458297,
         0.87900167,  0.17903049,  2.03592532],
       [ 1.50755225,  0.75277813,  0.54307587,  1.55315901,  0.39089067,
         0.71555175,  0.50724171,  0.53503839]])

In [14]:
# SVM Kernels
for k in ('linear', 'poly', 'rbf', 'sigmoid'):
    model = svm.SVC(kernel=k)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_train)
    print(k)
    print(accuracy_score(y_train, y_pred))

linear
0.7638888888888888
poly
0.7934027777777778
rbf
0.8246527777777778
sigmoid
0.6510416666666666


In [15]:
# Using the best model
model = svm.SVC(kernel='rbf')
model.fit(X_train, y_train)

SVC()

In [16]:
# Making a Single Prediction
# 'pregnancies', 'glucose', 'bpressure', 'skinThickness'
# 'insulin', 'bml', 'pedigree', 'age'

patient = np.array([[ 1., 150., 70., 45., 0., 40., 1.5, 25]])

# Normalize the data
patient = scaler.transform(patient)

model.predict(patient)

array([1], dtype=int64)

In [17]:
patient = np.array([[ 1., 50., 70., 45., 0., 40., 1.5, 25]])

# Normalize the data
patient = scaler.transform(patient)

model.predict(patient)

array([0], dtype=int64)

In [18]:
# Viewing Test Set
X_test

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
661,1,199,76,43,0,42.9,1.394,22
122,2,107,74,30,100,33.6,0.404,23
113,4,76,62,0,0,34.0,0.391,25
14,5,166,72,19,175,25.8,0.587,51
529,0,111,65,0,0,24.6,0.660,31
...,...,...,...,...,...,...,...,...
366,6,124,72,0,0,27.6,0.368,29
301,2,144,58,33,135,31.6,0.422,25
382,1,109,60,8,182,25.4,0.947,21
140,3,128,78,0,0,21.1,0.268,55


In [19]:
# Checking the third patient in the test set with index 2
X_test.iloc[2]

Pregnancies                  4.000
Glucose                     76.000
BloodPressure               62.000
SkinThickness                0.000
Insulin                      0.000
BMI                         34.000
DiabetesPedigreeFunction     0.391
Age                         25.000
Name: 113, dtype: float64

In [1]:
# Convert dataframe to a numpy array
t_patient = np.array([ X_test.iloc[2]])

NameError: name 'np' is not defined

In [21]:
# Predicting on third patient in Test Set
t_patient = scaler.transform(t_patient)

print("Model's Prediction:", model.predict(t_patient))
print("Actual Prediction:", y_test.iloc[2])

Model's Prediction: [0]
Actual Prediction: 0


In [22]:
# Accuracy on Testing Set
X_test = scaler.transform(X_test)
y_pred = model.predict(X_test)
print("Accuracy Score:", accuracy_score(y_test, y_pred))

Accuracy Score: 0.7760416666666666


In [23]:
# Comparison to All-Zero Prediction
y_zero = np.zeros(y_test.shape)
print(accuracy_score(y_test, y_zero))

0.6770833333333334


In [24]:
# This means dataset is unbalanced
# More People do not have diabetes

In [25]:
# Precision and Recall
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.80      0.90      0.84       130
           1       0.71      0.52      0.60        62

    accuracy                           0.78       192
   macro avg       0.75      0.71      0.72       192
weighted avg       0.77      0.78      0.77       192

