## Simulation of Diabetes Dataset Using Support Vector Machine with Selected Features

TO CHECK THE VERSION OF LIBRARIES

In [None]:
import sys
print('Python: {}'.format(sys.version))
# scipy
import scipy
print('scipy: {}'.format(scipy.__version__))
# numpy
import numpy
print('numpy: {}'.format(numpy.__version__))
# matplotlib
import matplotlib
print('matplotlib: {}'.format(matplotlib.__version__))
# pandas
import pandas
print('pandas: {}'.format(pandas.__version__))
# scikit-learn
import sklearn
print('sklearn: {}'.format(sklearn.__version__))

TO IMPORT LIBRARIES

In [None]:
import numpy as np
import matplotlib.pyplot as plot
import pandas as pd

# To allow plots to appear within the notebook
%matplotlib inline

TO LOAD THE DATASET

In [None]:
dataset = pandas.read_csv('diabetes.csv')

TO DETERMINE THE DIMENSIONS OF THE DATASET

In [None]:
print(dataset.shape)

TO PEEK AT THE DATA

In [None]:
print(dataset.head(5))

TO SEE THE STATISTICAL SUMMARY

In [None]:
print(dataset.describe())

TO SEE THE CLASS DISTRIBUTION

In [None]:
print(dataset.groupby('Outcome').size())

TO SHOW THE UNIVARIATE PLOT (BOX and WHISKER PLOTS)

In [None]:
dataset.plot(kind='box', subplots=True, layout=(3,3), sharex=False, sharey=False)
plot.show()

TO SHOW THE HISTOGRAM FOR THE DISTRIBUTION

In [None]:
dataset.hist()
plot.show()

FOR THE MULTIVARIATE PLOT

In [None]:
# For the Scatter Plot Matrix
from pandas.plotting import scatter_matrix
scatter_matrix(dataset)
plot.show()

## To Create Support Vector Machine Models with Identified Features to be Used

### A. Using Univariate Selection

Based from the Univariate Selection, the four identified features that have the strongest relationship with the output variable are the following:
1. Glucose
2. Insulin
3. BMI and
4. Age

In [None]:
# To Create the Matrix of Independent Variable, X, that contains Glucose, Insulin, BMI and Age
features = ['Glucose', 'Insulin', 'BMI', 'Age']
X_uni = dataset[features].values

In [None]:
# To Create the Matrix of Independent Variable, Y, that contains the Outcomes
Y_uni = dataset.iloc[:,8].values

In [None]:
# To Import the Support Vector Machine Model
from sklearn.svm import SVC

# To Instantiate the Model (Using Majority of Default Parameters)
support_vector_machine = SVC(kernel = 'linear', random_state = 0)

In [None]:
# To Apply K-fold Cross Validation for the Support Vector Machine Model Performance
from sklearn.model_selection import KFold
k_Fold = KFold(n_splits=10, shuffle=False, random_state=None)

from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator=support_vector_machine, X=X_uni, y=Y_uni, cv=k_Fold, scoring='accuracy')
accuracies_average = accuracies.mean()
accuracies_variance = accuracies.std()
print("ACCURACIES IN K-FOLDS:")
print(accuracies)
print('')
print("AVERAGE ACCURACY OF K-FOLDS:")
print(accuracies_average)
print('')
print("ACCURACY VARIANCE OF K-FOLDS:")
print(accuracies_variance)
print('')

In [None]:
# To Split the Dataset into Training Dataset and Testing Dataset
from sklearn.model_selection import train_test_split
X_uni_train, X_uni_test, Y_uni_train, Y_uni_test = train_test_split(X_uni, Y_uni, test_size = 0.20, random_state = 0)

# To Fit the Training Dataset into Support Vector Machine Model
support_vector_machine.fit(X_uni_train, Y_uni_train)

# To Predict the Output of the Testing Dataset
Y_predict_SVM_uni = support_vector_machine.predict(X_uni_test)

In [None]:
# To Show the Confusion Matrix
from sklearn.metrics import confusion_matrix
print("CONFUSION MATRIX:")
print(confusion_matrix(Y_uni_test, Y_predict_SVM_uni))

In [None]:
# For the Classification Accuracy
from sklearn.metrics import accuracy_score
classification_accuracy = accuracy_score(Y_uni_test, Y_predict_SVM_uni)
print('Classification Accuracy: %.4f'
      % classification_accuracy)
print('')

In [None]:
# For the Classification Report
from sklearn.metrics import classification_report
print("CLASSIFICATION REPORT:")
print(classification_report(Y_uni_test, Y_predict_SVM_uni))

### B. Using Recursive Feature Elimination

Based from the Recursive Feature Elimination, the four attributes chosen are the following:
1. Pregnancies
2. Glucose
3. BMI, and 
4. DiabetesPedigreeFunction

In [None]:
# To Create the Matrix of Independent Variable, X, that contains Pregnancies, Glucose, BMI and DiabetesPedigreeFunction
features = ['Pregnancies','Glucose', 'BMI', 'DiabetesPedigreeFunction']
X_rfe = dataset[features].values

In [None]:
# To Create the Matrix of Independent Variable, Y, that contains the Outcomes
Y_rfe = dataset.iloc[:,8].values

In [None]:
# To Import the Support Vector Machine Model
from sklearn.svm import SVC

# To Instantiate the Model (Using Majority of Default Parameters)
support_vector_machine = SVC(kernel = 'linear', random_state = 0)

In [None]:
# To Apply K-fold Cross Validation for the Support Vector Machine Model Performance
from sklearn.model_selection import KFold
k_Fold = KFold(n_splits=10, shuffle=False, random_state=None)

from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator=support_vector_machine, X=X_rfe, y=Y_rfe, cv=k_Fold, scoring='accuracy')
accuracies_average = accuracies.mean()
accuracies_variance = accuracies.std()
print("ACCURACIES IN K-FOLDS:")
print(accuracies)
print('')
print("AVERAGE ACCURACY OF K-FOLDS:")
print(accuracies_average)
print('')
print("ACCURACY VARIANCE OF K-FOLDS:")
print(accuracies_variance)
print('')

In [None]:
# To Split the Dataset into Training Dataset and Testing Dataset
from sklearn.model_selection import train_test_split
X_rfe_train, X_rfe_test, Y_rfe_train, Y_rfe_test = train_test_split(X_rfe, Y_rfe, test_size = 0.20, random_state = 0)

# To Fit the Training Dataset into Support Vector Machine Model
support_vector_machine.fit(X_rfe_train, Y_rfe_train)

# To Predict the Output of the Testing Dataset
Y_predict_SVM_rfe = support_vector_machine.predict(X_rfe_test)

In [None]:
# To Show the Confusion Matrix
from sklearn.metrics import confusion_matrix
print("CONFUSION MATRIX:")
print(confusion_matrix(Y_rfe_test, Y_predict_SVM_rfe))

In [None]:
# For the Classification Accuracy
from sklearn.metrics import accuracy_score
classification_accuracy = accuracy_score(Y_rfe_test, Y_predict_SVM_rfe)
print('Classification Accuracy: %.4f'
      % classification_accuracy)
print('')

In [None]:
# For the Classification Report
from sklearn.metrics import classification_report
print("CLASSIFICATION REPORT:")
print(classification_report(Y_rfe_test, Y_predict_SVM_rfe))

### C. Using Principal Component Analysis

A property of PCA is that you can choose the number of dimensions or principal component in the transformed result.
In this simulation, we will select 4 principal components.

In [None]:
# To Create the Matrix of Independent Variable, X, that contains all the Features
X = dataset.iloc[:,0:8].values

In [None]:
# To Create the Matrix of Independent Variable, Y, that contains the Outcomes
Y_pca = dataset.iloc[:,8].values

In [None]:
# To Perform Feature Selection with PCA
from sklearn.decomposition import PCA
selection_method_pca = PCA(n_components=4)

# To Apply the PCA in the Independent variable, X
X_pca = selection_method_pca.fit_transform(X)

In [None]:
# To Import the Support Vector Machine Model
from sklearn.svm import SVC

# To Instantiate the Model (Using Majority of Default Parameters)
support_vector_machine = SVC(kernel = 'linear', random_state = 0)

In [None]:
# To Apply K-fold Cross Validation for the Support Vector Machine Model Performance
from sklearn.model_selection import KFold
k_Fold = KFold(n_splits=10, shuffle=False, random_state=None)

from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator=support_vector_machine, X=X_pca, y=Y_pca, cv=k_Fold, scoring='accuracy')
accuracies_average = accuracies.mean()
accuracies_variance = accuracies.std()
print("ACCURACIES IN K-FOLDS:")
print(accuracies)
print('')
print("AVERAGE ACCURACY OF K-FOLDS:")
print(accuracies_average)
print('')
print("ACCURACY VARIANCE OF K-FOLDS:")
print(accuracies_variance)
print('')

In [None]:
# To Split the Dataset into Training Dataset and Testing Dataset
from sklearn.model_selection import train_test_split
X_pca_train, X_pca_test, Y_pca_train, Y_pca_test = train_test_split(X_pca, Y_pca, test_size = 0.20, random_state = 0)

# To Fit the Training Dataset into Support Vector Machine Model
support_vector_machine.fit(X_pca_train, Y_pca_train)

# To Predict the Output of the Testing Dataset
Y_predict_SVM_pca = support_vector_machine.predict(X_pca_test)

In [None]:
# To Show the Confusion Matrix
from sklearn.metrics import confusion_matrix
print("CONFUSION MATRIX:")
print(confusion_matrix(Y_pca_test, Y_predict_SVM_pca))

In [None]:
# For the Classification Accuracy
from sklearn.metrics import accuracy_score
classification_accuracy = accuracy_score(Y_pca_test, Y_predict_SVM_pca)
print('Classification Accuracy: %.4f'
      % classification_accuracy)
print('')

In [None]:
# For the Classification Report
from sklearn.metrics import classification_report
print("CLASSIFICATION REPORT:")
print(classification_report(Y_pca_test, Y_predict_SVM_pca))

### D. Using Feature Importance

Based from the Feature Importance Selection, the four attributes chosen are the following:
1. Glucose
2. BMI
3. DiabetesPedigreeFunction
4. Age

In [None]:
# To Create the Matrix of Independent Variable, X, that contains Glucose, BMI, DiabetesPedigreeFunction, and Age
features = ['Glucose', 'BMI', 'DiabetesPedigreeFunction','Age']
X_fi = dataset[features].values

In [None]:
# To Create the Matrix of Independent Variable, Y, that contains the Outcomes
Y_fi = dataset.iloc[:,8].values

In [None]:
# To Import the Support Vector Machine Model
from sklearn.svm import SVC

# To Instantiate the Model (Using Majority of Default Parameters)
support_vector_machine = SVC(kernel = 'linear', random_state = 0)

In [None]:
# To Apply K-fold Cross Validation for the Support Vector Machine Model Performance
from sklearn.model_selection import KFold
k_Fold = KFold(n_splits=10, shuffle=False, random_state=None)

from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator=support_vector_machine, X=X_fi, y=Y_fi, cv=k_Fold, scoring='accuracy')
accuracies_average = accuracies.mean()
accuracies_variance = accuracies.std()
print("ACCURACIES IN K-FOLDS:")
print(accuracies)
print('')
print("AVERAGE ACCURACY OF K-FOLDS:")
print(accuracies_average)
print('')
print("ACCURACY VARIANCE OF K-FOLDS:")
print(accuracies_variance)
print('')

In [None]:
# To Split the Dataset into Training Dataset and Testing Dataset
from sklearn.model_selection import train_test_split
X_fi_train, X_fi_test, Y_fi_train, Y_fi_test = train_test_split(X_fi, Y_fi, test_size = 0.20, random_state = 0)

# To Fit the Training Dataset into Support Vector Machine Model
support_vector_machine.fit(X_fi_train, Y_fi_train)

# To Predict the Output of the Testing Dataset
Y_predict_SVM_fi = support_vector_machine.predict(X_fi_test)

In [None]:
# To Show the Confusion Matrix
from sklearn.metrics import confusion_matrix
print("CONFUSION MATRIX:")
print(confusion_matrix(Y_fi_test, Y_predict_SVM_fi))

In [None]:
# For the Classification Accuracy
from sklearn.metrics import accuracy_score
classification_accuracy = accuracy_score(Y_fi_test, Y_predict_SVM_fi)
print('Classification Accuracy: %.4f'
      % classification_accuracy)
print('')

In [None]:
# For the Classification Report
from sklearn.metrics import classification_report
print("CLASSIFICATION REPORT:")
print(classification_report(Y_fi_test, Y_predict_SVM_fi))

###### earoxas2019 & rgdeluna2018