In [94]:
"""
    Diabetes Classification
"""

'\n    Diabetes Classification\n'

In [95]:
#### Libreries needed
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import datasets, metrics

In [96]:
### load dataset

# must to exist on your directory
data = pd.read_csv("diabetes.csv")

# To see a quick view of the data
data.iloc[:5, : ]

# The outcome column is the class (if the person has diabetes (1) or not (0))

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [99]:
######## Preprocessing data #########

# Get all data from diabetes dataset
x = np.asanyarray(data.drop(columns=['Outcome']))[:, :]  # 8 inputs
y = np.asanyarray(data[['Outcome']])[:, :].ravel()       # the output class (0=No Diabetes, 1:Diabetes)

# Scale the data 
x = StandardScaler().fit_transform(x)

# Split the dataset in train and test datasets
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.25)

# Samples
print("Total Samples:\t\t", x.shape[0])
print("No. Samples [train]:\t", xtrain.shape[0])
print("No. Samples [test]:\t", xtest.shape[0])


Total Samples:		 768
No. Samples [train]:	 576
No. Samples [test]:	 192


In [100]:
################## KNeighbors ########################
## Config  with best permomance: Neighbors: 25-100
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(25)
knn.fit(xtrain, ytrain)

print("Train: ", knn.score(xtrain, ytrain))
print("Test: ", knn.score(xtest,ytest))


Train:  0.7552083333333334
Test:  0.7708333333333334


In [101]:
########################## Gaussian Process ###############################
from sklearn.naive_bayes import GaussianNB

GP = GaussianNB()

GP.fit(xtrain, ytrain)
# train & test score
print("Score Train: ",GP.score(xtrain, ytrain))
print("Score Test: ",GP.score(xtest, ytest))

Score Train:  0.7638888888888888
Score Test:  0.765625


In [102]:
########################## SVC ########################
from sklearn.svm import SVC

# parameters with the best results: gamma=0.01, C=1
svc = SVC(gamma=0.01,C=1)

# train the SVC model
svc.fit(xtrain, ytrain)

print("Train: ", svc.score(xtrain, ytrain))
print("Test: ", svc.score(xtest,ytest))

Train:  0.7847222222222222
Test:  0.765625


In [103]:
######################## Make predictions ##################################33

# I took only one model for doing predictions (you can choose: 'mlp.predict, svc.predict, knn.predict')
ypred = svc.predict(xtest)

In [104]:
# get a random index of the test dataset
index = np.random.randint(xtest.shape[0])

# print the row
print("-- Prediction --\n")
print("Row (Normalized): \n",xtest[index, :])
predicted = ypred[index]

print("\nNOTE: (1=Diabetes) (0=No Diabetes)")
print("\nPrediction:\t", ypred[index])
print("Expected:\t", ytest[index])

-- Prediction --

Row (Normalized): 
 [-0.54791859  0.19108374 -0.57412775  0.21726125  1.69490581 -0.54481078
  3.40706745 -0.70119842]

NOTE: (1=Diabetes) (0=No Diabetes)

Prediction:	 0
Expected:	 0


In [105]:
####################### Metrics ########################################


"""
Muestra el desempeño general de nuestro modelo, desempeños gobales y locales por cada clase. F1-score, accuracy
"""

# Classification report
print("Classification Report: \n\n", metrics.classification_report(ytest, ypred))



Classification Report: 

               precision    recall  f1-score   support

           0       0.78      0.88      0.83       124
           1       0.72      0.56      0.63        68

    accuracy                           0.77       192
   macro avg       0.75      0.72      0.73       192
weighted avg       0.76      0.77      0.76       192



In [106]:
# Confusion matrix

"""
 Lo mas recomedable es que haya la mayoria de los valores en la diagonal de la matriz y que haya 
 pocos valores en las otras posiciones. Si pasa esto es que nuestro modelo ha hecho una buena 
 prediccion
"""
print("Confusion Matrix: \n\n", metrics.confusion_matrix(ytest, ypred))


Confusion Matrix: 

 [[109  15]
 [ 30  38]]
