In [1]:
#importing libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

In [2]:
#loading dataset
dataset = pd.read_csv('diabetes.csv')

In [60]:
#length of dataset
print(len(dataset))
dataset.head(10)

768


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148.0,72.0,35.0,155.0,33.6,0.627,50,1
1,1,85.0,66.0,29.0,155.0,26.6,0.351,31,0
2,8,183.0,64.0,29.0,155.0,23.3,0.672,32,1
3,1,89.0,66.0,23.0,94.0,28.1,0.167,21,0
4,0,137.0,40.0,35.0,168.0,43.1,2.288,33,1
5,5,116.0,74.0,29.0,155.0,25.6,0.201,30,0
6,3,78.0,50.0,32.0,88.0,31.0,0.248,26,1
7,10,115.0,72.0,29.0,155.0,35.3,0.134,29,0
8,2,197.0,70.0,45.0,543.0,30.5,0.158,53,1
9,8,125.0,96.0,29.0,155.0,32.0,0.232,54,1


In [59]:
# Replace zeroes
zero_not_accepted = ['Glucose', 'BloodPressure', 'SkinThickness', 'BMI', 'Insulin']

In [5]:
for column in zero_not_accepted:
    dataset[column] = dataset[column].replace(0, np.NaN)
    mean = int(dataset[column].mean(skipna=True))
    dataset[column] = dataset[column].replace(np.NaN, mean)

In [6]:
# split dataset
X = dataset.iloc[:, 0:8]
y = dataset.iloc[:, 8]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=0.2)

In [7]:
print(len(X_train))
print(len(y_train))
print(len(X_test))
print(len(y_test))

614
614
154
154


In [8]:
#Feature scaling
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)

In [50]:
# Define the model: Init K-NN
model = KNeighborsClassifier()
param_grid = {'n_neighbors':[5,7,11] ,
              'p':[1 , 2 ],
              'metric':['manhattan','euclidean']}

In [51]:
# Fit Model
grid_search = GridSearchCV(model , param_grid)
grid_search.fit(X_train ,y_train)
#classifier.fit(X_train, y_train)

GridSearchCV(estimator=KNeighborsClassifier(),
             param_grid={'metric': ['manhattan', 'euclidean'],
                         'n_neighbors': [5, 7, 11], 'p': [2, 3]})

In [52]:
#Check best Param and best Model
print(grid_search.best_params_)
best_model = grid_search.best_estimator_
print(best_model)

{'metric': 'euclidean', 'n_neighbors': 7, 'p': 2}
KNeighborsClassifier(metric='euclidean', n_neighbors=7)


In [53]:
#Predictions
predictions = best_model.predict(X_test)
predictions

array([1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1,
       1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1,
       1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
      dtype=int64)

In [54]:
# Evaluate Model by Confusion Matrix
cm = confusion_matrix(y_test,predictions)
print (cm)

[[90 17]
 [15 32]]


In [55]:
# Evaluate Model by F1-score
print(f1_score(y_test, predictions))

0.6666666666666666


In [56]:
#Accuracy Traing Test
print("Training Set Accuracy: " , best_model.score(X_train ,y_train))

Training Set Accuracy:  0.7931596091205212


In [57]:
#Accuracy Testing Test
print("Testing Set Accuracy: " , best_model.score(X_test, y_test))

Testing Set Accuracy:  0.7922077922077922


In [112]:
#Check the Model SVM

In [24]:
#Define the Model :SVM
model = SVC(random_state=4)
param_grid = {'C':[0.3 , 1 , 3 , 5]}

In [25]:
#Fit the Model
grid_search = GridSearchCV(model , param_grid)
grid_search.fit(X_train ,y_train)

GridSearchCV(estimator=SVC(random_state=4), param_grid={'C': [0.3, 1, 3, 5]})

In [26]:
#Check best Param and best Model
print(grid_search.best_params_)
best_model = grid_search.best_estimator_
print(best_model)

{'C': 0.3}
SVC(C=0.3, random_state=4)


In [27]:
#Predictions
predictions = best_model.predict(X_test)
predictions

array([1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1,
       1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1,
       0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
      dtype=int64)

In [28]:
# Evaluate Model by Confusion Matrix
cm = confusion_matrix(y_test, predictions)
print (cm)


[[97 10]
 [22 25]]


In [29]:
# Evaluate Model by F1-score
print(f1_score(y_test, predictions))

0.6097560975609756


In [30]:
#Accuracy Traing Test
print("Training Set Accuracy: " , best_model.score(X_train ,y_train))

Training Set Accuracy:  0.8013029315960912


In [31]:
#Accuracy Testing Test
print("Testing Set Accuracy: " , best_model.score(X_test, y_test))

Testing Set Accuracy:  0.7922077922077922


In [32]:
#Check the Model Random Forest

In [33]:
#Define the Model :RandomForestClassifier
model = RandomForestClassifier()
param_grid = {'n_estimators':[100,200 , 300] , 
             'min_samples_split':[5 , 7 , 9],
             'max_features':['sqrt' , 0.05]}

In [34]:
#Fit the Model
grid_search = GridSearchCV(model , param_grid)
grid_search.fit(X_train ,y_train)

GridSearchCV(estimator=RandomForestClassifier(),
             param_grid={'max_features': ['sqrt', 0.05],
                         'min_samples_split': [5, 7, 9],
                         'n_estimators': [100, 200, 300]})

In [39]:
#Check best Param and best Model
print(grid_search.best_params_)
best_model = grid_search.best_estimator_
print(best_model)

{'max_features': 0.05, 'min_samples_split': 9, 'n_estimators': 100}
RandomForestClassifier(max_features=0.05, min_samples_split=9)


In [40]:
#Predictions
predictions = best_model.predict(X_test)
predictions

array([1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1,
       1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
      dtype=int64)

In [41]:
# Evaluate Model
cm = confusion_matrix(y_test, predictions)
print (cm)

[[96 11]
 [15 32]]
0.711111111111111


In [None]:
# Evaluate Model by F1-score
print(f1_score(y_test, predictions))

In [38]:
#Accuracy Traing Test
print("Training Set Accuracy: " , best_model.score(X_train ,y_train))

Training Set Accuracy:  0.9429967426710097


In [142]:
#Accuracy Testing Test
print("Testing Set Accuracy: " , best_model.score(X_test, y_test))

Testing Set Accuracy:  0.8116883116883117


In [None]:
#Conclusions
#1 ->Random Forest is over fitting
#2->KNN is best for this database since it's accurancy of training set and accuracy of testing set are same
#3->SVM is the second best choice with nearly same accurancy of training set and accuracy of testing set 