# To predict diabetes using PIMA diabetes dataset

**Importing the required libraries**

In [32]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

**Importing the dataset from Google drive**

In [33]:
from google.colab import drive
drive.mount ('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [34]:
data = pd.read_csv('/content/drive/MyDrive/ML Datasets/diabetes.csv')
data.shape

(768, 9)

**Pre-processing the data**

In [35]:
data.head(5)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [36]:
diabetes_true_count= len(data.loc[data['Outcome'] == 1])
diabetes_false_count= len(data.loc[data['Outcome'] == 0])
(diabetes_true_count, diabetes_false_count)

(268, 500)

In [37]:
data.isnull().values.any()

False

In [38]:
data.corr()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
Pregnancies,1.0,0.129459,0.141282,-0.081672,-0.073535,0.017683,-0.033523,0.544341,0.221898
Glucose,0.129459,1.0,0.15259,0.057328,0.331357,0.221071,0.137337,0.263514,0.466581
BloodPressure,0.141282,0.15259,1.0,0.207371,0.088933,0.281805,0.041265,0.239528,0.065068
SkinThickness,-0.081672,0.057328,0.207371,1.0,0.436783,0.392573,0.183928,-0.11397,0.074752
Insulin,-0.073535,0.331357,0.088933,0.436783,1.0,0.197859,0.185071,-0.042163,0.130548
BMI,0.017683,0.221071,0.281805,0.392573,0.197859,1.0,0.140647,0.036242,0.292695
DiabetesPedigreeFunction,-0.033523,0.137337,0.041265,0.183928,0.185071,0.140647,1.0,0.033561,0.173844
Age,0.544341,0.263514,0.239528,-0.11397,-0.042163,0.036242,0.033561,1.0,0.238356
Outcome,0.221898,0.466581,0.065068,0.074752,0.130548,0.292695,0.173844,0.238356,1.0


**Splitting into Train and Test data**

In [39]:
from sklearn.model_selection import train_test_split
feature_columns=['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']
predicted_class= ['Outcome']
X = data[feature_columns].values
y = data[predicted_class].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state = 10)

**Checking for missing (zero) values**

In [40]:
print("no of rows missing from Glucose Concentration: {0}".format(len(data.loc[data['Glucose'] == 0])))
print("no of rows missing from Blood Pressure: {0}".format(len(data.loc[data['BloodPressure'] == 0])))
print("no of rows missing from Skin Thickness: {0}".format(len(data.loc[data['SkinThickness'] == 0])))
print("no of rows missing from Insulin: {0}".format(len(data.loc[data['Insulin'] == 0])))
print("no of rows missing from BMI: {0}".format(len(data.loc[data['BMI'] == 0])))
print("no of rows missing from DPF: {0}".format(len(data.loc[data['DiabetesPedigreeFunction'] == 0])))
print("no of rows missing from Age: {0}".format(len(data.loc[data['Age'] == 0])))

no of rows missing from Glucose Concentration: 5
no of rows missing from Blood Pressure: 35
no of rows missing from Skin Thickness: 227
no of rows missing from Insulin: 374
no of rows missing from BMI: 11
no of rows missing from DPF: 0
no of rows missing from Age: 0


**Filling in missing (zero) values with mean**

In [41]:
from sklearn.impute import SimpleImputer
fill_values = SimpleImputer(missing_values=0, strategy="mean")
X_train = fill_values.fit_transform(X_train)
X_test = fill_values.fit_transform(X_test)

**Applying different algorithms and checking their accuracies**

In [42]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
bagging = BaggingClassifier(base_estimator= DecisionTreeClassifier(), max_samples = 0.5, max_features = 0.5,bootstrap = False, bootstrap_features = False)
bagging.fit(X_train, y_train.ravel())
bg_pred_diabetes = bagging.predict(X_test)
bg_dt_score = bagging.score(X_test, y_test)
print("Accuracy: %.3f" % bagging.score(X_test, y_test))

Accuracy: 0.719


In [43]:
from sklearn.neighbors import KNeighborsClassifier
bagging = BaggingClassifier(base_estimator= KNeighborsClassifier(), max_samples = 0.5, max_features = 0.5, bootstrap = False, bootstrap_features = False)
bagging.fit(X_train, y_train.ravel())
bg_pred_diabetes = bagging.predict(X_test)
bg_score = bagging.score(X_test, y_test)
print("Accuracy: %.3f" % bagging.score(X_test, y_test))

Accuracy: 0.710


In [44]:
from sklearn.ensemble import AdaBoostClassifier
ab_clf = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(), n_estimators=100, learning_rate=0.5, random_state=100)
ab_clf.fit(X_train, y_train.ravel())
ab_pred_diabetes = ab_clf.predict(X_test)
ab_clf_score = ab_clf.score(X_test, y_test)
print("Accuracy: %.3f" % ab_clf.score(X_test, y_test))

Accuracy: 0.701


In [45]:
from sklearn.ensemble import RandomForestClassifier
rf_clf = RandomForestClassifier(random_state=10)
rf_clf.fit(X_train, y_train.ravel())
predict_train_data = rf_clf.predict(X_test)
rf_clf_score = rf_clf.score(X_test, y_test)
print("Accuracy: %.3f" % rf_clf.score(X_test, y_test))

Accuracy: 0.766


In [46]:
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier(hidden_layer_sizes=(1000, 300, 300), solver='adam', shuffle=False, tol = 0.0001)
mlp.fit(X_train, y_train.ravel())
mlp_pred_diabetes = mlp.predict(X_test)
mlp_score = mlp.score(X_test, y_test)
print("Accuracy = {0:.3f}".format(mlp.score(X_test, y_test)))

Accuracy = 0.701
