# Import Library

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

sns.set()

from mlxtend.plotting import plot_decision_regions
import missingno as msno
from pandas.plotting import scatter_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

from imblearn.over_sampling import RandomOverSampler
from google.colab import drive
drive.mount('/content/gdrive')
drive.mount('/content/drive')

Mounted at /content/gdrive
Mounted at /content/drive


# Read Data

In [None]:
diabetes_df = pd.read_csv('/content/Dataset.csv')
diabetes_df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


# EDA (Exploratory Data Analysis)

In [None]:
diabetes_df.columns

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')

In [None]:
diabetes_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [None]:
diabetes_df.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [None]:
diabetes_df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Pregnancies,768.0,3.845052,3.369578,0.0,1.0,3.0,6.0,17.0
Glucose,768.0,120.894531,31.972618,0.0,99.0,117.0,140.25,199.0
BloodPressure,768.0,69.105469,19.355807,0.0,62.0,72.0,80.0,122.0
SkinThickness,768.0,20.536458,15.952218,0.0,0.0,23.0,32.0,99.0
Insulin,768.0,79.799479,115.244002,0.0,0.0,30.5,127.25,846.0
BMI,768.0,31.992578,7.88416,0.0,27.3,32.0,36.6,67.1
DiabetesPedigreeFunction,768.0,0.471876,0.331329,0.078,0.24375,0.3725,0.62625,2.42
Age,768.0,33.240885,11.760232,21.0,24.0,29.0,41.0,81.0
Outcome,768.0,0.348958,0.476951,0.0,0.0,0.0,1.0,1.0


In [None]:
diabetes_df_copy = diabetes_df.copy(deep = True)
diabetes_df_copy[['Glucose','BloodPressure','SkinThickness','Insulin','BMI']] = diabetes_df_copy[['Glucose','BloodPressure','SkinThickness','Insulin','BMI']].replace(0,np.NaN)

# Showing the Count of NANs
print(diabetes_df_copy.isnull().sum())

Pregnancies                   0
Glucose                       5
BloodPressure                35
SkinThickness               227
Insulin                     374
BMI                          11
DiabetesPedigreeFunction      0
Age                           0
Outcome                       0
dtype: int64


In [None]:
diabetes_df_copy['Glucose'].fillna(diabetes_df_copy['Glucose'].mean(), inplace = True)
diabetes_df_copy['BloodPressure'].fillna(diabetes_df_copy['BloodPressure'].mean(), inplace = True)
diabetes_df_copy['SkinThickness'].fillna(diabetes_df_copy['SkinThickness'].mean(), inplace = True)
diabetes_df_copy['Insulin'].fillna(diabetes_df_copy['Insulin'].mean(), inplace = True)
diabetes_df_copy['BMI'].fillna(diabetes_df_copy['BMI'].mean(), inplace = True)

In [None]:
print(diabetes_df_copy.isnull().sum())

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64


# Remove Outliers

In [None]:
outliers = diabetes_df_copy.quantile(.97) # dealing with the outliers seen in the boxplots above

diabetes_df_copy = diabetes_df_copy[(diabetes_df_copy['Pregnancies']<outliers['Pregnancies'])]
diabetes_df_copy = diabetes_df_copy[(diabetes_df_copy['Glucose']<outliers['Glucose'])]
diabetes_df_copy = diabetes_df_copy[(diabetes_df_copy['BloodPressure']<outliers['BloodPressure'])]
diabetes_df_copy = diabetes_df_copy[(diabetes_df_copy['SkinThickness']<outliers['SkinThickness'])]
diabetes_df_copy = diabetes_df_copy[(diabetes_df_copy['Insulin']<outliers['Insulin'])]
diabetes_df_copy = diabetes_df_copy[(diabetes_df_copy['BMI']<outliers['BMI'])]
diabetes_df_copy = diabetes_df_copy[(diabetes_df_copy['DiabetesPedigreeFunction']<outliers['DiabetesPedigreeFunction'])]
diabetes_df_copy = diabetes_df_copy[(diabetes_df_copy['Age']<outliers['Age'])]

# Scalling the Data

In [None]:
diabetes_df_copy.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148.0,72.0,35.0,155.548223,33.6,0.627,50,1
1,1,85.0,66.0,29.0,155.548223,26.6,0.351,31,0
2,8,183.0,64.0,29.15342,155.548223,23.3,0.672,32,1
3,1,89.0,66.0,23.0,94.0,28.1,0.167,21,0
5,5,116.0,74.0,29.15342,155.548223,25.6,0.201,30,0


In [None]:
x = diabetes_df_copy.iloc[:,:-1]
x.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6,148.0,72.0,35.0,155.548223,33.6,0.627,50
1,1,85.0,66.0,29.0,155.548223,26.6,0.351,31
2,8,183.0,64.0,29.15342,155.548223,23.3,0.672,32
3,1,89.0,66.0,23.0,94.0,28.1,0.167,21
5,5,116.0,74.0,29.15342,155.548223,25.6,0.201,30


In [None]:
y = diabetes_df_copy.iloc[:, -1]
y.head()

0    1
1    0
2    1
3    0
5    0
Name: Outcome, dtype: int64

In [None]:
col = x.columns
std = StandardScaler()

x = std.fit_transform(x)
x = pd.DataFrame(data = x, columns = col)

x.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,0.915471,1.158016,0.126554,0.988239,0.257525,0.36709,0.788819,1.957467
1,-0.862633,-1.166385,-0.441964,0.175128,0.257525,-0.806589,-0.303846,-0.006731
2,1.626713,2.449351,-0.63147,0.195919,0.257525,-1.359895,0.966971,0.096648
3,-0.862633,-1.018804,-0.441964,-0.637982,-0.900891,-0.555086,-1.03229,-1.040519
4,0.559851,-0.022632,0.31606,0.195919,0.257525,-0.974257,-0.897686,-0.11011


# Class Balancing

In [None]:
print(y.value_counts())

0    419
1    180
Name: Outcome, dtype: int64


In [None]:
over = RandomOverSampler()
x, y = over.fit_resample(x, y)

print(y.value_counts())

1    419
0    419
Name: Outcome, dtype: int64


# Model Building

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(x,y, test_size=0.25, random_state=0)

# Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(n_estimators=200)
rfc.fit(X_train, y_train)

In [None]:
rfc_train = rfc.predict(X_train)
from sklearn import metrics

print("Accuracy_Score =", format(metrics.accuracy_score(y_train, rfc_train)))

Accuracy_Score = 1.0


In [None]:
from sklearn import metrics

predictionsrfc = rfc.predict(X_test)
print("Accuracy_Score =", format(metrics.accuracy_score(y_test, predictionsrfc)))

Accuracy_Score = 0.8904761904761904


In [None]:
from sklearn.metrics import classification_report, confusion_matrix

print(confusion_matrix(y_test, predictionsrfc))
print(classification_report(y_test,predictionsrfc))

[[ 87  19]
 [  4 100]]
              precision    recall  f1-score   support

           0       0.96      0.82      0.88       106
           1       0.84      0.96      0.90       104

    accuracy                           0.89       210
   macro avg       0.90      0.89      0.89       210
weighted avg       0.90      0.89      0.89       210



# Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

dtree = DecisionTreeClassifier()
dtree.fit(X_train, y_train)

In [None]:
from sklearn import metrics

predictions = dtree.predict(X_test)
print("Accuracy Score =", format(metrics.accuracy_score(y_test,predictions)))

Accuracy Score = 0.819047619047619


In [None]:
from sklearn.metrics import classification_report, confusion_matrix

print(confusion_matrix(y_test, predictions))
print(classification_report(y_test,predictions))

[[78 28]
 [10 94]]
              precision    recall  f1-score   support

           0       0.89      0.74      0.80       106
           1       0.77      0.90      0.83       104

    accuracy                           0.82       210
   macro avg       0.83      0.82      0.82       210
weighted avg       0.83      0.82      0.82       210



# Support Vector Machine

In [None]:
from sklearn.svm import SVC

svc_model = SVC()
svc_model.fit(X_train, y_train)

In [None]:
from sklearn import metrics
svc_pred = svc_model.predict(X_test)
print("Accuracy Score =", format(metrics.accuracy_score(y_test, svc_pred)))

Accuracy Score = 0.8428571428571429


In [None]:
from sklearn.metrics import classification_report, confusion_matrix

print(confusion_matrix(y_test, svc_pred))
print(classification_report(y_test,svc_pred))

[[87 19]
 [14 90]]
              precision    recall  f1-score   support

           0       0.86      0.82      0.84       106
           1       0.83      0.87      0.85       104

    accuracy                           0.84       210
   macro avg       0.84      0.84      0.84       210
weighted avg       0.84      0.84      0.84       210



# XgBoost Classifier

In [None]:
from xgboost import XGBClassifier

xgb_model = XGBClassifier(gamma=0)
xgb_model.fit(X_train, y_train)

In [None]:
from sklearn import metrics

xgb_pred = xgb_model.predict(X_test)
print("Accuracy Score =", format(metrics.accuracy_score(y_test, xgb_pred)))

Accuracy Score = 0.8857142857142857


In [None]:
from sklearn.metrics import classification_report, confusion_matrix

print(confusion_matrix(y_test, xgb_pred))
print(classification_report(y_test,xgb_pred))

[[87 19]
 [ 5 99]]
              precision    recall  f1-score   support

           0       0.95      0.82      0.88       106
           1       0.84      0.95      0.89       104

    accuracy                           0.89       210
   macro avg       0.89      0.89      0.89       210
weighted avg       0.89      0.89      0.89       210

