In [None]:
#importing Libraries
import numpy as np   
np.random.seed(42)   ## so that output would be same
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
#models
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
#Evaluation
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.model_selection import RandomizedSearchCV,GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import plot_roc_curve
#for warning
from warnings import filterwarnings
filterwarnings("ignore")  ## To remove any kind of warning

# Load the data set

In [None]:
data=pd.read_csv("https://raw.githubusercontent.com/Abhayparashar31/Diabetes-prediction/master/diabetes.csv")

# EDA on Dataset


In [None]:
print(data.shape)
print(data.ndim)
print(data.size)
print(data.isna().sum())
print(data.info)


In [None]:
data['Outcome'].value_counts().plot(kind='bar',color=['salmon','deeppink'])
plt.xticks(np.arange(2),('No Diabetes','Diabetes'),rotation=0)

In [None]:
# Comparing Glucose with the outcome
pd.crosstab(data.Glucose[::15],data.Outcome).plot(kind="bar",figsize=(18,8),color=['yellow','deeppink'])
plt.ylabel('people');
plt.xticks(rotation=0);
plt.legend(['NO DIABETES','DIABETES']);


In [None]:
#find out Blood Pressure and age of entries who have diabetes

plt.figure(figsize=(10,6))

# Scatter plot with positive example
plt.scatter(data.Age[data.Outcome==1],data.BloodPressure[data.Outcome==1],color='Red');

# Scatter plot with negative example
plt.scatter(data.Age[data.Outcome==0],data.BloodPressure[data.Outcome==0],color='purple');


# Add some helpful info

plt.title('Diabetes in function of Age and BloodPressure')
plt.xlabel("Age")
plt.ylabel("Blood Pressure")
plt.legend(["Diabetes","No Diabetes"]);


In [None]:
## pairplotting of dataframe
import seaborn as sns
sns.set(style='ticks',color_codes=True)
sns.pairplot(data,hue='Outcome',palette='gnuplot');

In [None]:
# Histogram of all columns when the outcome is 1( has diabetes)
fig,ax = plt.subplots(nrows=4, ncols=2, figsize=(12, 10))
fig.tight_layout(pad=3.0)
ax[0,0].set_title('Glucose')
ax[0,0].hist(data.Glucose[data.Outcome==1]);
ax[0,1].set_title('Pregnancies')
ax[0,1].hist(data.Pregnancies[data.Outcome==1]);
ax[1,0].set_title('Age')
ax[1,0].hist(data.Age[data.Outcome==1]);
ax[1,1].set_title('Blood Pressure')
ax[1,1].hist(data.BloodPressure[data.Outcome==1]);
ax[2,0].set_title('Skin Thickness ')
ax[2,0].hist(data.SkinThickness[data.Outcome==1]);
ax[2,1].set_title('Insulin')
ax[2,1].hist(data.Insulin[data.Outcome==1]);
ax[3,0].set_title('BMI')
ax[3,0].hist(data.BMI[data.Outcome==1]);
ax[3,1].set_title('Diabetes Pedigree Function')
ax[3,1].hist(data.DiabetesPedigreeFunction[data.Outcome==1]);


In [None]:
# CORRELATION MATRIX BETWEEN COLUMNS
## IT SHOWS THE CORRELATION (POSITIVE,NEGATIVE) BETWEEN DIFFERENT COLUMNS(ONLY INTEGER VALUE COLUMNS)

corr_matrix = data.corr()
fig,ax = plt.subplots(figsize=(15,10))
ax = sns.heatmap(corr_matrix,annot=True,linewidth=0.5,fmt='.2f',cmap='YlGnBu')

# MODELLING AND TRAINING

In [None]:
#random data shuffelin

data.sample(frac=1)

#Splitting the data
x=data.drop("Outcome",axis=1)
y=data['Outcome']
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2)

# We are going to train our model on 4 algorithms
## 1.Logistic Regression
## 2.KNN
## 3.Random Forest Classifier
## 4.Support Vector Machine

In [None]:
## Build an model (Logistic Regression)
from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression(random_state=0)
log_reg.fit(x_train,y_train);
## Evaluating the model
log_reg = log_reg.score(x_test,y_test)
## Build an model (KNN)
knn = KNeighborsClassifier()
knn.fit(x_train,y_train);
## Evaluating the model
knn = knn.score(x_test,y_test)
## Build an model (Random forest classifier)
clf= RandomForestClassifier()
clf.fit(x_train,y_train);
## Evaluating the model
clf = clf.score(x_test,y_test)
## Build an model (Support Vector Machine)
svm = SVC()
svm.fit(x_train,y_train)
## Evaluating the model
svm = svm.score(x_test,y_test)

## Letâ€™s visualize the training performance of all the models


In [None]:
model_compare = pd.DataFrame({"Logistic Regression":log_reg,"KNN":knn,"Random Forest Classifier":clf,"Support Vector Machine":svm},index=["accuracy"])
model_compare.T.plot(kind='bar',figsize=(15,10));

In [None]:
model_compare

## Hyperparameter Tuning using GridSearchcv


In [None]:
log_reg_grid = {'C': np.logspace(-4,4,30),
"solver":["liblinear"]}
#setup  the gird cv
gs_log_reg = GridSearchCV(LogisticRegression(),
                          param_grid=log_reg_grid,
                          cv=5,
                           verbose=True)
#fit grid search cv
gs_log_reg.fit(x_train,y_train)
score = gs_log_reg.score(x_test,y_test)
print(score*100)

## Using Grid Search CV we have increased the accuracy by up to 2.5%.

## Best Model is logistic Regression with 83% accuracy

# Evaluate the model

In [None]:
y_preds = gs_log_reg.predict(x_test)
y_preds

## Let see the confusion matrix, accuracy score, classification report, and roc curve.

## confusion matrix


In [None]:
sns.set(font_scale=2)
import seaborn as sns
sns.heatmap(confusion_matrix(y_test,y_pred),annot=True,cbar=False,fmt='g')
plt.xlabel("True label")
plt.ylabel("Predicted label");

## accuracy score


In [None]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test,y_pred))


## Classification Report


In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test,y_preds))

## ROC Curve


In [None]:
plot_roc_curve(gs_log_reg,x_test,y_test)

## Save and Load the model


In [None]:
import pickle
# Save trained model to file
pickle.dump(gs_log_reg,open("Diabetes.pkl","wb"))

loaded_model = pickle.load(open("Diabetes.pkl","rb"))
loaded_model.predict(x_test)
loaded_model.score(x_test,y_test)