<a href="https://colab.research.google.com/github/HabiburRahman47/Diabetes-Prediction/blob/main/thesis_on_diabetes_detection_final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import Necessary Libraries

In [None]:
# Data Manipulation and Linear Algebra
import pandas as pd
import numpy as np

# Plots
import seaborn as sns
sns.set_style("darkgrid")
import matplotlib.pyplot as plt

# Machine Learning
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.preprocessing import StandardScaler

from sklearn import svm, tree, linear_model, neighbors, naive_bayes, ensemble, discriminant_analysis, gaussian_process
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

#ignore warning messages 
import warnings
warnings.filterwarnings('ignore')

## Loading Data

In [None]:
data = pd.read_csv("../input/pima-indians-diabetes-database/diabetes.csv")

# Data Overview

In [None]:
data.info()

In [None]:
data.head()

In [None]:
fig, axes = plt.subplots(figsize=(20, 8), nrows=1, ncols=2)

sns.countplot(x="Outcome", data=data, palette=['#5bde54',"#de5454"], ax=axes[0])
axes[0].set_title("Count of Outcome variable")
axes[0].set_ylabel("Count")
axes[0].set_xticklabels(["Healty", "Diabetic"])

plt.pie(data.Outcome.value_counts(), autopct='%.1f%%', labels=["Healty", "Diabetic"], colors=['#5bde54',"#de5454"])
axes[1].set_title("Count of Outcome variable")

plt.show()

## Missing Values

In [None]:
data.describe()

#### As you can see the minimum value for Glucose, BloodPressure, skinThickness, Insulin and BMI is 0 which is practically not possible which suggests us that it is a faulty value.

In [None]:
# Replacing 0 by nan to calculate the null values
data[['Glucose','BloodPressure','SkinThickness','Insulin','BMI']] = data[['Glucose','BloodPressure','SkinThickness','Insulin','BMI']].replace(0,np.NaN)

In [None]:
# Missing Values
data.isnull().sum()

#### To fill these Nan values the data distribution needs to be understood against the target.

In [None]:
plt.rcParams["figure.figsize"] = (10, 8)
plt.rcParams["figure.dpi"] = 80
sns.heatmap(data.corr(), annot=True, cmap="viridis")
plt.show()

# Replace Missing Values and EDA

In [None]:
def distributon_plot(x):
    fig, axes = plt.subplots(figsize=(20, 8), nrows=1, ncols=2)

    sns.histplot(x=x, hue="Outcome", data=data, palette=['#5bde54',"#de5454"], ax=axes[0])
    axes[0].set_title(f"{x} Distribution Histplot")
    axes[0].legend(["Diabetic", "Healthy"])
    axes[0].set_ylabel("Density / Count")

    sns.kdeplot(x=x, hue="Outcome", data=data, palette=['#5bde54',"#de5454"], ax=axes[1])
    axes[1].set_title(f"{x} Distribution Kdeplot")
    axes[1].legend(["Diabetic", "Healthy"])
    axes[1].set_ylabel("Density / Count")

    plt.show()

In [None]:
# Gets two Median Valuse for Both Outcomes Seprately
def median_target(var):   
    temp = data[data[var].notnull()]
    temp = temp[[var, 'Outcome']].groupby(['Outcome'])[[var]].median().reset_index()
    return temp

## Insulin

In [None]:
distributon_plot("Insulin")

In [None]:
median_target('Insulin')

#### Insulin's medians by the target are really different ! 102.5 for a healthy person and 169.5 for a diabetic person

In [None]:
data.loc[(data['Outcome'] == 0 ) & (data['Insulin'].isnull()), 'Insulin'] = 102.5
data.loc[(data['Outcome'] == 1 ) & (data['Insulin'].isnull()), 'Insulin'] = 169.5

## Glucose

In [None]:
distributon_plot("Glucose")

In [None]:
median_target('Glucose')

In [None]:
data.loc[(data['Outcome'] == 0 ) & (data['Glucose'].isnull()), 'Glucose'] = 107
data.loc[(data['Outcome'] == 1 ) & (data['Glucose'].isnull()), 'Glucose'] = 140

## Skin Thickness

In [None]:
distributon_plot("SkinThickness")

In [None]:
median_target("SkinThickness")

In [None]:
data.loc[(data['Outcome'] == 0 ) & (data['SkinThickness'].isnull()), 'SkinThickness'] = 27
data.loc[(data['Outcome'] == 1 ) & (data['SkinThickness'].isnull()), 'SkinThickness'] = 32

## Blood Pressure

In [None]:
distributon_plot("BloodPressure")

In [None]:
median_target('BloodPressure')

In [None]:
data.loc[(data['Outcome'] == 0 ) & (data['BloodPressure'].isnull()), 'BloodPressure'] = 70
data.loc[(data['Outcome'] == 1 ) & (data['BloodPressure'].isnull()), 'BloodPressure'] = 74.5

## BMI

In [None]:
distributon_plot("BMI")

In [None]:
median_target('BMI')

In [None]:
data.loc[(data['Outcome'] == 0 ) & (data['BMI'].isnull()), 'BMI'] = 30.1
data.loc[(data['Outcome'] == 1 ) & (data['BMI'].isnull()), 'BMI'] = 34.3

In [None]:
distributon_plot("Age")
distributon_plot("Pregnancies")
distributon_plot("DiabetesPedigreeFunction")

## All Features Pair Plot

In [None]:
sns.pairplot(data, hue="Outcome")
plt.show()

### Final Check for Null Values in Data

In [None]:
data.isnull().sum()

# Prepare Dataset

In [None]:
data.head()

In [None]:
X = data.drop("Outcome", axis=1).values
y = data.Outcome

## Splitting the dataset into the Training set and Test set

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

## Feature Scaling - Scaling Data for Some Models

In [None]:
sc = StandardScaler()

X_train_scaled = sc.fit_transform(X_train)
X_test_scaled = sc.transform(X_test)

#### CatBoostClassifier Performed the Best with 90.26% Accuracy and 88.6% CrossVal-Accuracy

# Using CatBoostClassifier

In [None]:
#KNN model 
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=3)
neigh.fit(X_train, y_train)
pred=neigh.predict(X_test)

In [None]:
score=accuracy_score(pred,y_test)
score

In [None]:
classifier = CatBoostClassifier()
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

cm = confusion_matrix(y_test, y_pred)

In [None]:
print("Test Accuracy : ", accuracy, "\n")

print("Confusion Matrix \n", cm, "\n")

plt.rcParams["figure.figsize"] = (6, 5)
plt.rcParams["figure.dpi"] = (100)
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt='2.0f')
plt.show()

In [None]:
#CatBoostClassifier+KNN ensemble
from sklearn.ensemble import VotingClassifier
model = VotingClassifier(estimators=[('lr', classifier), ('dt', neigh)], voting='hard')
model.fit(X_train,y_train)
model.score(X_test,y_test)

In [None]:
y_pred=model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

cm = confusion_matrix(y_test, y_pred)

In [None]:
accuracy

In [None]:
cm

In [None]:
print("Test Accuracy : ", accuracy, "\n")

print("Confusion Matrix \n", cm, "\n")

plt.rcParams["figure.figsize"] = (6, 5)
plt.rcParams["figure.dpi"] = (100)
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt='2.0f')
plt.show()

![](http://)

# Credits
#### https://www.kaggle.com/vincentlugat/pima-indians-diabetes-eda-prediction-0-906