<a href="https://colab.research.google.com/github/Garima918/Diabetes-Prediction-based-on-Binary-Classification/blob/main/Diabetes_Prediction_using_Binary_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Dataset Loading**

In [None]:
from google.colab import drive
drive.mount('/content/drive')
#If we want to change working directory. Below are the steps:
#import os
#os.chdir()
#print(os.getcwd())

**Importing Required Libraries**

In [None]:
import pandas as pd
import sklearn as sk
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Datasets/Diabetes_dataset.csv')
# We can import file via local repository/drive or through Dataset libraries like Kaggle.

**Describing Dataset**

In [None]:
print(df.info())
print(df.describe())
# If we get total entries equal to the entries in all variable types, then we can conclude that there are no missing values.

**Data Visualization**

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
sns.histplot([df['Glucose'], df['Age'], df['BMI']], bins= 50, kde=True, alpha=0.8, palette = ['orange','blue','green'])
# We can add as many columns of dataframe to visualize and change other paramenters as required.

In [None]:
k = df.corr()
sns.heatmap(k, annot=True, cmap='Blues')
#Other colormaps can be - bwr, reds, oranges etc.

In [None]:
sns.pairplot(df, hue='Outcome')

**Dataset Splitting**

In [None]:
x = df.drop(columns = 'Outcome',axis=1)
y = df['Outcome']

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 9)
pmf = pd.DataFrame()
#pmf- performance_metrics_file
#Additional dataframe (pmf) created to store all the model's performance score for better visualization among models trained.

**Data Preprocessing**

**Handling Class Imbalance**

In [None]:
#from imblearn.over_sampling import SMOTE
#sm = SMOTE(random_state=9)
#X, Y = sm.fit_resample(x, y)
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=9)
x, y = ros.fit_resample(x, y)

**Data Scaling**

In [None]:
from sklearn.preprocessing import StandardScaler
data_scaler = StandardScaler()
data_rescaled = data_scaler.fit_transform(x)
x = pd.DataFrame(data_rescaled, columns = x.columns)
print(x)

**Performance Measures**


In [None]:
#Importing modules for Performance Metrics
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_curve,auc
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

In [None]:
metrics_indicator = ['Precision','Accuracy','Recall','F1 Score', 'ROC AUC']
#For more information, please refer to README.md on GitHub for better explanation of this particular variable in this dataset.

**Fitting Logistic Regression Model**

In [None]:
from sklearn.linear_model import LogisticRegression
x_train, x_test, y_train, y_test = x_train.copy(), x_test.copy(), y_train.copy(), y_test.copy()
log_r = LogisticRegression()
log_r.fit(x_train, y_train)
log_r_pred = log_r.predict(x_test)

log_r_precision_score = precision_score(y_test, log_r_pred)
log_r_accuracy_score = accuracy_score(y_test, log_r_pred)
log_r_recall_score = recall_score(y_test, log_r_pred)
log_r_f1_score = f1_score(y_test, log_r_pred)
log_r_fpr, log_r_tpr, log_r_thresholds = roc_curve(y_test, log_r_pred)
log_r_roc_auc = auc(log_r_fpr, log_r_tpr)

pmf["Log_R"] = pd.DataFrame({'Log R': [log_r_precision_score, log_r_accuracy_score, log_r_recall_score, log_r_f1_score, log_r_roc_auc]}, index = metrics_indicator)
cm = confusion_matrix(y_test, log_r_pred, labels = log_r.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix = cm, display_labels = log_r.classes_)
disp.plot()
plt.title('Logistic Regression')
plt.show()


**Fitting Decision Tree Model**

In [None]:
from sklearn.tree import DecisionTreeClassifier
x_train, y_train, x_test, y_test = x_train.copy(), y_train.copy(), x_test.copy(), y_test.copy()
dt = DecisionTreeClassifier()
dt.fit(x_train, y_train)
dt_pred = dt.predict(x_test)

dt_precision_score = precision_score(y_test, dt_pred)
dt_accuracy_score = accuracy_score(y_test, dt_pred)
dt_recall_score = recall_score(y_test, dt_pred)
dt_f1_score = f1_score(y_test, dt_pred)
dt_fpr, dt_tpr, dt_thresholds = roc_curve(y_test, dt_pred)
dt_roc_auc = auc(dt_fpr, dt_tpr)

pmf["DT"] = pd.DataFrame({'DT':[dt_precision_score, dt_accuracy_score, dt_recall_score, dt_f1_score, dt_roc_auc]}, index = metrics_indicator)
cm1 = confusion_matrix(y_test, dt_pred,labels = dt.classes_)
disp1 = ConfusionMatrixDisplay(confusion_matrix = cm1, display_labels = dt.classes_)
disp1.plot()
plt.title('Decision Tree')
plt.show()


**Fitting Random Forest Classifier**

In [None]:
from sklearn.ensemble import RandomForestClassifier
x_train, y_train, x_test, y_test = x_train.copy(), y_train.copy(), x_test.copy(), y_test.copy()
rfc = RandomForestClassifier()
rfc.fit(x_train,y_train)
rfc_pred = rfc.predict(x_test)

rfc_precision_score = precision_score(y_test, rfc_pred)
rfc_accuracy_score = accuracy_score(y_test, rfc_pred)
rfc_recall_score = recall_score(y_test, rfc_pred)
rfc_f1_score = f1_score(y_test, rfc_pred)
rfc_fpr, rfc_tpr, rfc_thresholds = roc_curve(y_test, rfc_pred)
rfc_roc_auc = auc(rfc_fpr, rfc_tpr)

pmf["RFC"] = pd.DataFrame({"RFC":[rfc_precision_score, rfc_accuracy_score,rfc_recall_score, rfc_f1_score, rfc_roc_auc]}, index = metrics_indicator)

cm2 = confusion_matrix(y_test, rfc_pred, labels = rfc.classes_)
disp2 = ConfusionMatrixDisplay(cm2, display_labels = rfc.classes_)
disp2.plot()
plt.title('Random Forest Classifier')
plt.show()


**Fitting KNN Model**

In [None]:
from sklearn.neighbors import KNeighborsClassifier
x_train, y_train, x_test, y_test = x_train.copy(), y_train.copy(), x_test.copy(), y_test.copy()
knn = KNeighborsClassifier()
knn.fit(x_train,y_train)
knn_pred = knn.predict(x_test)

knn_precision_score = precision_score(y_test, knn_pred)
knn_accuracy_score = accuracy_score(y_test, knn_pred)
knn_recall_score = recall_score(y_test, knn_pred)
knn_f1_score = f1_score(y_test, knn_pred)
knn_fpr, knn_tpr, knn_thresholds = roc_curve(y_test, knn_pred)
knn_roc_auc = auc(knn_fpr, knn_tpr)

pmf["KNN"] = pd.DataFrame({"KNN":[knn_precision_score, knn_accuracy_score, knn_recall_score, knn_f1_score, knn_roc_auc]}, index = metrics_indicator)

cm3 = confusion_matrix(y_test, knn_pred, labels = knn.classes_)
disp = ConfusionMatrixDisplay(cm3, display_labels = knn.classes_)
disp.plot()
plt.title('KNN')
plt.show()


**Fitting Naive Bayes Model**

In [None]:
from sklearn.naive_bayes import GaussianNB
x_train, y_train, x_test, y_test = x_train.copy(), y_train.copy(), x_test.copy(), y_test.copy()
nb = GaussianNB()
nb.fit(x_train, y_train)
nb_pred = nb.predict(x_test)

nb_precision_score = precision_score(y_test, nb_pred)
nb_accuracy_score = accuracy_score(y_test, nb_pred)
nb_recall_score = recall_score(y_test, nb_pred)
nb_f1_score = f1_score(y_test, nb_pred)
nb_fpr, nb_tpr, nb_thresholds = roc_curve(y_test, nb_pred)
nb_roc_auc = auc(nb_fpr, nb_tpr)

pmf["NB"] = pd.DataFrame({"NB": [nb_precision_score, nb_accuracy_score, nb_recall_score, nb_f1_score, nb_roc_auc]}, index = metrics_indicator)

cm4 = confusion_matrix(y_test, nb_pred, labels = nb.classes_)
disp = ConfusionMatrixDisplay(cm4, display_labels = nb.classes_)
disp.plot()
plt.title("Naive Bayes")
plt.show()


**Fitting Gradient Boosting Model**

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
x_train, y_train, x_test, y_test = x_train.copy(), y_train.copy(), x_test.copy(), y_test.copy()
gb = GradientBoostingClassifier()
gb.fit(x_train,y_train)
gb_pred = gb.predict(x_test)

gb_precision_score = precision_score(y_test, gb_pred)
gb_accuracy_score = accuracy_score(y_test, gb_pred)
gb_recall_score = recall_score(y_test, gb_pred)
gb_f1_score = f1_score(y_test, gb_pred)
gb_fpr, gb_tpr, gb_thresholds = roc_curve(y_test, gb_pred)
gb_roc_auc = auc(gb_fpr, gb_tpr)

pmf["GB"] = pd.DataFrame({"GB":[gb_precision_score, gb_accuracy_score, gb_recall_score, gb_f1_score, gb_roc_auc]}, index = metrics_indicator)

cm5 = confusion_matrix(y_test, gb_pred, labels = gb.classes_)
disp= ConfusionMatrixDisplay(cm5, display_labels = gb.classes_)
disp.plot()
plt.title("Gradient Boosting")
plt.show()


**Fitting Ada Boost Model**

In [None]:
from sklearn.ensemble import AdaBoostClassifier
x_train, y_train, x_test, y_test = x_train.copy(), y_train.copy(), x_test.copy(), y_test.copy()
ab = AdaBoostClassifier()
ab.fit(x_train, y_train)
ab_pred = ab.predict(x_test)

ab_precision_score = precision_score(y_test, ab_pred)
ab_accuracy_score = accuracy_score(y_test, ab_pred)
ab_recall_score = recall_score(y_test, ab_pred)
ab_f1_score = f1_score(y_test, ab_pred)
ab_fpr, ab_tpr, ab_thresholds = roc_curve(y_test, ab_pred)
ab_roc_auc = auc(ab_fpr, ab_tpr)

pmf["AB"] = pd.DataFrame({"AB":[ab_precision_score, ab_accuracy_score, ab_recall_score, ab_f1_score, ab_roc_auc]}, index = metrics_indicator)

cm6 = confusion_matrix(y_test, ab_pred, labels = ab.classes_)
disp= ConfusionMatrixDisplay(cm6, display_labels = ab.classes_)
disp.plot()
plt.title("Ada Boost")
plt.show()


**Setting Parameter to Display ROC Curve**

In [None]:
plt.figure()
#plt.plot(lr_fpr, lr_tpr, color = 'darkorange', lw = 2, label = 'Linear Regression(area=%0.3f)' % lr_roc_auc)
plt.plot(log_r_fpr, log_r_tpr, color = 'yellow', lw = 2, label = 'Logistic Regression (area = %0.3f)' % log_r_roc_auc)
plt.plot(dt_fpr, dt_tpr, color = 'red', lw = 2, label = 'Decision Tree (area = %0.3f)' % dt_roc_auc)
plt.plot(rfc_fpr, rfc_tpr, color = 'green', lw =2, label = 'Random Forest (area = %0.3f)' % rfc_roc_auc)
plt.plot(knn_fpr, knn_tpr, color = 'cyan', lw = 2, label = 'KNN (area = %0.3f)' % knn_roc_auc)
plt.plot(nb_fpr, nb_tpr, color = 'blue', lw = 2, label = 'Naive Bayes (area = %0.3f)' % nb_roc_auc)
plt.plot(gb_fpr, gb_tpr, color = 'black', lw = 2, label = 'Gradient Boosting (area = %0.3f)' % gb_roc_auc)
plt.plot(ab_fpr, ab_tpr, color = 'grey', lw = 2, label = 'Ada Boost (area = %0.3f)' % ab_roc_auc)

plt.legend(loc=0)
plt.xlim([0.0,1.0])
plt.ylim([0.0,1.0])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.plot([0,1], [0,1], linestyle = '--', color = 'black', lw = 2, label = 'Random Classifier')

**Writing Performance Metrics to File**

In [None]:
#pmf_ad = pd.DataFrame({[pmf.keys]:[pmf.values]}, index = metrics_indicator)
pmf.to_csv("./Performance_Of_Models_After_Preprocessing.csv")