In [None]:
#pip install tensorflow

# Importing required libraries:

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
import seaborn as sns
from scipy import stats

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from mlxtend.plotting import plot_confusion_matrix
from sklearn.linear_model import LogisticRegressionCV
from sklearn.svm import SVC
from sklearn import svm
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.utils import shuffle
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score , classification_report,ConfusionMatrixDisplay,precision_score,recall_score, f1_score,roc_auc_score,roc_curve, balanced_accuracy_score
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from mlxtend.feature_selection import ExhaustiveFeatureSelector as EFS
import tensorflow as tf 
tf.random.set_seed(3)
from tensorflow import keras

# The dataset:

In [None]:
diabetes_data=pd.read_csv("Diabetes_dataset.csv") #loading the dataset

# Data Pre-processing:

In [None]:
diabetes_data.head() #printing first five rows

In [None]:
diabetes_data.shape #checking the shape of the dataframe

In [None]:
diabetes_data['Diabetes'].value_counts()

In [None]:
diabetes_data.describe() #descriptive statistic summary

In [None]:
diabetes_data.isna().sum() #Checking the missing values

In [None]:
diabetes_data.isnull().sum() #checking the null values

In [None]:
diabetes_data.info() #getting the datatypes info

In [None]:
diabetes_data.duplicated().sum() #finding the duplicated values

In [None]:
if diabetes_data.duplicated().sum() == 0:
        print('No Duplicated Values')
else:
    print('Duplicated data has been eliminated')
    diabetes_data.drop_duplicates()   #eliminating the duplicated values

# EDA & Data visualization:

In [None]:
# Create a pie chart
counts = diabetes_data['Diabetes'].value_counts()
plt.figure(figsize=[7,5])
plt.pie(counts, labels=counts.index, autopct='%1.1f%%',textprops=dict(color="azure"),colors=['darkblue', 'deepskyblue'])
plt.title('Target distribution in the dataset')
plt.show()


In [None]:
#Pairplot to find correlation btn all features:
sns.pairplot(diabetes_data)
plt.show()

In [None]:
#selecting variables that are likely to predict diabetes medically:
#dia_data = diabetes_data[["Age","Sex","HighChol","BMI","Smoker","PhysActivity","PhysHlth","Fruits","Veggies","HvyAlcoholConsump","GenHlth","Stroke","HighBP","Diabetes"]]
#dia_data.head()

In [None]:
#Checking unique value count:
unique_values = {}
for col in diabetes_data.columns:
    unique_values[col] = diabetes_data[col].value_counts().shape[0]

pd.DataFrame(unique_values, index=['unique value count']).transpose()

In [None]:
#frequency check for all values in the column:

# All data columns except for color
feature_cols = [x for x in diabetes_data.columns]
plt.figure(figsize=(25,35))
# loop for subplots
for i in range(len(feature_cols)):
    plt.subplot(8,5,i+1)
    plt.title(feature_cols[i])
    plt.xticks(rotation=90)
    plt.hist(diabetes_data[feature_cols[i]],color = "darkblue")
    
plt.tight_layout()

Now, dropping columns with very small value range:'HvyAlcoholConsump' and 'stroke'

In [None]:
diabetes_data.drop(['HvyAlcoholConsump','Stroke'], axis=1, inplace=True)

In [None]:
#Correlation of other features with Diabetes:
diabetes_data.drop('Diabetes', axis=1).corrwith(diabetes_data.Diabetes).plot(kind='bar', grid=True, figsize=(10, 6), title="Correlation with Diabetes",color="darkblue");

Findings:variables with correlation less than 0.1 are Sex, Smoker, Fruits, Veggies

# Correlation matrix:

In [None]:
# check for all possible co-variates:
sns.set(rc = {'figure.figsize':(10,10)})
sns.heatmap(diabetes_data.corr(),vmin=-1, vmax=1, annot = True, fmt='.1g',cmap= 'YlGnBu')

In [None]:
#dropping the variables with low correlations: 
diabetes_data.drop(['Sex','Fruits'], axis=1, inplace=True)

In [None]:
diabetes_data.head()

#narrowed down to 13 possible determinants 
#determine which predictors are more useful

In [None]:
# Bivariate bar plot for categorical variables
features = [x for x in diabetes_data.columns if x not in ['Age', 'BMI', 'PhysHlth', 'Diabetes']]
plt.figure(figsize=(42, 24))  
plt.suptitle('Diabetes by categorical features',fontsize=24)

# Subplots
for i in enumerate(features):
    plt.subplot(4, 4, i[0] + 1)
    x = sns.countplot(
        data=diabetes_data,
        x=i[1],
        hue='Diabetes',
        palette=['darkblue', 'red']
    )
    for z in x.patches:
        x.annotate(
            '{:.1f}'.format((z.get_height() / diabetes_data.shape[0]) * 100) + '%',
            (z.get_x() + 0.25, z.get_height() + 0.01)
        )


In [None]:
#for numeric variables
plt.figure(figsize=(10,5))
sns.displot(x='BMI', col='Diabetes' , data = diabetes_data, color = 'darkblue')
sns.displot(data=diabetes_data,col='Diabetes',x='Age', color='darkblue')

# Feature selection:

In [None]:
#Features selection -step 1
#1. Define X,y
Y = (diabetes_data['Diabetes']).astype(int)
X = diabetes_data.loc[:, diabetes_data.columns != 'Diabetes']  # everything except "Diabetes"

In [None]:
#step 2
model = ExtraTreesClassifier()
model.fit(X,Y)
print(model.feature_importances_) 

#plot graph of feature importances for better visualization
feat_importances = pd.Series(model.feature_importances_, index=X.columns)
plt.figure(figsize=(8,6))
feat_importances.nlargest(6).plot(kind='barh',color='darkblue')
plt.show()

In [None]:
diabetes_data.head()

In [None]:
diabetes_data.tail()

# Splitting data into train data & test data:

In [None]:
Y = (diabetes_data['Diabetes']).astype(int)
X = diabetes_data.loc[:, diabetes_data.columns != 'stroke']  # everything except "stroke"

In [None]:
S= StandardScaler() 

In [None]:
S.fit(X) #standardising the features

In [None]:
Stdz_data=S.transform(X) #transforming the features

In [None]:
print(Stdz_data)

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=10)

In [None]:
pca.fit(Stdz_data)

In [None]:
x_pca=pca.transform(Stdz_data)

In [None]:
Stdz_data.shape

In [None]:
x_pca.shape

In [None]:
Stdz_data

In [None]:
x_pca

In [None]:
X= x_pca

In [None]:
Y= diabetes_data['Diabetes'] 

In [None]:
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size= 0.4)
#, stratify=Y,random_state=42

In [None]:
print(X.shape,X_train.shape,X_test.shape)

In [None]:
print(X)

In [None]:
print(Y)

In [None]:
print(type(Y))

In [None]:
# apply Linear Discriminant Analysis
lda = LinearDiscriminantAnalysis(n_components=1)
X_train = lda.fit_transform(X_train, Y_train)
X_test = lda.transform(X_test)

# Model building and testing:


# KNN:

In [None]:
knn=KNeighborsClassifier()
knn.fit(X_train, Y_train)

In [None]:
pred_knn = knn.predict(X_test)
pred_knn

In [None]:
#function that get y_test and calculate into df all the relevant metric
def train_evaluate_modelX1(Y_test):
    #fit the model instance 
    predictions = pred_knn # calculate predictions

    #compute metrics for evaluation
    accuracy = accuracy_score(Y_test, predictions)
    f1 = f1_score(Y_test, predictions)
    precision = precision_score(Y_test, predictions)
    recall = recall_score(Y_test, predictions)
    balanced_accuracy = balanced_accuracy_score(Y_test, predictions)
    auc = roc_auc_score(Y_test, predictions)

    #create a dataframe to visualize the results
    eval_df = pd.DataFrame([[accuracy, f1, precision, recall, balanced_accuracy, auc]], columns=['accuracy', 'f1_score', 'precision', 'recall', 'balanced_accuracy', 'auc'])
    return eval_df

In [None]:
print(classification_report(Y_test,pred_knn ))

In [None]:
# Calculate the accuracy score for the default KNN model
accuracy_knn = accuracy_score(Y_test, pred_knn)
print("Accuracy score for default KNN model:", accuracy_knn)

In [None]:
R1 = train_evaluate_modelX1(Y_test)
R1.index = ['K Nearest Neighbors - Method 1']
R1.style.background_gradient(cmap = sns.color_palette("blend:darkblue,deepskyblue", as_cmap=True))

# Hyperparameter tuning for K Nearest Neighbors


In [None]:
from sklearn.model_selection import GridSearchCV
# defining parameter range
param_grid = {'n_neighbors': [1,3,5,7,9,11,13,15,17,19],  #odd numbers because there are 2 classes in target coulmn
              'weights': ['distance', 'uniform']}  
gridKNN = GridSearchCV(KNeighborsClassifier(), param_grid, refit = True, verbose = 3)
  
# fitting the model for grid search
gridKNN.fit(X_train, Y_train)

In [None]:
print(gridKNN.best_params_)

# KNN with best parameters:

In [None]:
knn2 = KNeighborsClassifier(n_neighbors = 19, weights= 'distance')
knn2.fit(X_train, Y_train)
pred_knn2 = knn2.predict(X_test)
print(classification_report(Y_test, pred_knn2))

# Confusion matrix:

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay


# Trim the shorter array
Y_test = Y_test[:len(pred_knn2)]

# Compute the confusion matrix
cm = confusion_matrix(Y_test, pred_knn2)

# Create a ConfusionMatrixDisplay object
disp = ConfusionMatrixDisplay(confusion_matrix=cm)

# Plot the confusion matrix
fig = plt.figure(figsize=(5, 5))
disp.plot(cmap=plt.cm.Blues)

# Remove the cell gridlines
plt.grid(which='major')

# Set the figure size
plt.gcf().set_size_inches(6, 6)

# Show the plot
plt.show()


# Accuracy score for KNN with the best parameters:

In [None]:
accuracy_knn2 = accuracy_score(Y_test, pred_knn2)
print("Accuracy score for KNN model with best parameters:", accuracy_knn2)

# Cross-validation using scikit

In [None]:
from sklearn.model_selection import KFold

kf = KFold(n_splits=5, shuffle=True)

for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    Y_train, Y_test = Y[train_index], Y[test_index]
    knn2 = KNeighborsClassifier(n_neighbors = 19, weights= 'distance')
    knn2.fit(X_train, Y_train)   
    predictions = knn2.predict(X_test)
    print(classification_report(predictions, Y_test))

In [None]:
accuracy_knn3 = accuracy_score(Y_test, predictions)
print("Accuracy score for KNN model with best parameters:", accuracy_knn3)

# 

In [None]:
#function that get y_test and calculate into df all the relevant metric
def train_evaluate_model1(Y_test):
    #fit the model instance 
    #predictions = pred_knn3 # calculate predictions

    #compute metrics for evaluation
    accuracy = accuracy_score(Y_test, predictions)
    f1 = f1_score(Y_test, predictions)
    precision = precision_score(Y_test, predictions)
    recall = recall_score(Y_test, predictions)
    balanced_accuracy = balanced_accuracy_score(Y_test, predictions)
    auc = roc_auc_score(Y_test, predictions)

    #create a dataframe to visualize the results
    eval_df = pd.DataFrame([[accuracy, f1, precision, recall, balanced_accuracy, auc]], columns=['accuracy', 'f1_score', 'precision', 'recall', 'balanced_accuracy', 'auc'])
    return eval_df

In [None]:
results = train_evaluate_model1(Y_test)
results.index = ['K Nearest Neighbors - Method 1']
results.style.background_gradient(cmap = sns.color_palette("blend:darkblue,deepskyblue", as_cmap=True))

In [None]:
# Calculate and store the metrics
accuracy = accuracy_score(Y_test, predictions)
f1 = f1_score(Y_test, predictions)
precision = precision_score(Y_test, predictions)
recall = recall_score(Y_test, predictions)

# Create a DataFrame to store the metrics
metrics_df = pd.DataFrame({
    'Metric': ['Accuracy', 'F1-Score', 'Precision', 'Recall'],
    'Value': [accuracy, f1, precision, recall]
})

# Create a bar chart to visualize the metrics
plt.figure(figsize=(8, 5))
plt.bar(metrics_df['Metric'], metrics_df['Value'],color='darkblue')
plt.xlabel('Metric')
plt.ylabel('Value')
plt.title('Evaluation Metrics for K-Nearest Neighbors')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
from sklearn.metrics import roc_curve
import matplotlib.pyplot as plt
%matplotlib inline
fig, ax = plt.subplots(figsize=(5, 4))
fpr, tpr, thresholds = roc_curve(Y_test, predictions)

# Calculate the AUC
auc = roc_auc_score(Y_test, predictions)

# Plot the ROC curve
plt.plot(fpr, tpr, label='(AUC = %0.2f)' % auc)

# Print the AUC
print('AUC:', auc)

# Show the plot
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.show()
print("")
print('False Positive Rates:', fpr)
print("xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx")
print('True Positive Rates:', tpr)

# Random Forest:

In [None]:
RF=RandomForestClassifier()
RF.fit(X_train, Y_train)

In [None]:
pred_RF = RF.predict(X_test)
pred_RF

In [None]:
print(classification_report(Y_test, pred_RF))

In [None]:
# Calculate the accuracy score for the default rf model
accuracy_RF = accuracy_score(Y_test, pred_RF)
print("Accuracy score for default Random Forest model:", accuracy_RF)

In [None]:
#function that get y_test and calculate into df all the relevant metric
def train_evaluate_model2(Y_test):
    #fit the model instance 
    predictions = pred_RF # calculate predictions

    #compute metrics for evaluation
    accuracy = accuracy_score(Y_test, predictions)
    f1 = f1_score(Y_test, predictions)
    precision = precision_score(Y_test, predictions)
    recall = recall_score(Y_test, predictions)
    balanced_accuracy = balanced_accuracy_score(Y_test, predictions)
    auc = roc_auc_score(Y_test, predictions)

    #create a dataframe to visualize the results
    eval_df = pd.DataFrame([[accuracy, f1, precision, recall, balanced_accuracy, auc]], columns=['accuracy', 'f1_score', 'precision', 'recall', 'balanced_accuracy', 'auc'])
    return eval_df

# Tuning for RF

In [None]:
from sklearn.model_selection import GridSearchCV
# defining parameter range
param_grid = {
    'n_estimators': [10, 50, 100, 200],
    'max_features': ['auto', 'sqrt', 'log2'],   
    'criterion' :['gini', 'entropy']
}

gridrf = GridSearchCV(RandomForestClassifier(), param_grid, refit = True, verbose = 5)
  
# fitting the model for grid search
gridrf.fit(X_train, Y_train)


In [None]:
print(gridrf.best_params_)

In [None]:
#Let's run our SVC again with the best parameters.
rf2 = RandomForestClassifier(criterion='entropy', max_features='log2', n_estimators= 10)
rf2.fit(X_train, Y_train)
pred_rf2 = rf2.predict(X_test)
print(classification_report(Y_test, pred_rf2))

In [None]:
accuracy_rf2 = accuracy_score(Y_test,pred_rf2)
print("Accuracy score for RF model with best parameters:", accuracy_rf2)

In [None]:
resultsRF = train_evaluate_model2(Y_test)
resultsRF.index = ['Random Forest - Method 2']
results= results.append(resultsRF)
resultsRF.style.background_gradient(cmap = sns.color_palette("blend:darkblue,deepskyblue", as_cmap=True))

In [None]:
# Trim the shorter array
Y_test = Y_test[:len(pred_rf2)]

# Compute the confusion matrix
cm = confusion_matrix(Y_test, pred_rf2)

# Create a ConfusionMatrixDisplay object
disp = ConfusionMatrixDisplay(confusion_matrix=cm)

# Plot the confusion matrix
fig = plt.figure(figsize=(5, 5))
disp.plot(cmap=plt.cm.Blues)

# Remove the cell gridlines
plt.grid(which='major')

# Set the figure size
plt.gcf().set_size_inches(6, 6)

# Show the plot
plt.show()


In [None]:
# Calculate and store the metrics
accuracy = accuracy_score(Y_test, pred_rf2 )
f1 = f1_score(Y_test, pred_rf2 )
precision = precision_score(Y_test, pred_rf2 )
recall = recall_score(Y_test, pred_rf2 )

# Create a DataFrame to store the metrics
metrics_df = pd.DataFrame({
    'Metric': ['Accuracy', 'F1-Score', 'Precision', 'Recall'],
    'Value': [accuracy, f1, precision, recall]
})

# Create a bar chart to visualize the metrics
plt.figure(figsize=(8, 5))
plt.bar(metrics_df['Metric'], metrics_df['Value'],color='darkblue')
plt.xlabel('Metric')
plt.ylabel('Value')
plt.title('Evaluation Metrics for Random Forest')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
from sklearn.metrics import roc_curve
import matplotlib.pyplot as plt
%matplotlib inline
fig, ax = plt.subplots(figsize=(5, 4))
fpr, tpr, thresholds = roc_curve(Y_test, pred_rf2)

# Calculate the AUC
auc = roc_auc_score(Y_test, pred_rf2)

# Plot the ROC curve
plt.plot(fpr, tpr, label='(AUC = %0.2f)' % auc)

# Print the AUC
print('AUC:', auc)

# Show the plot
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.show()
print("")
print('False Positive Rates:', fpr)
print("xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx")
print('True Positive Rates:', tpr)

# Naive Bayes:

In [None]:
GNB=GaussianNB()
GNB.fit(X_train, Y_train)

In [None]:
pred_GNB = GNB.predict(X_test)
pred_GNB

In [None]:
#function that get y_test and calculate into df all the relevant metric
def train_evaluate_modelX3(Y_test):
    #fit the model instance 
    predictions = pred_GNB # calculate predictions

    #compute metrics for evaluation
    accuracy = accuracy_score(Y_test, predictions)
    f1 = f1_score(Y_test, predictions)
    precision = precision_score(Y_test, predictions)
    recall = recall_score(Y_test, predictions)
    balanced_accuracy = balanced_accuracy_score(Y_test, predictions)
    auc = roc_auc_score(Y_test, predictions)

    #create a dataframe to visualize the results
    eval_df = pd.DataFrame([[accuracy, f1, precision, recall, balanced_accuracy, auc]], columns=['accuracy', 'f1_score', 'precision', 'recall', 'balanced_accuracy', 'auc'])
    return eval_df

In [None]:
print(classification_report(Y_test, pred_GNB))

In [None]:
# Calculate the accuracy score for the default Gaussian Naive Bayes model
accuracy_GNB = accuracy_score(Y_test, pred_GNB)
print("Accuracy score for default Gaussian Naive Bayes model:", accuracy_GNB)

In [None]:
R3 = train_evaluate_modelX3(Y_test)
R3.index = ['Gaussian Naive Bayes - Method 3']
R1= R1.append(R3)
R3.style.background_gradient(cmap = sns.color_palette("blend:darkblue,deepskyblue", as_cmap=True))

# Hyperparameter tuning for Naive Bayes

In [None]:
from sklearn.model_selection import GridSearchCV
# defining parameter range
param_grid = {'var_smoothing': np.logspace(0, -9, num=100)}

gridGNB = GridSearchCV(GaussianNB(), param_grid, refit = True, verbose = 5)
  
# fitting the model for grid search
gridGNB.fit(X_train, Y_train)


In [None]:
print(gridGNB.best_params_)

In [None]:
#Let's run our SVC again with the best parameters.
GNB2 = GaussianNB(var_smoothing = 0.2848035868435802)
GNB2.fit(X_train, Y_train)
pred_GNB2 = GNB2.predict(X_test)
print(classification_report(Y_test, pred_GNB2))

In [None]:
accuracy_GNB2 = accuracy_score(Y_test,pred_GNB2)
print("Accuracy score for nb model with best parameters:", accuracy_GNB2)

In [None]:
# Trim the shorter array
Y_test = Y_test[:len(pred_GNB2)]

# Compute the confusion matrix
cm = confusion_matrix(Y_test, pred_GNB2)

# Create a ConfusionMatrixDisplay object
disp = ConfusionMatrixDisplay(confusion_matrix=cm)

# Plot the confusion matrix
fig = plt.figure(figsize=(5, 5))
disp.plot(cmap=plt.cm.Blues)

# Remove the cell gridlines
plt.grid(which='major')

# Set the figure size
plt.gcf().set_size_inches(6, 6)

# Show the plot
plt.show()


In [None]:
#function that get y_test and calculate into df all the relevant metric
def train_evaluate_model3(Y_test):
    #fit the model instance 
    predictions = pred_GNB2 # calculate predictions

    #compute metrics for evaluation
    accuracy = accuracy_score(Y_test, predictions)
    f1 = f1_score(Y_test, predictions)
    precision = precision_score(Y_test, predictions)
    recall = recall_score(Y_test, predictions)
    balanced_accuracy = balanced_accuracy_score(Y_test, predictions)
    auc = roc_auc_score(Y_test, predictions)

    #create a dataframe to visualize the results
    eval_df = pd.DataFrame([[accuracy, f1, precision, recall, balanced_accuracy, auc]], columns=['accuracy', 'f1_score', 'precision', 'recall', 'balanced_accuracy', 'auc'])
    return eval_df

In [None]:
resultsGNB = train_evaluate_model3(Y_test)
resultsGNB.index = ['Gaussian Naive Bayes - Method 3']
results= results.append(resultsGNB)
resultsGNB.style.background_gradient(cmap = sns.color_palette("blend:darkblue,deepskyblue", as_cmap=True))

In [None]:
# Calculate and store the metrics
predictions = pred_GNB2
accuracy = accuracy_score(Y_test, predictions)
f1 = f1_score(Y_test, predictions)
precision = precision_score(Y_test, predictions)
recall = recall_score(Y_test, predictions)

# Create a DataFrame to store the metrics
metrics_df = pd.DataFrame({
    'Metric': ['Accuracy', 'F1-Score', 'Precision', 'Recall'],
    'Value': [accuracy, f1, precision, recall]
})

# Create a bar chart to visualize the metrics
plt.figure(figsize=(8, 5))
plt.bar(metrics_df['Metric'], metrics_df['Value'],color='darkblue')
plt.xlabel('Metric')
plt.ylabel('Value')
plt.title('Evaluation Metrics for Naive Bayes')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
from sklearn.metrics import roc_curve
import matplotlib.pyplot as plt
%matplotlib inline
predictions = pred_GNB2
fig, ax = plt.subplots(figsize=(5, 4))
fpr, tpr, thresholds = roc_curve(Y_test, predictions)

# Calculate the AUC
auc = roc_auc_score(Y_test, predictions)

# Plot the ROC curve
plt.plot(fpr, tpr, label='(AUC = %0.2f)' % auc)

# Print the AUC
print('AUC:', auc)

# Show the plot
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.show()
print("")
print('False Positive Rates:', fpr)
print("xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx")
print('True Positive Rates:', tpr)

# SVM:


In [None]:
svc = svm.SVC()

In [None]:
svc.fit(X_train, Y_train)

In [None]:
pred_svc = svc.predict(X_test)
pred_svc

In [None]:
print(classification_report(Y_test, pred_svc))

In [None]:
# Calculate the accuracy score for the default SVC model
accuracy_svc = accuracy_score(Y_test, pred_svc)
print("Accuracy score for default SVC model:", accuracy_svc)

In [None]:
#function that get y_test and calculate into df all the relevant metric
def train_evaluate_model4(Y_test):
    #fit the model instance 
    predictions = pred_svc # calculate predictions

    #compute metrics for evaluation
    accuracy = accuracy_score(Y_test, predictions)
    f1 = f1_score(Y_test, predictions)
    precision = precision_score(Y_test, predictions)
    recall = recall_score(Y_test, predictions)
    balanced_accuracy = balanced_accuracy_score(Y_test, predictions)
    auc = roc_auc_score(Y_test, predictions)

    #create a dataframe to visualize the results
    eval_df = pd.DataFrame([[accuracy, f1, precision, recall, balanced_accuracy, auc]], columns=['accuracy', 'f1_score', 'precision', 'recall', 'balanced_accuracy', 'auc'])
    return eval_df

In [None]:
resultssvc = train_evaluate_model4(Y_test)
resultssvc.index = ['Support Vector Machine - Method 4']
results= results.append(resultssvc)
resultssvc.style.background_gradient(cmap = sns.color_palette("blend:darkblue,deepskyblue", as_cmap=True))

In [None]:
# Trim the shorter array
Y_test = Y_test[:len(pred_svc)]

# Compute the confusion matrix
cm = confusion_matrix(Y_test, pred_svc)

# Create a ConfusionMatrixDisplay object
disp = ConfusionMatrixDisplay(confusion_matrix=cm)

# Plot the confusion matrix
fig = plt.figure(figsize=(5, 5))
disp.plot(cmap=plt.cm.Blues)

# Remove the cell gridlines
plt.grid(which='major')

# Set the figure size
plt.gcf().set_size_inches(6, 6)

# Show the plot
plt.show()


In [None]:
# Calculate and store the metrics
predictions=pred_svc
accuracy = accuracy_score(Y_test, predictions)
f1 = f1_score(Y_test, predictions)
precision = precision_score(Y_test, predictions)
recall = recall_score(Y_test, predictions)

# Create a DataFrame to store the metrics
metrics_df = pd.DataFrame({
    'Metric': ['Accuracy', 'F1-Score', 'Precision', 'Recall'],
    'Value': [accuracy, f1, precision, recall]
})

# Create a bar chart to visualize the metrics
plt.figure(figsize=(8, 5))
plt.bar(metrics_df['Metric'], metrics_df['Value'])
plt.xlabel('Metric')
plt.ylabel('Value')
plt.title('Evaluation Metrics for SVM')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
from sklearn.metrics import roc_curve
import matplotlib.pyplot as plt
%matplotlib inline
predictions = pred_svc
fig, ax = plt.subplots(figsize=(5, 4))
fpr, tpr, thresholds = roc_curve(Y_test, predictions)

# Calculate the AUC
auc = roc_auc_score(Y_test, predictions)

# Plot the ROC curve
plt.plot(fpr, tpr, label='(AUC = %0.2f)' % auc)

# Print the AUC
print('AUC:', auc)

# Show the plot
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.show()
print("")
print('False Positive Rates:', fpr)
print("xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx")
print('True Positive Rates:', tpr)

# Tuning for svm

In [None]:
# defining parameter range
param_grid = [
    {'C': [1, 10,], 'kernel': ['linear']},
    {'gamma': [0.001, 0.0001]},
]

 
gridsvc = GridSearchCV(svm.SVC(), param_grid, refit = True, verbose = 3)
  
# fitting the model for grid search
gridsvc.fit(X_train, Y_train)

In [None]:
print(gridsvc.best_params_)

In [None]:
# Let's run our SVC again with the best parameters.
svc2 = svm.SVC( gamma= 0.001)
svc2.fit(X_train, Y_train)
pred_svc2 = svc2.predict(X_test)
print(classification_report(Y_test, pred_svc2))

In [None]:
# Calculate the accuracy score for the default SVC model
accuracy_svc2 = accuracy_score(Y_test, pred_svc2)
print("Accuracy score for default SVC model:", accuracy_svc2)

In [None]:
from sklearn.model_selection import KFold

kf = KFold(n_splits=5, shuffle=True)

for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    Y_train, Y_test = Y[train_index], Y[test_index]
    svc2 = KNeighborsClassifier(n_neighbors = 19, weights= 'distance')
    svc2.fit(X_train, Y_train)   
    predictions = svc2.predict(X_test)
    print(classification_report(predictions, Y_test))

In [None]:
accuracy_svc3 = accuracy_score(Y_test, predictions)
print("Accuracy score for Logistic regresssion model with best parameters:", accuracy_svc3)

# Logistic Regression:

In [None]:
log= LogisticRegression()

In [None]:
log.fit(X_train,Y_train)

In [None]:
pred_log = log.predict(X_test)
pred_log

In [None]:
#function that get y_test and calculate into df all the relevant metric
def train_evaluate_modelX5(Y_test):
    #fit the model instance 
    predictions = pred_log # calculate predictions

    #compute metrics for evaluation
    accuracy = accuracy_score(Y_test, predictions)
    f1 = f1_score(Y_test, predictions)
    precision = precision_score(Y_test, predictions)
    recall = recall_score(Y_test, predictions)
    balanced_accuracy = balanced_accuracy_score(Y_test, predictions)
    auc = roc_auc_score(Y_test, predictions)

    #create a dataframe to visualize the results
    eval_df = pd.DataFrame([[accuracy, f1, precision, recall, balanced_accuracy, auc]], columns=['accuracy', 'f1_score', 'precision', 'recall', 'balanced_accuracy', 'auc'])
    return eval_df

In [None]:
print(classification_report(Y_test, pred_log))

In [None]:
# Calculate the accuracy score for the default Logistic regresssion model
accuracy_log = accuracy_score(Y_test, pred_log)
print("Accuracy score for default Logistic regresssion model:", accuracy_log)

In [None]:
R5 = train_evaluate_modelX5(Y_test)
R5.index = ['Gaussian Naive Bayes - Method 3']
R1= R1.append(R5)
R5.style.background_gradient(cmap = sns.color_palette("blend:darkblue,deepskyblue", as_cmap=True))

# Turning for Logistic regresssion using GridSearch:

In [None]:
# defining parameter range
param_grid = {'C': [0.01, 0.1, 1, 10, 100],
               'solver': ['lbfgs', 'sag', 'newton-cg']}

 
gridlog = GridSearchCV(LogisticRegression(), param_grid, refit = True, verbose = 3)
  
# fitting the model for grid search
gridlog.fit(X_train, Y_train)

In [None]:
print(gridlog.best_params_)

In [None]:
# Let's run our SVC again with the best parameters.
log2 = LogisticRegression(C= 10,
    solver= 'lbfgs',)
log2.fit(X_train, Y_train)
pred_log2 = log2.predict(X_test)
print(classification_report(Y_test, pred_log2))

# Cross-validation using scikit

In [None]:
kf = KFold(n_splits=5, shuffle=True)

for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    Y_train, Y_test = Y[train_index], Y[test_index]
    log2 = LogisticRegression(C= 1,
    solver= 'lbfgs',)
    log2.fit(X_train, Y_train)  
    predictions = knn2.predict(X_test)
    print(classification_report(predictions, Y_test))

In [None]:
accuracy_log3 = accuracy_score(Y_test, predictions)
print("Accuracy score for Logistic regresssion model with best parameters:", accuracy_log3)

In [None]:
#function that get y_test and calculate into df all the relevant metric
def train_evaluate_model5(Y_test):
    #fit the model instance 
    #predictions = pred_log # calculate predictions

    #compute metrics for evaluation
    accuracy = accuracy_score(Y_test, predictions)
    f1 = f1_score(Y_test, predictions)
    precision = precision_score(Y_test, predictions)
    recall = recall_score(Y_test, predictions)
    balanced_accuracy = balanced_accuracy_score(Y_test, predictions)
    auc = roc_auc_score(Y_test, predictions)

    #create a dataframe to visualize the results
    eval_df = pd.DataFrame([[accuracy, f1, precision, recall, balanced_accuracy, auc]], columns=['accuracy', 'f1_score', 'precision', 'recall', 'balanced_accuracy', 'auc'])
    return eval_df

In [None]:
resultslog = train_evaluate_model5(Y_test)
resultslog.index = ['Logistic regresssion - Method 5']
results= results.append(resultslog)
resultslog.style.background_gradient(cmap = sns.color_palette("blend:darkblue,deepskyblue", as_cmap=True))

In [None]:
# Trim the shorter array
Y_test = Y_test[:len(predictions)]

# Compute the confusion matrix
cm = confusion_matrix(Y_test, predictions)

# Create a ConfusionMatrixDisplay object
disp = ConfusionMatrixDisplay(confusion_matrix=cm)

# Plot the confusion matrix
fig = plt.figure(figsize=(5, 5))
disp.plot(cmap=plt.cm.Blues)

# Remove the cell gridlines
plt.grid(which='major')

# Set the figure size
plt.gcf().set_size_inches(6, 6)

# Show the plot
plt.show()


In [None]:
# Calculate and store the metrics
accuracy = accuracy_score(Y_test, predictions)
f1 = f1_score(Y_test, predictions)
precision = precision_score(Y_test, predictions)
recall = recall_score(Y_test, predictions)

# Create a DataFrame to store the metrics
metrics_df = pd.DataFrame({
    'Metric': ['Accuracy', 'F1-Score', 'Precision', 'Recall'],
    'Value': [accuracy, f1, precision, recall]
})

# Create a bar chart to visualize the metrics
plt.figure(figsize=(8, 5))
plt.bar(metrics_df['Metric'], metrics_df['Value'])
plt.xlabel('Metric')
plt.ylabel('Value')
plt.title('Evaluation Metrics for Logistic regresssion')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
from sklearn.metrics import roc_curve
import matplotlib.pyplot as plt
%matplotlib inline
fig, ax = plt.subplots(figsize=(5, 4))
fpr, tpr, thresholds = roc_curve(Y_test, predictions)

# Calculate the AUC
auc = roc_auc_score(Y_test, predictions)

# Plot the ROC curve
plt.plot(fpr, tpr, label='(AUC = %0.2f)' % auc)

# Print the AUC
print('AUC:', auc)

# Show the plot
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.show()
print("")
print('False Positive Rates:', fpr)
print("xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx")
print('True Positive Rates:', tpr)

# Predictive system

In [None]:
X =diabetes_data.drop(columns='Diabetes',axis=1)
Y = diabetes_data['Diabetes']

In [None]:
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size= 0.4)

In [None]:
import pandas as pd
from sklearn.model_selection import cross_val_score, KFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
import numpy as np

# assuming you have your X and Y DataFrames here

# define the number of folds for k-fold cross-validation
n_folds = 5

# initialize the KFold object
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)

# iterate over each fold in the k-fold cross-validation
for train_index, test_index in kf.split(X):
    # use numpy.r_ to convert the index arrays into arrays of column indices
    X_train, X_test = X.iloc[np.r_[train_index]], X.iloc[np.r_[test_index]]
    Y_train, Y_test = Y.iloc[np.r_[train_index]], Y.iloc[np.r_[test_index]]
    
    # create an instance of the LogisticRegression model
    log2 = LogisticRegression(C= 1, penalty='l1', solver='liblinear')
    
    # train the model on the training data
    log2.fit(X_train, Y_train)
    
    # test the model on the test data
    Y_pred = log2.predict(X_test)
    
    # print the classification report and accuracy score for this fold
    print("\nClassification report for logistic regression model in this fold:")
    print(classification_report(Y_test, Y_pred))
    
    accuracy_log2 = accuracy_score(Y_test, Y_pred)
    print("Accuracy score for logistic regression model in this fold:", accuracy_log2)

In [None]:
X_train_a.shape

# Accuracy Score

In [None]:
  # accuracy score on the training data
X_train_prediction = log2.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)  

In [None]:
print('Accuracy score of the training data : ', training_data_accuracy)

In [None]:
# accuracy score on the test data

test_data_accuracy = accuracy_score(predictions, Y_test)

In [None]:
print('Accuracy score of the test data : ', test_data_accuracy)

# Making a Predictive System

In [None]:
input_data = (6,1,1,37,0,0,0,1,4,0,0,0,0)

# changing the input_data to numpy array
input_data_as_numpy_array = np.asarray(input_data)

# reshape the array as we are predicting for one instance
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

predictions = log2.predict(input_data_reshaped)
print(predictions)

if (predictions[0] == 0):
  print('The person is not diabetic')
else:
  print('The person is diabetic')

# Model Comparison after tuning:

In [None]:
results.style.background_gradient(cmap = sns.color_palette("blend:darkblue,deepskyblue", as_cmap=True))

# ML Comparison graph

In [None]:
fig = plt.figure(figsize=(8, 5)) 

labels = ["KNN", "RF","GNB","SVM","LG"]
accuracy_values = [95.01,95.15,84.94,98.86,98.99]

plt.bar(labels,accuracy_values,color='darkblue')

for i,v in enumerate(accuracy_values):
    plt.text(i, v/2, str(v), ha='center', color='gold', fontsize=20,fontweight='bold')

plt.xlabel("Algorithm")
plt.ylabel("Accuracy")
plt.title("Comparison of ML models")
plt.xticks(rotation=45)
plt.show()

# Deep learning techniques:

# Convolutional Neural Networks (CNNs):

In [None]:
import tensorflow as tf 
tf.random.set_seed(3)
from tensorflow import keras

In [None]:
diabetes_data_2=pd.read_csv("Diabetes_dataset.csv") #loading the dataset

In [None]:
#Data splitting
X =diabetes_data_2.drop(columns='Diabetes',axis=1)
Y = diabetes_data_2['Diabetes']

In [None]:
# Reshape the data into a 3D tensor for CNN input
X = np.array(X).reshape(X.shape[0], X.shape[1], 1)

In [None]:
# Split the data into training and test sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3)

In [None]:
# Normalize the features
X_train = X_train.astype("float32")
X_test = X_test.astype("float32")
X_train = X_train / X_train.max()
X_test = X_test / X_test.max()

In [None]:
# Define the CNN model
model = keras.Sequential()
#model.add(BatchNormalization())
#model.add(BatchNormalization())
model.add(keras.layers.Conv1D(filters=32, kernel_size=3, activation="relu", input_shape=(X_train.shape[1], 1)))
model.add(keras.layers.MaxPooling1D(pool_size=2))
model.add(keras.layers.Flatten())
model.add(keras.layers.Dense(64, activation="relu"))
model.add(keras.layers.Dense(1, activation="sigmoid"))

In [None]:
# Compile the model
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [None]:
# Train the model
model.fit(X_train, Y_train, epochs=30, batch_size=32)

In [None]:
# evaluate the model
_, accuracy = model.evaluate(X_train, Y_train, verbose=0)
print('Accuracy: %.2f' % (accuracy*100))

# Recurrent Neural Networks (RNNs) Using LSTM Method

In [None]:
#Data splitting
X =diabetes_data_2.drop(columns='Diabetes',axis=1)
Y = diabetes_data_2['Diabetes']

In [None]:
# Split the data into training and test sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3)

In [None]:
# Define the RNN model
model = keras.Sequential()
#model.add(keras.layers.Embedding(input_dim=1000, output_dim=64))
model.add(keras.layers.LSTM(units=64, return_sequences=True, input_shape=(X_train.shape[1], 1)))
model.add(keras.layers.LSTM(units=32))
model.add(keras.layers.Dense(128, activation='relu'),)
model.add(keras.layers.Dense(1, activation="sigmoid"))

In [None]:
# Compile the model
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [None]:
# Train the model
model.fit(X_train, Y_train, epochs=10, batch_size=32)

In [None]:
# evaluate the model
_, accuracy2 = model.evaluate(X_train, Y_train, verbose=0)
print('Accuracy: %.2f' % (accuracy*100))

# Multilayer perceptron(MLP)

In [None]:
from keras.models import Sequential
from keras.layers import Dense
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error


diabetes_data_2=pd.read_csv("Diabetes_dataset.csv")

X =diabetes_data_2.drop(columns='Diabetes',axis=1)
Y = diabetes_data_2['Diabetes']

# Standardize the data
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)


scaler= StandardScaler()
X_train_std = scaler.fit_transform(X_train)
X_test_std = scaler.transform(X_test)
# Define the MLP model
model = Sequential()
model.add(Dense(18, input_dim=X_train.shape[1], activation='relu'))
model.add(Dense(10, activation='relu'))
model.add(Dense(1, activation='linear'))

# Compile the model
model.compile(loss='mean_squared_error', optimizer='adam', metrics=['accuracy'])

# Train the model
history = model.fit(X_train_std, y_train, validation_data=(X_test, y_test), epochs=30, batch_size=32)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

print("Mean Squared Error: ", mse)
print("Mean Absolute Error: ", mae)

In [None]:
# evaluate the model
_, accuracy2 = model.evaluate(X_test, y_test, verbose=0)
print('Accuracy: %.2f' % (accuracy*100))

In [None]:
from sklearn.metrics import roc_curve, roc_auc_score
import matplotlib.pyplot as plt
fig, ax = plt.subplots(figsize=(5, 4))
y_proba = model.predict(X_test)[:, 0]
# Calculate the ROC curve
fpr, tpr, thresholds = roc_curve(y_test, y_proba)

# Calculate the AUC
auc = roc_auc_score(y_test, y_proba)

# Plot the ROC curve
plt.plot(fpr, tpr, label='(AUC = %0.2f)' % auc)

# Print the AUC
print('AUC:', auc)

# Show the plot
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.show()
print("")
print('False Positive Rates:', fpr)
print("xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx")
print('True Positive Rates:', tpr)

# Bayesian Optimized Long-Short Term Memory Recurrent Neural Network

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

In [None]:
diabetes_data_2=pd.read_csv("Diabetes_dataset.csv") #loading the dataset

In [None]:
#Data splitting
X =diabetes_data_2.drop(columns='Diabetes',axis=1)
Y = diabetes_data_2['Diabetes']

In [None]:
S= StandardScaler() 
S.fit(X)



In [None]:
Stdz_data=S.transform(X)


In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=10)
pca.fit(Stdz_data)
x_pca=pca.transform(Stdz_data)

In [None]:
X= x_pca
y= diabetes_data['Diabetes'] 

In [None]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [None]:
print(X.shape, X_train.shape, X_test.shape)

In [None]:
model = Sequential()
model.add(LSTM(units=50, return_sequences=True, input_shape=(X_train.shape[1], 1)))
model.add(Dropout(0.2))
model.add(LSTM(units=50, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(units=50))
model.add(Dropout(0.2))
model.add(Dense(units=1))
model.compile(optimizer='adam', loss='mean_squared_error',metrics=['accuracy'])

In [None]:
model.fit(X_train, y_train, epochs=30, batch_size=32)

In [None]:
# evaluate the model
_, accuracy3 = model.evaluate(X_train, y_train, verbose=0)
print('Accuracy: %.2f' % (accuracy*100))

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Predict the output for the testing data
predictions = model.predict(X_test)

# Calculate the mean absolute error and mean squared error
mae = mean_absolute_error(y_test, predictions)
mse = mean_squared_error(y_test, predictions)

print(f'Mean Absolute Error: {mae}')
print(f'Mean Squared Error: {mse}')

In [None]:
from sklearn.metrics import roc_curve, roc_auc_score
import matplotlib.pyplot as plt
fig, ax = plt.subplots(figsize=(5, 4))
y_proba = model.predict(X_test)[:, 0]
# Calculate the ROC curve
fpr, tpr, thresholds = roc_curve(y_test, y_proba)

# Calculate the AUC
auc = roc_auc_score(y_test, y_proba)

# Plot the ROC curve
plt.plot(fpr, tpr, label='(AUC = %0.2f)' % auc)

# Print the AUC
print('AUC:', auc)

# Show the plot
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.show()
print("")
print('False Positive Rates:', fpr)
print("xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx")
print('True Positive Rates:', tpr)

# Model Comparison 

In [None]:
fig = plt.figure(figsize=(10,6)) 

labels = ["CNN", "MLP","RNN"]
accuracy_values = [accuracy, accuracy2, accuracy3]
r_accuracy_values = [round(v, 2) for v in accuracy_values]
plt.bar(labels,r_accuracy_values)

for i,v in enumerate(r_accuracy_values):
    plt.text(i, v/2, str(v), ha='center', color='red', fontsize=20,fontweight='bold')

plt.xlabel("Algorithm")
plt.ylabel("Accuracy")
plt.title("Accuracy of Different Algorithms")
plt.xticks(rotation=45)
plt.show()

In [None]:

fig = plt.figure(figsize=(10,6))


labels = ["KNN", "RF","GNB","SVM","LG","CNN", "MLP","RNN"]
accuracy_values = [95.01,95.15,84.94,98.86,98.99,accuracy*100, accuracy2*100, accuracy3*100]
r_accuracy_values = [round(v, 2) for v in accuracy_values]

# Define a list of colors for each bar
colors = ['darkblue'] * len(labels)
colors[-3:] = ['deepskyblue','deepskyblue','deepskyblue'] # Change the last 3 colors to red, green, and blue

# Plot the bars with the specified colors
plt.bar(labels, r_accuracy_values, color=colors)

# Annotate the bars with their accuracy values
for i, v in enumerate(r_accuracy_values):
    plt.text(i, v/2, str(v), ha='center', color='azure', fontsize=20, fontweight='bold')

plt.xlabel("Algorithm")
plt.ylabel("Accuracy")
plt.title("Accuracy comparison between ML and Deep learning")
plt.xticks(rotation=45)
plt.show()