<a href="https://colab.research.google.com/github/MahdiFaourr/MahdiFaourr/blob/main/diabetes_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install opendatasets library
!pip install opendatasets

In [None]:
# Import necessary functions and libraries
import opendatasets as od
import pandas as pd
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.model_selection import cross_validate
import numpy as np
from sklearn.model_selection import GridSearchCV
import gradio as gr

In [None]:
# Download the data from kaggle
od.download("https://www.kaggle.com/datasets/kandij/diabetes-dataset")

In [None]:
# Read the data in a pandas dataframe
data=pd.read_csv("/content/diabetes-dataset/diabetes2.csv")

In [None]:
# Display a subpart of the data
data.head()

In [None]:
# Data shape
data.shape

In [None]:
# Search for nulls
data.isnull().sum()

In [None]:
# Data info
data.info()

In [None]:
# Create two sub datas each one contains a class from the Outcome column
data_positive=data[data['Outcome']==0]
data_negative=data[data['Outcome']==1]

In [None]:
# Plot a histogram for each column in data_positive
for column in data_positive.columns:
  if column!='Outcome':
    plt.figure()  # Create a new figure for each histogram
    data_positive[column].hist()
    plt.title(f'Histogram of {column}')
    plt.xlabel(column)
    plt.ylabel('Frequency')
    plt.grid(True)
    plt.show


In [None]:
# Plot a histogram for each column in data_negative
for column in data_negative.columns:
  if column!='Outcome':
    plt.figure()  # Create a new figure for each histogram
    data_negative[column].hist()
    plt.title(f'Histogram of {column}')
    plt.xlabel(column)
    plt.ylabel('Frequency')
    plt.grid(True)
    plt.show

In [None]:
# Normalize the data
data_max=data.max()
data=data.divide(data_max)

In [None]:
# Check classes distribution in Outcome column
data['Outcome'].value_counts()

In [None]:
# Over sample the minor class
x=data.drop(['Outcome'],axis=1).values
y=data['Outcome'].values
over=RandomOverSampler(sampling_strategy=0.9)
x_new,y_new=over.fit_resample(x,y)
# Split the data into training and testing parts
x_train,x_test,y_train,y_test=train_test_split(x_new,y_new,test_size=0.2,random_state=42)

In [None]:
# Initialize some classical models
Lr=LogisticRegression()
sv=SVC()
tree=DecisionTreeClassifier()
forest=RandomForestClassifier()
adab=AdaBoostClassifier()


In [None]:
# Iterate over the models and perform cross validation for each one
models=[Lr,sv,tree,forest,adab]
for model in models:
  result=cross_validate(model,x_train,y_train,cv=3,scoring=['accuracy','precision','recall'])
  print(f'{model}')
  print("Accuracy %:", np.mean(result['test_accuracy'])*100)
  print("Precision %:", np.mean(result['test_accuracy'])*100)
  print("Recall %:", np.mean(result['test_accuracy'])*100)
  print("----------------")

LogisticRegression()
Accuracy %: 76.04909070160697
Precision %: 76.04909070160697
Recall %: 76.04909070160697
----------------
SVC()
Accuracy %: 76.70629610033924
Precision %: 76.70629610033924
Recall %: 76.70629610033924
----------------
DecisionTreeClassifier()
Accuracy %: 77.3655763385308
Precision %: 77.3655763385308
Recall %: 77.3655763385308
----------------
RandomForestClassifier()
Accuracy %: 80.7854304773168
Precision %: 80.7854304773168
Recall %: 80.7854304773168
----------------
AdaBoostClassifier()
Accuracy %: 77.2327866131358
Precision %: 77.2327866131358
Recall %: 77.2327866131358
----------------


In [None]:
# Define the grid of parameters to search
param_grid = {
    'n_estimators': [25, 50, 100, 150, 200],  # Number of trees in the forest
    'max_depth': [None, 10, 15, 20, 30, 35],  # Maximum depth of the tree
    'min_samples_split': [2, 5, 7, 10],  # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 4, 8]  # Minimum number of samples required to be at a leaf node
}

# Perform grid search with cross-validation for the model with best scores
grid_search = GridSearchCV(estimator=forest, param_grid=param_grid, cv=5)
grid_search.fit(x_train, y_train)

# Print the best parameters found
print("Best parameters found:")
print(grid_search.best_params_)

# Print the best mean cross-validated score found
print("Best mean cross-validated score found:")
print(grid_search.best_score_)


In [None]:
# Train the fine tuned model
forest=RandomForestClassifier(max_depth=None,min_samples_leaf=1,min_samples_split=5,n_estimators=50)
forest.fit(x_train,y_train)

In [None]:
# Test the model
y_hat=forest.predict(x_test)
from sklearn.metrics import accuracy_score,precision_score,recall_score
print("accuracy% :",accuracy_score(y_hat,y_test)*100)
print("precision% :",precision_score(y_hat,y_test)*100)
print("recall% :",recall_score(y_hat,y_test)*100)

accuracy% : 82.63157894736842
precision% : 88.17204301075269
recall% : 78.84615384615384


In [None]:
# Install gradio
!pip install gradio==3.14.0

In [None]:
# Build a function for predictions
def predict_output(features):
    # Convert the list of features to a numpy array
    features_array = np.array(features).reshape(1, -1)
    # Use the pre-trained model to predict
    prediction = forest.predict(features_array)[0]
    return prediction
# Test the function
predict_output(data.iloc[1,:8].values)

In [None]:
# Define the inputs as a list of Number objects
inputs = [gr.inputs.Number(default=0, label=f"Feature {i+1}") for i in range(8)]

# Create the Gradio interface with the predict_output function and the inputs
gr.Interface(fn=predict_output, inputs=inputs, outputs="number", title="Predict Output").launch()