<a href="https://colab.research.google.com/github/MahdiFaourr/MahdiFaourr/blob/main/Obesity_Class_Detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Interact with Hugging face API and curl the data

In [None]:
!curl -X GET \
     "https://datasets-server.huggingface.co/rows?dataset=aiml2021%2Fobesity&config=default&split=train&offset=0&length=100"

In [None]:
!curl -X GET \
     "https://datasets-server.huggingface.co/splits?dataset=aiml2021%2Fobesity"

In [None]:
!curl -X GET \
     "https://huggingface.co/api/datasets/aiml2021/obesity/parquet/default/train"

In [None]:
# Import necessary libraries and functions
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from keras.utils import to_categorical
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from keras.metrics import Precision,F1Score
from sklearn.model_selection import cross_val_score,train_test_split
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import cross_validate
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.ensemble import RandomForestClassifier
import joblib
from sklearn.metrics import precision_score, accuracy_score,f1_score


In [1]:
# Read the data in pandas
data=pd.read_parquet("/content/0000.parquet")

In [None]:
# Display a part from the data
data.head()

In [None]:
# Data-shape
data.shape

In [None]:
# Get some info using info() method
data.info()

In [None]:
# Find the unique values with their count for the classes in NObeyesdad column
data['NObeyesdad'].value_counts()

In [7]:
# Define and display the data of categorical columns
data_categorical=data[['Gender','family_history_with_overweight','FAVC','CAEC','SMOKE','SCC','CALC','MTRANS']]
print(data_categorical.head())

In [9]:
# Define and display the data of numerical columns
data_numerical=data[['Age','Height','Weight','FCVC','NCP','CH2O','FAF','TUE']]
print(data_numerical.head())

In [None]:
# 'NObeyesdad' is the specific column we want to crosstab with
target_column = 'NObeyesdad'

# Get a list of all categorical columns excluding the target column
columns=data_categorical.columns

# Set up the figure and axes for subplots
fig, axs = plt.subplots(len(columns), 1, figsize=(10, 5 * len(columns)))

# Iterate over all categorical columns and create crosstab plots
for i, column in enumerate(columns):
    # Create a crosstab
    crosstab_df = pd.crosstab(data[target_column], data_categorical[column])

    # Plot the crosstab as a bar plot
    crosstab_df.plot(kind='bar', ax=axs[i], stacked=True)

    # Set plot titles and labels
    axs[i].set_title(f'{column} vs {target_column}')
    axs[i].set_xlabel(target_column)
    axs[i].set_ylabel(column)

# Adjust layout to prevent overlap of axis labels
plt.tight_layout()

# Show the plot
plt.show()


In [None]:
# 'NObeyesdad' is the specific column we want to analyze
target_column = 'NObeyesdad'

# Get a list of all numerical columns excluding the target column
numerical_columns = data_numerical.columns

# Iterate over all numerical columns and create individual plots
for column in numerical_columns:
    # Create a histogram for each category in 'NObeyesdad'
    for category in data[target_column].unique():
        fig, axs = plt.subplots(figsize=(8, 5))
        axs.hist(data[data[target_column] == category][column], alpha=0.5, label=category)

    # Set plot titles and labels
        axs.set_title(f'{column} vs {target_column}')
        axs.set_xlabel(column)
        axs.set_ylabel('Frequency')
        axs.legend()

    # Show the plot for each numerical column
        plt.show()


In [12]:
# Encode all the categorical columns except the target column
encoder=LabelEncoder()
for column in data_categorical.columns:
  data[column]=encoder.fit_transform(data_categorical[column])


In [15]:
# Encode the target column
data[target_column]=encoder.fit_transform(data[target_column])

In [16]:
# Identify the columns to normalize
columns_to_normalize = [col for col in data.columns if col != target_column]

# Calculate the maximum value excluding the specified column
max_value_exclude_column = data[columns_to_normalize].max()

# Normalize the selected columns
data[columns_to_normalize] = data[columns_to_normalize].divide(max_value_exclude_column)


In [17]:
# Set up the features and the labels in an arrays
x=data.iloc[:,:-1].values
y=data['NObeyesdad'].values

In [18]:
# One hot encoding of y
y=to_categorical(y)

In [19]:
# Split the data into training and testing parts
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.1,random_state=15)

# Define and compile your deep neural network model
def create_model():
   model=Sequential()
   model.add(Dense(103,activation='relu',input_dim=x.shape[1]))
   model.add(Dense(25,activation='relu'))
   model.add(Dense(7,activation='softmax'))
   model.compile(optimizer='adam',loss="categorical_crossentropy",metrics=['acc',Precision(),F1Score()])
   return model

In [None]:
# Train the model
model=create_model()
history=model.fit(x_train,y_train,batch_size=32,epochs=60,verbose=1)

In [39]:
# Evaluate on testing data
model.evaluate(x_test,y_test)



[0.31372544169425964,
 0.8962264060974121,
 0.8962264060974121,
 array([0.9166666 , 0.82758623, 0.9411765 , 0.984127  , 1.        ,
        0.78378373, 0.87500006], dtype=float32)]

In [None]:
# Save the model
model.save('model_obesity_NN.h5')

In [23]:
# Set up the features and the labels in an arrays again
x=data.iloc[:,:-1].values
y=data['NObeyesdad'].values

In [24]:
# Initialize the classical models
kneighbors = KNeighborsClassifier()
LR = LogisticRegression()
Adab = AdaBoostClassifier()
svc = SVC(kernel='rbf')
Forest = RandomForestClassifier()

models = [ Adab,svc, Forest,LR,kneighbors]
# Iterate over the models
for model_name in models:
    # Perform cross-validation
    cv_results = cross_validate(model_name, x, y, scoring=['precision_macro', 'accuracy', 'f1_macro'], cv=5)

    # Extract and print the evaluation metrics
    precision = np.mean(cv_results['test_precision_macro'])
    accuracy = np.mean(cv_results['test_accuracy'])
    f1 = np.mean(cv_results['test_f1_macro'])
    #recall=np.mean(cv_results['test_recall'])
    print(f"Model: {model_name}")
    print(f"Precision: {precision}")
    print(f"Accuracy: {accuracy}")
    print(f"F1 Score: {f1}")
    #print(f"Recall Score: {recall}")
    print("-----")


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Model: AdaBoostClassifier()
Precision: 0.2569205958853007
Accuracy: 0.29985210581156935
F1 Score: 0.20146565036523256
-----
Model: SVC()
Precision: 0.7616778728434797
Accuracy: 0.745238815502
F1 Score: 0.7373964839506232
-----
Model: RandomForestClassifier()
Precision: 0.9563919674821886
Accuracy: 0.9356727504957817
F1 Score: 0.9358335860302149
-----


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Model: LogisticRegression()
Precision: 0.6601613121041032
Accuracy: 0.6632572574591331
F1 Score: 0.6418390354718732
-----
Model: KNeighborsClassifier()
Precision: 0.7538691645913295
Accuracy: 0.7547085252036346
F1 Score: 0.7397142844565833
-----


In [25]:
# Define the parameter grid to search
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    # Add more hyperparameters as needed
}

# Create GridSearchCV for the model with best performance
grid_search = GridSearchCV(Forest, param_grid, cv=3, scoring='accuracy')

# Fit the model to the training data
grid_search.fit(x,y)

# Get the best parameters and the best model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_


print("Best Parameters:", best_params)
print("Best Model Accuracy:", accuracy)


Best Parameters: {'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best Model Accuracy: 0.7547085252036346


In [26]:
# Define the tuned model
Tuned_Forest_Model=RandomForestClassifier(max_depth=20,min_samples_split=2,n_estimators=100,min_samples_leaf=1)

In [None]:
# Train the tuned model
Tuned_Forest_Model.fit(x_train,y_train)

In [None]:
# Save the model to a file using joblib
joblib.dump(Tuned_Forest_Model, 'obesity_model_Forest.joblib')

In [28]:
# Test the model on testing data
y_pred=Tuned_Forest_Model.predict(x_test)
accuracy=accuracy_score(y_test,y_pred)
precision=precision_score(y_test,y_pred,average='macro')
f1score=f1_score(y_test,y_pred,average='macro')

print("Accuracy: {:.2%}".format(accuracy))
print("Precision: {:.2%}".format(precision))
print("F1 Score: {:.2%}".format(f1score))

Accuracy: 90.09%
Precision: 98.47%
F1 Score: 94.51%


In [30]:
# Define a class predictor function and blend the predictions of the deep NN model and the Forest model
def weight_class_function(input, model, Tuned_Forest_Model):

    input = np.expand_dims(input, axis=0)  # Add a batch dimension

    # Make predictions using the deep learning model
    pred1 = model.predict(input)

    # Make predictions using the RandomForest model
    pred2 = Tuned_Forest_Model.predict(input)

    # Blend the predictions (simple averaging in this case)
    blended_pred = (np.argmax(pred1, axis=1) + pred2) / 2

    return np.argmax(blended_pred,axis=1)



In [33]:
# Display the classes of some instances
print(data.iloc[100,-1])
print(data.iloc[1050,-1])
print(data.iloc[243,-1])
print(data.iloc[2001,-1])

1
6
6
4


In [36]:
# Test the  weight_class_function on the choosen random samples
evaluating_examples=[data.iloc[100,:-1].values,data.iloc[1050,:-1].values,data.iloc[243,:-1].values,data.iloc[2001,:-1].values]
for example in evaluating_examples:
 input=example
 print(weight_class_function(input,model,Tuned_Forest_Model))

[1]
[6]
[6]
[4]
