In [13]:
import pandas as pd
import joblib
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import Pipeline

Load data

In [5]:
file_path = "dataset/obesity_data.csv"
try:
    data = pd.read_csv(file_path)
except FileNotFoundError:
    print(f"Error: The file {file_path} was not found.")
    exit()

print("Data loaded successfully.")
print("First 5 rows of the dataset:")
print(data.head())
print("\nShape of the dataset:", data.shape)
print("\nMissing values in the dataset:")
print(data.isnull().sum())

Data loaded successfully.
First 5 rows of the dataset:
   Age  Gender      Height     Weight        BMI  PhysicalActivityLevel  \
0   56    Male  173.575262  71.982051  23.891783                      4   
1   69    Male  164.127306  89.959256  33.395209                      2   
2   46  Female  168.072202  72.930629  25.817737                      4   
3   32    Male  168.459633  84.886912  29.912247                      3   
4   60    Male  183.568568  69.038945  20.487903                      3   

  ObesityCategory  
0   Normal weight  
1           Obese  
2      Overweight  
3      Overweight  
4   Normal weight  

Shape of the dataset: (1000, 7)

Missing values in the dataset:
Age                      0
Gender                   0
Height                   0
Weight                   0
BMI                      0
PhysicalActivityLevel    0
ObesityCategory          0
dtype: int64


Define features (x) and target (y)

In [6]:
X = data.drop("ObesityCategory", axis=1)
y = data["ObesityCategory"]

### Pre-process data

Identify categorical and numerical features

In [16]:
categorical_features = ['Gender']
numerical_features = ['Age', 'Height', 'Weight', 'BMI', 'PhysicalActivityLevel']

Create preprocessing pipelines for numerical and categorical features

In [17]:
numerical_pipeline = Pipeline([
    ('scaler', StandardScaler())
])

categorical_pipeline = Pipeline([
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

Create a column transformer to apply different transformations to different columns

In [18]:
preprocessor = ColumnTransformer([
    ('numerical', numerical_pipeline, numerical_features),
    ('categorical', categorical_pipeline, categorical_features)
], remainder='passthrough')

Encode the target variable

In [19]:
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
print("\nTarget variable classes:", label_encoder.classes_)


Target variable classes: ['Normal weight' 'Obese' 'Overweight' 'Underweight']


### Train Model

Split data into training and testing sets

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)

print(f"\nTraining set shape: X_train={X_train.shape}, y_train={y_train.shape}")
print(f"Test set shape: X_test={X_test.shape}, y_test={y_test.shape}")


Training set shape: X_train=(800, 6), y_train=(800,)
Test set shape: X_test=(200, 6), y_test=(200,)


Create the full pipeline: preprocessor + model

In [21]:
model_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(solver='liblinear', random_state=42, max_iter=1000))
])

Train the model

In [22]:
print("\nTraining the model...")
model_pipeline.fit(X_train, y_train)
print("Model training complete.")


Training the model...
Model training complete.


### Evaluate Model

In [23]:
print("\nEvaluating the model...")
y_pred_test = model_pipeline.predict(X_test)
test_accuracy = accuracy_score(y_test, y_pred_test)
print(f"\nTest Accuracy: {test_accuracy:.4f}")
print("\nClassification Report on Test Set:")
report = classification_report(y_test, y_pred_test, target_names=label_encoder.classes_)
print(report)


Evaluating the model...

Test Accuracy: 0.8950

Classification Report on Test Set:
               precision    recall  f1-score   support

Normal weight       0.79      0.99      0.88        74
        Obese       1.00      0.97      0.99        38
   Overweight       0.96      0.75      0.84        59
  Underweight       1.00      0.86      0.93        29

     accuracy                           0.90       200
    macro avg       0.94      0.89      0.91       200
 weighted avg       0.91      0.90      0.89       200



Save Model and Preprocessor

In [24]:
model_filename = "obesity_model.pkl"
preprocessor_filename = "obesity_preprocessor.pkl"
label_encoder_filename = "obesity_target_encoder.pkl"

print(f"\nSaving the model to {model_filename}...")
joblib.dump(model_pipeline.named_steps['classifier'], model_filename)
print("Model saved.")

print(f"\nSaving the preprocessor to {preprocessor_filename}...")
joblib.dump(preprocessor, preprocessor_filename) 
print("Preprocessor saved.")

print(f"\nSaving the label encoder to {label_encoder_filename}...")
joblib.dump(label_encoder, label_encoder_filename)
print("Label encoder saved.")


Saving the model to obesity_model.pkl...
Model saved.

Saving the preprocessor to obesity_preprocessor.pkl...
Preprocessor saved.

Saving the label encoder to obesity_target_encoder.pkl...
Label encoder saved.


### Test Saved Model and Preprocessor

In [25]:
print("\n--- Testing Saved Model and Preprocessor ---")
loaded_model = joblib.load(model_filename)
loaded_preprocessor = joblib.load(preprocessor_filename)
loaded_label_encoder = joblib.load(label_encoder_filename)
print("Model, preprocessor, and label encoder loaded successfully.")

sample_raw_data = X.iloc[[0]].copy()

print("\nSample Raw Data (first row of X):")
print(sample_raw_data)

sample_data_processed = loaded_preprocessor.transform(sample_raw_data)
print("\nSample Data after Preprocessing:")
print(sample_data_processed)

prediction_encoded = loaded_model.predict(sample_data_processed)
prediction_proba = loaded_model.predict_proba(sample_data_processed)

prediction_label = loaded_label_encoder.inverse_transform(prediction_encoded)

print(f"\nEncoded Prediction for Sample Data: {prediction_encoded[0]}")
print(f"Predicted Obesity Category for Sample Data: {prediction_label[0]}")
print(f"Prediction Probabilities: {prediction_proba[0]}")
print("Corresponding classes:", loaded_label_encoder.classes_)

print("\n--- Example of creating a new sample from scratch ---")
new_sample_dict = {
    'Age': [30],
    'Gender': ['Male'], 
    'Height': [175],   
    'Weight': [70],    
    'BMI': [22.86],  
    'PhysicalActivityLevel': [3] 
}

new_sample_df = pd.DataFrame(new_sample_dict, columns=X.columns)

print("\nNew Sample Raw Data (DataFrame):")
print(new_sample_df)

new_sample_processed = loaded_preprocessor.transform(new_sample_df)
print("\nNew Sample Data after Preprocessing:")
print(new_sample_processed)

new_prediction_encoded = loaded_model.predict(new_sample_processed)
new_prediction_label = loaded_label_encoder.inverse_transform(new_prediction_encoded)
new_prediction_proba = loaded_model.predict_proba(new_sample_processed)

print(f"\nPredicted Obesity Category for New Sample: {new_prediction_label[0]}")
print(f"Prediction Probabilities for New Sample: {new_prediction_proba[0]}")

print("\nPipeline script execution finished.")


--- Testing Saved Model and Preprocessor ---
Model, preprocessor, and label encoder loaded successfully.

Sample Raw Data (first row of X):
   Age Gender      Height     Weight        BMI  PhysicalActivityLevel
0   56   Male  173.575262  71.982051  23.891783                      4

Sample Data after Preprocessing:
[[ 0.36363016  0.32334546  0.0421252  -0.16013316  1.29584717  0.
   1.        ]]

Encoded Prediction for Sample Data: 0
Predicted Obesity Category for Sample Data: Normal weight
Prediction Probabilities: [0.62356259 0.00117434 0.36997011 0.00529296]
Corresponding classes: ['Normal weight' 'Obese' 'Overweight' 'Underweight']

--- Example of creating a new sample from scratch ---

New Sample Raw Data (DataFrame):
   Age Gender  Height  Weight    BMI  PhysicalActivityLevel
0   30   Male     175      70  22.86                      3

New Sample Data after Preprocessing:
[[-1.07511957  0.46069188 -0.08502205 -0.32606943  0.39360371  0.
   1.        ]]

Predicted Obesity Category