In [1]:
#-----------------------------------------
# Title:  Naive Bayes Multi-Class Prediction of Obesity Risk Dataset
# Subtitle: DDS-8555, Assignment 5
# Author: Madgene Moise
# Date: Sunday, June 15, 2025
#-----------------------------------------

In [2]:
import pandas as pd

# Load the training and testing datasets

train_df = pd.read_csv("/kaggle/input/playground-series-s4e2/train.csv") 
test_df = pd.read_csv("/kaggle/input/playground-series-s4e2/test.csv")

# Display basic information about the datasets
train_df.info(), train_df.head(), test_df.info(), test_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20758 entries, 0 to 20757
Data columns (total 18 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              20758 non-null  int64  
 1   Gender                          20758 non-null  object 
 2   Age                             20758 non-null  float64
 3   Height                          20758 non-null  float64
 4   Weight                          20758 non-null  float64
 5   family_history_with_overweight  20758 non-null  object 
 6   FAVC                            20758 non-null  object 
 7   FCVC                            20758 non-null  float64
 8   NCP                             20758 non-null  float64
 9   CAEC                            20758 non-null  object 
 10  SMOKE                           20758 non-null  object 
 11  CH2O                            20758 non-null  float64
 12  SCC                             

(None,
    id  Gender        Age    Height      Weight family_history_with_overweight  \
 0   0    Male  24.443011  1.699998   81.669950                            yes   
 1   1  Female  18.000000  1.560000   57.000000                            yes   
 2   2  Female  18.000000  1.711460   50.165754                            yes   
 3   3  Female  20.952737  1.710730  131.274851                            yes   
 4   4    Male  31.641081  1.914186   93.798055                            yes   
 
   FAVC      FCVC       NCP        CAEC SMOKE      CH2O SCC       FAF  \
 0  yes  2.000000  2.983297   Sometimes    no  2.763573  no  0.000000   
 1  yes  2.000000  3.000000  Frequently    no  2.000000  no  1.000000   
 2  yes  1.880534  1.411685   Sometimes    no  1.910378  no  0.866045   
 3  yes  3.000000  3.000000   Sometimes    no  1.674061  no  1.467863   
 4  yes  2.679664  1.971472   Sometimes    no  1.979848  no  1.967973   
 
         TUE       CALC                 MTRANS           NO

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import LabelEncoder

# Encode categorical variables
# Combine train and test for consistent encoding
combined_df = pd.concat([train_df.drop('NObeyesdad', axis=1), test_df])
categorical_cols = combined_df.select_dtypes(include=['object']).columns

encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    combined_df[col] = le.fit_transform(combined_df[col])
    encoders[col] = le

# Separate back train and test
X_train = combined_df.iloc[:len(train_df), :]
X_test = combined_df.iloc[len(train_df):, :]

# Encode target variable
y_train = LabelEncoder().fit_transform(train_df['NObeyesdad'])

In [4]:
# Fit Naive Bayes
nb = GaussianNB()
nb.fit(X_train, y_train)

# Predict
y_train_pred = nb.predict(X_train)
y_test_pred = nb.predict(X_test)

# Evaluate on train
train_accuracy = accuracy_score(y_train, y_train_pred)
conf_matrix = confusion_matrix(y_train, y_train_pred)
report = classification_report(y_train, y_train_pred)

train_accuracy, conf_matrix, report

(0.7013199730224492,
 array([[2071,  394,   19,    1,    1,   31,    6],
        [ 775, 1743,   89,    0,   52,  293,  130],
        [   3,   16, 1924,  535,   47,   91,  294],
        [   0,    0,  224, 3001,    0,    3,   20],
        [   1,    0,    0,    4, 4038,    3,    0],
        [  52,  448,  582,    0,   55,  839,  451],
        [   4,  190, 1134,   83,   13,  156,  942]]),
 '              precision    recall  f1-score   support\n\n           0       0.71      0.82      0.76      2523\n           1       0.62      0.57      0.59      3082\n           2       0.48      0.66      0.56      2910\n           3       0.83      0.92      0.87      3248\n           4       0.96      1.00      0.98      4046\n           5       0.59      0.35      0.44      2427\n           6       0.51      0.37      0.43      2522\n\n    accuracy                           0.70     20758\n   macro avg       0.67      0.67      0.66     20758\nweighted avg       0.70      0.70      0.69     20758\n')

Gaussian Naive Bayes was fitted to the training data to classify the variable NObeyesdad. 

Accuracy: 70.13% 

Classification Report (Marco Average):
* Precision: 0.67
* Recall: 0.67
* F1-score: 0.66

High Performance Observed for 'Normal Weight' and 'Obesity Type I'"
* Classes 3 and 4 (in the matrix above) show very strong performance with high recall and precision.
* 'Normal Weight' (class 4) is nearly perfectly classified.

Mixed Performance Observed for Overlapping Categories: 
* Some confusion occurs between adjacent weight categories like 'Obesity Types I/II/III' and 'Overweight Levels'.
* For example, many true "Obesity Type II' are misclassified as 'Obesity Type I'.

Naive Assumption:
* The model assumes feature independence. In reality, features like height, weight, and family history are correlated. This can limit accuracy.

**Assumptions and Diagnostics**
* Feature Independence: Naive Bayes assumes all predictors are conditionally independent given the class label. This is unlikely to be true for anthropometric and lifestyle data.
* Gaussian Assumption: The numeric predictors are assumed to follow a normal distribution per class. This should ideally be verified via histograms or Q-Q plots for each class.
* Residuals: For classification, residuals are not like regression residuals. Misclassifications and the confusion matrix effectively serve as residual diagnostics. Here, misclassification is more frequent for intermediate categories.

The Naive Bayes model performs reasonably well for clear-cut classes but struggles with categories that have overlapping predictor ranges. This behavior highlights the limits of the model's independence assumption.

In [5]:
# Decode numerical predictions back to original labels
# First, get target encoder from train_df
target_encoder = LabelEncoder()
target_encoder.fit(train_df['NObeyesdad'])
y_test_labels = target_encoder.inverse_transform(y_test_pred)

# Create submission DataFrame
submission_df = test_df[['id']].copy()
submission_df['NObeyesdad_Predicted'] = y_test_labels