In [None]:
#-----------------------------------------
# Title:  Bagged Model Multi-Class Prediction of Obesity Risk Dataset
# Subtitle: DDS-8555, Assignment 5
# Author: Madgene Moise
# Date: Sunday, June 22, 2025
#-----------------------------------------

In [1]:
import pandas as pd

# Load the training and testing datasets

train_df = pd.read_csv("/kaggle/input/playground-series-s4e2/train.csv") 
test_df = pd.read_csv("/kaggle/input/playground-series-s4e2/test.csv")

# Display basic information about the datasets
train_df.info(), train_df.head(), test_df.info(), test_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20758 entries, 0 to 20757
Data columns (total 18 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              20758 non-null  int64  
 1   Gender                          20758 non-null  object 
 2   Age                             20758 non-null  float64
 3   Height                          20758 non-null  float64
 4   Weight                          20758 non-null  float64
 5   family_history_with_overweight  20758 non-null  object 
 6   FAVC                            20758 non-null  object 
 7   FCVC                            20758 non-null  float64
 8   NCP                             20758 non-null  float64
 9   CAEC                            20758 non-null  object 
 10  SMOKE                           20758 non-null  object 
 11  CH2O                            20758 non-null  float64
 12  SCC                             

(None,
    id  Gender        Age    Height      Weight family_history_with_overweight  \
 0   0    Male  24.443011  1.699998   81.669950                            yes   
 1   1  Female  18.000000  1.560000   57.000000                            yes   
 2   2  Female  18.000000  1.711460   50.165754                            yes   
 3   3  Female  20.952737  1.710730  131.274851                            yes   
 4   4    Male  31.641081  1.914186   93.798055                            yes   
 
   FAVC      FCVC       NCP        CAEC SMOKE      CH2O SCC       FAF  \
 0  yes  2.000000  2.983297   Sometimes    no  2.763573  no  0.000000   
 1  yes  2.000000  3.000000  Frequently    no  2.000000  no  1.000000   
 2  yes  1.880534  1.411685   Sometimes    no  1.910378  no  0.866045   
 3  yes  3.000000  3.000000   Sometimes    no  1.674061  no  1.467863   
 4  yes  2.679664  1.971472   Sometimes    no  1.979848  no  1.967973   
 
         TUE       CALC                 MTRANS           NO

In [4]:
# Preprocessing: encode categorical variables and prepare data for bagging
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Copy datasets
X_train = train_df.drop(columns=['NObeyesdad', 'id'])
y_train = train_df['NObeyesdad']
X_test = test_df.drop(columns=['id'])

# Combine to encode categoricals
combined = pd.concat([X_train, X_test])
cat_cols = combined.select_dtypes(include='object').columns

# Encode
le_dict = {}
for col in cat_cols:
    le = LabelEncoder()
    combined[col] = le.fit_transform(combined[col])
    le_dict[col] = le

# Split back
X_train_enc = combined.iloc[:len(X_train)]
X_test_enc = combined.iloc[len(X_train):]

Preprocessing:
* All categorical predictors were encoded using LabelEncoder.
* The same encoding was applied to the test set to ensure consistency.
* The target variable was also encoded.

In [5]:
# Encode target
y_le = LabelEncoder()
y_train_enc = y_le.fit_transform(y_train)

# Build bagged decision tree
bagged_clf = BaggingClassifier(
    base_estimator=DecisionTreeClassifier(),
    n_estimators=50,
    random_state=42
)

bagged_clf.fit(X_train_enc, y_train_enc)

# Predict on training for evaluation
y_pred_train = bagged_clf.predict(X_train_enc)

# Predict on test
y_pred_test = bagged_clf.predict(X_test_enc)

# Decode predictions for test
y_pred_test_labels = y_le.inverse_transform(y_pred_test)

# Evaluation on training
report = classification_report(y_train_enc, y_pred_train, target_names=y_le.classes_)
conf_matrix = confusion_matrix(y_train_enc, y_pred_train)
train_accuracy = accuracy_score(y_train_enc, y_pred_train)

report, conf_matrix, train_accuracy, y_pred_test_labels[:5]



('                     precision    recall  f1-score   support\n\nInsufficient_Weight       1.00      1.00      1.00      2523\n      Normal_Weight       1.00      1.00      1.00      3082\n     Obesity_Type_I       1.00      1.00      1.00      2910\n    Obesity_Type_II       1.00      1.00      1.00      3248\n   Obesity_Type_III       1.00      1.00      1.00      4046\n Overweight_Level_I       1.00      1.00      1.00      2427\nOverweight_Level_II       1.00      1.00      1.00      2522\n\n           accuracy                           1.00     20758\n          macro avg       1.00      1.00      1.00     20758\n       weighted avg       1.00      1.00      1.00     20758\n',
 array([[2522,    1,    0,    0,    0,    0,    0],
        [   2, 3079,    0,    0,    0,    1,    0],
        [   0,    0, 2907,    2,    0,    1,    0],
        [   0,    0,    1, 3247,    0,    0,    0],
        [   0,    0,    0,    0, 4046,    0,    0],
        [   0,    3,    0,    0,    0, 2424,    0

Model Built:
* I trained a bagged decision tree classifier (BaggingClassifer with 50 decision trees) using the training set (train.csv).
* The target variable is NObeyesdad, a categorical indicator of obesity status.

Model Performance (Training Set):
* Accuracy: about 99.93%
* Confusion Matrix: Very few misclassifications; almost perfect diagonal matrix.
* Classification Report:
| Class                 | Precision | Recall | F1-score |
| --------------------- | --------- | ------ | -------- |
| Insufficient\_Weight  | 1.00      | 1.00   | 1.00     |
| Normal\_Weight        | 1.00      | 1.00   | 1.00     |
| Obesity\_Type\_I      | 1.00      | 1.00   | 1.00     |
| Obesity\_Type\_II     | 1.00      | 1.00   | 1.00     |
| Obesity\_Type\_III    | 1.00      | 1.00   | 1.00     |
| Overweight\_Level\_I  | 1.00      | 1.00   | 1.00     |
| Overweight\_Level\_II | 1.00      | 1.00   | 1.00     |


Assumptions Checked:
* Bagging reduces variance by combining multiple decision trees, each trained on bootstrapped samples. It does not assume linearity or normality. This model is well-suited for complex relationships.
* The near-perfect training accuracy suggests some overfitting. However, bagging typically generalizes well. It would be ideal to validate on a hold-out or cross-validation, but since the test set is used for predictions, I focused on that.

In [7]:
# Prepare submission DataFrame
submission = pd.DataFrame({
    'id': test_df['id'],
    'NObeyesdad': y_pred_test_labels
})

submission.head()

Unnamed: 0,id,NObeyesdad
0,20758,Obesity_Type_II
1,20759,Overweight_Level_I
2,20760,Obesity_Type_III
3,20761,Obesity_Type_I
4,20762,Obesity_Type_III
