In [30]:
import pandas as pd

# Step 1: Load dataset
df = pd.read_csv('ObesityDataSet_raw_and_data_sinthetic.csv')

# Preview columns and data
print(df.columns)
print(df.head())


Index(['Gender', 'Age', 'Height', 'Weight', 'family_history_with_overweight',
       'FAVC', 'FCVC', 'NCP', 'CAEC', 'SMOKE', 'CH2O', 'SCC', 'FAF', 'TUE',
       'CALC', 'MTRANS', 'NObeyesdad'],
      dtype='object')
   Gender  Age  Height  Weight family_history_with_overweight FAVC  FCVC  NCP  \
0  Female   21    1.62    64.0                            yes   no   2.0  3.0   
1  Female   21    1.52    56.0                            yes   no   3.0  3.0   
2    Male   23    1.80    77.0                            yes   no   2.0  3.0   
3    Male   27    1.80    87.0                             no   no   3.0  3.0   
4    Male   22    1.78    89.8                             no   no   2.0  1.0   

        CAEC SMOKE  CH2O  SCC  FAF  TUE        CALC                 MTRANS  \
0  Sometimes    no   2.0   no  0.0  1.0          no  Public_Transportation   
1  Sometimes   yes   3.0  yes  3.0  0.0   Sometimes  Public_Transportation   
2  Sometimes    no   2.0   no  2.0  1.0  Frequently  Public_Tra

In [31]:
# Step 2: Drop irrelevant columns
df = df.drop(columns=['SMOKE', 'CALC'])


In [32]:
# Strip whitespace from all object (categorical) columns
df = df.apply(lambda col: col.str.strip() if col.dtype == "object" else col)

In [33]:
# Preview unique values of all categorical (object) columns
for col in df.select_dtypes(include="object").columns:
    print(f"{col} → {df[col].unique()}")

Gender → ['Female' 'Male']
family_history_with_overweight → ['yes' 'no']
FAVC → ['no' 'yes']
CAEC → ['Sometimes' 'Frequently' 'Always' 'no']
SCC → ['no' 'yes']
MTRANS → ['Public_Transportation' 'Walking' 'Automobile' 'Motorbike' 'Bike']
NObeyesdad → ['Normal_Weight' 'Overweight_Level_I' 'Overweight_Level_II'
 'Obesity_Type_I' 'Insufficient_Weight' 'Obesity_Type_II'
 'Obesity_Type_III']


In [34]:
# Optional: filter children only if 'Age' is available
if 'Age' in df.columns:
    df = df[df['Age'] <= 18]

In [35]:
# Step 3: Encode categorical columns

from sklearn.preprocessing import LabelEncoder

df_encoded = df.copy()
label_encoders = {}
for col in df_encoded.columns:
    if df_encoded[col].dtype == 'object':
        le = LabelEncoder()
        df_encoded[col] = le.fit_transform(df_encoded[col])
        label_encoders[col] = le

In [36]:
# Preview columns and data after encoded (to check all are numerical values)
print(df_encoded.columns)
print(df_encoded.head())

Index(['Gender', 'Age', 'Height', 'Weight', 'family_history_with_overweight',
       'FAVC', 'FCVC', 'NCP', 'CAEC', 'CH2O', 'SCC', 'FAF', 'TUE', 'MTRANS',
       'NObeyesdad'],
      dtype='object')
     Gender  Age  Height  Weight  family_history_with_overweight  FAVC  FCVC  \
58        0   17    1.65    67.0                               1     1   3.0   
112       0   18    1.56    51.0                               1     1   2.0   
115       0   17    1.75    57.0                               1     1   3.0   
116       0   15    1.65    86.0                               1     1   3.0   
117       0   17    1.70    85.0                               1     0   2.0   

     NCP  CAEC  CH2O  SCC  FAF  TUE  MTRANS  NObeyesdad  
58   1.0     2   2.0    0  1.0  1.0       2           1  
112  4.0     1   2.0    0  1.0  0.0       1           1  
115  3.0     1   2.0    0  0.0  1.0       1           1  
116  3.0     2   1.0    0  3.0  2.0       2           2  
117  3.0     1   2.0    0  1.0

In [37]:
# Step 4: Prepare features and target
X = df_encoded.drop('NObeyesdad', axis=1)
y = df_encoded['NObeyesdad']

In [38]:
# Step 5: Split data

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [39]:
# Step 6: Train model

from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

In [40]:
# Step 7: Evaluate

from sklearn.metrics import accuracy_score, classification_report

y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.8852459016393442
Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.94      0.88        16
           1       0.82      0.82      0.82        17
           2       1.00      0.93      0.96        14
           3       1.00      1.00      1.00         2
           4       1.00      0.67      0.80         6
           5       0.86      1.00      0.92         6

    accuracy                           0.89        61
   macro avg       0.92      0.89      0.90        61
weighted avg       0.89      0.89      0.88        61



In [41]:
original_labels = label_encoders['NObeyesdad'].inverse_transform([0,1,2,3,4,5])
print(original_labels)


['Insufficient_Weight' 'Normal_Weight' 'Obesity_Type_I' 'Obesity_Type_III'
 'Overweight_Level_I' 'Overweight_Level_II']


In [42]:
# Step 8: Export model and encoders
import joblib

joblib.dump(model, 'obesity_model.pkl')
joblib.dump(label_encoders, 'encoders.pkl')

['encoders.pkl']