# Multi-Class Prediction of Obesity Risk - XGBoost+Optuna

This notebook uses XGBoost as the base model and Optuna for hyperparameter optimization.

In [3]:
import pandas as pd
import numpy as np

from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.model_selection import cross_val_score

from xgboost import XGBClassifier

# import cupy as cp
# import optuna

## Data loading and analysis


In [5]:
train = pd.read_csv("train.csv", index_col="id")
test = pd.read_csv("test.csv", index_col="id")
obesity = pd.read_csv("ObesityDataSet.csv")

train = pd.concat([train, obesity], axis=0)
train = train.drop_duplicates()

display(train.shape, train.head(), train.describe(include=[np.number]).T, train.describe(include=[object]).T, train.isna().sum())

(22845, 17)

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,Male,24.443011,1.699998,81.66995,yes,yes,2.0,2.983297,Sometimes,no,2.763573,no,0.0,0.976473,Sometimes,Public_Transportation,Overweight_Level_II
1,Female,18.0,1.56,57.0,yes,yes,2.0,3.0,Frequently,no,2.0,no,1.0,1.0,no,Automobile,Normal_Weight
2,Female,18.0,1.71146,50.165754,yes,yes,1.880534,1.411685,Sometimes,no,1.910378,no,0.866045,1.673584,no,Public_Transportation,Insufficient_Weight
3,Female,20.952737,1.71073,131.274851,yes,yes,3.0,3.0,Sometimes,no,1.674061,no,1.467863,0.780199,Sometimes,Public_Transportation,Obesity_Type_III
4,Male,31.641081,1.914186,93.798055,yes,yes,2.679664,1.971472,Sometimes,no,1.979848,no,1.967973,0.931721,Sometimes,Public_Transportation,Overweight_Level_II


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Age,22845.0,23.888513,5.755338,14.0,20.0,22.815416,26.0,61.0
Height,22845.0,1.700467,0.087865,1.45,1.631856,1.7,1.763029,1.98
Weight,22845.0,87.793761,26.363367,39.0,66.0,84.0,111.531208,173.0
FCVC,22845.0,2.443675,0.533392,1.0,2.0,2.393837,3.0,3.0
NCP,22845.0,2.755837,0.711185,1.0,3.0,3.0,3.0,4.0
CH2O,22845.0,2.027165,0.608479,1.0,1.755907,2.0,2.531984,3.0
FAF,22845.0,0.984585,0.839728,0.0,0.01586,1.0,1.600431,3.0
TUE,22845.0,0.620984,0.602802,0.0,0.0,0.58284,1.0,2.0


Unnamed: 0,count,unique,top,freq
Gender,22845,2,Female,11457
family_history_with_overweight,22845,2,yes,18736
FAVC,22845,2,yes,20826
CAEC,22845,4,Sometimes,19290
SMOKE,22845,2,no,22556
SCC,22845,2,no,22062
CALC,22845,4,Sometimes,16446
MTRANS,22845,5,Public_Transportation,18245
NObeyesdad,22845,7,Obesity_Type_III,4370


Gender                            0
Age                               0
Height                            0
Weight                            0
family_history_with_overweight    0
FAVC                              0
FCVC                              0
NCP                               0
CAEC                              0
SMOKE                             0
CH2O                              0
SCC                               0
FAF                               0
TUE                               0
CALC                              0
MTRANS                            0
NObeyesdad                        0
dtype: int64

## Data Preprocessing

In [6]:
preprocess = ColumnTransformer([
    ("onehot", OneHotEncoder(handle_unknown="ignore"), make_column_selector(dtype_include=object)),
    ("scale", StandardScaler(), make_column_selector(dtype_include=np.number)),
])

X_train, y_train = train.drop("NObeyesdad", axis=1), train["NObeyesdad"]

preprocess.fit(pd.concat([X_train, test]))
X_train = preprocess.transform(X_train)
X_test = preprocess.transform(test)

label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)

## Model parameters optimization

In [7]:
# def objective(trial):
#     params = {
#         'n_estimators': trial.suggest_int('n_estimators', 10, 5000),
#         'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.5, log=True),
#         'gamma': trial.suggest_float('gamma', 1e-3, 1, log=True),
#         'reg_alpha': trial.suggest_float('reg_alpha', 1e-3, 1, log=True),
#         'reg_lambda': trial.suggest_float('reg_lambda', 1e-3, 1, log=True),
#         'max_depth': trial.suggest_int('max_depth', 3, 15),
#         "min_child_weight": trial.suggest_int('min_child_weight', 1, 10),
#         'subsample': trial.suggest_float('subsample', 0.5, 1, log=True),
#         'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1, log=True),
#     }

#     model = XGBClassifier(random_state=42, device='cuda', **params)
#     score = cross_val_score(model, cp.array(X_train), y_train, scoring='accuracy', cv=5, n_jobs=-1).mean()
#     return score

# study = optuna.create_study(direction='maximize')
# study.optimize(objective, n_trials=1500)

# print('Best trial:', study.best_trial.params)
# print('Best score:', study.best_value)

NameError: name 'optuna' is not defined

## Model training and evaluation

In [8]:
# params = study.best_trial.params
params = {'n_estimators': 1312, 'learning_rate': 0.018279520260162645, 'gamma': 0.0024196354156454324, 'reg_alpha': 0.9025931173755949, 'reg_lambda': 0.06835667255875388, 'max_depth': 5, 'min_child_weight': 5, 'subsample': 0.883274050086088, 'colsample_bytree': 0.6579828557036317}
xgb = XGBClassifier(random_state=42, **params)

score = cross_val_score(xgb, X_train, y_train, scoring='accuracy', cv=5, n_jobs=-1).mean()
print("Accuracy: ", score)

xgb.fit(X_train, y_train)

Accuracy:  0.9147297001532063


## Model prediction

In [10]:
y_pred = xgb.predict(X_test)
y_pred = label_encoder.inverse_transform(y_pred)

submission = pd.DataFrame({"id": test.index, "NObeyesdad": y_pred})
submission.to_csv("vatsalsubmission.csv", index=False)