In [1]:
import os
import pandas as pd
import numpy as np
import optuna
from optuna.integration import OptunaSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import (
    accuracy_score, f1_score, roc_auc_score, classification_report
    )
from sklearn.feature_selection import mutual_info_classif
import matplotlib.pyplot as plt
import seaborn as sns
import shap


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data_path = '../../data/preprocessed/preprocessed_data_20250720_131842'
X_train_baseline_path = os.path.join(data_path, 'X_train_baseline.csv')
X_test_baseline_path = os.path.join(data_path, 'X_test_baseline.csv')
y_train_baseline_path = os.path.join(data_path, 'y_train_baseline.csv')
y_test_baseline_path = os.path.join(data_path, 'y_test_baseline.csv')


X_train_scaled_path = os.path.join(data_path, 'X_train_scaled.csv')
X_test_scaled_path = os.path.join(data_path, 'X_test_scaled.csv')
y_train_scaled_path = os.path.join(data_path, 'y_train_scaled.csv')
y_test_scaled_path = os.path.join(data_path, 'y_test_scaled.csv')


In [3]:
X_train_baseline = pd.read_csv(X_train_baseline_path)
X_test_baseline = pd.read_csv(X_test_baseline_path)
y_train_baseline = pd.read_csv(y_train_baseline_path)
y_test_baseline = pd.read_csv(y_test_baseline_path)

In [4]:
X_train_scaled = pd.read_csv(X_train_scaled_path)
X_test_scaled = pd.read_csv(X_test_scaled_path)
y_train_scaled = pd.read_csv(y_train_scaled_path)
y_test_scaled = pd.read_csv(y_test_scaled_path)

In [5]:
display(X_train_baseline.head(3))
display(y_train_baseline.head(3))

Unnamed: 0,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,...,TUE,CALC,Gender_Male,MTRANS_Bike,MTRANS_Motorbike,MTRANS_Public_Transportation,MTRANS_Walking,Age_Category_Young Adult,Age_Category_Adult,Age_Category_Senior
0,1.76,79.0,1,1,2.0,3.0,2,0,3.0,0,...,2.0,2,True,False,False,True,False,True,False,False
1,1.75,70.0,0,0,2.0,3.0,1,0,3.0,0,...,1.0,0,True,False,False,True,False,True,False,False
2,1.7,55.3,1,1,3.0,3.0,1,0,2.0,0,...,0.0,1,True,False,False,True,False,True,False,False


Unnamed: 0,NObeyesdad
0,5
1,1
2,1


In [6]:
display(X_train_scaled.head(3))
display(y_train_scaled.head(3))

Unnamed: 0,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,...,TUE,CALC,Gender_Male,MTRANS_Bike,MTRANS_Motorbike,MTRANS_Public_Transportation,MTRANS_Walking,Age_Category_Young Adult,Age_Category_Adult,Age_Category_Senior
0,3.319501,2.315363,1,1,3.0,3.666667,3.666667,0,5.0,0,...,5.0,3.666667,True,False,False,True,False,True,False,False
1,3.243115,2.019406,0,0,3.0,3.666667,2.333333,0,5.0,0,...,3.0,1.0,True,False,False,True,False,True,False,False
2,2.861183,1.536011,1,1,5.0,3.666667,2.333333,0,3.0,0,...,1.0,2.333333,True,False,False,True,False,True,False,False


Unnamed: 0,NObeyesdad
0,5
1,1
2,1


## **Baseline**

In [7]:
# Baseline
baseline_dt = DecisionTreeClassifier()
baseline_dt.fit(X_train_baseline, y_train_baseline)
preds_baseline = baseline_dt.predict(X_test_baseline)

In [8]:
print('{:>8}: {:.3f}'.format('Accuracy', accuracy_score(y_test_baseline, preds_baseline)))
print('{:>8}: {:.3f}'.format('F1 score', f1_score(y_test_baseline, preds_baseline, average='macro')))
print('{:>8}: {:.3f}'.format('ROC score', roc_auc_score(
    y_test_baseline, baseline_dt.predict_proba(X_test_baseline), multi_class='ovr', average='macro'
    )
    ))
print('{:>8}: {}'.format('Report', classification_report(y_test_baseline, preds_baseline)))

Accuracy: 0.908
F1 score: 0.907
ROC score: 0.945
  Report:               precision    recall  f1-score   support

           0       1.00      0.83      0.91        54
           1       0.76      0.88      0.82        58
           2       0.90      0.91      0.91        70
           3       0.97      0.95      0.96        60
           4       1.00      0.98      0.99        65
           5       0.86      0.86      0.86        58
           6       0.90      0.91      0.91        58

    accuracy                           0.91       423
   macro avg       0.91      0.91      0.91       423
weighted avg       0.91      0.91      0.91       423



## **Random Forest w/optuna**

In [9]:
def opt_rf(X, y):
    """Objetive function for Optuna optimization.

    :param trial: Optuna trial object.
    :return: Score of the model based on cross-validation.
    """
    
    estimator = RandomForestClassifier(random_state=42)
    params = {
        'criterion': optuna.distributions.CategoricalDistribution(['gini', 'entropy']),
        'n_estimators': optuna.distributions.IntDistribution(10, 100),
        'max_depth': optuna.distributions.IntDistribution(2, 32),
        }

    optuna_search = OptunaSearchCV(estimator, params, cv=5, scoring='accuracy')
    
    return  optuna_search.fit(X, y)

In [10]:
_opt_rf = opt_rf(X_train_scaled, y_train_scaled)
best_score_rf, best_params_rf = _opt_rf.best_score_, _opt_rf.best_params_
best_rf = _opt_rf.best_estimator_

  optuna_search = OptunaSearchCV(estimator, params, cv=5, scoring='accuracy')
[I 2025-07-23 22:29:56,051] A new study created in memory with name: no-name-d0d5a703-bd09-4cfa-93ec-b90e003bd431
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
[I 2025-07-23 22:29:56,869] Trial 0 finished with value: 0.9378031008024159 and parameters: {'criterion': 'gini', 'n_estimators': 59, 'max_depth': 17}. Best is trial 0 with value: 0.9378031008024159.
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
[I 2025-07-23 22:29:57,444] Trial 1 finished with value: 0.8264200305515075 and parameters: {'criterion': 'entropy', 'n_estimato

In [11]:
print(best_score_rf)
print(best_params_rf)

0.9472775797587485
{'criterion': 'entropy', 'n_estimators': 86, 'max_depth': 31}


In [12]:
# reoptimize model
preds_rf = best_rf.predict(X_test_scaled)
print('{:>8}: {:.3f}'.format('Accuracy', accuracy_score(y_test_scaled, preds_rf)))
print('{:>8}: {:.3f}'.format('F1 score', f1_score(y_test_scaled, preds_rf, average='macro')))
print('{:>8}: {:.3f}'.format('ROC score', roc_auc_score(y_test_scaled, best_rf.predict_proba(X_test_scaled), multi_class='ovr', average='macro')))
print('{:>8}: {}'.format('Report', classification_report(y_test_scaled, preds_rf)))


Accuracy: 0.943
F1 score: 0.943
ROC score: 0.996
  Report:               precision    recall  f1-score   support

           0       1.00      0.94      0.97        54
           1       0.81      0.97      0.88        58
           2       0.94      0.96      0.95        70
           3       0.98      0.98      0.98        60
           4       1.00      0.98      0.99        65
           5       0.94      0.86      0.90        58
           6       0.95      0.90      0.92        58

    accuracy                           0.94       423
   macro avg       0.95      0.94      0.94       423
weighted avg       0.95      0.94      0.94       423



## **Feature Selection**

In [13]:
# Feature seleciton
feat_sel = mutual_info_classif(X_train_scaled, y_train_scaled, random_state=42)
feat_sel_df = pd.DataFrame({
    'feature': X_train_scaled.columns,
    'mutual_info': feat_sel
}).sort_values(by='mutual_info', ascending=False)

display(feat_sel_df)

  y = column_or_1d(y, warn=True)


Unnamed: 0,feature,mutual_info
1,Weight,1.248537
0,Height,0.429016
4,FCVC,0.383194
10,FAF,0.282351
8,CH2O,0.274618
11,TUE,0.257995
5,NCP,0.238799
13,Gender_Male,0.219307
2,family_history_with_overweight,0.158614
6,CAEC,0.154062


In [14]:
selected_features = feat_sel_df.loc[feat_sel_df['mutual_info'] > 0.1, 'feature'].tolist()
print(selected_features)

['Weight', 'Height', 'FCVC', 'FAF', 'CH2O', 'TUE', 'NCP', 'Gender_Male', 'family_history_with_overweight', 'CAEC']


## **Random Forest w/CV**

In [15]:
X_train_scaled_red = X_train_scaled[selected_features]
X_test_scaled_red = X_test_scaled[selected_features]

In [16]:
_opt_rf = opt_rf(X_train_scaled_red, y_train_scaled)
best_score_rf, best_params_rf = _opt_rf.best_score_, _opt_rf.best_params_
best_rf = _opt_rf.best_estimator_

  optuna_search = OptunaSearchCV(estimator, params, cv=5, scoring='accuracy')
[I 2025-07-23 22:32:57,405] A new study created in memory with name: no-name-05679113-5fe4-42ea-bec4-74abb32d7c78
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
[I 2025-07-23 22:32:58,111] Trial 0 finished with value: 0.943125032921883 and parameters: {'criterion': 'gini', 'n_estimators': 47, 'max_depth': 14}. Best is trial 0 with value: 0.943125032921883.
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
[I 2025-07-23 22:32:58,615] Trial 1 finished with value: 0.9443014415395151 and parameters: {'criterion': 'entropy', 'n_estimators

In [19]:
best_score_rf

0.9472670447562024

In [17]:
best_rf

In [18]:
# reoptimize model
preds_rf = best_rf.predict(X_test_scaled_red)
preds_rf_train = best_rf.predict(X_train_scaled_red)
print('{:>8}: {:.3f}'.format('Train Accuracy', accuracy_score(y_train_scaled, preds_rf_train)))
print('{:>8}: {:.3f}'.format('Accuracy', accuracy_score(y_test_scaled, preds_rf)))
print('{:>8}: {:.3f}'.format('F1 score', f1_score(y_test_scaled, preds_rf, average='macro')))
print('{:>8}: {:.3f}'.format('ROC score', roc_auc_score(y_test_scaled, best_rf.predict_proba(X_test_scaled_red), multi_class='ovr', average='macro')))
print('{:>8}: {}'.format('Report', classification_report(y_test_scaled, preds_rf)))

Train Accuracy: 1.000
Accuracy: 0.953
F1 score: 0.952
ROC score: 0.997
  Report:               precision    recall  f1-score   support

           0       1.00      0.91      0.95        54
           1       0.81      0.97      0.88        58
           2       0.97      0.97      0.97        70
           3       0.98      1.00      0.99        60
           4       1.00      0.98      0.99        65
           5       0.96      0.88      0.92        58
           6       0.96      0.95      0.96        58

    accuracy                           0.95       423
   macro avg       0.96      0.95      0.95       423
weighted avg       0.96      0.95      0.95       423



## **SHAP for feature importance**

In [None]:
_X_test_scaled_red = X_test_scaled_red.replace({True: 1, False: 0}).values

In [None]:
explainer = shap.TreeExplainer(
    best_rf,
    _X_test_scaled_red,
    feature_names = selected_features
    )

shap_values = explainer(_X_test_scaled_red, check_additivity=False)


In [None]:
shap.plots.beeswarm(shap_values)