# XGBOOST

In [1]:
!pip install xgboost==1.7.6



In [22]:
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt
from xgboost import XGBClassifier

from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix


In [3]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

## Importando a base de dados

In [None]:
#df_all = pd.read_parquet("../../data/data_processed/participants/Participants_all.parquet")

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
df_all = pd.read_parquet("/content/drive/MyDrive/capture24/Participants_all.parquet")

In [6]:
df_all.head()

Unnamed: 0,pid,window_start,window_end,n_samples,duration_seconds,sex,age_group,label:Walmsley2020,label:Walmsley2020_enc,label:WillettsSpecific2018,label:WillettsSpecific2018_enc,label:WillettsMET2018,label:WillettsMET2018_enc,x_mean,x_std,x_min,x_max,y_mean,y_std,y_min,y_max,z_mean,z_std,z_min,z_max,energy_x,energy_y,energy_z,energy_total,magnitude_mean,corr_xy,corr_xz,corr_yz,fft_dom_freq,fft_peak_power,hour_sin,hour_cos
0,P001,2016-11-13 02:18:00,2016-11-13 02:18:05,500,4.99,1,2,sleep,3,sleep,5,sleep,5,-0.468161,0.004565,-0.482334,-0.46669,-0.537512,0.006892,-0.548902,-0.533341,0.657518,0.00396,0.643077,0.673867,0.219195,0.288966,0.432345,0.940507,0.969787,-0.14848,-0.077644,0.275487,0.0,0.0,0.566406,0.824126
1,P001,2016-11-13 02:18:05,2016-11-13 02:18:10,500,4.99,1,2,sleep,3,sleep,5,sleep,5,-0.470069,0.006437,-0.482334,-0.46669,-0.537045,0.006771,-0.548902,-0.51778,0.657702,0.003627,0.643077,0.673867,0.221007,0.288463,0.432585,0.942055,0.970582,-0.108382,-0.028882,0.137541,0.0,0.0,0.566706,0.82392
2,P001,2016-11-13 02:18:10,2016-11-13 02:18:15,500,4.99,1,2,sleep,3,sleep,5,sleep,5,-0.469694,0.006162,-0.482334,-0.46669,-0.537947,0.007104,-0.548902,-0.533341,0.657764,0.003369,0.643077,0.673867,0.22065,0.289438,0.432665,0.942753,0.970944,-0.260468,-0.079268,0.204062,0.0,0.0,0.567005,0.823714
3,P001,2016-11-13 02:18:15,2016-11-13 02:18:20,500,4.99,1,2,sleep,3,sleep,5,sleep,5,-0.469287,0.005821,-0.482334,-0.46669,-0.537512,0.006962,-0.548902,-0.51778,0.657733,0.003567,0.643077,0.673867,0.220264,0.288967,0.432626,0.941857,0.970483,-0.243211,-0.092415,0.223157,0.0,0.0,0.567305,0.823508
4,P001,2016-11-13 02:18:20,2016-11-13 02:18:25,500,4.99,1,2,sleep,3,sleep,5,sleep,5,-0.47082,0.006896,-0.482334,-0.46669,-0.535333,0.005291,-0.548902,-0.51778,0.658226,0.002743,0.643077,0.673867,0.221719,0.286609,0.433269,0.941597,0.97035,-0.225457,-0.07925,0.230302,0.0,0.0,0.567604,0.823302


In [7]:
#features baseadas em aceleração
features_acc = [
    'x_mean', 'x_std','x_min', 'x_max',
    'y_mean', 'y_std', 'y_min', 'y_max',
    'z_mean','z_std', 'z_min', 'z_max',
    'energy_x', 'energy_y', 'energy_z','energy_total',
    'magnitude_mean', 'corr_xy', 'corr_xz', 'corr_yz',
    'fft_dom_freq', 'fft_peak_power'
]

#features de contexto, não baseadas em aceleração.
features_cont = ['sex', 'age_group', 'hour_sin', 'hour_cos']

#colunas que contem nas e precisam ser zeradas
cols_corr = ["corr_xy", "corr_xz", "corr_yz"]
df_all[cols_corr] = df_all[cols_corr].fillna(0)

#Participante para teste e target
PID_TEST = "P043"
TARGET = "label:WillettsSpecific2018"
TARGET_enc = "label:WillettsSpecific2018_enc"
n_class = df_all[TARGET].nunique()

TREE_METHOD = "gpu_hist" #gpu_hist se estiver na GPU ou hist se não estiver

In [8]:
label2enc = (
    df_all[[TARGET,TARGET_enc]]
    .drop_duplicates()
    .set_index(TARGET)[TARGET_enc]
    .sort_index()
    .to_dict()
)

In [9]:
enc2label = {
    v: k for k, v in label2enc.items()
}

In [10]:
df_all_train = df_all[df_all["pid"] != PID_TEST].reset_index(drop=True)
df_all_test  = df_all[df_all["pid"] == PID_TEST].reset_index(drop=True)

y_train = df_all_train[TARGET_enc].astype(int)
y_test  = df_all_test[TARGET_enc].astype(int)

X_train_v1 = df_all_train[features_acc].astype("float32")
X_test_v1  = df_all_test[features_acc].astype("float32")

# Subamostra para tuning

In [11]:
N_TUNE = 200_000  # ajuste se quiser maior/menor

frac = min(1.0, N_TUNE / len(X_train_v1))

X_tune, _, y_tune, _ = train_test_split(
    X_train_v1,
    y_train,
    train_size=frac,
    stratify=y_train,
    random_state=42
)

len(X_tune), len(y_tune)

(200000, 200000)

In [12]:
X_tune_train, X_tune_val, y_tune_train, y_tune_val = train_test_split(
    X_tune,
    y_tune,
    test_size=0.2,
    stratify=y_tune,
    random_state=42
)

## Pesos de classe → sample_weight

In [13]:
classes = np.unique(y_train)
class_weights_arr = compute_class_weight(
    class_weight="balanced",
    classes=classes,
    y=y_train
)
class_weight_dict = dict(zip(classes, class_weights_arr))
class_weight_dict

{np.int64(0): np.float64(10.400329564179783),
 np.int64(1): np.float64(1.4395573627376186),
 np.int64(2): np.float64(9.137606709600119),
 np.int64(3): np.float64(2.400716140709845),
 np.int64(4): np.float64(0.2787789710323946),
 np.int64(5): np.float64(0.2707976778042112),
 np.int64(6): np.float64(18.48842424242424),
 np.int64(7): np.float64(3.1197443327083687),
 np.int64(8): np.float64(2.6506893355731913),
 np.int64(9): np.float64(1.5350167728950017)}

In [14]:
# vetor de pesos por amostra no tuning
sample_weight_tune = y_tune_train.map(class_weight_dict).values
sample_weight_val  = y_tune_val.map(class_weight_dict).values

## TUNING NA SUBAMOSTRA

In [15]:
param_grid = [
    {"n_estimators": 300, "max_depth": 5, "learning_rate": 0.1,  "subsample": 0.8, "colsample_bytree": 0.8},
    {"n_estimators": 500, "max_depth": 5, "learning_rate": 0.1,  "subsample": 0.8, "colsample_bytree": 0.8},
    {"n_estimators": 300, "max_depth": 7, "learning_rate": 0.05, "subsample": 0.9, "colsample_bytree": 0.9},
    {"n_estimators": 500, "max_depth": 7, "learning_rate": 0.05, "subsample": 0.9, "colsample_bytree": 0.9},
]

In [16]:
best_params = None
best_f1 = -np.inf

for params in param_grid:
    print("\nTestando params:", params)

    model_xgb = XGBClassifier(
        objective="multi:softprob",
        num_class=n_class,
        tree_method=TREE_METHOD,
        eval_metric="mlogloss",
        early_stopping_rounds=20,
        **params,
        n_jobs=-1,
    )

    model_xgb.fit(
        X_tune_train,
        y_tune_train,
        sample_weight=sample_weight_tune,
        eval_set=[(X_tune_val, y_tune_val)],
        sample_weight_eval_set=[sample_weight_val],
        verbose=False,
    )

    # Previsão na validação
    y_val_pred = model_xgb.predict(X_tune_val)
    macro_f1_val = f1_score(y_tune_val, y_val_pred, average="macro")

    print(f"Macro F1 (val): {macro_f1_val:.4f}")

    if macro_f1_val > best_f1:
        best_f1 = macro_f1_val
        best_params = params

print("\nMelhores params:", best_params)
print("Melhor Macro F1 (val):", best_f1)


Testando params: {'n_estimators': 300, 'max_depth': 5, 'learning_rate': 0.1, 'subsample': 0.8, 'colsample_bytree': 0.8}
Macro F1 (val): 0.4489

Testando params: {'n_estimators': 500, 'max_depth': 5, 'learning_rate': 0.1, 'subsample': 0.8, 'colsample_bytree': 0.8}
Macro F1 (val): 0.4511

Testando params: {'n_estimators': 300, 'max_depth': 7, 'learning_rate': 0.05, 'subsample': 0.9, 'colsample_bytree': 0.9}
Macro F1 (val): 0.4632

Testando params: {'n_estimators': 500, 'max_depth': 7, 'learning_rate': 0.05, 'subsample': 0.9, 'colsample_bytree': 0.9}
Macro F1 (val): 0.4666

Melhores params: {'n_estimators': 500, 'max_depth': 7, 'learning_rate': 0.05, 'subsample': 0.9, 'colsample_bytree': 0.9}
Melhor Macro F1 (val): 0.46658134980387056


## Treinar o XGBOOST

In [17]:
sample_weight_full = y_train.map(class_weight_dict).values

In [18]:
X_train_full, X_val_full, y_train_full, y_val_full = train_test_split(
    X_train_v1,
    y_train,
    test_size=0.1,
    stratify=y_train,
    random_state=42
)

sw_train_full = y_train_full.map(class_weight_dict).values
sw_val_full   = y_val_full.map(class_weight_dict).values

In [19]:
model_xgb_final = XGBClassifier(
    objective="multi:softprob",
    num_class=n_class,
    tree_method=TREE_METHOD,
    eval_metric="mlogloss",
    **best_params,
    n_jobs=-1,
    early_stopping_rounds=30
)

model_xgb_final.fit(
    X_train_full,
    y_train_full,
    sample_weight=sw_train_full,
    eval_set=[(X_val_full, y_val_full)],
    sample_weight_eval_set=[sw_val_full],
    verbose=True
)

[0]	validation_0-mlogloss:2.23875
[1]	validation_0-mlogloss:2.18419
[2]	validation_0-mlogloss:2.13661
[3]	validation_0-mlogloss:2.09417
[4]	validation_0-mlogloss:2.05574
[5]	validation_0-mlogloss:2.02146
[6]	validation_0-mlogloss:1.98940
[7]	validation_0-mlogloss:1.95993
[8]	validation_0-mlogloss:1.93224
[9]	validation_0-mlogloss:1.90654
[10]	validation_0-mlogloss:1.88272
[11]	validation_0-mlogloss:1.86053
[12]	validation_0-mlogloss:1.83957
[13]	validation_0-mlogloss:1.82010
[14]	validation_0-mlogloss:1.80147
[15]	validation_0-mlogloss:1.78396
[16]	validation_0-mlogloss:1.76723
[17]	validation_0-mlogloss:1.75124
[18]	validation_0-mlogloss:1.73633
[19]	validation_0-mlogloss:1.72226
[20]	validation_0-mlogloss:1.70828
[21]	validation_0-mlogloss:1.69525
[22]	validation_0-mlogloss:1.68289
[23]	validation_0-mlogloss:1.67096
[24]	validation_0-mlogloss:1.65952
[25]	validation_0-mlogloss:1.64849
[26]	validation_0-mlogloss:1.63806
[27]	validation_0-mlogloss:1.62801
[28]	validation_0-mlogloss:1.6

AttributeError: 'super' object has no attribute '__sklearn_tags__'

AttributeError: 'super' object has no attribute '__sklearn_tags__'

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=0.9, early_stopping_rounds=30,
              enable_categorical=False, eval_metric='mlogloss',
              feature_types=None, gamma=None, gpu_id=None, grow_policy=None,
              importance_type=None, interaction_constraints=None,
              learning_rate=0.05, max_bin=None, max_cat_threshold=None,
              max_cat_to_onehot=None, max_delta_step=None, max_depth=7,
              max_leaves=None, min_child_weight=None, missing=nan,
              monotone_constraints=None, n_estimators=500, n_jobs=-1,
              num_class=10, num_parallel_tree=None, objective='multi:softprob', ...)

## Avaliação em P043

In [20]:
y_pred_test = model_xgb_final.predict(X_test_v1)
test_acc_xgb = accuracy_score(y_test, y_pred_test)
macro_f1_xgb = f1_score(y_test, y_pred_test, average="macro")

print("XGBoost Test accuracy:", test_acc_xgb)
print("XGBoost Macro F1:", macro_f1_xgb)

XGBoost Test accuracy: 0.5469424869288475
XGBoost Macro F1: 0.3623907747850887


In [23]:
f1_per_class_xgb = f1_score(y_test, y_pred_test, average=None)
f1_named_xgb = {enc2label[i]: f1_per_class_xgb[i] for i in range(len(f1_per_class_xgb))}

print("F1 por classe (XGBoost):")
for label, v in f1_named_xgb.items():
    print(f"{label:20s} {v:.4f}")

cm_xgb = confusion_matrix(y_test, y_pred_test)
labels_order = [enc2label[i] for i in range(10)]
cm_xgb_df = pd.DataFrame(cm_xgb, index=labels_order, columns=labels_order)
cm_xgb_df

F1 por classe (XGBoost):
bicycling            0.6205
household-chores     0.5145
manual-work          0.0078
mixed-activity       0.1097
sitting              0.5598
sleep                0.9102
sports               0.1522
standing             0.2157
vehicle              0.3123
walking              0.2212


Unnamed: 0,bicycling,household-chores,manual-work,mixed-activity,sitting,sleep,sports,standing,vehicle,walking
bicycling,188,68,25,1,2,3,7,16,23,1
household-chores,16,1079,174,39,105,8,30,91,145,71
manual-work,0,3,2,6,0,0,5,2,1,5
mixed-activity,2,252,37,80,152,6,19,96,227,113
sitting,40,279,36,66,1423,100,22,420,418,26
sleep,2,30,19,9,404,3669,30,46,42,9
sports,7,108,31,49,11,0,65,115,186,61
standing,7,195,52,146,114,15,10,234,263,56
vehicle,1,47,1,3,36,1,0,8,329,0
walking,9,375,110,76,7,0,33,50,47,149
