# Classificacao dos estagios do sono com XGBoost

Este notebook organiza o pipeline de treino, validacao e teste para o modelo aplicado aos dados de estagios do sono.

In [1]:
import random
from pathlib import Path

import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, balanced_accuracy_score, classification_report, confusion_matrix, f1_score, log_loss
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier

## Configuracao dos dados

In [2]:
BASE_PATH = Path().resolve()
if not (BASE_PATH / "datalake").exists():
    BASE_PATH = BASE_PATH.parents[2]
DATASETS_PATH = BASE_PATH / "datalake" / "data-for-model"
TRAINING_DATA_FILE = DATASETS_PATH / "train" / "train_sleep_cassette.parquet"
VALIDATION_DATA_FILE = DATASETS_PATH / "val" / "val_sleep_cassette.parquet"
TEST_DATA_FILE = DATASETS_PATH / "test" / "test_sleep_cassette.parquet"
STAGES = ["W", "N1", "N2", "N3", "REM"]
STAGE2ID = {stage: idx for idx, stage in enumerate(STAGES)}
df_train = pd.read_parquet(TRAINING_DATA_FILE, engine="fastparquet")
df_val = pd.read_parquet(VALIDATION_DATA_FILE, engine="fastparquet")
df_test = pd.read_parquet(TEST_DATA_FILE, engine="fastparquet")

## Preparacao das tabelas

In [3]:
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)

SEED = 42
set_seed(SEED)

SEX_MAP = {"F": 0.0, "M": 1.0}
frames = [df_train, df_val, df_test]
for frame in frames:
    frame["sex"] = frame["sex"].map(SEX_MAP).fillna(0.5).astype(np.float32)
    frame["stage_id"] = frame["stage"].map(STAGE2ID).astype(np.int64)

IDENTIFIERS = ["subject_id", "night_id", "epoch_idx", "stage", "stage_id"]
FEATURES = [column for column in df_train.columns if column not in IDENTIFIERS]
FEATURES.sort()

scaler = StandardScaler()
x_train = scaler.fit_transform(df_train[FEATURES]).astype(np.float32)
x_val = scaler.transform(df_val[FEATURES]).astype(np.float32)
x_test = scaler.transform(df_test[FEATURES]).astype(np.float32)

y_train = df_train["stage_id"].to_numpy(dtype=np.int64)
y_val = df_val["stage_id"].to_numpy(dtype=np.int64)
y_test = df_test["stage_id"].to_numpy(dtype=np.int64)

class_distribution = df_train["stage_id"].value_counts().sort_index()
class_weights = (len(df_train) / (len(STAGES) * class_distribution)).astype(np.float64)
weight_lookup = {idx: float(value) for idx, value in class_weights.items()}
train_weights = np.array([weight_lookup[label] for label in y_train], dtype=np.float64)
val_weights = np.array([weight_lookup[label] for label in y_val], dtype=np.float64)
test_weights = np.array([weight_lookup[label] for label in y_test], dtype=np.float64)

## Distribuicao das classes no treino

In [4]:
class_summary = pd.DataFrame({
    "stage": STAGES,
    "samples": [int(class_distribution.get(idx, 0)) for idx in range(len(STAGES))]
})
class_summary["proportion"] = class_summary["samples"] / class_summary["samples"].sum()
class_summary

Unnamed: 0,stage,samples,proportion
0,W,34935,0.309837
1,N1,13882,0.123119
2,N2,40344,0.357809
3,N3,8532,0.07567
4,REM,15060,0.133566


## Treinamento

In [5]:
train_weights_array = train_weights
val_weights_array = val_weights
model = XGBClassifier(
    objective="multi:softprob",
    num_class=len(STAGES),
    learning_rate=0.045,
    max_depth=8,
    n_estimators=1600,
    subsample=0.85,
    colsample_bytree=0.7,
    min_child_weight=3,
    gamma=0.1,
    reg_lambda=1.2,
    reg_alpha=0.05,
    tree_method="hist",
    random_state=SEED,
    n_jobs=-1,
    eval_metric="mlogloss")
model.fit(
    x_train,
    y_train,
    sample_weight=train_weights_array,
    eval_set=[(x_train, y_train), (x_val, y_val)],
    verbose=50
)
history = model.evals_result()
val_logloss = history["validation_1"]["mlogloss"]
best_iteration = int(np.argmin(val_logloss))
print(f"melhor iteracao valida: {best_iteration + 1} de {len(val_logloss)}")

[0]	validation_0-mlogloss:1.54457	validation_1-mlogloss:1.55252


[50]	validation_0-mlogloss:0.54480	validation_1-mlogloss:0.74415


[100]	validation_0-mlogloss:0.37734	validation_1-mlogloss:0.66304


[150]	validation_0-mlogloss:0.31467	validation_1-mlogloss:0.65086


[200]	validation_0-mlogloss:0.27401	validation_1-mlogloss:0.64612


[250]	validation_0-mlogloss:0.23988	validation_1-mlogloss:0.64401


[300]	validation_0-mlogloss:0.21243	validation_1-mlogloss:0.64360


[350]	validation_0-mlogloss:0.18874	validation_1-mlogloss:0.64450


[400]	validation_0-mlogloss:0.16781	validation_1-mlogloss:0.64531


[450]	validation_0-mlogloss:0.15008	validation_1-mlogloss:0.64692


[500]	validation_0-mlogloss:0.13522	validation_1-mlogloss:0.64904


[550]	validation_0-mlogloss:0.12227	validation_1-mlogloss:0.65125


[600]	validation_0-mlogloss:0.11003	validation_1-mlogloss:0.65295


[650]	validation_0-mlogloss:0.09983	validation_1-mlogloss:0.65631


[700]	validation_0-mlogloss:0.09058	validation_1-mlogloss:0.66118


[750]	validation_0-mlogloss:0.08235	validation_1-mlogloss:0.66545


[800]	validation_0-mlogloss:0.07500	validation_1-mlogloss:0.66950


[850]	validation_0-mlogloss:0.06842	validation_1-mlogloss:0.67286


[900]	validation_0-mlogloss:0.06243	validation_1-mlogloss:0.67710


[950]	validation_0-mlogloss:0.05734	validation_1-mlogloss:0.68125


[1000]	validation_0-mlogloss:0.05255	validation_1-mlogloss:0.68535


[1050]	validation_0-mlogloss:0.04831	validation_1-mlogloss:0.69104


[1100]	validation_0-mlogloss:0.04433	validation_1-mlogloss:0.69620


[1150]	validation_0-mlogloss:0.04087	validation_1-mlogloss:0.70017


[1200]	validation_0-mlogloss:0.03767	validation_1-mlogloss:0.70479


[1250]	validation_0-mlogloss:0.03499	validation_1-mlogloss:0.71019


[1300]	validation_0-mlogloss:0.03252	validation_1-mlogloss:0.71452


[1350]	validation_0-mlogloss:0.03022	validation_1-mlogloss:0.71927


[1400]	validation_0-mlogloss:0.02823	validation_1-mlogloss:0.72324


[1450]	validation_0-mlogloss:0.02655	validation_1-mlogloss:0.72776


[1500]	validation_0-mlogloss:0.02513	validation_1-mlogloss:0.73150


[1550]	validation_0-mlogloss:0.02394	validation_1-mlogloss:0.73440


[1599]	validation_0-mlogloss:0.02304	validation_1-mlogloss:0.73674


melhor iteracao valida: 287 de 1600


## Historico de treino

In [6]:
history_df = pd.DataFrame({
    "iteration": np.arange(1, len(history["validation_0"]["mlogloss"]) + 1),
    "train_logloss": history["validation_0"]["mlogloss"],
    "val_logloss": history["validation_1"]["mlogloss"],
    "train_error": history["validation_0"].get("merror", [np.nan] * len(history["validation_0"]["mlogloss"])),
    "val_error": history["validation_1"].get("merror", [np.nan] * len(history["validation_1"]["mlogloss"]))
})
history_df

Unnamed: 0,iteration,train_logloss,val_logloss,train_error,val_error
0,1,1.544570,1.552517,,
1,2,1.485018,1.501703,,
2,3,1.431268,1.454975,,
3,4,1.381056,1.411105,,
4,5,1.334321,1.371404,,
...,...,...,...,...,...
1595,1596,0.023084,0.736642,,
1596,1597,0.023077,0.736669,,
1597,1598,0.023060,0.736681,,
1598,1599,0.023054,0.736717,,


## Avaliacao no conjunto de teste

In [7]:
iteration_range = (0, best_iteration + 1)
val_predictions = model.predict(x_val, iteration_range=iteration_range)
test_predictions = model.predict(x_test, iteration_range=iteration_range)
test_probabilities = model.predict_proba(x_test, iteration_range=iteration_range)
test_loss = log_loss(y_test, test_probabilities)
test_accuracy = accuracy_score(y_test, test_predictions)
test_balanced_acc = balanced_accuracy_score(y_test, test_predictions)
test_macro_f1 = f1_score(y_test, test_predictions, average="macro")
summary = pd.DataFrame({
    "metric": ["loss", "accuracy", "balanced_accuracy", "macro_f1"],
    "value": [test_loss, test_accuracy, test_balanced_acc, test_macro_f1]
})
print(summary.to_string(index=False))
print()
report = classification_report(y_test, test_predictions, target_names=STAGES, output_dict=True)
report_df = pd.DataFrame(report).transpose()
report_table = report_df.loc[STAGES + ["macro avg", "weighted avg"], ["precision", "recall", "f1-score", "support"]]
print(report_table.round(3).to_string())
print()
confusion = confusion_matrix(y_test, test_predictions)
confusion_df = pd.DataFrame(confusion, index=STAGES, columns=STAGES)
print(confusion_df)

           metric    value
             loss 0.588359
         accuracy 0.777412
balanced_accuracy 0.724430
         macro_f1 0.712344

              precision  recall  f1-score  support
W                 0.903   0.880     0.891  11429.0
N1                0.365   0.435     0.397   3425.0
N2                0.848   0.806     0.826  13722.0
N3                0.712   0.808     0.757   1983.0
REM               0.687   0.694     0.690   5319.0
macro avg         0.703   0.724     0.712  35878.0
weighted avg      0.788   0.777     0.782  35878.0

         W    N1     N2    N3   REM
W    10054   847    144     7   377
N1     560  1491    746     9   619
N2     143  1214  11056   626   683
N3       4    13    364  1602     0
REM    370   523    731     6  3689


## Analise adicional

In [8]:
importances = pd.Series(model.feature_importances_, index=FEATURES)
top_importances = importances.sort_values(ascending=False).head(20)
top_importances.to_frame(name="importance")

Unnamed: 0,importance
EEG_Fpz_Cz_beta_relpow_256,0.139073
EEG_Pz_Oz_aperiodic_slope_256,0.067852
EEG_Pz_Oz_beta_relpow_256_roll_mean_5,0.045061
EEG_Pz_Oz_beta_relpow_256_roll_mean_10,0.038024
EEG_Fpz_Cz_theta_relpow_256_roll_mean_5,0.036534
EOG_rms_roll_mean_5,0.036114
EOG_sef95_256,0.027828
EMG_submental_median_1hz_roll_mean_5,0.026634
EEG_Pz_Oz_beta_relpow_256,0.02459
EEG_Pz_Oz_delta_relpow_256,0.023041
