# Classificacao dos estagios do sono com LightGBM

Este notebook organiza o pipeline de treino, validacao e teste para o modelo aplicado aos dados de estagios do sono.

In [1]:
import random
from pathlib import Path

import numpy as np
import pandas as pd
import lightgbm as lgb
from lightgbm import LGBMClassifier, early_stopping, log_evaluation
from sklearn.metrics import accuracy_score, balanced_accuracy_score, classification_report, confusion_matrix, f1_score, log_loss
from sklearn.preprocessing import StandardScaler

## Configuracao dos dados

In [2]:
BASE_PATH = Path().resolve()
if not (BASE_PATH / "datalake").exists():
    BASE_PATH = BASE_PATH.parents[2]
DATASETS_PATH = BASE_PATH / "datalake" / "data-for-model"
TRAINING_DATA_FILE = DATASETS_PATH / "train" / "train_sleep_cassette.parquet"
VALIDATION_DATA_FILE = DATASETS_PATH / "val" / "val_sleep_cassette.parquet"
TEST_DATA_FILE = DATASETS_PATH / "test" / "test_sleep_cassette.parquet"
STAGES = ["W", "N1", "N2", "N3", "REM"]
STAGE2ID = {stage: idx for idx, stage in enumerate(STAGES)}
df_train = pd.read_parquet(TRAINING_DATA_FILE, engine="fastparquet")
df_val = pd.read_parquet(VALIDATION_DATA_FILE, engine="fastparquet")
df_test = pd.read_parquet(TEST_DATA_FILE, engine="fastparquet")

## Preparacao das tabelas

In [3]:
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)

SEED = 42
set_seed(SEED)

SEX_MAP = {"F": 0.0, "M": 1.0}
frames = [df_train, df_val, df_test]
for frame in frames:
    frame["sex"] = frame["sex"].map(SEX_MAP).fillna(0.5).astype(np.float32)
    frame["stage_id"] = frame["stage"].map(STAGE2ID).astype(np.int64)

IDENTIFIERS = ["subject_id", "night_id", "epoch_idx", "stage", "stage_id"]
FEATURES = [column for column in df_train.columns if column not in IDENTIFIERS]
FEATURES.sort()

scaler = StandardScaler()
x_train = scaler.fit_transform(df_train[FEATURES]).astype(np.float32)
x_val = scaler.transform(df_val[FEATURES]).astype(np.float32)
x_test = scaler.transform(df_test[FEATURES]).astype(np.float32)

y_train = df_train["stage_id"].to_numpy(dtype=np.int64)
y_val = df_val["stage_id"].to_numpy(dtype=np.int64)
y_test = df_test["stage_id"].to_numpy(dtype=np.int64)

class_distribution = df_train["stage_id"].value_counts().sort_index()
class_weights = (len(df_train) / (len(STAGES) * class_distribution)).astype(np.float64)
weight_lookup = {idx: float(value) for idx, value in class_weights.items()}
train_weights = np.array([weight_lookup[label] for label in y_train], dtype=np.float64)
val_weights = np.array([weight_lookup[label] for label in y_val], dtype=np.float64)
test_weights = np.array([weight_lookup[label] for label in y_test], dtype=np.float64)

## Distribuicao das classes no treino

In [4]:
class_summary = pd.DataFrame({
    "stage": STAGES,
    "samples": [int(class_distribution.get(idx, 0)) for idx in range(len(STAGES))]
})
class_summary["proportion"] = class_summary["samples"] / class_summary["samples"].sum()
class_summary

Unnamed: 0,stage,samples,proportion
0,W,34935,0.309837
1,N1,13882,0.123119
2,N2,40344,0.357809
3,N3,8532,0.07567
4,REM,15060,0.133566


## Treinamento

In [5]:
train_weights_array = train_weights
val_weights_array = val_weights
model = LGBMClassifier(
    objective="multiclass",
    num_class=len(STAGES),
    n_estimators=2200,
    learning_rate=0.045,
    num_leaves=104,
    max_depth=-1,
    min_child_samples=60,
    subsample=0.85,
    subsample_freq=1,
    colsample_bytree=0.7,
    reg_lambda=0.9,
    reg_alpha=0.02,
    random_state=SEED,
    n_jobs=-1)
model.fit(
    x_train,
    y_train,
    sample_weight=train_weights_array,
    eval_set=[(x_train, y_train), (x_val, y_val)],
    eval_sample_weight=[train_weights_array, val_weights_array],
    eval_metric=["multi_logloss", "multi_error"],
    eval_names=["treino", "validacao"],
    callbacks=[early_stopping(stopping_rounds=150), log_evaluation(period=25)]
)
best_iteration = model.best_iteration_ if model.best_iteration_ is not None else model.n_estimators_
history = model.evals_result_

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.038097 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14206
[LightGBM] [Info] Number of data points in the train set: 112753, number of used features: 59
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438


Training until validation scores don't improve for 150 rounds


[25]	treino's multi_logloss: 0.599177	treino's multi_error: 0.141129	validacao's multi_logloss: 0.800536	validacao's multi_error: 0.259514


[50]	treino's multi_logloss: 0.390096	treino's multi_error: 0.114391	validacao's multi_logloss: 0.695251	validacao's multi_error: 0.25709


[75]	treino's multi_logloss: 0.302143	treino's multi_error: 0.0932015	validacao's multi_logloss: 0.680949	validacao's multi_error: 0.255268


[100]	treino's multi_logloss: 0.249784	treino's multi_error: 0.0749293	validacao's multi_logloss: 0.685161	validacao's multi_error: 0.255401


[125]	treino's multi_logloss: 0.213439	treino's multi_error: 0.0609763	validacao's multi_logloss: 0.694143	validacao's multi_error: 0.255148


[150]	treino's multi_logloss: 0.185476	treino's multi_error: 0.0490326	validacao's multi_logloss: 0.703847	validacao's multi_error: 0.254687


[175]	treino's multi_logloss: 0.163213	treino's multi_error: 0.0398192	validacao's multi_logloss: 0.716106	validacao's multi_error: 0.255802


[200]	treino's multi_logloss: 0.144608	treino's multi_error: 0.0323148	validacao's multi_logloss: 0.726234	validacao's multi_error: 0.25605


Early stopping, best iteration is:
[74]	treino's multi_logloss: 0.304561	treino's multi_error: 0.0935577	validacao's multi_logloss: 0.680854	validacao's multi_error: 0.255676


## Historico de treino

In [6]:
iterations = len(history["treino"]["multi_logloss"])
history_df = pd.DataFrame({
    "iteration": np.arange(1, iterations + 1),
    "train_logloss": history["treino"]["multi_logloss"],
    "val_logloss": history["validacao"]["multi_logloss"],
    "train_error": history["treino"].get("multi_error", [np.nan] * iterations),
    "val_error": history["validacao"].get("multi_error", [np.nan] * iterations)
})
history_df

Unnamed: 0,iteration,train_logloss,val_logloss,train_error,val_error
0,1,1.504859,1.522432,0.206981,0.314419
1,2,1.414307,1.444211,0.188396,0.285980
2,3,1.333748,1.375025,0.178288,0.275402
3,4,1.262581,1.316039,0.172792,0.272511
4,5,1.198693,1.264842,0.169627,0.274180
...,...,...,...,...,...
219,220,0.132065,0.735298,0.027264,0.257398
220,221,0.131457,0.736157,0.027021,0.257618
221,222,0.130872,0.736185,0.026776,0.257377
222,223,0.130322,0.736663,0.026540,0.257518


## Avaliacao no conjunto de teste

In [7]:
val_predictions = model.predict(x_val, num_iteration=best_iteration)
test_predictions = model.predict(x_test, num_iteration=best_iteration)
test_probabilities = model.predict_proba(x_test, num_iteration=best_iteration)
test_loss = log_loss(y_test, test_probabilities)
test_accuracy = accuracy_score(y_test, test_predictions)
test_balanced_acc = balanced_accuracy_score(y_test, test_predictions)
test_macro_f1 = f1_score(y_test, test_predictions, average="macro")
summary = pd.DataFrame({
    "metric": ["loss", "accuracy", "balanced_accuracy", "macro_f1"],
    "value": [test_loss, test_accuracy, test_balanced_acc, test_macro_f1]
})
print(summary.to_string(index=False))
print()
report = classification_report(y_test, test_predictions, target_names=STAGES, output_dict=True)
report_df = pd.DataFrame(report).transpose()
report_table = report_df.loc[STAGES + ["macro avg", "weighted avg"], ["precision", "recall", "f1-score", "support"]]
print(report_table.round(3).to_string())
print()
confusion = confusion_matrix(y_test, test_predictions)
confusion_df = pd.DataFrame(confusion, index=STAGES, columns=STAGES)
print(confusion_df)







           metric    value
             loss 0.612711
         accuracy 0.766375
balanced_accuracy 0.725628
         macro_f1 0.705784

              precision  recall  f1-score  support
W                 0.911   0.859     0.884  11429.0
N1                0.346   0.469     0.398   3425.0
N2                0.860   0.782     0.819  13722.0
N3                0.687   0.818     0.747   1983.0
REM               0.663   0.701     0.681   5319.0
macro avg         0.693   0.726     0.706  35878.0
weighted avg      0.788   0.766     0.775  35878.0

        W    N1     N2    N3   REM
W    9818  1052    125     8   426
N1    469  1605    624    14   713
N2    114  1421  10724   707   756
N3      3    16    341  1623     0
REM   377   541    663    12  3726


## Analise adicional

In [8]:
importances = pd.Series(model.feature_importances_, index=FEATURES)
top_importances = importances.sort_values(ascending=False).head(20)
top_importances.to_frame(name="importance")

Unnamed: 0,importance
age,2202
tso_min,2072
EMG_submental_p90_1hz_roll_max_10,1257
EOG_theta_relpow_256_roll_mean_5,1157
EEG_Fpz_Cz_aperiodic_slope_256_roll_mean_15,1127
EEG_Fpz_Cz_theta_relpow_256_roll_mean_5,1000
EMG_submental_median_1hz,889
EEG_Pz_Oz_delta_relpow_256_roll_mean_15,852
EMG_submental_median_1hz_roll_mean_5,833
EEG_Pz_Oz_aperiodic_slope_256,820
