# Classificacao dos estagios do sono com MLP

Este notebook organiza o pipeline de treino, validacao e teste para um perceptron multicamadas aplicado aos dados de estagios do sono.

In [1]:
import random
import os
from pathlib import Path

import numpy as np
import pandas as pd
import torch
from sklearn.metrics import accuracy_score, balanced_accuracy_score, classification_report, confusion_matrix, f1_score, log_loss
from sklearn.preprocessing import StandardScaler
from torch import nn
from torch.utils.data import DataLoader, Dataset, WeightedRandomSampler

## Configuracao dos dados

In [2]:
BASE_PATH = Path().resolve()
if not (BASE_PATH / "datalake").exists():
    BASE_PATH = BASE_PATH.parents[2]
DATASETS_PATH = BASE_PATH / "datalake" / "data-for-model"
TRAINING_DATA_FILE = DATASETS_PATH / "train" / "train_sleep_cassette.parquet"
VALIDATION_DATA_FILE = DATASETS_PATH / "val" / "val_sleep_cassette.parquet"
TEST_DATA_FILE = DATASETS_PATH / "test" / "test_sleep_cassette.parquet"
STAGES = ["W", "N1", "N2", "N3", "REM"]
STAGE2ID = {stage: idx for idx, stage in enumerate(STAGES)}
df_train = pd.read_parquet(TRAINING_DATA_FILE, engine="fastparquet")
df_val = pd.read_parquet(VALIDATION_DATA_FILE, engine="fastparquet")
df_test = pd.read_parquet(TEST_DATA_FILE, engine="fastparquet")

## Preparacao das tabelas

In [3]:
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

SEED = 42
set_seed(SEED)

SEX_MAP = {"F": 0.0, "M": 1.0}
frames = [df_train, df_val, df_test]
for frame in frames:
    frame["sex"] = frame["sex"].map(SEX_MAP).fillna(0.5).astype(np.float32)
    frame["stage_id"] = frame["stage"].map(STAGE2ID).astype(np.int64)

IDENTIFIERS = ["subject_id", "night_id", "epoch_idx", "stage", "stage_id"]
FEATURES = [column for column in df_train.columns if column not in IDENTIFIERS]
FEATURES.sort()

scaler = StandardScaler()
x_train = scaler.fit_transform(df_train[FEATURES]).astype(np.float32)
x_val = scaler.transform(df_val[FEATURES]).astype(np.float32)
x_test = scaler.transform(df_test[FEATURES]).astype(np.float32)

y_train = df_train["stage_id"].to_numpy(dtype=np.int64)
y_val = df_val["stage_id"].to_numpy(dtype=np.int64)
y_test = df_test["stage_id"].to_numpy(dtype=np.int64)

class_distribution = df_train["stage_id"].value_counts().sort_index()
base_weights = (len(df_train) / (len(STAGES) * class_distribution)).astype(np.float64)
adjusted_weights = base_weights ** 1.15
weight_lookup = {idx: float(adjusted_weights.loc[idx]) for idx in class_distribution.index}
train_weights = np.array([weight_lookup[label] for label in y_train], dtype=np.float64)
val_weights = np.array([weight_lookup.get(label, 1.0) for label in y_val], dtype=np.float64)
test_weights = np.array([weight_lookup.get(label, 1.0) for label in y_test], dtype=np.float64)

## Distribuicao das classes no treino

In [4]:
class_summary = pd.DataFrame({
    "stage": STAGES,
    "samples": [int(class_distribution.get(idx, 0)) for idx in range(len(STAGES))]
})
class_summary["proportion"] = class_summary["samples"] / class_summary["samples"].sum()
class_summary

Unnamed: 0,stage,samples,proportion
0,W,34935,0.309837
1,N1,13882,0.123119
2,N2,40344,0.357809
3,N3,8532,0.07567
4,REM,15060,0.133566


## Datasets e carregadores

In [5]:
class SleepDataset(Dataset):
    def __init__(self, features, labels):
        self.features = torch.from_numpy(features)
        self.labels = torch.from_numpy(labels)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, index):
        return self.features[index], self.labels[index]

train_dataset = SleepDataset(x_train, y_train)
val_dataset = SleepDataset(x_val, y_val)
test_dataset = SleepDataset(x_test, y_test)

batch_size = 512
num_workers = min(8, os.cpu_count() // 2) if os.cpu_count() else 2
pin_memory = torch.cuda.is_available()
persistent = num_workers > 0
loader_args = {
    "batch_size": batch_size,
    "num_workers": num_workers,
    "pin_memory": pin_memory,
    "persistent_workers": persistent
}
if num_workers > 0:
    loader_args["prefetch_factor"] = 2
train_loader = DataLoader(train_dataset, shuffle=True, **loader_args)
val_loader = DataLoader(val_dataset, shuffle=False, **loader_args)
test_loader = DataLoader(test_dataset, shuffle=False, **loader_args)

## Modelo MLP

In [6]:
class ResidualBlock(nn.Module):
    def __init__(self, dim, expansion, dropout):
        super().__init__()
        hidden_dim = int(dim * expansion)
        self.norm = nn.LayerNorm(dim)
        self.fc1 = nn.Linear(dim, hidden_dim)
        self.activation = nn.GELU()
        self.dropout = nn.Dropout(dropout)
        self.fc2 = nn.Linear(hidden_dim, dim)

    def forward(self, inputs):
        residual = inputs
        outputs = self.norm(inputs)
        outputs = self.fc1(outputs)
        outputs = self.activation(outputs)
        outputs = self.dropout(outputs)
        outputs = self.fc2(outputs)
        outputs = self.dropout(outputs)
        return outputs + residual

class SleepMLP(nn.Module):
    def __init__(self, input_dim, hidden_dim, depth, expansion, dropout, num_classes):
        super().__init__()
        layers = [
            nn.Linear(input_dim, hidden_dim),
            nn.LayerNorm(hidden_dim),
            nn.GELU(),
            nn.Dropout(dropout)
        ]
        self.stem = nn.Sequential(*layers)
        self.blocks = nn.ModuleList([ResidualBlock(hidden_dim, expansion, dropout) for _ in range(depth)])
        self.head = nn.Sequential(
            nn.LayerNorm(hidden_dim),
            nn.Linear(hidden_dim, num_classes)
        )

    def forward(self, inputs):
        outputs = self.stem(inputs)
        for block in self.blocks:
            outputs = block(outputs)
        return self.head(outputs)

class FocalLoss(nn.Module):
    def __init__(self, gamma=1.3, weight=None):
        super().__init__()
        self.gamma = gamma
        self.register_buffer("class_weight", weight if weight is not None else None)

    def forward(self, logits, targets):
        ce = nn.functional.cross_entropy(logits, targets, weight=self.class_weight, reduction="none")
        probabilities = nn.functional.softmax(logits, dim=-1)
        pt = probabilities.gather(1, targets.unsqueeze(1)).squeeze(1)
        loss = ((1.0 - pt) ** self.gamma) * ce
        return loss.mean()

## Treinamento

In [7]:
loss_weights = torch.tensor([weight_lookup[idx] for idx in range(len(STAGES))], dtype=torch.float32)
loss_weights = loss_weights / loss_weights.mean()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = SleepMLP(len(FEATURES), 384, 4, 1.4, 0.2, len(STAGES)).to(device)
criterion = FocalLoss(gamma=1.15, weight=loss_weights.to(device))
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4, weight_decay=5e-4)
epochs = 20
steps_per_epoch = len(train_loader)
scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=1.5e-3, epochs=epochs, steps_per_epoch=steps_per_epoch, pct_start=0.35, div_factor=10.0, final_div_factor=30.0)
scaler = torch.cuda.amp.GradScaler(enabled=device.type == "cuda")

def run_epoch(model, loader, criterion, device, optimizer=None, scaler=None, scheduler=None, grad_clip=None):
    is_train = optimizer is not None
    model.train() if is_train else model.eval()
    total_loss = 0.0
    total_correct = 0
    total_samples = 0
    all_targets = []
    all_predictions = []
    all_logits = []
    autocast_enabled = scaler is not None and scaler.is_enabled()
    with torch.set_grad_enabled(is_train):
        for features, targets in loader:
            features = features.to(device)
            targets = targets.to(device)
            if is_train:
                optimizer.zero_grad()
            with torch.cuda.amp.autocast(enabled=autocast_enabled):
                logits = model(features)
                loss = criterion(logits, targets)
            if is_train:
                if scaler is not None and scaler.is_enabled():
                    scaler.scale(loss).backward()
                    scaler.unscale_(optimizer)
                    if grad_clip is not None:
                        torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
                    scaler.step(optimizer)
                    scaler.update()
                else:
                    loss.backward()
                    if grad_clip is not None:
                        torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
                    optimizer.step()
                if scheduler is not None:
                    scheduler.step()
            total_loss += loss.item() * features.size(0)
            total_samples += features.size(0)
            predictions = logits.detach().argmax(dim=1)
            total_correct += (predictions == targets).sum().item()
            all_targets.append(targets.detach().cpu())
            all_predictions.append(predictions.cpu())
            all_logits.append(logits.detach().cpu())
    epoch_loss = total_loss / max(total_samples, 1)
    epoch_acc = total_correct / max(total_samples, 1)
    targets_array = torch.cat(all_targets).numpy()
    predictions_array = torch.cat(all_predictions).numpy()
    logits_array = torch.cat(all_logits).numpy()
    return epoch_loss, epoch_acc, targets_array, predictions_array, logits_array

history = []
best_state = None
best_metric = -np.inf
patience = 4
wait = 0
for epoch in range(1, epochs + 1):
    train_loss, train_acc, train_targets, train_preds, train_logits = run_epoch(model, train_loader, criterion, device, optimizer, scaler, scheduler, grad_clip=1.0)
    train_balanced_acc = balanced_accuracy_score(train_targets, train_preds)
    train_macro_f1 = f1_score(train_targets, train_preds, average="macro")
    val_loss, val_acc, val_targets, val_preds, val_logits = run_epoch(model, val_loader, criterion, device)
    val_balanced_acc = balanced_accuracy_score(val_targets, val_preds)
    val_macro_f1 = f1_score(val_targets, val_preds, average="macro")
    history.append({
        "epoch": epoch,
        "train_loss": train_loss,
        "train_acc": train_acc,
        "train_balanced_acc": train_balanced_acc,
        "train_macro_f1": train_macro_f1,
        "val_loss": val_loss,
        "val_acc": val_acc,
        "val_balanced_acc": val_balanced_acc,
        "val_macro_f1": val_macro_f1
    })
    print(f"epoch {epoch:02d} | train loss {train_loss:.4f} acc {train_acc:.4f} bal_acc {train_balanced_acc:.4f} f1 {train_macro_f1:.4f} | val loss {val_loss:.4f} acc {val_acc:.4f} bal_acc {val_balanced_acc:.4f} f1 {val_macro_f1:.4f}")
    if val_macro_f1 > best_metric:
        best_metric = val_macro_f1
        best_state = {key: value.cpu() for key, value in model.state_dict().items()}
        wait = 0
    else:
        wait += 1
        if wait >= patience:
            break

if best_state is not None:
    model.load_state_dict(best_state)
model.to(device)

  scaler = torch.cuda.amp.GradScaler(enabled=device.type == "cuda")


  with torch.cuda.amp.autocast(enabled=autocast_enabled):


  with torch.cuda.amp.autocast(enabled=autocast_enabled):


epoch 01 | train loss 0.2506 acc 0.7332 bal_acc 0.7602 f1 0.7076 | val loss 0.3192 acc 0.6910 bal_acc 0.7166 f1 0.6431


  with torch.cuda.amp.autocast(enabled=autocast_enabled):


  with torch.cuda.amp.autocast(enabled=autocast_enabled):


epoch 02 | train loss 0.2015 acc 0.7701 bal_acc 0.7977 f1 0.7465 | val loss 0.3043 acc 0.7101 bal_acc 0.7329 f1 0.6579


  with torch.cuda.amp.autocast(enabled=autocast_enabled):


  with torch.cuda.amp.autocast(enabled=autocast_enabled):


epoch 03 | train loss 0.1856 acc 0.7841 bal_acc 0.8115 f1 0.7609 | val loss 0.3279 acc 0.7503 bal_acc 0.7190 f1 0.6793


  with torch.cuda.amp.autocast(enabled=autocast_enabled):


  with torch.cuda.amp.autocast(enabled=autocast_enabled):


epoch 04 | train loss 0.1752 acc 0.7911 bal_acc 0.8188 f1 0.7687 | val loss 0.3338 acc 0.7207 bal_acc 0.7215 f1 0.6677


  with torch.cuda.amp.autocast(enabled=autocast_enabled):


  with torch.cuda.amp.autocast(enabled=autocast_enabled):


epoch 05 | train loss 0.1683 acc 0.7951 bal_acc 0.8231 f1 0.7729 | val loss 0.3419 acc 0.6919 bal_acc 0.7242 f1 0.6566


  with torch.cuda.amp.autocast(enabled=autocast_enabled):


  with torch.cuda.amp.autocast(enabled=autocast_enabled):


epoch 06 | train loss 0.1643 acc 0.7978 bal_acc 0.8261 f1 0.7758 | val loss 0.3623 acc 0.6902 bal_acc 0.7047 f1 0.6398


  with torch.cuda.amp.autocast(enabled=autocast_enabled):


  with torch.cuda.amp.autocast(enabled=autocast_enabled):


epoch 07 | train loss 0.1549 acc 0.8062 bal_acc 0.8345 f1 0.7847 | val loss 0.3482 acc 0.6962 bal_acc 0.7266 f1 0.6468


SleepMLP(
  (stem): Sequential(
    (0): Linear(in_features=59, out_features=384, bias=True)
    (1): LayerNorm((384,), eps=1e-05, elementwise_affine=True)
    (2): GELU(approximate='none')
    (3): Dropout(p=0.2, inplace=False)
  )
  (blocks): ModuleList(
    (0-3): 4 x ResidualBlock(
      (norm): LayerNorm((384,), eps=1e-05, elementwise_affine=True)
      (fc1): Linear(in_features=384, out_features=537, bias=True)
      (activation): GELU(approximate='none')
      (dropout): Dropout(p=0.2, inplace=False)
      (fc2): Linear(in_features=537, out_features=384, bias=True)
    )
  )
  (head): Sequential(
    (0): LayerNorm((384,), eps=1e-05, elementwise_affine=True)
    (1): Linear(in_features=384, out_features=5, bias=True)
  )
)

## Historico de treino

In [8]:
history_df = pd.DataFrame(history)
history_df

Unnamed: 0,epoch,train_loss,train_acc,train_balanced_acc,train_macro_f1,val_loss,val_acc,val_balanced_acc,val_macro_f1
0,1,0.250568,0.733222,0.760248,0.707642,0.319157,0.691017,0.716554,0.643065
1,2,0.20153,0.77009,0.797681,0.74646,0.304324,0.71013,0.73288,0.657914
2,3,0.185633,0.784103,0.811519,0.760885,0.327886,0.750313,0.71899,0.679277
3,4,0.175196,0.79111,0.81879,0.768702,0.333819,0.720653,0.721538,0.667697
4,5,0.1683,0.795136,0.823143,0.772944,0.341932,0.691867,0.724167,0.656607
5,6,0.164329,0.797841,0.826071,0.77578,0.362266,0.690239,0.704701,0.639781
6,7,0.15486,0.806205,0.834469,0.784663,0.348228,0.696208,0.726612,0.646822


## Avaliacao no conjunto de teste

In [9]:
test_loss, test_acc, test_targets, test_preds, test_logits = run_epoch(model, test_loader, criterion, device)
test_probabilities = torch.softmax(torch.from_numpy(test_logits), dim=1).numpy()
test_logloss = log_loss(test_targets, test_probabilities)
test_balanced_acc = balanced_accuracy_score(test_targets, test_preds)
test_macro_f1 = f1_score(test_targets, test_preds, average="macro")
test_accuracy = accuracy_score(test_targets, test_preds)
summary = pd.DataFrame({
    "metric": ["loss", "accuracy", "balanced_accuracy", "macro_f1"],
    "value": [test_logloss, test_accuracy, test_balanced_acc, test_macro_f1]
})
print(summary.to_string(index=False))
print()
report = classification_report(test_targets, test_preds, target_names=STAGES, output_dict=True)
report_df = pd.DataFrame(report).transpose()
report_table = report_df.loc[STAGES + ["macro avg", "weighted avg"], ["precision", "recall", "f1-score", "support"]]
print(report_table.round(3).to_string())
print()
confusion = confusion_matrix(test_targets, test_preds)
confusion_df = pd.DataFrame(confusion, index=STAGES, columns=STAGES)
print(confusion_df)

  with torch.cuda.amp.autocast(enabled=autocast_enabled):


           metric    value
             loss 0.694315
         accuracy 0.731841
balanced_accuracy 0.729671
         macro_f1 0.678694

              precision  recall  f1-score  support
W                 0.910   0.886     0.898  11429.0
N1                0.307   0.545     0.393   3425.0
N2                0.891   0.634     0.741  13722.0
N3                0.544   0.857     0.666   1983.0
REM               0.669   0.727     0.697   5319.0
macro avg         0.664   0.730     0.679  35878.0
weighted avg      0.789   0.732     0.747  35878.0

         W    N1    N2    N3   REM
W    10128   880    75    19   327
N1     591  1866   316    35   617
N2      71  2646  8699  1336   970
N3       0    54   230  1699     0
REM    340   635   447    32  3865
