In [6]:
import os, math, json, random
import pandas as pd
import numpy as np
from pathlib import Path
from typing import Dict, List, Tuple

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
from sklearn.metrics import classification_report, confusion_matrix, balanced_accuracy_score, f1_score, accuracy_score

In [7]:
BASE_PATH = Path().resolve().parents[2]
DATASETS_PATH = BASE_PATH / "datalake" / "data-for-model"
TRAINING_DATA_FILE = DATASETS_PATH / "train" / "train_sleep_cassette.parquet" 
VALIDATION_DATA_FILE = DATASETS_PATH / "val" / "val_sleep_cassette.parquet" 
TEST_DATA_FILE = DATASETS_PATH / "test" / "test_sleep_cassette.parquet" 

STAGES = ["W", "N1", "N2", "N3", "REM"]
STAGE2ID = {s:i for i, s in enumerate(STAGES)}

In [8]:
df_train = pd.read_parquet(TRAINING_DATA_FILE, engine="fastparquet")
df_val = pd.read_parquet(VALIDATION_DATA_FILE, engine="fastparquet")
df_test = pd.read_parquet(TEST_DATA_FILE, engine="fastparquet")

In [9]:
import os
import math
import random
import json
from pathlib import Path
from typing import List, Tuple, Dict

import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score, classification_report, confusion_matrix
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler

# =========================
# 0) Configs e utilidades
# =========================

CONFIG = {
    "train_path": f"{TRAINING_DATA_FILE}",  # mude para .csv se for o caso
    "val_path": f"{VALIDATION_DATA_FILE}",
    "test_path": f"{TEST_DATA_FILE}",
    # Colunas
    "target_col": "stage",
    "id_cols": ["subject_id", "night_id"],  # não entram como features
    "drop_cols": ["age", "sex"],            # você disse que não quer usar
    # Hiperparâmetros
    "batch_size": 512,
    "epochs": 40,
    "patience": 6,                  # early stopping (macro-F1)
    "lr": 2e-3,
    "weight_decay": 1e-4,
    "hidden_sizes": [256, 256, 128],
    "dropout": 0.2,
    "use_sampler": True,            # WeightedRandomSampler no treino
    "use_class_weight": True,       # pesos na loss
    "num_workers": 2,
    "seed": 42,
    "save_dir": "artifacts",
}

CLASSES = ["W", "N1", "N2", "N3", "REM"]
CLASS_TO_IDX = {c: i for i, c in enumerate(CLASSES)}

def set_seed(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

def load_table(path: str) -> pd.DataFrame:
    ext = Path(path).suffix.lower()
    if ext == ".parquet":
        return pd.read_parquet(path)
    elif ext == ".csv":
        return pd.read_csv(path)
    else:
        raise ValueError(f"Unsupported file extension: {ext}")

# ==================================================
# 1) Dataset e normalização (fit no treino apenas)
# ==================================================

class TabularDataset(Dataset):
    def __init__(self, df: pd.DataFrame, feature_cols: List[str], target_col: str):
        self.X = df[feature_cols].to_numpy(dtype=np.float32)
        self.y = df[target_col].map(CLASS_TO_IDX).to_numpy(dtype=np.int64)
    def __len__(self):
        return len(self.y)
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

def prepare_data(cfg: Dict) -> Tuple[TabularDataset, TabularDataset, TabularDataset, List[str], StandardScaler, np.ndarray]:
    # Carrega
    train_df = load_table(cfg["train_path"])
    val_df   = load_table(cfg["val_path"])
    test_df  = load_table(cfg["test_path"])

    # Sanity: remove duplicadas de nomes que você listou (ocorreu no prompt)
    train_df = train_df.loc[:, ~train_df.columns.duplicated()]
    val_df   = val_df.loc[:, ~val_df.columns.duplicated()]
    test_df  = test_df.loc[:, ~test_df.columns.duplicated()]

    # Remove colunas que não vão nas features
    cols_to_drop = set(cfg["drop_cols"] + [cfg["target_col"]] + cfg["id_cols"])
    all_cols = [c for c in train_df.columns if c not in cols_to_drop]

    # Garante que as colunas de features existem nas três partições
    feature_cols = [c for c in all_cols if c in val_df.columns and c in test_df.columns]

    # Checagem de nulos (você disse que não há, mas assert explícito ajuda)
    assert not train_df[feature_cols + [cfg["target_col"]]].isnull().any().any()
    assert not val_df[feature_cols + [cfg["target_col"]]].isnull().any().any()
    assert not test_df[feature_cols + [cfg["target_col"]]].isnull().any().any()

    # Escalador: fit no treino, transform nos demais
    scaler = StandardScaler()
    train_df[feature_cols] = scaler.fit_transform(train_df[feature_cols].to_numpy(dtype=np.float32))
    val_df[feature_cols]   = scaler.transform(val_df[feature_cols].to_numpy(dtype=np.float32))
    test_df[feature_cols]  = scaler.transform(test_df[feature_cols].to_numpy(dtype=np.float32))

    # Datasets
    ds_train = TabularDataset(train_df, feature_cols, cfg["target_col"])
    ds_val   = TabularDataset(val_df, feature_cols, cfg["target_col"])
    ds_test  = TabularDataset(test_df, feature_cols, cfg["target_col"])

    # Pesos por classe (frequência do treino)
    counts = np.bincount(ds_train.y, minlength=len(CLASSES)).astype(np.float64)
    class_weights = (counts.sum() / (counts + 1e-8))  # inverso da frequência
    class_weights = class_weights / class_weights.mean()  # normaliza

    return ds_train, ds_val, ds_test, feature_cols, scaler, class_weights

# ==================================
# 2) MLP para tabular (BN + SiLU)
# ==================================

class Block(nn.Module):
    def __init__(self, in_f, out_f, p_drop=0.2):
        super().__init__()
        self.lin = nn.Linear(in_f, out_f)
        self.bn  = nn.BatchNorm1d(out_f)
        self.act = nn.SiLU()
        self.do  = nn.Dropout(p_drop)
    def forward(self, x):
        x = self.lin(x)
        x = self.bn(x)
        x = self.act(x)
        x = self.do(x)
        return x

class TabMLP(nn.Module):
    def __init__(self, in_features: int, hidden: List[int], num_classes: int, p_drop: float):
        super().__init__()
        layers = []
        prev = in_features
        for h in hidden:
            layers.append(Block(prev, h, p_drop))
            prev = h
        self.backbone = nn.Sequential(*layers)
        self.head = nn.Linear(prev, num_classes)
    def forward(self, x):
        x = self.backbone(x)
        return self.head(x)

# ======================================
# 3) Treino, validação, inferência
# ======================================

def make_loader(ds: Dataset, cfg: Dict, class_weights: np.ndarray = None, train: bool = False):
    if train and cfg["use_sampler"]:
        # probabilidade ~ inverso da frequência
        sample_weights = class_weights[ds.y]
        sampler = WeightedRandomSampler(
            weights=torch.as_tensor(sample_weights, dtype=torch.double),
            num_samples=len(ds),
            replacement=True
        )
        return DataLoader(ds, batch_size=cfg["batch_size"], sampler=sampler,
                          num_workers=cfg["num_workers"], pin_memory=True)
    else:
        shuffle = train and not cfg["use_sampler"]
        return DataLoader(ds, batch_size=cfg["batch_size"], shuffle=shuffle,
                          num_workers=cfg["num_workers"], pin_memory=True)

def evaluate(model, loader, device):
    model.eval()
    ys, yps = [], []
    with torch.no_grad():
        for xb, yb in loader:
            xb = xb.to(device, non_blocking=True)
            yb = yb.to(device, non_blocking=True)
            logits = model(xb)
            y_hat = logits.argmax(1)
            ys.append(yb.cpu().numpy())
            yps.append(y_hat.cpu().numpy())
    y_true = np.concatenate(ys)
    y_pred = np.concatenate(yps)
    macro = f1_score(y_true, y_pred, average="macro")
    return macro, y_true, y_pred

def train(cfg: Dict):
    os.makedirs(cfg["save_dir"], exist_ok=True)
    set_seed(cfg["seed"])

    ds_train, ds_val, ds_test, feature_cols, scaler, class_weights = prepare_data(cfg)
    input_dim = len(feature_cols)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = TabMLP(
        in_features=input_dim,
        hidden=cfg["hidden_sizes"],
        num_classes=len(CLASSES),
        p_drop=cfg["dropout"]
    ).to(device)

    # Loss com pesos por classe
    if cfg["use_class_weight"]:
        weight_t = torch.tensor(class_weights, dtype=torch.float32, device=device)
        criterion = nn.CrossEntropyLoss(weight=weight_t)
    else:
        criterion = nn.CrossEntropyLoss()

    optimizer = torch.optim.AdamW(model.parameters(), lr=cfg["lr"], weight_decay=cfg["weight_decay"])

    # LRs com OneCycle
    steps_per_epoch = math.ceil(len(ds_train) / cfg["batch_size"])
    total_steps = steps_per_epoch * cfg["epochs"]
    scheduler = torch.optim.lr_scheduler.OneCycleLR(
        optimizer, max_lr=cfg["lr"], total_steps=total_steps, pct_start=0.15, div_factor=10.0, final_div_factor=1e3
    )

    train_loader = make_loader(ds_train, cfg, class_weights, train=True)
    val_loader   = make_loader(ds_val, cfg, class_weights, train=False)

    scaler_amp = torch.cuda.amp.GradScaler(enabled=torch.cuda.is_available())
    best_f1, best_epoch, wait = -1.0, -1, 0
    best_path = Path(cfg["save_dir"]) / "best_mlp.pt"

    for epoch in range(cfg["epochs"]):
        model.train()
        running_loss = 0.0
        for xb, yb in train_loader:
            xb = xb.to(device, non_blocking=True)
            yb = yb.to(device, non_blocking=True)

            optimizer.zero_grad(set_to_none=True)
            with torch.cuda.amp.autocast(enabled=torch.cuda.is_available()):
                logits = model(xb)
                loss = criterion(logits, yb)
            scaler_amp.scale(loss).backward()
            scaler_amp.step(optimizer)
            scaler_amp.update()
            scheduler.step()

            running_loss += loss.item() * xb.size(0)

        train_loss = running_loss / len(ds_train)
        val_f1, _, _ = evaluate(model, val_loader, device)

        print(f"[{epoch+1}/{cfg['epochs']}] train_loss={train_loss:.4f} val_macroF1={val_f1:.4f}")

        # early stopping por macro-F1
        if val_f1 > best_f1 + 1e-5:
            best_f1, best_epoch, wait = val_f1, epoch, 0
            torch.save({"model": model.state_dict(),
                        "feature_cols": feature_cols,
                        "scaler_mean": scaler.mean_.tolist(),
                        "scaler_scale": scaler.scale_.tolist()}, best_path)
        else:
            wait += 1
            if wait >= cfg["patience"]:
                print(f"Early stopping at epoch {epoch+1}. Best @ {best_epoch+1} (macro-F1={best_f1:.4f})")
                break

    # Carrega melhor e avalia no teste
    ckpt = torch.load(best_path, map_location=device)
    model.load_state_dict(ckpt["model"])
    # reconstroi scaler (para export)
    saved_scaler = StandardScaler()
    saved_scaler.mean_ = np.array(ckpt["scaler_mean"])
    saved_scaler.scale_ = np.array(ckpt["scaler_scale"])
    saved_scaler.var_ = saved_scaler.scale_ ** 2

    test_loader = make_loader(ds_test, CONFIG, train=False)
    test_f1, y_true, y_pred = evaluate(model, test_loader, device)
    print(f"TEST macro-F1 = {test_f1:.4f}")
    print(classification_report(y_true, y_pred, target_names=CLASSES, digits=4))
    print("Confusion matrix:")
    print(confusion_matrix(y_true, y_pred))

    # Salva relatório
    with open(Path(cfg["save_dir"]) / "test_report.json", "w") as f:
        json.dump({
            "macro_f1": float(test_f1),
            "report": classification_report(y_true, y_pred, target_names=CLASSES, digits=4, output_dict=True),
            "confusion_matrix": confusion_matrix(y_true, y_pred).tolist()
        }, f, indent=2)

if __name__ == "__main__":
    train(CONFIG)


  scaler_amp = torch.cuda.amp.GradScaler(enabled=torch.cuda.is_available())
  with torch.cuda.amp.autocast(enabled=torch.cuda.is_available()):


[1/40] train_loss=0.6255 val_macroF1=0.5775


  with torch.cuda.amp.autocast(enabled=torch.cuda.is_available()):


[2/40] train_loss=0.4357 val_macroF1=0.6082


  with torch.cuda.amp.autocast(enabled=torch.cuda.is_available()):


[3/40] train_loss=0.3972 val_macroF1=0.6070


  with torch.cuda.amp.autocast(enabled=torch.cuda.is_available()):


[4/40] train_loss=0.3706 val_macroF1=0.6213


  with torch.cuda.amp.autocast(enabled=torch.cuda.is_available()):


[5/40] train_loss=0.3612 val_macroF1=0.6098


  with torch.cuda.amp.autocast(enabled=torch.cuda.is_available()):


[6/40] train_loss=0.3520 val_macroF1=0.6438


  with torch.cuda.amp.autocast(enabled=torch.cuda.is_available()):


[7/40] train_loss=0.3367 val_macroF1=0.6139


  with torch.cuda.amp.autocast(enabled=torch.cuda.is_available()):


[8/40] train_loss=0.3284 val_macroF1=0.6305


  with torch.cuda.amp.autocast(enabled=torch.cuda.is_available()):


[9/40] train_loss=0.3143 val_macroF1=0.6283


  with torch.cuda.amp.autocast(enabled=torch.cuda.is_available()):


[10/40] train_loss=0.3118 val_macroF1=0.6244


  with torch.cuda.amp.autocast(enabled=torch.cuda.is_available()):


[11/40] train_loss=0.3057 val_macroF1=0.6263


  with torch.cuda.amp.autocast(enabled=torch.cuda.is_available()):


[12/40] train_loss=0.2980 val_macroF1=0.6172
Early stopping at epoch 12. Best @ 6 (macro-F1=0.6438)




TEST macro-F1 = 0.6711
              precision    recall  f1-score   support

           W     0.9529    0.8166    0.8795     11429
          N1     0.2946    0.6301    0.4015      3425
          N2     0.8949    0.6043    0.7214     13722
          N3     0.4826    0.8926    0.6264      1983
         REM     0.6951    0.7612    0.7267      5319

    accuracy                         0.7136     35878
   macro avg     0.6640    0.7410    0.6711     35878
weighted avg     0.8037    0.7136    0.7368     35878

Confusion matrix:
[[9333 1668   68   19  341]
 [ 284 2158  341   42  600]
 [  41 2749 8292 1806  834]
 [   0   47  165 1770    1]
 [ 136  703  400   31 4049]]
