# Classificacao dos estagios do sono com Naive Bayes

Este notebook organiza o pipeline de treino, validacao e teste para o modelo aplicado aos dados de estagios do sono.

In [1]:
import random
from pathlib import Path

import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, balanced_accuracy_score, classification_report, confusion_matrix, f1_score, log_loss
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler

## Configuracao dos dados

In [2]:
BASE_PATH = Path().resolve()
if not (BASE_PATH / "datalake").exists():
    BASE_PATH = BASE_PATH.parents[2]
DATASETS_PATH = BASE_PATH / "datalake" / "data-for-model"
TRAINING_DATA_FILE = DATASETS_PATH / "train" / "train_sleep_cassette.parquet"
VALIDATION_DATA_FILE = DATASETS_PATH / "val" / "val_sleep_cassette.parquet"
TEST_DATA_FILE = DATASETS_PATH / "test" / "test_sleep_cassette.parquet"
STAGES = ["W", "N1", "N2", "N3", "REM"]
STAGE2ID = {stage: idx for idx, stage in enumerate(STAGES)}
df_train = pd.read_parquet(TRAINING_DATA_FILE, engine="fastparquet")
df_val = pd.read_parquet(VALIDATION_DATA_FILE, engine="fastparquet")
df_test = pd.read_parquet(TEST_DATA_FILE, engine="fastparquet")

## Preparacao das tabelas

In [3]:
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)

SEED = 42
set_seed(SEED)

SEX_MAP = {"F": 0.0, "M": 1.0}
frames = [df_train, df_val, df_test]
for frame in frames:
    frame["sex"] = frame["sex"].map(SEX_MAP).fillna(0.5).astype(np.float32)
    frame["stage_id"] = frame["stage"].map(STAGE2ID).astype(np.int64)

IDENTIFIERS = ["subject_id", "night_id", "epoch_idx", "stage", "stage_id"]
FEATURES = [column for column in df_train.columns if column not in IDENTIFIERS]
FEATURES.sort()

scaler = StandardScaler()
x_train = scaler.fit_transform(df_train[FEATURES]).astype(np.float32)
x_val = scaler.transform(df_val[FEATURES]).astype(np.float32)
x_test = scaler.transform(df_test[FEATURES]).astype(np.float32)

y_train = df_train["stage_id"].to_numpy(dtype=np.int64)
y_val = df_val["stage_id"].to_numpy(dtype=np.int64)
y_test = df_test["stage_id"].to_numpy(dtype=np.int64)

class_distribution = df_train["stage_id"].value_counts().sort_index()
class_weights = (len(df_train) / (len(STAGES) * class_distribution)).astype(np.float64)
weight_lookup = {idx: float(value) for idx, value in class_weights.items()}
train_weights = np.array([weight_lookup[label] for label in y_train], dtype=np.float64)
val_weights = np.array([weight_lookup[label] for label in y_val], dtype=np.float64)
test_weights = np.array([weight_lookup[label] for label in y_test], dtype=np.float64)

## Distribuicao das classes no treino

In [4]:
class_summary = pd.DataFrame({
    "stage": STAGES,
    "samples": [int(class_distribution.get(idx, 0)) for idx in range(len(STAGES))]
})
class_summary["proportion"] = class_summary["samples"] / class_summary["samples"].sum()
class_summary

Unnamed: 0,stage,samples,proportion
0,W,34935,0.309837
1,N1,13882,0.123119
2,N2,40344,0.357809
3,N3,8532,0.07567
4,REM,15060,0.133566


## Treinamento

In [5]:
smoothing_grid = np.logspace(-9, -3, 13)
history_records = []
best_score = -np.inf
best_model = None
for smoothing in smoothing_grid:
    candidate = GaussianNB(var_smoothing=smoothing)
    candidate.fit(x_train, y_train, sample_weight=train_weights)
    val_predictions = candidate.predict(x_val)
    score = f1_score(y_val, val_predictions, average="macro")
    history_records.append({"var_smoothing": smoothing, "val_macro_f1": score})
    if score > best_score:
        best_score = score
        best_model = candidate
history_df = pd.DataFrame(history_records).sort_values("val_macro_f1", ascending=False).reset_index(drop=True)
model = best_model

## Historico de treino

In [6]:
history_df

Unnamed: 0,var_smoothing,val_macro_f1
0,0.0001,0.595972
1,3.162278e-06,0.595958
2,3.162278e-05,0.595958
3,1e-05,0.595958
4,0.0003162278,0.595953
5,1e-07,0.595939
6,3.162278e-08,0.595939
7,1e-08,0.595939
8,3.162278e-09,0.595939
9,1e-09,0.595939


## Avaliacao no conjunto de teste

In [7]:
val_predictions = model.predict(x_val)
test_predictions = model.predict(x_test)
test_probabilities = model.predict_proba(x_test)
test_loss = log_loss(y_test, test_probabilities)
test_accuracy = accuracy_score(y_test, test_predictions)
test_balanced_acc = balanced_accuracy_score(y_test, test_predictions)
test_macro_f1 = f1_score(y_test, test_predictions, average="macro")
summary = pd.DataFrame({
    "metric": ["loss", "accuracy", "balanced_accuracy", "macro_f1"],
    "value": [test_loss, test_accuracy, test_balanced_acc, test_macro_f1]
})
print(summary.to_string(index=False))
print()
report = classification_report(y_test, test_predictions, target_names=STAGES, output_dict=True)
report_df = pd.DataFrame(report).transpose()
report_table = report_df.loc[STAGES + ["macro avg", "weighted avg"], ["precision", "recall", "f1-score", "support"]]
print(report_table.round(3).to_string())
print()
confusion = confusion_matrix(y_test, test_predictions)
confusion_df = pd.DataFrame(confusion, index=STAGES, columns=STAGES)
print(confusion_df)

           metric    value
             loss 5.524137
         accuracy 0.649451
balanced_accuracy 0.639431
         macro_f1 0.581154

              precision  recall  f1-score  support
W                 0.925   0.706     0.801  11429.0
N1                0.237   0.350     0.282   3425.0
N2                0.776   0.664     0.716  13722.0
N3                0.387   0.880     0.537   1983.0
REM               0.544   0.597     0.569   5319.0
macro avg         0.574   0.639     0.581  35878.0
weighted avg      0.716   0.649     0.670  35878.0

        W    N1    N2    N3   REM
W    8073  2076   298    13   969
N1    426  1199   929    52   819
N2     77  1057  9109  2614   865
N3      1     3   230  1745     4
REM   150   734  1173    87  3175
