# Classificacao dos estagios do sono com Random Forest

Este notebook organiza o pipeline de treino, validacao e teste para o modelo aplicado aos dados de estagios do sono.

In [1]:
import random
import os
from pathlib import Path

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, balanced_accuracy_score, classification_report, confusion_matrix, f1_score, log_loss
from sklearn.preprocessing import StandardScaler

## Configuracao dos dados

In [2]:
BASE_PATH = Path().resolve()
if not (BASE_PATH / "datalake").exists():
    BASE_PATH = BASE_PATH.parents[2]
DATASETS_PATH = BASE_PATH / "datalake" / "data-for-model"
TRAINING_DATA_FILE = DATASETS_PATH / "train" / "train_sleep_cassette.parquet"
VALIDATION_DATA_FILE = DATASETS_PATH / "val" / "val_sleep_cassette.parquet"
TEST_DATA_FILE = DATASETS_PATH / "test" / "test_sleep_cassette.parquet"
STAGES = ["W", "N1", "N2", "N3", "REM"]
STAGE2ID = {stage: idx for idx, stage in enumerate(STAGES)}
df_train = pd.read_parquet(TRAINING_DATA_FILE, engine="fastparquet")
df_val = pd.read_parquet(VALIDATION_DATA_FILE, engine="fastparquet")
df_test = pd.read_parquet(TEST_DATA_FILE, engine="fastparquet")

## Preparacao das tabelas

In [3]:
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)

SEED = 42
set_seed(SEED)

SEX_MAP = {"F": 0.0, "M": 1.0}
frames = [df_train, df_val, df_test]
for frame in frames:
    frame["sex"] = frame["sex"].map(SEX_MAP).fillna(0.5).astype(np.float32)
    frame["stage_id"] = frame["stage"].map(STAGE2ID).astype(np.int64)

IDENTIFIERS = ["subject_id", "night_id", "epoch_idx", "stage", "stage_id"]
FEATURES = [column for column in df_train.columns if column not in IDENTIFIERS]
FEATURES.sort()

scaler = StandardScaler()
x_train = scaler.fit_transform(df_train[FEATURES]).astype(np.float32)
x_val = scaler.transform(df_val[FEATURES]).astype(np.float32)
x_test = scaler.transform(df_test[FEATURES]).astype(np.float32)

y_train = df_train["stage_id"].to_numpy(dtype=np.int64)
y_val = df_val["stage_id"].to_numpy(dtype=np.int64)
y_test = df_test["stage_id"].to_numpy(dtype=np.int64)

class_distribution = df_train["stage_id"].value_counts().sort_index()
class_weights = (len(df_train) / (len(STAGES) * class_distribution)).astype(np.float64)
weight_lookup = {idx: float(value) for idx, value in class_weights.items()}
train_weights = np.array([weight_lookup[label] for label in y_train], dtype=np.float64)
val_weights = np.array([weight_lookup[label] for label in y_val], dtype=np.float64)
test_weights = np.array([weight_lookup[label] for label in y_test], dtype=np.float64)

## Distribuicao das classes no treino

In [4]:
class_summary = pd.DataFrame({
    "stage": STAGES,
    "samples": [int(class_distribution.get(idx, 0)) for idx in range(len(STAGES))]
})
class_summary["proportion"] = class_summary["samples"] / class_summary["samples"].sum()
class_summary

Unnamed: 0,stage,samples,proportion
0,W,34935,0.309837
1,N1,13882,0.123119
2,N2,40344,0.357809
3,N3,8532,0.07567
4,REM,15060,0.133566


## Treinamento

In [5]:
os.environ.setdefault("OMP_NUM_THREADS", "1")
search_space = [
    {"n_estimators": 240, "max_depth": 22, "min_samples_split": 6, "min_samples_leaf": 2, "max_features": "sqrt", "max_samples": 0.8},
    {"n_estimators": 300, "max_depth": 26, "min_samples_split": 5, "min_samples_leaf": 2, "max_features": "sqrt", "max_samples": 0.85},
    {"n_estimators": 200, "max_depth": 20, "min_samples_split": 8, "min_samples_leaf": 3, "max_features": "sqrt", "max_samples": 0.75}
]
history_records = []
best_score = -np.inf
best_model = None
for idx, candidate_params in enumerate(search_space, start=1):
    print(f"treinando combinacao {idx}/{len(search_space)}: {candidate_params}")
    model = RandomForestClassifier(
        n_estimators=candidate_params["n_estimators"],
        max_depth=candidate_params["max_depth"],
        min_samples_split=candidate_params["min_samples_split"],
        min_samples_leaf=candidate_params["min_samples_leaf"],
        max_features=candidate_params["max_features"],
        max_samples=candidate_params["max_samples"],
        class_weight="balanced_subsample",
        bootstrap=True,
        n_jobs=8,
        random_state=SEED
    )
    model.fit(x_train, y_train, sample_weight=train_weights)
    val_predictions = model.predict(x_val)
    score = f1_score(y_val, val_predictions, average="macro")
    record = dict(candidate_params)
    record["val_macro_f1"] = score
    history_records.append(record)
    if score > best_score:
        best_score = score
        best_model = model
history_df = pd.DataFrame(history_records).sort_values("val_macro_f1", ascending=False).reset_index(drop=True)
model = best_model

treinando combinacao 1/3: {'n_estimators': 240, 'max_depth': 22, 'min_samples_split': 6, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_samples': 0.8}


treinando combinacao 2/3: {'n_estimators': 300, 'max_depth': 26, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_samples': 0.85}


treinando combinacao 3/3: {'n_estimators': 200, 'max_depth': 20, 'min_samples_split': 8, 'min_samples_leaf': 3, 'max_features': 'sqrt', 'max_samples': 0.75}


## Historico de treino

In [6]:
history_df

Unnamed: 0,n_estimators,max_depth,min_samples_split,min_samples_leaf,max_features,max_samples,val_macro_f1
0,300,26,5,2,sqrt,0.85,0.693351
1,240,22,6,2,sqrt,0.8,0.686414
2,200,20,8,3,sqrt,0.75,0.677568


## Avaliacao no conjunto de teste

In [7]:
val_predictions = model.predict(x_val)
test_predictions = model.predict(x_test)
test_probabilities = model.predict_proba(x_test)
test_loss = log_loss(y_test, test_probabilities)
test_accuracy = accuracy_score(y_test, test_predictions)
test_balanced_acc = balanced_accuracy_score(y_test, test_predictions)
test_macro_f1 = f1_score(y_test, test_predictions, average="macro")
summary = pd.DataFrame({
    "metric": ["loss", "accuracy", "balanced_accuracy", "macro_f1"],
    "value": [test_loss, test_accuracy, test_balanced_acc, test_macro_f1]
})
print(summary.to_string(index=False))
print()
report = classification_report(y_test, test_predictions, target_names=STAGES, output_dict=True)
report_df = pd.DataFrame(report).transpose()
report_table = report_df.loc[STAGES + ["macro avg", "weighted avg"], ["precision", "recall", "f1-score", "support"]]
print(report_table.round(3).to_string())
print()
confusion = confusion_matrix(y_test, test_predictions)
confusion_df = pd.DataFrame(confusion, index=STAGES, columns=STAGES)
print(confusion_df)

           metric    value
             loss 0.602668
         accuracy 0.777747
balanced_accuracy 0.691420
         macro_f1 0.700113

              precision  recall  f1-score  support
W                 0.882   0.901     0.891  11429.0
N1                0.369   0.362     0.365   3425.0
N2                0.814   0.840     0.826  13722.0
N3                0.782   0.707     0.742   1983.0
REM               0.704   0.648     0.675   5319.0
macro avg         0.710   0.691     0.700  35878.0
weighted avg      0.775   0.778     0.776  35878.0

         W    N1     N2    N3   REM
W    10295   620    228     5   281
N1     730  1240    870     4   581
N2     184  1055  11520   377   586
N3       4     4    574  1401     0
REM    455   442    969     5  3448


## Analise adicional

In [8]:
importances = pd.Series(model.feature_importances_, index=FEATURES)
top_importances = importances.sort_values(ascending=False).head(20)
top_importances.to_frame(name="importance")

Unnamed: 0,importance
EEG_Fpz_Cz_beta_relpow_256,0.056839
EEG_Pz_Oz_beta_relpow_256_roll_mean_5,0.056712
EEG_Pz_Oz_beta_relpow_256_roll_mean_10,0.053626
EEG_Pz_Oz_beta_relpow_256,0.048923
EEG_Pz_Oz_beta_relpow_256_roll_std_10,0.043065
EEG_Pz_Oz_aperiodic_slope_256,0.031465
EMG_submental_median_1hz,0.026149
EEG_Fpz_Cz_aperiodic_slope_256,0.025837
EEG_Fpz_Cz_aperiodic_slope_256_roll_mean_15,0.025354
EMG_submental_median_1hz_roll_mean_5,0.025246
