Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from typing import Dict, List

# Machine Learning
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split, GridSearchCV

# Balanceamento
from imblearn.over_sampling import SMOTE


In [2]:
def add_rolling_features(df, subject_col="subject_id", window=5):
    df_sorted = df.sort_values([subject_col, "epoch"]).copy()
    for col in df.columns:
        if col not in [subject_col, "epoch", "y", "stage"]:
            df_sorted[f"{col}_roll_mean_{window}"] = (
                df_sorted.groupby(subject_col)[col]
                .transform(lambda x: x.rolling(window, min_periods=1).mean())
            )
            df_sorted[f"{col}_roll_max_{window}"] = (
                df_sorted.groupby(subject_col)[col]
                .transform(lambda x: x.rolling(window, min_periods=1).max())
            )
    return df_sorted

def train_random_forest(X_train, y_train, X_test, y_test, top_n=20):
    model = RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(classification_report(y_test, y_pred))

    # Importância das features
    feat_importances = pd.Series(model.feature_importances_, index=X_train.columns)
    feat_sorted = feat_importances.sort_values(ascending=False)[:top_n]
    return model, feat_sorted

def gridsearch_random_forest(X, y, param_grid=None, cv=3):
    if param_grid is None:
        param_grid = {
            "n_estimators": [100, 200],
            "max_depth": [10, 20, None],
            "min_samples_split": [2, 5, 10]
        }
    rf = RandomForestClassifier(random_state=42, n_jobs=-1)
    grid = GridSearchCV(rf, param_grid, cv=cv, scoring="f1_weighted", n_jobs=-1, verbose=2)
    grid.fit(X, y)
    print("Melhores parâmetros:", grid.best_params_)
    print("Melhor score (f1_weighted):", grid.best_score_)
    return grid.best_estimator_

def add_rolling_features_sleep(df, subject_col="subject_id", epoch_col="epoch_idx", target_col="stage", window=5):
    df_sorted = df.sort_values(by=[subject_col, epoch_col]).copy()
    
    # Seleciona apenas colunas numéricas (float/int) para rolling
    feature_cols = df_sorted.select_dtypes(include=["float64", "int64"]).columns.tolist()
    feature_cols = [col for col in feature_cols if col not in [epoch_col]]
    
    for col in feature_cols:
        df_sorted[f"{col}_roll_mean"] = df_sorted.groupby(subject_col)[col].transform(
            lambda x: x.rolling(window, min_periods=1).mean()
        )
        df_sorted[f"{col}_roll_std"] = df_sorted.groupby(subject_col)[col].transform(
            lambda x: x.rolling(window, min_periods=1).std()
        )
        df_sorted[f"{col}_roll_max"] = df_sorted.groupby(subject_col)[col].transform(
            lambda x: x.rolling(window, min_periods=1).max()
        )
        
    return df_sorted


Carregamento e pré-processamento

In [3]:
SEED = 42
np.random.seed(SEED)

BASE_DIR = Path.cwd().parents[2]
DATA_DIR = BASE_DIR / "datalake" / "data-for-model"

cassette_file = DATA_DIR / "sleep-cassette.parquet"
df_cassette = pd.read_parquet(cassette_file, engine="fastparquet")

print("Shape:", df_cassette.shape)
display(df_cassette.head())

print(df_cassette.info())
print(df_cassette["stage"].value_counts(normalize=True))
print(df_cassette.describe())

ImportError: Missing optional dependency 'fastparquet'. fastparquet is required for parquet support. Use pip or conda to install fastparquet.

Análise Exploratória

In [4]:
print("Shape:", df_cassette.shape)
print(df_cassette.info())

sns.countplot(x="stage", data=df_cassette)
plt.title("Distribuição das classes")
plt.show()

corr = df_cassette.corr(numeric_only=True)
plt.figure(figsize=(12,6))
sns.heatmap(corr, cmap="coolwarm", center=0)
plt.title("Correlação entre features")
plt.show()

#Outputs de Resultados
def plot_results(model, X_test, y_test, top_n=20):
    y_pred = model.predict(X_test)
    print(classification_report(y_test, y_pred))

    cm = confusion_matrix(y_test, y_pred, labels=model.classes_)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=model.classes_)
    disp.plot(cmap="Blues", xticks_rotation=45)
    plt.title("Matriz de Confusão")
    plt.show()

    # Importância das features
    feat_importances = pd.Series(model.feature_importances_, index=X_test.columns)
    feat_sorted = feat_importances.sort_values(ascending=False)[:top_n]
    print("\nTop Features:")
    print(feat_sorted)
    return feat_sorted

NameError: name 'df_cassette' is not defined

Separação de variáveis

In [11]:
X = df_cassette.drop(columns=["stage", "subject_id", "night_id"], errors="ignore")
y = df_cassette["stage"]

# Converte colunas categóricas (se houver)
if "sex" in X.columns:
    X = pd.get_dummies(X, columns=["sex"], drop_first=True)

# Split treino/teste (20% teste, estratificado)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=SEED, stratify=y
)

print("Treino:", X_train.shape, "Teste:", X_test.shape)


Treino: (28092, 28) Teste: (7024, 28)


Experimentos

Treino simples sem balanceamento
Treino com rolling 10 epochs
Treino com rolling 20 epochs
Treino com rolling 20 epochs + SMOTE

In [14]:
# ======================
# Comparação Rolling 10 vs 20
# ======================

results = {}

for window in [10, 20]:
    print(f"\n====== Rolling Window: {window} epochs (sem SMOTE) ======")
    
    # Criar features com rolling
    df_roll = add_rolling_features_sleep(
        df_cassette, subject_col="subject_id", epoch_col="epoch_idx", window=window
    )

    # Preparar X e y
    X_tmp = df_roll.drop(columns=["stage", "subject_id", "sex", "age"], errors="ignore")
    X_tmp = pd.get_dummies(X_tmp, drop_first=True)
    y_tmp = df_roll["stage"]

    # Tratar NaN
    X_tmp = X_tmp.fillna(0)

    # Split
    X_train, X_test, y_train, y_test = train_test_split(
        X_tmp, y_tmp, test_size=0.2, stratify=y_tmp, random_state=42
    )

    # Treinar modelo
    model, feat_sorted = train_random_forest(X_train, y_train, X_test, y_test)

    # Avaliação
    y_pred = model.predict(X_test)
    report = classification_report(y_test, y_pred, output_dict=True)
    results[window] = report

    print(classification_report(y_test, y_pred))
    print("\nTop Features:")
    print(feat_sorted.head(15))

# Resumo comparativo
print("\n===== Resumo Comparativo (sem SMOTE) =====")
for window in [10, 20]:
    print(f"Rolling {window} epochs -> Accuracy: {results[window]['accuracy']:.4f} | "
          f"Macro F1: {results[window]['macro avg']['f1-score']:.4f} | "
          f"Weighted F1: {results[window]['weighted avg']['f1-score']:.4f}")



              precision    recall  f1-score   support

          N1       0.76      0.45      0.57       606
          N2       0.82      0.94      0.88      3393
          N3       0.91      0.80      0.85      1033
         REM       0.88      0.80      0.84      1394
           W       0.82      0.78      0.80       598

    accuracy                           0.84      7024
   macro avg       0.84      0.76      0.79      7024
weighted avg       0.84      0.84      0.83      7024

              precision    recall  f1-score   support

          N1       0.76      0.45      0.57       606
          N2       0.82      0.94      0.88      3393
          N3       0.91      0.80      0.85      1033
         REM       0.88      0.80      0.84      1394
           W       0.82      0.78      0.80       598

    accuracy                           0.84      7024
   macro avg       0.84      0.76      0.79      7024
weighted avg       0.84      0.84      0.83      7024


Top Features:
EEG_Fp

In [15]:
# ======================
# Rolling 20 + SMOTE
# ======================

print("\n====== Rolling Window: 20 epochs (com SMOTE) ======")

# Criar features com rolling
df_roll = add_rolling_features_sleep(
    df_cassette, subject_col="subject_id", epoch_col="epoch_idx", window=20
)

# Preparar X e y
X_tmp = df_roll.drop(columns=["stage", "subject_id", "sex", "age"], errors="ignore")
X_tmp = pd.get_dummies(X_tmp, drop_first=True)
y_tmp = df_roll["stage"]

# Tratar NaN
X_tmp = X_tmp.fillna(0)

# SMOTE
smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X_tmp, y_tmp)

print("Distribuição após SMOTE:\n", pd.Series(y_res).value_counts(), "\n")

# Split
X_train, X_test, y_train, y_test = train_test_split(
    X_res, y_res, test_size=0.2, stratify=y_res, random_state=42
)

# Treinar modelo
model, feat_sorted = train_random_forest(X_train, y_train, X_test, y_test)

# Avaliação
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

print("\nTop Features:")
print(feat_sorted.head(15))


Distribuição após SMOTE:
 stage
W      16961
N1     16961
N2     16961
N3     16961
REM    16961
Name: count, dtype: int64 

              precision    recall  f1-score   support

          N1       0.92      0.94      0.93      3392
          N2       0.91      0.89      0.90      3392
          N3       0.96      0.97      0.96      3392
         REM       0.96      0.97      0.96      3393
           W       0.97      0.95      0.96      3392

    accuracy                           0.94     16961
   macro avg       0.94      0.94      0.94     16961
weighted avg       0.94      0.94      0.94     16961

              precision    recall  f1-score   support

          N1       0.92      0.94      0.93      3392
          N2       0.91      0.89      0.90      3392
          N3       0.96      0.97      0.96      3392
         REM       0.96      0.97      0.96      3393
           W       0.97      0.95      0.96      3392

    accuracy                           0.94     16961
   ma

Testes com train / val / test

In [6]:
# Datasets já separados
BASE_PATH = Path().resolve().parents[2]
DATASETS_PATH = BASE_PATH / "datalake" / "data-for-model"
TRAINING_DATA_FILE = DATASETS_PATH / "train" / "train_sleep_cassette.parquet" 
VALIDATION_DATA_FILE = DATASETS_PATH / "val" / "val_sleep_cassette.parquet" 
TEST_DATA_FILE = DATASETS_PATH / "test" / "test_sleep_cassette.parquet" 

df_train = pd.read_parquet(TRAINING_DATA_FILE, engine="fastparquet")
df_val = pd.read_parquet(VALIDATION_DATA_FILE, engine="fastparquet")
df_test = pd.read_parquet(TEST_DATA_FILE, engine="fastparquet")

In [7]:
# ======== Preparar dados ========
def prepare_X_y(df):
    X = df.drop(columns=["stage", "subject_id", "sex", "age"], errors="ignore")
    X = pd.get_dummies(X, drop_first=True)
    X = X.fillna(0)
    y = df["stage"]
    return X, y

X_train, y_train = prepare_X_y(df_train)
X_val, y_val = prepare_X_y(df_val)
X_test, y_test = prepare_X_y(df_test)

# Alinhar dummies
X_train, X_val = X_train.align(X_val, join="left", axis=1, fill_value=0)
X_train, X_test = X_train.align(X_test, join="left", axis=1, fill_value=0)

# ======== Remover NaNs do target ========
mask = y_train.notna()
X_train = X_train[mask]
y_train = y_train[mask]

mask_val = y_val.notna()
X_val = X_val[mask_val]
y_val = y_val[mask_val]

mask_test = y_test.notna()
X_test = X_test[mask_test]
y_test = y_test[mask_test]

In [27]:
# ======== SMOTE apenas no treino ========
smote = SMOTE(random_state=1542)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

print("Distribuição após SMOTE (train):\n", pd.Series(y_train_res).value_counts(), "\n")

# ======== Treinar modelo COM SMOTE ========
def train_random_forest(X_train, y_train, X_test, y_test, top_n=20):
    model = RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1)
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_test)
    print(classification_report(y_test, y_pred))
    
    feat_importances = pd.Series(model.feature_importances_, index=X_train.columns)
    feat_sorted = feat_importances.sort_values(ascending=False)
    
    return model, feat_sorted

model, feat_sorted = train_random_forest(X_train_res, y_train_res, X_val, y_val)

# ======== Avaliação ========
print("\n===== Avaliação no Validation =====")
y_val_pred = model.predict(X_val)
print(classification_report(y_val, y_val_pred))

print("\n===== Avaliação no Test =====")
y_test_pred = model.predict(X_test)
print(classification_report(y_test, y_test_pred))

print("\nTop Features:")
print(feat_sorted.head(15))


Distribuição após SMOTE (train):
 stage
W      42211
N1     42211
N2     42211
N3     42211
REM    42211
Name: count, dtype: int64 

              precision    recall  f1-score   support

          N1       0.42      0.46      0.44      4527
          N2       0.78      0.83      0.80     14715
          N3       0.80      0.61      0.69      3549
         REM       0.74      0.84      0.79      5257
           W       0.87      0.75      0.80      8167

    accuracy                           0.74     36215
   macro avg       0.72      0.69      0.70     36215
weighted avg       0.75      0.74      0.74     36215


===== Avaliação no Validation =====
              precision    recall  f1-score   support

          N1       0.42      0.46      0.44      4527
          N2       0.78      0.83      0.80     14715
          N3       0.80      0.61      0.69      3549
         REM       0.74      0.84      0.79      5257
           W       0.87      0.75      0.80      8167

    accuracy   

In [8]:
# ======== Treinar modelo SEM SMOTE ========
def train_random_forest(X_train, y_train, X_test, y_test, top_n=20):
    model = RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1)
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_test)
    print(classification_report(y_test, y_pred))
    
    feat_importances = pd.Series(model.feature_importances_, index=X_train.columns)
    feat_sorted = feat_importances.sort_values(ascending=False)
    
    return model, feat_sorted

model, feat_sorted = train_random_forest(X_train, y_train, X_val, y_val)

# ======== Avaliação ========
print("\n===== Avaliação no Validation =====")
y_val_pred = model.predict(X_val)
print(classification_report(y_val, y_val_pred))

print("\n===== Avaliação no Test =====")
y_test_pred = model.predict(X_test)
print(classification_report(y_test, y_test_pred))

print("\nTop Features:")
print(feat_sorted.head(15))


              precision    recall  f1-score   support

          N1       0.44      0.31      0.36      4215
          N2       0.80      0.84      0.82     15066
          N3       0.69      0.65      0.67      2524
         REM       0.73      0.74      0.74      5456
           W       0.88      0.92      0.90     15120

    accuracy                           0.79     42381
   macro avg       0.71      0.69      0.70     42381
weighted avg       0.78      0.79      0.78     42381


===== Avaliação no Validation =====
              precision    recall  f1-score   support

          N1       0.44      0.31      0.36      4215
          N2       0.80      0.84      0.82     15066
          N3       0.69      0.65      0.67      2524
         REM       0.73      0.74      0.74      5456
           W       0.88      0.92      0.90     15120

    accuracy                           0.79     42381
   macro avg       0.71      0.69      0.70     42381
weighted avg       0.78      0.79      0

In [9]:
# ======== Salvar modelo de forma compacta ========
import joblib

FINAL_MODELS_PATH = BASE_PATH / "final-models" 
MODEL_FILENAME = FINAL_MODELS_PATH / "random-forest-model.pkl"

explainer_package = {
    "model": model,
    "features": X_train.columns.tolist(),
    "target": y_train.name if hasattr(y_train, "name") else "target"
}

# Salvar com compressão em XZ
joblib.dump(explainer_package, MODEL_FILENAME, compress=("xz", 3))

print(f"Modelo salvo em: {MODEL_FILENAME}")

Modelo salvo em: C:\Python\FIAP\sleep-stages-classification\final-models\random-forest-model.pkl
