In [None]:
# ==== 1. Imports ====
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
import xgboost as xgb
import os 


In [None]:
# !pip install xgboost

In [None]:
from pathlib import Path
import pandas as pd

# raiz do projeto = 3 níveis acima do notebook atual
base_dir = Path().resolve().parents[2]

file_path = base_dir / "datalake" / "data-for-model" / "sleep-cassette.parquet"

print("Lendo:", file_path)
df = pd.read_parquet(file_path, engine="fastparquet")
print("Shape:", df.shape)
display(df.head)


In [None]:

# ==== 3. Separar features e target ====
X = df.drop(columns=["stage", "subject_id", "night_id","sex"])  # remove identificadores
y = df["stage"]

display(X.head)



In [None]:
# ==== 4. Encoder no target ====
le = LabelEncoder()
y_enc = le.fit_transform(y)

# ==== 5. Treino / teste ====
X_train, X_test, y_train, y_test = train_test_split(
    X, y_enc, test_size=0.2, random_state=42, stratify=y_enc
)

# ==== 6. Escalar features ====
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# ==== 7. Modelo XGBoost ====
model = xgb.XGBClassifier(
    objective="multi:softmax",
    num_class=len(le.classes_),  # quantos estágios de sono
    eval_metric="mlogloss",
    n_estimators=200,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1
)

model.fit(X_train, y_train)



In [None]:
# ==== 8. Avaliação ====
y_pred = model.predict(X_test)

print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=le.classes_))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# ==== 9. Importância das features ====
import matplotlib.pyplot as plt
xgb.plot_importance(model, max_num_features=20, importance_type="gain")
plt.show()

In [None]:
import numpy as np

importances = model.feature_importances_
indices = np.argsort(importances)[::-1]

print("Top 20 features mais importantes:")
for i in indices[:20]:
    print(f"{X.columns[i]} -> {importances[i]:.4f}")
