In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, f1_score, balanced_accuracy_score, ConfusionMatrixDisplay, recall_score, precision_score
from sklearn.model_selection import train_test_split
import seaborn as sns
import os

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_row', None)

In [3]:
!pwd

/home/renatagrassi/code/LianaBernat/wearable-project/notebooks


In [4]:
path = '../data/data_processed/participants'
file = 'Participants_all.parquet'

In [5]:
df = pd.read_parquet(os.path.join(path, file))

In [6]:
df.head()

Unnamed: 0,pid,window_start,window_end,n_samples,duration_seconds,sex,age_group,label:Walmsley2020,label:Walmsley2020_enc,label:WillettsSpecific2018,label:WillettsSpecific2018_enc,label:WillettsMET2018,label:WillettsMET2018_enc,x_mean,x_std,x_min,x_max,y_mean,y_std,y_min,y_max,z_mean,z_std,z_min,z_max,energy_x,energy_y,energy_z,energy_total,magnitude_mean,corr_xy,corr_xz,corr_yz,fft_dom_freq,fft_peak_power,hour_sin,hour_cos
0,P001,2016-11-13 02:18:00,2016-11-13 02:18:05,500,4.99,1,2,sleep,3,sleep,5,sleep,5,-0.468161,0.004565,-0.482334,-0.46669,-0.537512,0.006892,-0.548902,-0.533341,0.657518,0.00396,0.643077,0.673867,0.219195,0.288966,0.432345,0.940507,0.969787,-0.14848,-0.077644,0.275487,0.4,5e-06,0.566406,0.824126
1,P001,2016-11-13 02:18:05,2016-11-13 02:18:10,500,4.99,1,2,sleep,3,sleep,5,sleep,5,-0.470069,0.006437,-0.482334,-0.46669,-0.537045,0.006771,-0.548902,-0.51778,0.657702,0.003627,0.643077,0.673867,0.221007,0.288463,0.432585,0.942055,0.970582,-0.108382,-0.028882,0.137541,3.6,8e-06,0.566706,0.82392
2,P001,2016-11-13 02:18:10,2016-11-13 02:18:15,500,4.99,1,2,sleep,3,sleep,5,sleep,5,-0.469694,0.006162,-0.482334,-0.46669,-0.537947,0.007104,-0.548902,-0.533341,0.657764,0.003369,0.643077,0.673867,0.22065,0.289438,0.432665,0.942753,0.970944,-0.260468,-0.079268,0.204062,0.4,2e-06,0.567005,0.823714
3,P001,2016-11-13 02:18:15,2016-11-13 02:18:20,500,4.99,1,2,sleep,3,sleep,5,sleep,5,-0.469287,0.005821,-0.482334,-0.46669,-0.537512,0.006962,-0.548902,-0.51778,0.657733,0.003567,0.643077,0.673867,0.220264,0.288967,0.432626,0.941857,0.970483,-0.243211,-0.092415,0.223157,0.4,1e-06,0.567305,0.823508
4,P001,2016-11-13 02:18:20,2016-11-13 02:18:25,500,4.99,1,2,sleep,3,sleep,5,sleep,5,-0.47082,0.006896,-0.482334,-0.46669,-0.535333,0.005291,-0.548902,-0.51778,0.658226,0.002743,0.643077,0.673867,0.221719,0.286609,0.433269,0.941597,0.97035,-0.225457,-0.07925,0.230302,8.8,2e-06,0.567604,0.823302


In [11]:
df.groupby("pid").size().sort_values()

pid
P094     4375
P009     4533
P112     6171
P081     7006
P118     7662
P095     7711
P142     7831
P018     8005
P088     8490
P122     8579
P143     8670
P106     8812
P136     8880
P034     9436
P139     9479
P075     9485
P020     9541
P077     9696
P044     9711
P048     9727
P071     9836
P052     9840
P134     9848
P102     9909
P028     9914
P060    10069
P120    10087
P144    10110
P039    10177
P117    10179
P138    10248
P141    10261
P045    10396
P123    10627
P149    10649
P036    10773
P069    10808
P063    10818
P127    10838
P079    10849
P135    10866
P015    11011
P031    11019
P137    11092
P023    11131
P140    11158
P074    11230
P004    11232
P002    11273
P046    11353
P083    11416
P115    11469
P051    11475
P059    11516
P082    11567
P105    11614
P132    11620
P041    11654
P016    11674
P107    11713
P057    11755
P029    11776
P003    11840
P087    11851
P061    11877
P108    11880
P026    11957
P011    12078
P067    12157
P092    12223
P119    12227
P0

In [10]:
df['pid'].value_counts().get('P001', 0)

np.int64(37381)

In [6]:
df_test = df[df["pid"] == "P001"]
df_train = df[df["pid"] != "P001"]

In [7]:
cols_to_drop = [
    "label:WillettsSpecific2018_enc",
    "label:WillettsMET2018",
    "label:WillettsMET2018_enc",
    "label:WillettsSpecific2018",
    "pid",
    "window_start",
    "window_end",
    "n_samples",
    "duration_seconds",
    "sex","age_group",
    "label:Walmsley2020_enc",
    "label:Walmsley2020",
    "magnitude_mean"

]

In [None]:
#################### substituir nan por 0 nas correlacoes!

In [9]:
X_train = df_train.drop(columns=cols_to_drop)
y_train = df_train["label:Walmsley2020_enc"]

In [8]:
X_test = df_test.drop(columns=cols_to_drop)
y_test = df_test["label:Walmsley2020_enc"]

In [10]:
"""Faz o imputer com medianas para o SMOTE"""
imputer = SimpleImputer(strategy="median")
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.fit_transform(X_test)

### Comparando antes e depois do imputer

In [None]:
print("NaNs no X_train antes:")
print(X_train.isna().sum())

print("NaNs no X_test antes:")
print(X_test.isna().sum())

In [None]:
print("Antes (train):")
print(X_train.describe())

print("Depois (train):")
print(X_train_imp.describe())

In [None]:
"""Smote vai criar novos dados sintéticos para as classes minoritárias a fim de diminuir o desbalancento.
 k_neighbors=5 e sampling strategy=0.5 é para evitar que ele repita padrões"""
smote = SMOTE(random_state=42, k_neighbors=5, sampling_strategy=0.5)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_imputed, y_train)

### Comparando antes e depois do SMOTE

In [None]:
# Verificação de dados duplicados 

df_res = pd.DataFrame(X_train_resampled)
duplicados = df_res.duplicated().sum()

print("Duplicados:", duplicados)

In [None]:
print("Distribuição antes:", y_train.value_counts().to_dict())
print("Distribuição depois:", pd.Series(y_train_res).value_counts().to_dict())

In [None]:
print("X_train antes:", X_train.shape)
print("X_train depois do imputer:", X_train_imp.shape)
print("X_train depois do SMOTE:", X_train_resampled.shape)

### Rodando o modelo

In [None]:
"""class_weight="balanced" => ajudaa o modelo a não ignorar a classe minoritária e corrige um pouco o desbalanceamento;
    oob_score=True => out of bag: 'mini conjunto de validação' que usa os dados que a aleatoriedade do random_forest nao usou para treinar o modelo;
    bootstrap=True => possibilita que o oob seja executado, pois cada amostra é treinada com uma amostra com reposição (assim o X_train nao fica o mesmo)"""
rforest = RandomForestClassifier(
        n_estimators=150,
        max_depth=15,
        class_weight="balanced",
        bootstrap=True,
        oob_score=True,
        random_state=42,
        n_jobs=-1
)

In [1]:
%%time
rforest.fit(X_train_resampled, y_train_resampled)

NameError: name 'rforest' is not defined

### Checando o modelo

In [21]:
y_pred = rforest.predict(X_test_imputed)
y_pred

array([3, 3, 3, ..., 3, 3, 3], shape=(37381,))

In [22]:
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.79      0.77      0.78     10856
           1       0.05      0.71      0.09        84
           2       0.86      0.82      0.84     12509
           3       0.99      0.97      0.98     13932

    accuracy                           0.86     37381
   macro avg       0.67      0.82      0.67     37381
weighted avg       0.89      0.86      0.87     37381

[[ 8324  1189  1343     0]
 [   15    60     9     0]
 [ 2111    47 10255    96]
 [  106     4   306 13516]]


In [None]:
prec, rec, f1, sup = precision_recall_fscore_support(y_test, y_pred, average=None)
print("Per-class P/R/F1:", list(zip(prec, rec, f1, sup)))

In [16]:
### APAGAR OOB DEPOIS. fICA AQUI SO DE CURIOSIDADE. ELE NAO É UM BOM MEIO PARA VALIDACAO EXTERNA
print("OOB Accuracy:", rforest.oob_score_)

OOB Accuracy: 0.852690922235511


In [24]:
classes = sorted(list(set(y_test)))  #classes presentes no y_test (no caso, P001)

#  calcula métricas só para essas classes
precision = precision_score(y_test, y_pred, average=None, labels=classes)
recall    = recall_score(y_test, y_pred, average=None, labels=classes)
f1        = f1_score(y_test, y_pred, average=None, labels=classes)

#  traduzindo as classes
class_names = {
    0: "Light",
    1: "Moderate/Vigorous",
    2: "Sedentary",
    3: "Sleep"
}

#  dataframe final de métricas por classe
df_metrics_test = pd.DataFrame({
    "Classe": classes,
    "Categoria": [class_names[c] for c in classes],
    "Precision": precision,
    "Recall": recall,
    "F1": f1
})

df_metrics_test

NameError: name 'y' is not defined