"""
CAPTURE-24: janelamento 5s (tempo-based) e extração de features. Primeiro teste em notebook.

Principais escolhas implementadas:
- Janela de 5 segundos sem sobreposição (tempo-based: [t0, t0+5s))
- Descartar linhas com NA em time,x,y,z
- Para annotation: descartar janela se >= 50% das observações na janela tiverem annotation NA
- Janela válida se n_samples >= min_samples (default 250)
- Calcula estatísticas por eixo, energia, magnitude média, correlações, FFT (Welch na magnitude)
- Gera 3 colunas de rótulos simplificados (strings) e colunas de label-encoded correspondentes
- Inclui campos de auditoria: pid, window_start, window_end, n_samples, duration_seconds
- Salva resultado em arquivo parquet por participante ou combinado
"""

In [1]:
import os
from glob import glob
import numpy as np
import pandas as pd
from datetime import timedelta
from scipy import signal
from tqdm import tqdm
import psutil
import time


pd.set_option('display.max_columns', None)
pd.set_option('display.max_row', None)

In [None]:
SAMPLING_RATE = 100  # Hz
WINDOW_SECONDS = 5
WINDOW_SIZE_SAMPLES = int(SAMPLING_RATE * WINDOW_SECONDS)  # 500
MIN_SAMPLES = 250  # mínimo aceitável por janela
ANNOTATION_NA_THRESHOLD = 0.5  # descartar janela se >= 50% das annotation forem NA
FFT_NFFT = WINDOW_SIZE_SAMPLES  # usar zero-padding até 500 quando necessário
LABEL_COLUMNS_TO_KEEP = ['label:Walmsley2020', 'label:WillettsSpecific2018', 'label:WillettsMET2018']

In [3]:
# Map para sexo e age
SEX_MAP = {'F': 1, 'M': 0}
AGE_MAP = {'18-29': 0, '30-37': 1, '38-52': 2, '53+': 3}

In [None]:
# Diretórios de entrada/saída (AJUSTAR ESSA PARTE)
PARTICIPANT_GLOB = "data/data_raw/capture24/capture24/P*.csv.gz"
METADATA_PATH = "data/data_raw/capture24/capture24/metadata.csv"
ANNOT_DICT_PATH = "data/data_raw/capture24/capture24/annotation-label-dictionary.csv"
OUT_DIR = "data/data_processed/participants"
os.makedirs(OUT_DIR, exist_ok=True)

In [5]:
# ---------- Funções utilitárias ----------
def safe_parse_time(df, time_col='time'):
    """Assegura que coluna time seja datetime e ordena por time."""
    if not np.issubdtype(df[time_col].dtype, np.datetime64):
        df[time_col] = pd.to_datetime(df[time_col], errors='coerce')
    df = df.sort_values(time_col).reset_index(drop=True)
    return df

In [6]:
def compute_window_features_chunked(window_df):
    """
    Extrai features de aceleração + FFT completa para uma janela.
    NÃO inclui metadata, labels ou hora cíclica.
    """

    x = window_df['x'].to_numpy()
    y = window_df['y'].to_numpy()
    z = window_df['z'].to_numpy()

    feats = {}

    # -----------------------------
    # Estatísticas por eixo
    # -----------------------------
    for axis, arr in [('x', x), ('y', y), ('z', z)]:
        feats[f"{axis}_mean"] = np.mean(arr)
        feats[f"{axis}_std"]  = np.std(arr)
        feats[f"{axis}_min"]  = np.min(arr)
        feats[f"{axis}_max"]  = np.max(arr)

    # -----------------------------
    # Energia
    # -----------------------------
    feats['energy_x'] = np.mean(x**2)
    feats['energy_y'] = np.mean(y**2)
    feats['energy_z'] = np.mean(z**2)
    feats['energy_total'] = np.mean(x**2 + y**2 + z**2)

    # -----------------------------
    # Magnitude
    # -----------------------------
    mag = np.sqrt(x**2 + y**2 + z**2)
    feats['magnitude_mean'] = np.mean(mag)

    # -----------------------------
    # Correlações
    # -----------------------------
    def safe_corr(a, b):
        if len(a) < 2:
            return np.nan
        if np.std(a) == 0 or np.std(b) == 0:
            return np.nan
        return float(np.corrcoef(a, b)[0, 1])

    feats['corr_xy'] = safe_corr(x, y)
    feats['corr_xz'] = safe_corr(x, z)
    feats['corr_yz'] = safe_corr(y, z)

    # -----------------------------
    # FFT completa
    # -----------------------------
    mag_dt = signal.detrend(mag)

    freqs, psd = signal.welch(
        mag_dt,
        fs=100,
        nperseg=256,
        nfft=500
    )

    if np.all(np.isnan(psd)):
        feats['fft_dom_freq'] = np.nan
        feats['fft_peak_power'] = np.nan
    else:
        idx = np.argmax(psd)
        feats['fft_dom_freq'] = freqs[idx]
        feats['fft_peak_power'] = psd[idx]

    return feats

In [7]:
def build_annotation_maps(annotation_map_df, label_columns):
    """
    Cria dois dicionários globais:
      - mapping_dicts[col]: mapa {annotation_original → label_simplificado}
      - enc_maps[col]: mapa {label_simplificado → inteiro}

    Deve ser chamado UMA ÚNICA VEZ antes do processamento de janelas.
    """

    mapping_dicts = {}
    enc_maps = {}

    for col in label_columns:
        # Mapeamento original → simplificado
        mapping = dict(zip(annotation_map_df['annotation'].astype(str),
                           annotation_map_df[col].astype(str)))
        mapping_dicts[col] = mapping

        # Encoding fixo baseado no conjunto completo de rótulos simplificados
        unique_labels = sorted(annotation_map_df[col].dropna().unique().tolist())
        enc_maps[col] = {lab: i for i, lab in enumerate(unique_labels)}

    return mapping_dicts, enc_maps

In [8]:
def map_annotations_and_encode(window_df, mapping_dicts, enc_maps, label_columns):
    """
    Faz o mapeamento da coluna 'annotation' original para os rótulos simplificados,
    realiza majority vote e aplica encoding fixo.

    - Assume que a janela já passou pelos filtros (>= 250 amostras e < 50% NA).
    - Se houver empate → 'ambiguous' com encoding -1.
    """

    ann = window_df['annotation'].astype(str)
    result = {}

    for col in label_columns:

        mapped = ann.map(mapping_dicts[col])

        # Majority vote (len(counts) nunca é 0 após filtros)
        counts = mapped.value_counts()
        top_count = counts.iloc[0]
        top_labels = counts[counts == top_count].index.tolist()

        if len(top_labels) > 1:
            major = "ambiguous"
        else:
            major = top_labels[0]

        result[col] = major

        # Encoding fixo (consistente para todas as janelas)
        if major == "ambiguous":
            result[col + "_enc"] = -1
        else:
            result[col + "_enc"] = enc_maps[col][major]

    return result

In [9]:
def choose_chunksize():
    """Escolhe chunksize automaticamente baseado na memória disponível."""
    free_gb = psutil.virtual_memory().available / 1e9

    if free_gb >= 8:
        return 1_200_000
    elif free_gb >= 4:
        return 750_000
    else:
        return 500_000


def process_participant_file_chunked(
    path,
    metadata_df,
    mapping_dicts,
    enc_maps,
    label_columns,
    out_dir=OUT_DIR,
):
    """
    Processa UM participante usando chunks.
    Salva incrementalmente (append) em parquet.
    """
    filename = os.path.basename(path)     # "P001.csv.gz"
    pid = os.path.splitext(filename)[0]   # "P001.csv"
    pid = os.path.splitext(pid)[0]        # "P001"

    print(f"\n=== Processando {pid} ===")

    # Obter metadata
    meta = metadata_df.loc[metadata_df["pid"] == pid].iloc[0]
    sex_code = SEX_MAP.get(meta["sex"], np.nan)
    age_code = AGE_MAP.get(meta["age"], np.nan)

    # Caminho de saída

    # Diretório específico do participante
    out_dir_pid = os.path.join(out_dir, pid)
    os.makedirs(out_dir_pid, exist_ok=True)


    # Escolher chunksize pelo estado atual da memória
    chunksize = choose_chunksize()
    print(f"Usando chunksize: {chunksize:,}")

    reader = pd.read_csv(
        path,
        chunksize=chunksize,
        usecols=['time', 'x', 'y', 'z', 'annotation'],
        dtype={'annotation': 'string'}
    )

    total_windows = 0
    total_valid = 0

    for chunk in tqdm(reader, desc=f"{pid} — chunks"):

        # Garantir tipos corretos
        chunk['time'] = pd.to_datetime(chunk['time'], errors='coerce')
        chunk = chunk.dropna(subset=['time', 'x', 'y', 'z']).reset_index(drop=True)
        if len(chunk) == 0:
            continue

        # Tempo relativo dentro do chunk
        t0 = chunk['time'].iloc[0]
        chunk['t_sec'] = (chunk['time'] - t0).dt.total_seconds()

        # Construir janelas
        t_end = chunk['t_sec'].iloc[-1]
        window_starts = np.arange(0, t_end, WINDOW_SECONDS)

        rows_out = []

        for ws in window_starts:
            we = ws + WINDOW_SECONDS
            mask = (chunk['t_sec'] >= ws) & (chunk['t_sec'] < we)
            wdf = chunk.loc[mask]

            total_windows += 1

            if len(wdf) < MIN_SAMPLES:
                continue
            if wdf['annotation'].isna().mean() >= ANNOTATION_NA_THRESHOLD:
                continue

            total_valid += 1

            # Extrair features (PARTE 1)
            feats = compute_window_features_chunked(wdf)

            # Labels: mapeamento + majority vote
            ann = wdf["annotation"].astype(str)

            for col in label_columns:
                mapped = ann.map(mapping_dicts[col])
                counts = mapped.value_counts()
                top = counts.max()
                winners = counts[counts == top].index.tolist()

                if len(winners) > 1:
                    major = "ambiguous"
                    enc = -1
                else:
                    major = winners[0]
                    enc = enc_maps[col][major]

                feats[col] = major
                feats[col + "_enc"] = enc

            # Hora cíclica
            ws_datetime = t0 + timedelta(seconds=float(ws))
            frac_hour = (
                ws_datetime.hour
                + ws_datetime.minute / 60
                + ws_datetime.second / 3600
                + ws_datetime.microsecond / (3600 * 1e6)
            )
            frac_day = frac_hour / 24

            feats["hour_sin"] = np.sin(2 * np.pi * frac_day)
            feats["hour_cos"] = np.cos(2 * np.pi * frac_day)

            # Metadata
            feats["pid"] = pid
            feats["sex"] = sex_code
            feats["age_group"] = age_code

            # Auditoria
            feats["window_start"] = ws_datetime
            feats["window_end"] = ws_datetime + timedelta(seconds=WINDOW_SECONDS)
            feats["n_samples"] = len(wdf)
            feats["duration_seconds"] = (
                (wdf["time"].iloc[-1] - wdf["time"].iloc[0]).total_seconds()
                if len(wdf) >= 2 else 0
            )

            rows_out.append(feats)

        # Salvamento seguro por chunk (SEM append)
        if rows_out:
            df_out = pd.DataFrame(rows_out)

            # gerar nome incremental chunk_000.parquet, chunk_001.parquet...
            existing = [f for f in os.listdir(out_dir_pid) if f.endswith(".parquet")]
            chunk_id = len(existing)
            chunk_path = os.path.join(out_dir_pid, f"chunk_{chunk_id:03d}.parquet")

            #reorganizando as colunas
            audit_cols = ["pid", "window_start", "window_end","n_samples",
                          "duration_seconds", "sex", "age_group"]

            label_cols = ["label:Walmsley2020", "label:Walmsley2020_enc",
                          "label:WillettsSpecific2018", "label:WillettsSpecific2018_enc",
                          "label:WillettsMET2018", "label:WillettsMET2018_enc",]

            feature_cols = [c for c in df_out.columns
                if c not in audit_cols + label_cols]

            df_out = df_out[audit_cols + label_cols + feature_cols]

            df_out.to_parquet(chunk_path, index=False)

    return out_dir_pid

In [10]:
metadata_df = pd.read_csv(METADATA_PATH)
annotation_map_df = pd.read_csv(ANNOT_DICT_PATH)

In [11]:
mapping_dicts, enc_maps = build_annotation_maps(annotation_map_df, LABEL_COLUMNS_TO_KEEP)

In [12]:
files = sorted(glob(PARTICIPANT_GLOB))


In [13]:
PARTICIPANT_GLOB

'data/data_raw/capture24/capture24/P*.csv.gz'

In [14]:
files[0]

'data/data_raw/capture24/capture24/P001.csv.gz'

In [None]:
f= files[0]
df_part = process_participant_file_chunked(
    path=f,
    metadata_df=metadata_df,
    mapping_dicts=mapping_dicts,
    enc_maps=enc_maps,
    label_columns=LABEL_COLUMNS_TO_KEEP,
    out_dir=OUT_DIR
)



=== Processando P001 ===
Usando chunksize: 750,000


P001 — chunks: 14it [02:48, 12.06s/it]


In [15]:
import os
out_dir_pid = "data/data_processed/participants/P001"
print(len([f for f in os.listdir(out_dir_pid) if f.endswith(".parquet")]), "arquivos .parquet (chunks) encontrados")
print(sorted(os.listdir(out_dir_pid))[:10])  # mostra os primeiros nomes

14 arquivos .parquet (chunks) encontrados
['chunk_000.parquet', 'chunk_001.parquet', 'chunk_002.parquet', 'chunk_003.parquet', 'chunk_004.parquet', 'chunk_005.parquet', 'chunk_006.parquet', 'chunk_007.parquet', 'chunk_008.parquet', 'chunk_009.parquet']


In [2]:
df = pd.read_parquet("data/data_processed/participants/P001/chunk_000.parquet")
df.head()

Unnamed: 0,pid,window_start,window_end,n_samples,duration_seconds,sex,age_group,label:Walmsley2020,label:Walmsley2020_enc,label:WillettsSpecific2018,label:WillettsSpecific2018_enc,label:WillettsMET2018,label:WillettsMET2018_enc,x_mean,x_std,x_min,x_max,y_mean,y_std,y_min,y_max,z_mean,z_std,z_min,z_max,energy_x,energy_y,energy_z,energy_total,magnitude_mean,corr_xy,corr_xz,corr_yz,fft_dom_freq,fft_peak_power,hour_sin,hour_cos
0,P001,2016-11-13 02:18:00,2016-11-13 02:18:05,500,4.99,1,2,sleep,3,sleep,5,sleep,5,-0.468161,0.004565,-0.482334,-0.46669,-0.537512,0.006892,-0.548902,-0.533341,0.657518,0.00396,0.643077,0.673867,0.219195,0.288966,0.432345,0.940507,0.969787,-0.14848,-0.077644,0.275487,0.4,5e-06,0.566406,0.824126
1,P001,2016-11-13 02:18:05,2016-11-13 02:18:10,500,4.99,1,2,sleep,3,sleep,5,sleep,5,-0.470069,0.006437,-0.482334,-0.46669,-0.537045,0.006771,-0.548902,-0.51778,0.657702,0.003627,0.643077,0.673867,0.221007,0.288463,0.432585,0.942055,0.970582,-0.108382,-0.028882,0.137541,3.6,8e-06,0.566706,0.82392
2,P001,2016-11-13 02:18:10,2016-11-13 02:18:15,500,4.99,1,2,sleep,3,sleep,5,sleep,5,-0.469694,0.006162,-0.482334,-0.46669,-0.537947,0.007104,-0.548902,-0.533341,0.657764,0.003369,0.643077,0.673867,0.22065,0.289438,0.432665,0.942753,0.970944,-0.260468,-0.079268,0.204062,0.4,2e-06,0.567005,0.823714
3,P001,2016-11-13 02:18:15,2016-11-13 02:18:20,500,4.99,1,2,sleep,3,sleep,5,sleep,5,-0.469287,0.005821,-0.482334,-0.46669,-0.537512,0.006962,-0.548902,-0.51778,0.657733,0.003567,0.643077,0.673867,0.220264,0.288967,0.432626,0.941857,0.970483,-0.243211,-0.092415,0.223157,0.4,1e-06,0.567305,0.823508
4,P001,2016-11-13 02:18:20,2016-11-13 02:18:25,500,4.99,1,2,sleep,3,sleep,5,sleep,5,-0.47082,0.006896,-0.482334,-0.46669,-0.535333,0.005291,-0.548902,-0.51778,0.658226,0.002743,0.643077,0.673867,0.221719,0.286609,0.433269,0.941597,0.97035,-0.225457,-0.07925,0.230302,8.8,2e-06,0.567604,0.823302


In [4]:
df.shape

(1500, 37)

In [5]:
df.columns

Index(['pid', 'window_start', 'window_end', 'n_samples', 'duration_seconds',
       'sex', 'age_group', 'label:Walmsley2020', 'label:Walmsley2020_enc',
       'label:WillettsSpecific2018', 'label:WillettsSpecific2018_enc',
       'label:WillettsMET2018', 'label:WillettsMET2018_enc', 'x_mean', 'x_std',
       'x_min', 'x_max', 'y_mean', 'y_std', 'y_min', 'y_max', 'z_mean',
       'z_std', 'z_min', 'z_max', 'energy_x', 'energy_y', 'energy_z',
       'energy_total', 'magnitude_mean', 'corr_xy', 'corr_xz', 'corr_yz',
       'fft_dom_freq', 'fft_peak_power', 'hour_sin', 'hour_cos'],
      dtype='object')

In [7]:
df = pd.read_parquet("data/data_processed/participants/P001/chunk_011.parquet")
df.head()

Unnamed: 0,pid,window_start,window_end,n_samples,duration_seconds,sex,age_group,label:Walmsley2020,label:Walmsley2020_enc,label:WillettsSpecific2018,label:WillettsSpecific2018_enc,label:WillettsMET2018,label:WillettsMET2018_enc,x_mean,x_std,x_min,x_max,y_mean,y_std,y_min,y_max,z_mean,z_std,z_min,z_max,energy_x,energy_y,energy_z,energy_total,magnitude_mean,corr_xy,corr_xz,corr_yz,fft_dom_freq,fft_peak_power,hour_sin,hour_cos
0,P001,2016-11-14 02:38:00,2016-11-14 02:38:05,500,4.99,1,2,sleep,3,sleep,5,sleep,5,-0.977192,0.007578,-0.983116,-0.967408,0.024293,0.005817,0.011339,0.026919,0.180042,0.003228,0.165224,0.18082,0.954962,0.000624,0.032425,0.988011,0.993959,-0.103857,-0.045651,0.030106,5.4,8e-06,0.636078,0.771625
1,P001,2016-11-14 02:38:05,2016-11-14 02:38:10,500,4.99,1,2,sleep,3,sleep,5,sleep,5,-0.977202,0.007575,-0.983116,-0.967408,0.02442,0.005705,0.011339,0.026919,0.180105,0.003015,0.165224,0.18082,0.95498,0.000629,0.032447,0.988056,0.993982,-0.190548,-0.054464,0.049911,6.2,7e-06,0.636359,0.771393
2,P001,2016-11-14 02:38:10,2016-11-14 02:38:15,500,4.99,1,2,sleep,3,sleep,5,sleep,5,-0.976248,0.007763,-0.983116,-0.967408,0.024704,0.005432,0.011339,0.026919,0.180054,0.003018,0.165224,0.18082,0.95312,0.00064,0.032429,0.986188,0.99304,-0.114458,-0.044656,-0.025016,5.2,8e-06,0.636639,0.771162
3,P001,2016-11-14 02:38:15,2016-11-14 02:38:20,500,4.99,1,2,sleep,3,sleep,5,sleep,5,-0.977869,0.007377,-0.983116,-0.967408,0.024547,0.005586,0.011339,0.026919,0.180009,0.003299,0.165224,0.196215,0.956283,0.000634,0.032414,0.989331,0.994625,-0.184093,0.065115,-0.032377,5.2,1e-05,0.63692,0.77093
4,P001,2016-11-14 02:38:20,2016-11-14 02:38:25,500,4.99,1,2,sleep,3,sleep,5,sleep,5,-0.977222,0.007571,-0.983116,-0.967408,0.024417,0.005704,0.011339,0.026919,0.179984,0.003502,0.165224,0.196215,0.955021,0.000629,0.032407,0.988056,0.993982,-0.237423,-0.02848,-0.070048,5.4,9e-06,0.6372,0.770699


In [8]:
df.shape


(480, 37)

In [9]:
df = pd.read_parquet("data/data_processed/participants/P001/chunk_013.parquet")
df.head()

Unnamed: 0,pid,window_start,window_end,n_samples,duration_seconds,sex,age_group,label:Walmsley2020,label:Walmsley2020_enc,label:WillettsSpecific2018,label:WillettsSpecific2018_enc,label:WillettsMET2018,label:WillettsMET2018_enc,x_mean,x_std,x_min,x_max,y_mean,y_std,y_min,y_max,z_mean,z_std,z_min,z_max,energy_x,energy_y,energy_z,energy_total,magnitude_mean,corr_xy,corr_xz,corr_yz,fft_dom_freq,fft_peak_power,hour_sin,hour_cos
0,P001,2016-11-14 05:23:00,2016-11-14 05:23:05,500,4.99,1,2,sleep,3,sleep,5,sleep,5,-0.29868,0.006862,-0.310256,-0.294612,0.18064,0.00502,0.166915,0.182476,-0.941242,0.004406,-0.942627,-0.927232,0.089257,0.032656,0.885955,1.007868,1.003915,-0.146141,-0.036645,-0.014947,45.6,2e-06,0.986996,0.160743
1,P001,2016-11-14 05:23:05,2016-11-14 05:23:10,500,4.99,1,2,sleep,3,sleep,5,sleep,5,-0.299837,0.007378,-0.310256,-0.294612,0.180329,0.005367,0.166915,0.182476,-0.941704,0.003656,-0.942627,-0.927232,0.089957,0.032547,0.886819,1.009323,1.004642,-0.172695,-0.017499,0.027836,15.8,1e-06,0.987055,0.160384
2,P001,2016-11-14 05:23:10,2016-11-14 05:23:15,500,4.99,1,2,sleep,3,sleep,5,sleep,5,-0.299118,0.007084,-0.310256,-0.294612,0.180204,0.005495,0.166915,0.182476,-0.941334,0.00427,-0.942627,-0.927232,0.089522,0.032504,0.886128,1.008153,1.004058,-0.137888,-0.046238,0.043534,5.8,2e-06,0.987113,0.160025
3,P001,2016-11-14 05:23:15,2016-11-14 05:23:20,500,4.99,1,2,sleep,3,sleep,5,sleep,5,-0.298398,0.0067,-0.310256,-0.294612,0.179986,0.005705,0.166915,0.182476,-0.941642,0.003768,-0.942627,-0.927232,0.089086,0.032428,0.886704,1.008218,1.004091,-0.182912,-0.004884,-0.064194,4.4,2e-06,0.987171,0.159666
4,P001,2016-11-14 05:23:20,2016-11-14 05:23:25,500,4.99,1,2,sleep,3,sleep,5,sleep,5,-0.297115,0.005735,-0.310256,-0.294612,0.17815,0.006972,0.166915,0.182476,-0.941396,0.004177,-0.942627,-0.927232,0.08831,0.031786,0.886243,1.00634,1.003155,-0.161223,-0.092502,-0.01448,48.8,1e-06,0.987229,0.159307


In [11]:
df.shape


(540, 37)

In [16]:
def process_all_participants(
    pattern=PARTICIPANT_GLOB,
    metadata_df=None,
    mapping_dicts=None,
    enc_maps=None,
    label_columns=LABEL_COLUMNS_TO_KEEP,
    out_dir=OUT_DIR
):
    """
    Processa TODOS os participantes usando o pipeline chunked.

    - pattern: glob com caminho dos arquivos PXXX.csv.gz
    - metadata_df: dataframe carregado previamente
    - mapping_dicts, enc_maps: gerados por build_annotation_maps
    - label_columns: lista das colunas simplificadas que serão usadas
    - out_dir: diretório base para saída

    Retorna lista dos diretórios gerados (1 por participante).
    """

    files = sorted(glob(pattern))
    if len(files) == 0:
        print("Nenhum arquivo encontrado com o padrão:", pattern)
        return []

    print(f"=== Iniciando processamento de {len(files)} participantes ===\n")

    dirs_out = []
    t0 = time.time()

    for f in files:
        try:
            out_pid = process_participant_file_chunked(
                path=f,
                metadata_df=metadata_df,
                mapping_dicts=mapping_dicts,
                enc_maps=enc_maps,
                label_columns=label_columns,
                out_dir=out_dir
            )
            dirs_out.append(out_pid)

        except Exception as e:
            print(f"\nERRO ao processar {f}:\n{e}\n")
            continue

    t1 = time.time()
    print(f"\n=== Finalizado processamento de todos os participantes ===")
    print(f"Tempo total: {(t1 - t0)/60:.2f} minutos\n")

    return dirs_out

In [17]:
dirs = process_all_participants(
    pattern=PARTICIPANT_GLOB,
    metadata_df=metadata_df,
    mapping_dicts=mapping_dicts,
    enc_maps=enc_maps,
    label_columns=LABEL_COLUMNS_TO_KEEP,
    out_dir=OUT_DIR
)

=== Iniciando processamento de 151 participantes ===


=== Processando P001 ===
Usando chunksize: 750,000


P001 — chunks: 14it [02:57, 12.69s/it]



=== Processando P002 ===
Usando chunksize: 750,000


P002 — chunks: 12it [02:15, 11.27s/it]



=== Processando P003 ===
Usando chunksize: 750,000


P003 — chunks: 13it [02:32, 11.76s/it]



=== Processando P004 ===
Usando chunksize: 750,000


P004 — chunks: 11it [02:12, 12.04s/it]



=== Processando P005 ===
Usando chunksize: 750,000


P005 — chunks: 14it [02:47, 11.95s/it]



=== Processando P006 ===
Usando chunksize: 750,000


P006 — chunks: 16it [03:09, 11.87s/it]



=== Processando P007 ===
Usando chunksize: 750,000


P007 — chunks: 14it [02:36, 11.17s/it]



=== Processando P008 ===
Usando chunksize: 750,000


P008 — chunks: 13it [02:20, 10.78s/it]



=== Processando P009 ===
Usando chunksize: 750,000


P009 — chunks: 10it [00:54,  5.45s/it]



=== Processando P010 ===
Usando chunksize: 750,000


P010 — chunks: 13it [01:29,  6.88s/it]



=== Processando P011 ===
Usando chunksize: 750,000


P011 — chunks: 13it [01:31,  7.06s/it]



=== Processando P012 ===
Usando chunksize: 750,000


P012 — chunks: 13it [01:34,  7.29s/it]



=== Processando P013 ===
Usando chunksize: 750,000


P013 — chunks: 12it [01:26,  7.20s/it]



=== Processando P014 ===
Usando chunksize: 750,000


P014 — chunks: 14it [01:40,  7.16s/it]



=== Processando P015 ===
Usando chunksize: 750,000


P015 — chunks: 13it [01:25,  6.58s/it]



=== Processando P016 ===
Usando chunksize: 750,000


P016 — chunks: 12it [01:23,  6.92s/it]



=== Processando P017 ===
Usando chunksize: 750,000


P017 — chunks: 13it [01:31,  7.05s/it]



=== Processando P018 ===
Usando chunksize: 750,000


P018 — chunks: 12it [01:06,  5.54s/it]



=== Processando P019 ===
Usando chunksize: 750,000


P019 — chunks: 13it [01:34,  7.26s/it]



=== Processando P020 ===
Usando chunksize: 750,000


P020 — chunks: 11it [01:06,  6.05s/it]



=== Processando P021 ===
Usando chunksize: 750,000


P021 — chunks: 13it [01:37,  7.51s/it]



=== Processando P022 ===
Usando chunksize: 750,000


P022 — chunks: 15it [01:46,  7.13s/it]



=== Processando P023 ===
Usando chunksize: 750,000


P023 — chunks: 14it [01:20,  5.72s/it]



=== Processando P024 ===
Usando chunksize: 750,000


P024 — chunks: 14it [01:43,  7.40s/it]



=== Processando P025 ===
Usando chunksize: 750,000


P025 — chunks: 13it [01:37,  7.50s/it]



=== Processando P026 ===
Usando chunksize: 750,000


P026 — chunks: 12it [01:16,  6.40s/it]



=== Processando P027 ===
Usando chunksize: 750,000


P027 — chunks: 13it [01:21,  6.30s/it]



=== Processando P028 ===
Usando chunksize: 750,000


P028 — chunks: 13it [01:14,  5.77s/it]



=== Processando P029 ===
Usando chunksize: 750,000


P029 — chunks: 14it [01:21,  5.85s/it]



=== Processando P030 ===
Usando chunksize: 750,000


P030 — chunks: 13it [02:24, 11.15s/it]



=== Processando P031 ===
Usando chunksize: 750,000


P031 — chunks: 13it [01:32,  7.11s/it]



=== Processando P032 ===
Usando chunksize: 750,000


P032 — chunks: 13it [02:58, 13.73s/it]



=== Processando P033 ===
Usando chunksize: 750,000


P033 — chunks: 13it [03:20, 15.46s/it]



=== Processando P034 ===
Usando chunksize: 750,000


P034 — chunks: 14it [02:02,  8.79s/it]



=== Processando P035 ===
Usando chunksize: 750,000


P035 — chunks: 13it [14:53, 68.72s/it]



=== Processando P036 ===
Usando chunksize: 750,000


P036 — chunks: 12it [01:17,  6.44s/it]



=== Processando P037 ===
Usando chunksize: 750,000


P037 — chunks: 14it [01:26,  6.14s/it]



=== Processando P038 ===
Usando chunksize: 750,000


P038 — chunks: 13it [01:35,  7.34s/it]



=== Processando P039 ===
Usando chunksize: 750,000


P039 — chunks: 13it [01:22,  6.32s/it]



=== Processando P040 ===
Usando chunksize: 750,000


P040 — chunks: 12it [01:25,  7.09s/it]



=== Processando P041 ===
Usando chunksize: 750,000


P041 — chunks: 13it [01:31,  7.07s/it]



=== Processando P042 ===
Usando chunksize: 750,000


P042 — chunks: 13it [01:33,  7.18s/it]



=== Processando P043 ===
Usando chunksize: 750,000


P043 — chunks: 13it [02:06,  9.71s/it]



=== Processando P044 ===
Usando chunksize: 750,000


P044 — chunks: 13it [01:48,  8.36s/it]



=== Processando P045 ===
Usando chunksize: 750,000


P045 — chunks: 12it [01:28,  7.38s/it]



=== Processando P046 ===
Usando chunksize: 750,000


P046 — chunks: 13it [01:40,  7.74s/it]



=== Processando P047 ===
Usando chunksize: 750,000


P047 — chunks: 11it [01:34,  8.59s/it]



=== Processando P048 ===
Usando chunksize: 750,000


P048 — chunks: 13it [01:36,  7.41s/it]



=== Processando P049 ===
Usando chunksize: 750,000


P049 — chunks: 13it [01:53,  8.73s/it]



=== Processando P050 ===
Usando chunksize: 750,000


P050 — chunks: 12it [01:42,  8.54s/it]



=== Processando P051 ===
Usando chunksize: 750,000


P051 — chunks: 12it [01:46,  8.86s/it]



=== Processando P052 ===
Usando chunksize: 750,000


P052 — chunks: 12it [01:34,  7.89s/it]



=== Processando P053 ===
Usando chunksize: 750,000


P053 — chunks: 13it [02:01,  9.32s/it]



=== Processando P054 ===
Usando chunksize: 750,000


P054 — chunks: 15it [02:25,  9.71s/it]



=== Processando P055 ===
Usando chunksize: 750,000


P055 — chunks: 13it [01:40,  7.75s/it]



=== Processando P056 ===
Usando chunksize: 750,000


P056 — chunks: 13it [01:50,  8.52s/it]



=== Processando P057 ===
Usando chunksize: 750,000


P057 — chunks: 13it [01:43,  7.98s/it]



=== Processando P058 ===
Usando chunksize: 750,000


P058 — chunks: 13it [01:46,  8.21s/it]



=== Processando P059 ===
Usando chunksize: 750,000


P059 — chunks: 13it [01:40,  7.73s/it]



=== Processando P060 ===
Usando chunksize: 750,000


P060 — chunks: 13it [01:39,  7.62s/it]



=== Processando P061 ===
Usando chunksize: 750,000


P061 — chunks: 13it [01:41,  7.82s/it]



=== Processando P062 ===
Usando chunksize: 750,000


P062 — chunks: 12it [01:46,  8.86s/it]



=== Processando P063 ===
Usando chunksize: 750,000


P063 — chunks: 13it [01:30,  6.96s/it]



=== Processando P064 ===
Usando chunksize: 750,000


P064 — chunks: 13it [01:48,  8.33s/it]



=== Processando P065 ===
Usando chunksize: 750,000


P065 — chunks: 12it [01:41,  8.44s/it]



=== Processando P066 ===
Usando chunksize: 750,000


P066 — chunks: 14it [02:01,  8.65s/it]



=== Processando P067 ===
Usando chunksize: 750,000


P067 — chunks: 13it [01:43,  7.94s/it]



=== Processando P068 ===
Usando chunksize: 750,000


P068 — chunks: 12it [01:48,  9.06s/it]



=== Processando P069 ===
Usando chunksize: 750,000


P069 — chunks: 13it [01:40,  7.72s/it]



=== Processando P070 ===
Usando chunksize: 750,000


P070 — chunks: 13it [01:53,  8.75s/it]



=== Processando P071 ===
Usando chunksize: 750,000


P071 — chunks: 14it [01:36,  6.90s/it]



=== Processando P072 ===
Usando chunksize: 750,000


  chunk['time'] = pd.to_datetime(chunk['time'], errors='coerce')
P072 — chunks: 13it [01:50,  8.48s/it]



=== Processando P073 ===
Usando chunksize: 750,000


P073 — chunks: 14it [01:58,  8.45s/it]



=== Processando P074 ===
Usando chunksize: 750,000


P074 — chunks: 15it [01:49,  7.30s/it]



=== Processando P075 ===
Usando chunksize: 750,000


P075 — chunks: 13it [01:32,  7.12s/it]



=== Processando P076 ===
Usando chunksize: 750,000


P076 — chunks: 14it [01:57,  8.40s/it]



=== Processando P077 ===
Usando chunksize: 750,000


P077 — chunks: 11it [01:26,  7.88s/it]



=== Processando P078 ===
Usando chunksize: 750,000


P078 — chunks: 13it [01:59,  9.22s/it]



=== Processando P079 ===
Usando chunksize: 750,000


P079 — chunks: 12it [01:36,  8.02s/it]



=== Processando P080 ===
Usando chunksize: 750,000


P080 — chunks: 15it [02:06,  8.40s/it]



=== Processando P081 ===
Usando chunksize: 750,000


P081 — chunks: 7it [01:00,  8.61s/it]



=== Processando P082 ===
Usando chunksize: 750,000


P082 — chunks: 12it [01:35,  7.95s/it]



=== Processando P083 ===
Usando chunksize: 750,000


P083 — chunks: 13it [01:37,  7.50s/it]



=== Processando P084 ===
Usando chunksize: 750,000


P084 — chunks: 14it [01:53,  8.14s/it]



=== Processando P085 ===
Usando chunksize: 750,000


P085 — chunks: 13it [01:45,  8.11s/it]



=== Processando P086 ===
Usando chunksize: 750,000


P086 — chunks: 13it [01:51,  8.55s/it]



=== Processando P087 ===
Usando chunksize: 750,000


P087 — chunks: 11it [01:35,  8.72s/it]



=== Processando P088 ===
Usando chunksize: 750,000


P088 — chunks: 14it [01:30,  6.46s/it]



=== Processando P089 ===
Usando chunksize: 750,000


P089 — chunks: 12it [01:43,  8.59s/it]



=== Processando P090 ===
Usando chunksize: 750,000


P090 — chunks: 13it [01:49,  8.42s/it]



=== Processando P091 ===
Usando chunksize: 750,000


P091 — chunks: 13it [01:51,  8.55s/it]



=== Processando P092 ===
Usando chunksize: 750,000


P092 — chunks: 13it [01:42,  7.90s/it]



=== Processando P093 ===
Usando chunksize: 750,000


P093 — chunks: 13it [01:56,  8.97s/it]



=== Processando P094 ===
Usando chunksize: 750,000


P094 — chunks: 8it [00:54,  6.80s/it]



=== Processando P095 ===
Usando chunksize: 750,000


P095 — chunks: 13it [01:27,  6.76s/it]



=== Processando P096 ===
Usando chunksize: 750,000


P096 — chunks: 12it [01:45,  8.83s/it]



=== Processando P097 ===
Usando chunksize: 750,000


P097 — chunks: 14it [02:01,  8.67s/it]



=== Processando P098 ===
Usando chunksize: 750,000


P098 — chunks: 15it [02:06,  8.44s/it]



=== Processando P099 ===
Usando chunksize: 750,000


P099 — chunks: 13it [01:42,  7.91s/it]



=== Processando P100 ===
Usando chunksize: 750,000


P100 — chunks: 14it [02:03,  8.80s/it]



=== Processando P101 ===
Usando chunksize: 750,000


P101 — chunks: 13it [01:53,  8.71s/it]



=== Processando P102 ===
Usando chunksize: 750,000


P102 — chunks: 13it [01:31,  7.04s/it]



=== Processando P103 ===
Usando chunksize: 750,000


P103 — chunks: 12it [01:48,  9.05s/it]



=== Processando P104 ===
Usando chunksize: 750,000


P104 — chunks: 13it [01:46,  8.18s/it]



=== Processando P105 ===
Usando chunksize: 750,000


P105 — chunks: 13it [01:38,  7.57s/it]



=== Processando P106 ===
Usando chunksize: 750,000


P106 — chunks: 13it [01:30,  6.93s/it]



=== Processando P107 ===
Usando chunksize: 750,000


P107 — chunks: 12it [01:43,  8.64s/it]



=== Processando P108 ===
Usando chunksize: 750,000


P108 — chunks: 14it [01:48,  7.76s/it]



=== Processando P109 ===
Usando chunksize: 750,000


P109 — chunks: 12it [01:38,  8.21s/it]



=== Processando P110 ===
Usando chunksize: 750,000


P110 — chunks: 13it [01:51,  8.56s/it]



=== Processando P111 ===
Usando chunksize: 750,000


P111 — chunks: 14it [02:05,  8.97s/it]



=== Processando P112 ===
Usando chunksize: 750,000


P112 — chunks: 13it [01:13,  5.68s/it]



=== Processando P113 ===
Usando chunksize: 750,000


P113 — chunks: 14it [01:54,  8.17s/it]



=== Processando P114 ===
Usando chunksize: 750,000


P114 — chunks: 13it [01:51,  8.56s/it]



=== Processando P115 ===
Usando chunksize: 750,000


P115 — chunks: 13it [01:42,  7.90s/it]



=== Processando P116 ===
Usando chunksize: 750,000


P116 — chunks: 13it [01:41,  7.81s/it]



=== Processando P117 ===
Usando chunksize: 750,000


P117 — chunks: 13it [01:39,  7.66s/it]



=== Processando P118 ===
Usando chunksize: 750,000


P118 — chunks: 13it [01:17,  5.99s/it]



=== Processando P119 ===
Usando chunksize: 750,000


P119 — chunks: 13it [01:43,  7.95s/it]



=== Processando P120 ===
Usando chunksize: 750,000


P120 — chunks: 13it [01:33,  7.21s/it]



=== Processando P121 ===
Usando chunksize: 750,000


P121 — chunks: 14it [02:02,  8.78s/it]



=== Processando P122 ===
Usando chunksize: 750,000


P122 — chunks: 11it [01:21,  7.40s/it]



=== Processando P123 ===
Usando chunksize: 750,000


P123 — chunks: 11it [01:27,  7.93s/it]



=== Processando P124 ===
Usando chunksize: 750,000


P124 — chunks: 11it [01:37,  8.90s/it]



=== Processando P125 ===
Usando chunksize: 750,000


P125 — chunks: 14it [02:03,  8.80s/it]



=== Processando P126 ===
Usando chunksize: 750,000


P126 — chunks: 12it [01:43,  8.60s/it]



=== Processando P127 ===
Usando chunksize: 750,000


P127 — chunks: 10it [01:25,  8.52s/it]



=== Processando P128 ===
Usando chunksize: 750,000


P128 — chunks: 13it [02:01,  9.33s/it]



=== Processando P129 ===
Usando chunksize: 750,000


P129 — chunks: 13it [01:41,  7.77s/it]



=== Processando P130 ===
Usando chunksize: 750,000


P130 — chunks: 13it [01:52,  8.64s/it]



=== Processando P131 ===
Usando chunksize: 750,000


P131 — chunks: 14it [01:51,  7.98s/it]



=== Processando P132 ===
Usando chunksize: 750,000


P132 — chunks: 13it [01:41,  7.84s/it]



=== Processando P133 ===
Usando chunksize: 750,000


P133 — chunks: 15it [02:12,  8.82s/it]



=== Processando P134 ===
Usando chunksize: 750,000


P134 — chunks: 13it [01:33,  7.22s/it]



=== Processando P135 ===
Usando chunksize: 750,000


P135 — chunks: 13it [01:38,  7.60s/it]



=== Processando P136 ===
Usando chunksize: 750,000


P136 — chunks: 9it [01:15,  8.43s/it]



=== Processando P137 ===
Usando chunksize: 750,000


P137 — chunks: 13it [01:32,  7.09s/it]



=== Processando P138 ===
Usando chunksize: 750,000


P138 — chunks: 13it [01:33,  7.16s/it]



=== Processando P139 ===
Usando chunksize: 750,000


P139 — chunks: 11it [01:27,  7.92s/it]



=== Processando P140 ===
Usando chunksize: 750,000


P140 — chunks: 11it [01:31,  8.35s/it]



=== Processando P141 ===
Usando chunksize: 750,000


P141 — chunks: 14it [01:36,  6.88s/it]



=== Processando P142 ===
Usando chunksize: 750,000


P142 — chunks: 8it [01:02,  7.85s/it]



=== Processando P143 ===
Usando chunksize: 750,000


P143 — chunks: 13it [01:27,  6.69s/it]



=== Processando P144 ===
Usando chunksize: 750,000


P144 — chunks: 12it [01:37,  8.13s/it]



=== Processando P145 ===
Usando chunksize: 750,000


P145 — chunks: 14it [01:49,  7.81s/it]



=== Processando P146 ===
Usando chunksize: 750,000


P146 — chunks: 13it [01:59,  9.21s/it]



=== Processando P147 ===
Usando chunksize: 750,000


P147 — chunks: 13it [01:50,  8.53s/it]



=== Processando P148 ===
Usando chunksize: 750,000


P148 — chunks: 13it [01:40,  7.75s/it]



=== Processando P149 ===
Usando chunksize: 750,000


P149 — chunks: 13it [01:32,  7.15s/it]



=== Processando P150 ===
Usando chunksize: 750,000


P150 — chunks: 13it [01:55,  8.92s/it]



=== Processando P151 ===
Usando chunksize: 750,000


P151 — chunks: 15it [01:51,  7.41s/it]


=== Finalizado processamento de todos os participantes ===
Tempo total: 275.70 minutos






In [25]:
df = pd.read_parquet("data/data_processed/participants/P075/chunk_005.parquet").head()


In [22]:
OUT_DIR

'data/data_processed/participants'

In [26]:
file_sanity_check = 'data/data_processed/participants/amostra_P075_chunk_005.csv'
df.to_csv(file_sanity_check)

In [27]:
df = pd.read_parquet("data/data_processed/participants/P148/chunk_011.parquet").head()
file_sanity_check = 'data/data_processed/participants/amostra_P148_chunk_011.csv'
df.to_csv(file_sanity_check)