"""
CAPTURE-24: janelamento 5s (tempo-based) e extração de features. Primeiro teste em notebook.

Principais escolhas implementadas:
- Janela de 5 segundos sem sobreposição (tempo-based: [t0, t0+5s))
- Descartar linhas com NA em time,x,y,z
- Para annotation: descartar janela se >= 50% das observações na janela tiverem annotation NA
- Janela válida se n_samples >= min_samples (default 250)
- Calcula estatísticas por eixo, energia, magnitude média, correlações, FFT (Welch na magnitude)
- Gera 3 colunas de rótulos simplificados (strings) e colunas de label-encoded correspondentes
- Inclui campos de auditoria: pid, window_start, window_end, n_samples, duration_seconds
- Salva resultado em arquivo parquet por participante ou combinado
"""

In [2]:
import os
from glob import glob
import numpy as np
import pandas as pd
from datetime import timedelta
from scipy import signal

In [4]:
SAMPLING_RATE = 100  # Hz
WINDOW_SECONDS = 5
WINDOW_SIZE_SAMPLES = int(SAMPLING_RATE * WINDOW_SECONDS)  # 500
MIN_SAMPLES = 250  # mínimo aceitável por janela
ANNOTATION_NA_THRESHOLD = 0.5  # descartar janela se >= 50% das annotation forem NA
FFT_NFFT = WINDOW_SIZE_SAMPLES  # usar zero-padding até 500 quando necessário
USE_WELCH = False  # usar Welch sobre a magnitude
LABEL_COLUMNS_TO_KEEP = ['label:Walmsley2020', 'label:WillettsSpecific2018', 'label:WillettsMET2018']

In [3]:
# Map para sexo e age
SEX_MAP = {'F': 1, 'M': 0}
AGE_MAP = {'18-29': 0, '30-37': 1, '38-52': 2, '53+': 3}

In [None]:
# Diretórios de entrada/saída (ajuste)
PARTICIPANT_GLOB = "../data/data_raw/capture24/capture24/P*.csv.gz"
METADATA_PATH = "../data/data_raw/capture24/capture24/metadata.csv"
ANNOT_DICT_PATH = "../data/data_raw/capture24/capture24/annotation-label-dictionary.csv"
OUT_DIR = "../data/data_processed"
os.makedirs(OUT_DIR, exist_ok=True)

In [5]:
# ---------- Funções utilitárias ----------
def safe_parse_time(df, time_col='time'):
    """Assegura que coluna time seja datetime e ordena por time."""
    if not np.issubdtype(df[time_col].dtype, np.datetime64):
        df[time_col] = pd.to_datetime(df[time_col], errors='coerce')
    df = df.sort_values(time_col).reset_index(drop=True)
    return df

In [6]:
def compute_window_features(window_df):
    """
    Extrai features estatísticas e espectrais de uma janela válida.
    NÃO adiciona metadata (pid, sex, age_group).
    """

    x = window_df['x'].to_numpy()
    y = window_df['y'].to_numpy()
    z = window_df['z'].to_numpy()
    n = len(window_df)

    feats = {}

    # Estatísticas por eixo
    for axis, arr in [('x', x), ('y', y), ('z', z)]:
        feats[f'{axis}_mean'] = np.mean(arr)
        feats[f'{axis}_std']  = np.std(arr, ddof=0)
        feats[f'{axis}_min']  = np.min(arr)
        feats[f'{axis}_max']  = np.max(arr)

    # Energia
    feats['energy_x'] = np.mean(x**2)
    feats['energy_y'] = np.mean(y**2)
    feats['energy_z'] = np.mean(z**2)
    feats['energy_total'] = np.mean(x**2 + y**2 + z**2)

    # Magnitude média
    mag = np.sqrt(x**2 + y**2 + z**2)
    feats['magnitude_mean'] = np.mean(mag)

    # Correlações
    def safe_corr(a, b):
        if len(a) < 2 or np.std(a) == 0 or np.std(b) == 0:
            return np.nan
        return float(np.corrcoef(a, b)[0, 1])

    feats['corr_xy'] = safe_corr(x, y)
    feats['corr_xz'] = safe_corr(x, z)
    feats['corr_yz'] = safe_corr(y, z)

    # FFT (Welch)
    mag_detrended = signal.detrend(mag)
    nperseg = min(256, n)

    if USE_WELCH:
        freqs, psd = signal.welch(
            mag_detrended,
            fs=SAMPLING_RATE,
            nperseg=nperseg,
            nfft=FFT_NFFT
        )
        if np.all(np.isnan(psd)):
            feats['fft_dom_freq'] = np.nan
            feats['fft_peak_power'] = np.nan
        else:
            idx = np.argmax(psd)
            feats['fft_dom_freq'] = freqs[idx]
            feats['fft_peak_power'] = psd[idx]
    else:
        X = np.fft.rfft(mag_detrended, n=FFT_NFFT)
        psd = (np.abs(X)**2) / n
        freqs = np.fft.rfftfreq(FFT_NFFT, 1 / SAMPLING_RATE)
        idx = np.argmax(psd)
        feats['fft_dom_freq'] = freqs[idx]
        feats['fft_peak_power'] = psd[idx]

    return feats

In [7]:
def build_annotation_maps(annotation_map_df, label_columns):
    """
    Cria dois dicionários globais:
      - mapping_dicts[col]: mapa {annotation_original → label_simplificado}
      - enc_maps[col]: mapa {label_simplificado → inteiro}

    Deve ser chamado UMA ÚNICA VEZ antes do processamento de janelas.
    """

    mapping_dicts = {}
    enc_maps = {}

    for col in label_columns:
        # Mapeamento original → simplificado
        mapping = dict(zip(annotation_map_df['annotation'].astype(str),
                           annotation_map_df[col].astype(str)))
        mapping_dicts[col] = mapping

        # Encoding fixo baseado no conjunto completo de rótulos simplificados
        unique_labels = sorted(annotation_map_df[col].dropna().unique().tolist())
        enc_maps[col] = {lab: i for i, lab in enumerate(unique_labels)}

    return mapping_dicts, enc_maps

In [8]:
def map_annotations_and_encode(window_df, mapping_dicts, enc_maps, label_columns):
    """
    Faz o mapeamento da coluna 'annotation' original para os rótulos simplificados,
    realiza majority vote e aplica encoding fixo.

    - Assume que a janela já passou pelos filtros (>= 250 amostras e < 50% NA).
    - Se houver empate → 'ambiguous' com encoding -1.
    """

    ann = window_df['annotation'].astype(str)
    result = {}

    for col in label_columns:

        mapped = ann.map(mapping_dicts[col])

        # Majority vote (len(counts) nunca é 0 após filtros)
        counts = mapped.value_counts()
        top_count = counts.iloc[0]
        top_labels = counts[counts == top_count].index.tolist()

        if len(top_labels) > 1:
            major = "ambiguous"
        else:
            major = top_labels[0]

        result[col] = major

        # Encoding fixo (consistente para todas as janelas)
        if major == "ambiguous":
            result[col + "_enc"] = -1
        else:
            result[col + "_enc"] = enc_maps[col][major]

    return result

In [9]:
def process_participant_file(
    path,
    metadata_df,
    mapping_dicts,
    enc_maps,
    label_columns,
    out_dir,
    save_per_participant=True
):
    """
    Processa um único participante Pxxx.csv.gz:
    - lê dados brutos
    - remove NAs essenciais
    - monta janelas e filtra
    - extrai features
    - adiciona pid, sex, age_group, labels e hora cíclica
    - salva parquet
    """

    # Extrair PID corretamente
    filename = os.path.basename(path)
    pid = os.path.splitext(os.path.splitext(filename)[0])[0]  # remove .gz e .csv

    print(f"Processing {pid}...")

    # 1) Ler arquivo comprimido
    df = pd.read_csv(path, dtype={'annotation': 'string'})

    # Check de colunas
    required_cols = {'time', 'x', 'y', 'z', 'annotation'}
    if not required_cols.issubset(df.columns):
        raise ValueError(f"{pid} está faltando colunas obrigatórias: {required_cols}")

    # 2) Limpeza
    df = df.dropna(subset=['time', 'x', 'y', 'z']).reset_index(drop=True)
    df = safe_parse_time(df, 'time')

    # Metadata
    meta = metadata_df.loc[metadata_df['pid'] == pid].iloc[0]

    # 3) Janelas
    t0 = df['time'].iloc[0]
    t_end = df['time'].iloc[-1]

    window_start = t0
    results = []

    while window_start < t_end:

        window_end = window_start + timedelta(seconds=WINDOW_SECONDS)
        mask = (df['time'] >= window_start) & (df['time'] < window_end)
        window_df = df.loc[mask].reset_index(drop=True)

        n = len(window_df)

        if n == 0 or n < MIN_SAMPLES:
            window_start = window_end
            continue

        if window_df['annotation'].isna().mean() >= ANNOTATION_NA_THRESHOLD:
            window_start = window_end
            continue

        # 4) Features do sinal
        feats = compute_window_features(window_df)

        # 5) Metadata → adicionada AQUI
        feats['pid'] = pid
        feats['sex'] = SEX_MAP.get(meta['sex'], np.nan)
        feats['age_group'] = AGE_MAP.get(meta['age'], np.nan)

        # 6) Labels simplificados
        feats.update(
            map_annotations_and_encode(window_df, mapping_dicts, enc_maps, label_columns)
        )

        # 7) Hora cíclica
        frac_hour = (
            window_start.hour
            + window_start.minute / 60
            + window_start.second / 3600
            + window_start.microsecond / 3.6e9
        )
        frac_day = frac_hour / 24
        feats['hour_sin'] = np.sin(2 * np.pi * frac_day)
        feats['hour_cos'] = np.cos(2 * np.pi * frac_day)

        # Auditoria
        feats['window_start'] = window_start
        feats['window_end'] = window_end
        feats['n_samples'] = n
        feats['duration_seconds'] = (
            window_df['time'].iloc[-1] - window_df['time'].iloc[0]
        ).total_seconds()

        results.append(feats)

        window_start = window_end

    # 8) Salvar
    if len(results) == 0:
        print(f"Sem janelas válidas para {pid}.")
        return None

    df_out = pd.DataFrame(results)

    cols_base = [
        'pid', 'window_start', 'window_end', 'n_samples', 'duration_seconds',
        'sex', 'age_group'
    ]
    other_cols = [c for c in df_out.columns if c not in cols_base]
    df_out = df_out[cols_base + other_cols]

    if save_per_participant:
        out_path = os.path.join(out_dir, f"{pid}_windows.parquet")
        df_out.to_parquet(out_path, index=False)
        print(f"Saved {out_path} ({len(df_out)} janelas).")

    return df_out

In [10]:
METADATA_PATH

'data/data_raw/capture24/capture24/metadata.csv'

In [11]:
metadata_df = pd.read_csv(METADATA_PATH)
annotation_map_df = pd.read_csv(ANNOT_DICT_PATH)

In [12]:
metadata_df.head()
annotation_map_df.head()

Unnamed: 0,annotation,label:WillettsSpecific2018,label:WillettsMET2018,label:DohertySpecific2018,label:Willetts2018,label:Doherty2018,label:Walmsley2020
0,7030 sleeping;MET 0.95,sleep,sleep,sleep,sleep,sleep,sleep
1,occupation;office and administrative support;1...,sitting,sitstand+lowactivity,sedentary-screen,sit-stand,sedentary,sedentary
2,home activity;household chores;preparing meals...,household-chores,sitstand+activity,tasks-moderate,mixed,moderate,light
3,occupation;office and administrative support;1...,sitting,sitstand+lowactivity,sedentary-screen,sit-stand,sedentary,sedentary
4,home activity;miscellaneous;sitting;9060 sitti...,sitting,sitstand+lowactivity,sedentary-non-screen,sit-stand,sedentary,sedentary


In [13]:
mapping_dicts, enc_maps = build_annotation_maps(annotation_map_df, LABEL_COLUMNS_TO_KEEP)

In [14]:
enc_maps

{'label:Walmsley2020': {'light': 0,
  'moderate-vigorous': 1,
  'sedentary': 2,
  'sleep': 3},
 'label:WillettsSpecific2018': {'bicycling': 0,
  'household-chores': 1,
  'manual-work': 2,
  'mixed-activity': 3,
  'sitting': 4,
  'sleep': 5,
  'sports': 6,
  'standing': 7,
  'vehicle': 8,
  'walking': 9},
 'label:WillettsMET2018': {'bicycling': 0,
  'gym': 1,
  'sitstand+activity': 2,
  'sitstand+lowactivity': 3,
  'sitting': 4,
  'sleep': 5,
  'sports': 6,
  'standing': 7,
  'vehicle': 8,
  'walking': 9,
  'walking+activity': 10}}

In [15]:
print(os.listdir("data/data_raw/capture24"))

['capture24']


In [16]:
print(os.listdir("data/data_raw/capture24/capture24"))

['P070.csv.gz', 'P105.csv.gz', 'P027.csv.gz', 'P088.csv.gz', 'P018.csv.gz', 'P129.csv.gz', 'P025.csv.gz', 'P033.csv.gz', 'P106.csv.gz', 'P049.csv.gz', 'P121.csv.gz', 'P056.csv.gz', 'P061.csv.gz', 'P086.csv.gz', 'P122.csv.gz', 'P034.csv.gz', 'P127.csv.gz', 'P052.csv.gz', 'P017.csv.gz', 'P102.csv.gz', 'P075.csv.gz', 'P066.csv.gz', 'metadata.csv', 'P055.csv.gz', 'P048.csv.gz', 'P125.csv.gz', 'P077.csv.gz', 'P060.csv.gz', 'P130.csv.gz', 'P071.csv.gz', 'P103.csv.gz', 'P005.csv.gz', 'P141.csv.gz', 'P043.csv.gz', 'P079.csv.gz', 'P051.csv.gz', 'P112.csv.gz', 'P089.csv.gz', 'P001.csv.gz', 'P069.csv.gz', 'P104.csv.gz', 'P143.csv.gz', 'P020.csv.gz', 'P031.csv.gz', 'P068.csv.gz', 'P139.csv.gz', 'P083.csv.gz', 'P016.csv.gz', 'P101.csv.gz', 'P148.csv.gz', 'P032.csv.gz', 'P036.csv.gz', 'P107.csv.gz', 'P137.csv.gz', 'P006.csv.gz', 'P073.csv.gz', 'P003.csv.gz', 'P064.csv.gz', 'P076.csv.gz', 'P149.csv.gz', 'P131.csv.gz', 'P096.csv.gz', 'P138.csv.gz', 'P117.csv.gz', 'P038.csv.gz', 'P084.csv.gz', 'P110.cs

In [17]:
files = sorted(glob(PARTICIPANT_GLOB))


In [18]:
PARTICIPANT_GLOB

'data/data_raw/capture24/capture24/P*.csv.gz'

In [19]:
files[0]

'data/data_raw/capture24/capture24/P001.csv.gz'

In [None]:
f= files[0]
df_part = process_participant_file(
    path=f,
    metadata_df=metadata_df,
    mapping_dicts=mapping_dicts,
    enc_maps=enc_maps,
    label_columns=LABEL_COLUMNS_TO_KEEP,
    out_dir=OUT_DIR,
    save_per_participant=True
)

Processing P001...


In [3]:
df = pd.read_csv('data/data_raw/capture24/capture24/P001.csv.gz', usecols=['time'])
df.shape

(10020001, 1)

In [5]:
USE_WELCH

False

Timedelta('1 days 03:50:00')

Memória livre: 6.39 GB
chunksize usado: 750,000

Chunk lido com sucesso!
Linhas no chunk: 750,000
Tempo de leitura: 0.79 segundos
