In [2]:
import pandas as pd

df = pd.read_csv('data/data_raw/capture24/capture24/P001.csv.gz',
                 usecols=['time'])
df['time'] = pd.to_datetime(df['time'])
(df['time'].iloc[-1] - df['time'].iloc[0])

Timedelta('1 days 03:50:00')

In [3]:
import pandas as pd
import psutil
import time

path = "data/data_raw/capture24/capture24/P001.csv.gz"

# medir memória livre
mem = psutil.virtual_memory()
free_gb = mem.available / 1e9
print(f"Memória livre: {free_gb:.2f} GB")

# chunksize recomendado com base na memória
if free_gb >= 8:
    chunksize = 1_200_000
elif free_gb >= 4:
    chunksize = 750_000
else:
    chunksize = 500_000

print(f"chunksize usado: {chunksize:,}")

# testar leitura do primeiro chunk
t0 = time.time()

reader = pd.read_csv(
    path,
    chunksize=chunksize,
    usecols=['time', 'x', 'y', 'z', 'annotation'],
    dtype={'annotation': 'string'},
    low_memory=False
)

chunk = next(reader)
t1 = time.time()

print(f"\nChunk lido com sucesso!")
print(f"Linhas no chunk: {len(chunk):,}")
print(f"Tempo de leitura: {t1 - t0:.2f} segundos")

Memória livre: 6.15 GB
chunksize usado: 750,000

Chunk lido com sucesso!
Linhas no chunk: 750,000
Tempo de leitura: 0.80 segundos


In [4]:
import pandas as pd
import numpy as np
from datetime import timedelta
import time

WINDOW_SECONDS = 5
MIN_SAMPLES = 250
ANNOTATION_NA_THRESHOLD = 0.5

chunk = chunk.copy()  # garantir segurança
chunk['time'] = pd.to_datetime(chunk['time'], errors='coerce')
chunk = chunk.dropna(subset=['time','x','y','z']).reset_index(drop=True)

# converter para seconds-from-start
t0 = chunk['time'].iloc[0]
chunk['t_sec'] = (chunk['time'] - t0).dt.total_seconds()

# limite
t_end_chunk = chunk['t_sec'].iloc[-1]

# janelas dentro do chunk
window_starts = np.arange(0, t_end_chunk, WINDOW_SECONDS)

print(f"Total de janelas teóricas no chunk: {len(window_starts)}")

valid_windows = 0

t0 = time.time()

for ws in window_starts:
    we = ws + WINDOW_SECONDS
    mask = (chunk['t_sec'] >= ws) & (chunk['t_sec'] < we)
    wdf = chunk.loc[mask]

    n = len(wdf)
    if n < MIN_SAMPLES:
        continue

    ann_na_frac = wdf['annotation'].isna().mean()
    if ann_na_frac >= ANNOTATION_NA_THRESHOLD:
        continue

    valid_windows += 1

t1 = time.time()

print(f"Janelas válidas no chunk: {valid_windows}")
print(f"Tempo do janelamento dentro do chunk: {t1 - t0:.2f} segundos")

Total de janelas teóricas no chunk: 1500
Janelas válidas no chunk: 1500
Tempo do janelamento dentro do chunk: 2.51 segundos


In [5]:
import numpy as np
from scipy import signal
from datetime import timedelta
import time

WINDOW_SECONDS = 5
MIN_SAMPLES = 250
ANNOTATION_NA_THRESHOLD = 0.5

def compute_features_fft_fast(window_df):
    """Features + FFT rápida para teste de performance."""
    x = window_df['x'].to_numpy()
    y = window_df['y'].to_numpy()
    z = window_df['z'].to_numpy()

    feats = {}

    # ---- Estatísticas ----
    for axis, arr in [('x', x), ('y', y), ('z', z)]:
        feats[f'{axis}_mean'] = np.mean(arr)
        feats[f'{axis}_std'] = np.std(arr)
        feats[f'{axis}_min'] = np.min(arr)
        feats[f'{axis}_max'] = np.max(arr)

    # ---- Energia ----
    feats['energy_x'] = np.mean(x**2)
    feats['energy_y'] = np.mean(y**2)
    feats['energy_z'] = np.mean(z**2)
    feats['energy_total'] = np.mean(x**2 + y**2 + z**2)

    # ---- Magnitude ----
    mag = np.sqrt(x**2 + y**2 + z**2)
    feats['mag_mean'] = np.mean(mag)

    # ---- Correlações ----
    def safe_corr(a, b):
        if len(a) < 2:
            return np.nan
        if np.std(a) == 0 or np.std(b) == 0:
            return np.nan
        return float(np.corrcoef(a, b)[0, 1])

    feats['corr_xy'] = safe_corr(x, y)
    feats['corr_xz'] = safe_corr(x, z)
    feats['corr_yz'] = safe_corr(y, z)

    # ---- FFT rápida (Welch) ----
    mag_dt = signal.detrend(mag)
    freqs, psd = signal.welch(
        mag_dt,
        fs=100,
        nperseg=128,
        nfft=256
    )
    idx = np.argmax(psd)
    feats['fft_dom_freq'] = freqs[idx]
    feats['fft_peak_power'] = psd[idx]

    return feats


# ------------------------------
#   RODAR O TESTE NO CHUNK
# ------------------------------

chunk2 = chunk.copy()
chunk2['time'] = pd.to_datetime(chunk2['time'], errors='coerce')
chunk2 = chunk2.dropna(subset=['time', 'x', 'y', 'z']).reset_index(drop=True)

t0_global = time.time()

# tempo relativo dentro do chunk
t0 = chunk2['time'].iloc[0]
chunk2['t_sec'] = (chunk2['time'] - t0).dt.total_seconds()

t_end = chunk2['t_sec'].iloc[-1]
window_starts = np.arange(0, t_end, WINDOW_SECONDS)

all_feats = []
valid_windows = 0

t0 = time.time()

for ws in window_starts:
    we = ws + WINDOW_SECONDS
    mask = (chunk2['t_sec'] >= ws) & (chunk2['t_sec'] < we)
    wdf = chunk2.loc[mask]

    if len(wdf) < MIN_SAMPLES:
        continue
    if wdf['annotation'].isna().mean() >= ANNOTATION_NA_THRESHOLD:
        continue

    feats = compute_features_fft_fast(wdf)
    valid_windows += 1
    all_feats.append(feats)

t1 = time.time()

print(f"Total de janelas no chunk: {len(window_starts)}")
print(f"Janelas válidas: {valid_windows}")
print(f"Tempo com FFT rápida: {t1 - t0:.2f} segundos")
print(f"Tempo total (incluindo preparação): {t1 - t0_global:.2f} segundos")
print(f"Tamanho da lista de features: {len(all_feats)}")

Total de janelas no chunk: 1500
Janelas válidas: 1500
Tempo com FFT rápida: 4.02 segundos
Tempo total (incluindo preparação): 4.02 segundos
Tamanho da lista de features: 1500


In [6]:
import numpy as np
from scipy import signal
from datetime import timedelta
import time

WINDOW_SECONDS = 5
MIN_SAMPLES = 250
ANNOTATION_NA_THRESHOLD = 0.5

def compute_features_fft_precise(window_df):
    """Features + FFT completa (Welch, nperseg=256, nfft=500)."""
    x = window_df['x'].to_numpy()
    y = window_df['y'].to_numpy()
    z = window_df['z'].to_numpy()

    feats = {}

    # ---- Estatísticas ----
    for axis, arr in [('x', x), ('y', y), ('z', z)]:
        feats[f'{axis}_mean'] = np.mean(arr)
        feats[f'{axis}_std'] = np.std(arr)
        feats[f'{axis}_min'] = np.min(arr)
        feats[f'{axis}_max'] = np.max(arr)

    # ---- Energia ----
    feats['energy_x'] = np.mean(x**2)
    feats['energy_y'] = np.mean(y**2)
    feats['energy_z'] = np.mean(z**2)
    feats['energy_total'] = np.mean(x**2 + y**2 + z**2)

    # ---- Magnitude ----
    mag = np.sqrt(x**2 + y**2 + z**2)
    feats['mag_mean'] = np.mean(mag)

    # ---- Correlações ----
    def safe_corr(a, b):
        if len(a) < 2:
            return np.nan
        if np.std(a) == 0 or np.std(b) == 0:
            return np.nan
        return float(np.corrcoef(a, b)[0, 1])

    feats['corr_xy'] = safe_corr(x, y)
    feats['corr_xz'] = safe_corr(x, z)
    feats['corr_yz'] = safe_corr(y, z)

    # ---- FFT completa ----
    mag_dt = signal.detrend(mag)

    freqs, psd = signal.welch(
        mag_dt,
        fs=100,
        nperseg=256,
        nfft=500   # zero-padding até 500
    )

    if np.all(np.isnan(psd)):
        feats['fft_dom_freq'] = np.nan
        feats['fft_peak_power'] = np.nan
    else:
        idx = np.argmax(psd)
        feats['fft_dom_freq'] = freqs[idx]
        feats['fft_peak_power'] = psd[idx]

    return feats


# ------------------------------
#   RODAR O TESTE NO CHUNK COM FFT COMPLETA
# ------------------------------

chunk3 = chunk.copy()
chunk3['time'] = pd.to_datetime(chunk3['time'], errors='coerce')
chunk3 = chunk3.dropna(subset=['time','x','y','z']).reset_index(drop=True)

t0_global = time.time()

# t em segundos dentro do chunk
t0 = chunk3['time'].iloc[0]
chunk3['t_sec'] = (chunk3['time'] - t0).dt.total_seconds()

t_end = chunk3['t_sec'].iloc[-1]
window_starts = np.arange(0, t_end, WINDOW_SECONDS)

all_feats_precise = []
valid_windows = 0

t0 = time.time()

for ws in window_starts:
    we = ws + WINDOW_SECONDS
    mask = (chunk3['t_sec'] >= ws) & (chunk3['t_sec'] < we)
    wdf = chunk3.loc[mask]

    if len(wdf) < MIN_SAMPLES:
        continue
    if wdf['annotation'].isna().mean() >= ANNOTATION_NA_THRESHOLD:
        continue

    feats = compute_features_fft_precise(wdf)
    valid_windows += 1
    all_feats_precise.append(feats)

t1 = time.time()

print(f"Total de janelas no chunk: {len(window_starts)}")
print(f"Janelas válidas: {valid_windows}")
print(f"Tempo com FFT COMPLETA: {t1 - t0:.2f} segundos")
print(f"Tempo total (incluindo preparação): {t1 - t0_global:.2f} segundos")
print(f"Tamanho da lista de features: {len(all_feats_precise)}")


Total de janelas no chunk: 1500
Janelas válidas: 1500
Tempo com FFT COMPLETA: 3.98 segundos
Tempo total (incluindo preparação): 3.98 segundos
Tamanho da lista de features: 1500
