# PAMAP2: data loading and cleaning


In [1]:
import os
import glob
import pandas as pd
import numpy as np
from pathlib import Path


In [2]:
# Column names from PAMAP2 README (54 columns)
columns = [
    "timestamp", "activity_id", "heart_rate",
    # hand IMU 17 cols
    "hand_temp", "hand_acc16_x", "hand_acc16_y", "hand_acc16_z",
    "hand_acc6_x", "hand_acc6_y", "hand_acc6_z",
    "hand_gyro_x", "hand_gyro_y", "hand_gyro_z",
    "hand_mag_x", "hand_mag_y", "hand_mag_z",
    "hand_ori_w", "hand_ori_x", "hand_ori_y", "hand_ori_z",
    # chest IMU 17 cols
    "chest_temp", "chest_acc16_x", "chest_acc16_y", "chest_acc16_z",
    "chest_acc6_x", "chest_acc6_y", "chest_acc6_z",
    "chest_gyro_x", "chest_gyro_y", "chest_gyro_z",
    "chest_mag_x", "chest_mag_y", "chest_mag_z",
    "chest_ori_w", "chest_ori_x", "chest_ori_y", "chest_ori_z",
    # ankle IMU 17 cols
    "ankle_temp", "ankle_acc16_x", "ankle_acc16_y", "ankle_acc16_z",
    "ankle_acc6_x", "ankle_acc6_y", "ankle_acc6_z",
    "ankle_gyro_x", "ankle_gyro_y", "ankle_gyro_z",
    "ankle_mag_x", "ankle_mag_y", "ankle_mag_z",
    "ankle_ori_w", "ankle_ori_x", "ankle_ori_y", "ankle_ori_z"
]

activity_map = {
    1: "lying", 2: "sitting", 3: "standing", 4: "walking", 5: "running",
    6: "cycling", 7: "nordic_walking", 9: "watching_tv", 10: "computer_work",
    11: "car_driving", 12: "ascending_stairs", 13: "descending_stairs",
    16: "vacuum_cleaning", 17: "ironing", 18: "folding_laundry",
    19: "house_cleaning", 20: "playing_soccer", 24: "rope_jumping"
}


In [3]:
def read_pamap_file(path, columns):
    # Use regex whitespace separator to avoid problems with variable spacing
    try:
        df = pd.read_csv(path, sep=r'\s+', header=None, names=columns, engine='python')
    except Exception as e:
        # fallback: try delim_whitespace
        df = pd.read_csv(path, delim_whitespace=True, header=None, names=columns, engine='python')
    return df


In [4]:
def timestamp_to_datetime(ts_series):
    # ts_series may be seconds, milliseconds, or seconds since start.
    maxv = ts_series.abs().max()
    if pd.isna(maxv):
        return pd.to_datetime(ts_series, unit='s', errors='coerce')
    # epoch milliseconds if value large
    if maxv > 1e12:
        return pd.to_datetime(ts_series, unit='ms', errors='coerce')
    # epoch seconds
    if maxv > 1e9:
        return pd.to_datetime(ts_series, unit='s', errors='coerce')
    # likely relative seconds; convert to timedelta from a fake epoch
    return pd.to_datetime(ts_series, unit='s', errors='coerce')


In [5]:
sample_file = sorted(glob.glob("data/raw/Protocol/*.dat"))[0]
print("Sample file:", sample_file)
df_sample = read_pamap_file(sample_file, columns)
print("shape:", df_sample.shape)
display(df_sample.head())
print("timestamp max:", df_sample['timestamp'].max())


IndexError: list index out of range