# PAMAP2 Feature Engineering

### Imports

In [7]:
import pandas as pd
import numpy as np
from pathlib import Path
from scipy.stats import kurtosis, skew
from scipy.signal import welch


In [8]:
data_path = r"D:\Projects\SmartFit-SmartDiet\data\processed\pamap2_clean.csv"
df = pd.read_csv(data_path)
df['datetime'] = pd.to_datetime(df['timestamp'], unit='s', errors='coerce')
print(df.shape)
df.head()



(348788, 57)


Unnamed: 0,timestamp,activity_id,heart_rate,hand_acc_16g_x,hand_acc_16g_y,hand_acc_16g_z,hand_acc_6g_x,hand_acc_6g_y,hand_acc_6g_z,hand_gyro_x,...,extra_col_48,extra_col_49,extra_col_50,extra_col_51,extra_col_52,extra_col_53,extra_col_54,subject_id,session_type,datetime
0,8.38,0,104.0,30.0,2.37223,8.60074,3.51048,2.43954,8.76165,3.35465,...,-61.1888,-38.9599,-58.1438,1.0,0.0,0.0,0.0,101,protocol,1970-01-01 00:00:08.380
1,8.48,0,104.0,30.0,2.29745,8.9045,3.46984,2.39736,8.94335,3.53551,...,-61.5302,-38.724,-58.386,1.0,0.0,0.0,0.0,101,protocol,1970-01-01 00:00:08.480
2,8.59,0,104.0,30.0,2.40867,9.16819,3.35516,2.48704,9.03315,3.35401,...,-61.0729,-39.5091,-58.6457,1.0,0.0,0.0,0.0,101,protocol,1970-01-01 00:00:08.590
3,8.7,0,104.0,30.0,2.18114,8.86676,3.39125,2.34987,8.7473,3.46048,...,-61.4205,-39.0475,-59.2568,1.0,0.0,0.0,0.0,101,protocol,1970-01-01 00:00:08.700
4,8.81,0,104.0,30.0,2.40681,8.71326,3.39515,2.3971,8.86788,3.58097,...,-60.848,-38.8713,-57.4029,1.0,0.0,0.0,0.0,101,protocol,1970-01-01 00:00:08.810


In [9]:
# PAMAP2 raw data has 100Hz sampling frequency (approx)
# Let's pick 5-second windows = 500 samples per window
WINDOW_SIZE = 5  # seconds
SAMPLE_FREQ = 100  # Hz
SAMPLES_PER_WINDOW = WINDOW_SIZE * SAMPLE_FREQ


In [10]:
# Sort data properly
df.sort_values(by=['subject_id', 'session_type', 'datetime'], inplace=True)

# Reset index for clean iteration
df.reset_index(drop=True, inplace=True)


In [11]:
def extract_features(window_df):
    features = {}
    numeric_cols = window_df.select_dtypes(include=np.number).columns.drop(['activity_id', 'subject_id'])
    
    for col in numeric_cols:
        data = window_df[col].values
        
        # Basic stats
        features[f'{col}_mean'] = np.mean(data)
        features[f'{col}_std'] = np.std(data)
        features[f'{col}_min'] = np.min(data)
        features[f'{col}_max'] = np.max(data)
        features[f'{col}_median'] = np.median(data)
        features[f'{col}_kurtosis'] = kurtosis(data)
        features[f'{col}_skew'] = skew(data)
        
        # Frequency domain: signal energy (sum of squares)
        features[f'{col}_energy'] = np.sum(data**2) / len(data)
        
        # Optional: dominant frequency using Welch PSD estimate
        try:
            freqs, psd = welch(data, fs=SAMPLE_FREQ)
            features[f'{col}_dom_freq'] = freqs[np.argmax(psd)]
        except:
            features[f'{col}_dom_freq'] = np.nan

    # Activity label mode (most frequent activity_id in window)
    features['activity_id'] = window_df['activity_id'].mode()[0]
    features['subject_id'] = window_df['subject_id'].iloc[0]
    features['session_type'] = window_df['session_type'].iloc[0]
    
    # Start and end timestamp of window
    features['window_start'] = window_df['datetime'].iloc[0]
    features['window_end'] = window_df['datetime'].iloc[-1]
    
    return features


In [12]:
windowed_features = []

# We’ll group by subject and session type to avoid mixing sessions
for (subj, session), group in df.groupby(['subject_id', 'session_type']):
    group = group.reset_index(drop=True)
    n_windows = len(group) // SAMPLES_PER_WINDOW
    
    for w in range(n_windows):
        start_idx = w * SAMPLES_PER_WINDOW
        end_idx = start_idx + SAMPLES_PER_WINDOW
        window_df = group.iloc[start_idx:end_idx]
        if len(window_df) == SAMPLES_PER_WINDOW:
            feat = extract_features(window_df)
            windowed_features.append(feat)

features_df = pd.DataFrame(windowed_features)
print("Feature matrix shape:", features_df.shape)
features_df.head()


  features[f'{col}_kurtosis'] = kurtosis(data)
  features[f'{col}_skew'] = skew(data)
  features[f'{col}_kurtosis'] = kurtosis(data)
  features[f'{col}_skew'] = skew(data)
  features[f'{col}_kurtosis'] = kurtosis(data)
  features[f'{col}_skew'] = skew(data)
  features[f'{col}_kurtosis'] = kurtosis(data)
  features[f'{col}_skew'] = skew(data)
  features[f'{col}_kurtosis'] = kurtosis(data)
  features[f'{col}_skew'] = skew(data)
  features[f'{col}_kurtosis'] = kurtosis(data)
  features[f'{col}_skew'] = skew(data)
  features[f'{col}_kurtosis'] = kurtosis(data)
  features[f'{col}_skew'] = skew(data)
  features[f'{col}_kurtosis'] = kurtosis(data)
  features[f'{col}_skew'] = skew(data)
  features[f'{col}_kurtosis'] = kurtosis(data)
  features[f'{col}_skew'] = skew(data)
  features[f'{col}_kurtosis'] = kurtosis(data)
  features[f'{col}_skew'] = skew(data)
  features[f'{col}_kurtosis'] = kurtosis(data)
  features[f'{col}_skew'] = skew(data)
  features[f'{col}_kurtosis'] = kurtosis(data)
  featu

Feature matrix shape: (690, 482)


Unnamed: 0,timestamp_mean,timestamp_std,timestamp_min,timestamp_max,timestamp_median,timestamp_kurtosis,timestamp_skew,timestamp_energy,timestamp_dom_freq,heart_rate_mean,...,extra_col_54_median,extra_col_54_kurtosis,extra_col_54_skew,extra_col_54_energy,extra_col_54_dom_freq,activity_id,subject_id,session_type,window_start,window_end
0,37.74314,16.0473,10.03,65.39,37.825,-1.204632,-0.009388,1682.060455,0.390625,112.142,...,0.0,,,0.0,0.0,0,101,optional,1970-01-01 00:00:10.030,1970-01-01 00:01:05.390
1,93.06666,15.847266,65.5,120.42,93.025,-1.200776,-0.001697,8912.53905,0.390625,113.582,...,0.0,,,0.0,0.0,0,101,optional,1970-01-01 00:01:05.500,1970-01-01 00:02:00.420
2,148.145,16.099049,120.53,175.88,147.935,-1.217297,0.009092,22206.120412,0.390625,114.332,...,0.0,,,0.0,0.0,0,101,optional,1970-01-01 00:02:00.530,1970-01-01 00:02:55.880
3,203.30376,15.789988,175.99,230.59,203.305,-1.199326,-0.000588,41581.742563,0.390625,121.434,...,0.0,,,0.0,0.0,11,101,optional,1970-01-01 00:02:55.990,1970-01-01 00:03:50.590
4,257.9931,15.786906,230.7,285.28,257.995,-1.200005,-1e-05,66809.666057,0.390625,110.098,...,0.0,,,0.0,0.0,11,101,optional,1970-01-01 00:03:50.700,1970-01-01 00:04:45.280


In [13]:
activity_map = {
    1: "lying", 2: "sitting", 3: "standing", 4: "walking", 5: "running",
    6: "cycling", 7: "nordic_walking", 9: "watching_tv", 10: "computer_work",
    11: "car_driving", 12: "ascending_stairs", 13: "descending_stairs",
    16: "vacuum_cleaning", 17: "ironing", 18: "folding_laundry",
    19: "house_cleaning", 20: "playing_soccer", 24: "rope_jumping"
}

features_df['activity'] = features_df['activity_id'].map(activity_map)
features_df.drop(columns=['activity_id'], inplace=True)


In [15]:
out_path = Path("D:\Projects\SmartFit-SmartDiet\data\processed\pamap2_features.csv")
features_df.to_csv(out_path, index=False)
print("Features saved to", out_path)


  out_path = Path("D:\Projects\SmartFit-SmartDiet\data\processed\pamap2_features.csv")


Features saved to D:\Projects\SmartFit-SmartDiet\data\processed\pamap2_features.csv


In [16]:
features_df

Unnamed: 0,timestamp_mean,timestamp_std,timestamp_min,timestamp_max,timestamp_median,timestamp_kurtosis,timestamp_skew,timestamp_energy,timestamp_dom_freq,heart_rate_mean,...,extra_col_54_median,extra_col_54_kurtosis,extra_col_54_skew,extra_col_54_energy,extra_col_54_dom_freq,subject_id,session_type,window_start,window_end,activity
0,37.74314,16.047300,10.03,65.39,37.825,-1.204632,-0.009388,1.682060e+03,0.390625,112.142,...,0.000000,,,0.000000,0.000000,101,optional,1970-01-01 00:00:10.030,1970-01-01 00:01:05.390,
1,93.06666,15.847266,65.50,120.42,93.025,-1.200776,-0.001697,8.912539e+03,0.390625,113.582,...,0.000000,,,0.000000,0.000000,101,optional,1970-01-01 00:01:05.500,1970-01-01 00:02:00.420,
2,148.14500,16.099049,120.53,175.88,147.935,-1.217297,0.009092,2.220612e+04,0.390625,114.332,...,0.000000,,,0.000000,0.000000,101,optional,1970-01-01 00:02:00.530,1970-01-01 00:02:55.880,
3,203.30376,15.789988,175.99,230.59,203.305,-1.199326,-0.000588,4.158174e+04,0.390625,121.434,...,0.000000,,,0.000000,0.000000,101,optional,1970-01-01 00:02:55.990,1970-01-01 00:03:50.590,car_driving
4,257.99310,15.786906,230.70,285.28,257.995,-1.200005,-0.000010,6.680967e+04,0.390625,110.098,...,0.000000,,,0.000000,0.000000,101,optional,1970-01-01 00:03:50.700,1970-01-01 00:04:45.280,car_driving
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
685,1737.89440,16.203672,1710.02,1765.89,1737.745,-1.205799,0.007561,3.020540e+06,0.390625,170.900,...,-0.203589,-1.268708,0.150928,0.214828,0.390625,109,optional,1970-01-01 00:28:30.020,1970-01-01 00:29:25.890,playing_soccer
686,1794.17280,16.225508,1766.00,1822.14,1794.385,-1.207079,-0.016929,3.219319e+06,0.390625,181.948,...,-0.289263,-0.999795,0.459302,0.250275,0.390625,109,optional,1970-01-01 00:29:26.000,1970-01-01 00:30:22.140,playing_soccer
687,1849.94284,16.108846,1822.25,1877.82,1849.975,-1.207711,0.004146,3.422548e+06,0.390625,179.286,...,-0.194828,-1.130432,0.398761,0.172880,1.171875,109,optional,1970-01-01 00:30:22.250,1970-01-01 00:31:17.820,playing_soccer
688,1905.38410,15.928474,1877.92,1932.94,1905.325,-1.209388,-0.000485,3.630742e+06,0.390625,180.358,...,-0.222285,-0.728815,0.510623,0.173224,1.171875,109,optional,1970-01-01 00:31:17.920,1970-01-01 00:32:12.940,playing_soccer
