# PAMAP2 Feature Engineering

### Imports

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
from scipy.stats import kurtosis, skew
from scipy.signal import welch


### Load cleaned dataset

In [None]:
data_path = r"D:\Projects\SmartFit-SmartDiet\data\processed\pamap2_clean.csv"
df = pd.read_csv(data_path)
df['datetime'] = pd.to_datetime(df['timestamp'], unit='s', errors='coerce')
print(df.shape)
df.head()

### Define windowing parameters

In [None]:
# PAMAP2 raw data has 100Hz sampling frequency (approx)
# Let's pick 5-second windows = 500 samples per window
WINDOW_SIZE = 5  # seconds
SAMPLE_FREQ = 100  # Hz
SAMPLES_PER_WINDOW = WINDOW_SIZE * SAMPLE_FREQ


### Prepare dataframe for windowing

In [None]:
# Sort data properly
df.sort_values(by=['subject_id', 'session_type', 'datetime'], inplace=True)

# Reset index for clean iteration
df.reset_index(drop=True, inplace=True)


### Feature extraction function (per window)

In [None]:
def extract_features(window_df):
    features = {}
    numeric_cols = window_df.select_dtypes(include=np.number).columns.drop(['activity_id', 'subject_id'])
    
    for col in numeric_cols:
        data = window_df[col].values
        
        # Basic stats
        features[f'{col}_mean'] = np.mean(data)
        features[f'{col}_std'] = np.std(data)
        features[f'{col}_min'] = np.min(data)
        features[f'{col}_max'] = np.max(data)
        features[f'{col}_median'] = np.median(data)
        features[f'{col}_kurtosis'] = kurtosis(data)
        features[f'{col}_skew'] = skew(data)
        
        # Frequency domain: signal energy (sum of squares)
        features[f'{col}_energy'] = np.sum(data**2) / len(data)
        
        # Optional: dominant frequency using Welch PSD estimate
        try:
            freqs, psd = welch(data, fs=SAMPLE_FREQ)
            features[f'{col}_dom_freq'] = freqs[np.argmax(psd)]
        except:
            features[f'{col}_dom_freq'] = np.nan

    # Activity label mode (most frequent activity_id in window)
    features['activity_id'] = window_df['activity_id'].mode()[0]
    features['subject_id'] = window_df['subject_id'].iloc[0]
    features['session_type'] = window_df['session_type'].iloc[0]
    
    # Start and end timestamp of window
    features['window_start'] = window_df['datetime'].iloc[0]
    features['window_end'] = window_df['datetime'].iloc[-1]
    
    return features


### Apply windowing and extract features

In [None]:
windowed_features = []

# We’ll group by subject and session type to avoid mixing sessions
for (subj, session), group in df.groupby(['subject_id', 'session_type']):
    group = group.reset_index(drop=True)
    n_windows = len(group) // SAMPLES_PER_WINDOW
    
    for w in range(n_windows):
        start_idx = w * SAMPLES_PER_WINDOW
        end_idx = start_idx + SAMPLES_PER_WINDOW
        window_df = group.iloc[start_idx:end_idx]
        if len(window_df) == SAMPLES_PER_WINDOW:
            feat = extract_features(window_df)
            windowed_features.append(feat)

features_df = pd.DataFrame(windowed_features)
print("Feature matrix shape:", features_df.shape)
features_df.head()


### Map activity_id to activity name

In [None]:
activity_map = {
    1: "lying", 2: "sitting", 3: "standing", 4: "walking", 5: "running",
    6: "cycling", 7: "nordic_walking", 9: "watching_tv", 10: "computer_work",
    11: "car_driving", 12: "ascending_stairs", 13: "descending_stairs",
    16: "vacuum_cleaning", 17: "ironing", 18: "folding_laundry",
    19: "house_cleaning", 20: "playing_soccer", 24: "rope_jumping"
}

features_df['activity'] = features_df['activity_id'].map(activity_map)
features_df.drop(columns=['activity_id'], inplace=True)


### Save feature dataframe

In [None]:
out_path = Path("D:\Projects\SmartFit-SmartDiet\data\processed\pamap2_features.csv")
features_df.to_csv(out_path, index=False)
print("Features saved to", out_path)
