In [1]:
import pandas as pd
from pathlib import Path

# Load datasets
train_df = pd.read_csv("../data/raw/train.csv")
test_df  = pd.read_csv("../data/raw/test.csv")

print("Train shape:", train_df.shape)
print("Test shape:", test_df.shape)

Train shape: (7352, 563)
Test shape: (2947, 563)


In [2]:
X = train_df.drop(columns=["Activity", "subject"])
y = train_df["Activity"]
subjects = train_df["subject"]

print("Feature matrix:", X.shape)
print("Target:", y.shape)

Feature matrix: (7352, 561)
Target: (7352,)


In [3]:
# abel Encoding 
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y_encoded = le.fit_transform(y)

label_map = dict(zip(le.classes_, le.transform(le.classes_)))
label_map

{'LAYING': np.int64(0),
 'SITTING': np.int64(1),
 'STANDING': np.int64(2),
 'WALKING': np.int64(3),
 'WALKING_DOWNSTAIRS': np.int64(4),
 'WALKING_UPSTAIRS': np.int64(5)}

In [4]:
# Feature Scaling (Mandatory for HAR)
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [5]:
# Domain-Aware Feature Grouping (HAR-Specific)

time_cols = [c for c in X.columns if c.startswith("t")]
freq_cols = [c for c in X.columns if c.startswith("f")]

X_time = X[time_cols]
X_freq = X[freq_cols]

print("Time features:", X_time.shape)
print("Frequency features:", X_freq.shape)

Time features: (7352, 265)
Frequency features: (7352, 289)


In [6]:
# Statistical Feature Aggregation (NEW FEATURES)

X_stats = pd.DataFrame({
    "row_mean": X.mean(axis=1),
    "row_std": X.std(axis=1),
    "row_max": X.max(axis=1),
    "row_min": X.min(axis=1)
})

In [7]:
# Energy-Focused Feature Engineering

energy_cols = [c for c in X.columns if "energy()" in c]

X_energy = pd.DataFrame({
    "energy_mean": X[energy_cols].mean(axis=1),
    "energy_std": X[energy_cols].std(axis=1)
})

In [8]:
# Angle-Based Postural Features

angle_cols = [c for c in X.columns if c.startswith("angle")]

X_angles = X[angle_cols]

In [9]:
# FINAL FEATURE MATRIX (ENGINEERED)
X_engineered = pd.concat(
    [
        pd.DataFrame(X_scaled, columns=X.columns),
        X_stats.reset_index(drop=True),
        X_energy.reset_index(drop=True),
        X_angles.reset_index(drop=True)
    ],
    axis=1
)

print("Final engineered shape:", X_engineered.shape)

Final engineered shape: (7352, 574)


In [13]:
import numpy as np
import pandas as pd

# Save engineered features and labels
pd.DataFrame(X_engineered).to_csv(
    "../data/X_engineered_train.csv",
    index=False
)

pd.Series(y_encoded).to_csv(
    "../data/y_train.csv",
    index=False
)

print("Saved engineered features:", X_engineered.shape)

Saved engineered features: (7352, 574)


In [10]:
import joblib

joblib.dump(scaler, "../model/scaler.pkl")
print("✅ Scaler saved")

✅ Scaler saved


In [11]:
def engineer_features(X, scaler):
    X_scaled = pd.DataFrame(
        scaler.transform(X),
        columns=X.columns
    )

    X_stats = pd.DataFrame({
        "row_mean": X.mean(axis=1),
        "row_std": X.std(axis=1),
        "row_max": X.max(axis=1),
        "row_min": X.min(axis=1)
    })

    energy_cols = [c for c in X.columns if "energy()" in c]
    X_energy = pd.DataFrame({
        "energy_mean": X[energy_cols].mean(axis=1),
        "energy_std": X[energy_cols].std(axis=1)
    })

    angle_cols = [c for c in X.columns if c.startswith("angle")]
    X_angles = X[angle_cols]

    return pd.concat(
        [X_scaled, X_stats, X_energy, X_angles],
        axis=1
    )