# PAMAP2: data loading and cleaning


### Imports

In [None]:
import os
import glob
import numpy as np
import pandas as pd


### Paths

In [None]:
# Adjust base_path to match your machine
base_path = r"D:\Projects\SmartFit-SmartDiet\data\raw"

protocol_path = os.path.join(base_path, "Protocol")
optional_path = os.path.join(base_path, "Optional")

print("Protocol path:", protocol_path)
print("Optional path:", optional_path)


### Find all .dat files

In [None]:
# Search for .dat files recursively (covers subfolders)
protocol_files = sorted(glob.glob(os.path.join(protocol_path, "**", "*.dat"), recursive=True))
optional_files = sorted(glob.glob(os.path.join(optional_path, "**", "*.dat"), recursive=True))

print(f"Protocol files: {len(protocol_files)}")
print(f"Optional files: {len(optional_files)}")

# Quick check
if protocol_files:
    print("Sample Protocol file:", protocol_files[0])
if optional_files:
    print("Sample Optional file:", optional_files[0])


### Define column names

In [None]:
columns = [
    "timestamp", "activity_id", "heart_rate",
    "hand_acc_16g_x", "hand_acc_16g_y", "hand_acc_16g_z",
    "hand_acc_6g_x", "hand_acc_6g_y", "hand_acc_6g_z",
    "hand_gyro_x", "hand_gyro_y", "hand_gyro_z",
    "hand_mag_x", "hand_mag_y", "hand_mag_z",
    "hand_temp",
    "chest_acc_16g_x", "chest_acc_16g_y", "chest_acc_16g_z",
    "chest_acc_6g_x", "chest_acc_6g_y", "chest_acc_6g_z",
    "chest_gyro_x", "chest_gyro_y", "chest_gyro_z",
    "chest_mag_x", "chest_mag_y", "chest_mag_z",
    "chest_temp",
    "ankle_acc_16g_x", "ankle_acc_16g_y", "ankle_acc_16g_z",
    "ankle_acc_6g_x", "ankle_acc_6g_y", "ankle_acc_6g_z",
    "ankle_gyro_x", "ankle_gyro_y", "ankle_gyro_z",
    "ankle_mag_x", "ankle_mag_y", "ankle_mag_z",
    "ankle_temp"
]
# PAMAP2 files have 54 columns; adding 2 more for missing ones if needed
while len(columns) < 54:
    columns.append(f"extra_col_{len(columns)+1}")


### Function to read a file

In [None]:
def read_pamap_file(filepath, columns):
    df = pd.read_csv(filepath, sep=" ", header=None, names=columns, comment=None)
    # Drop empty unnamed columns from multiple spaces
    df = df.dropna(axis=1, how="all")
    return df


### Load all Protocol files

In [None]:
protocol_dfs = []
for f in protocol_files:
    df = read_pamap_file(f, columns)
    protocol_dfs.append(df)

df_protocol = pd.concat(protocol_dfs, ignore_index=True) if protocol_dfs else pd.DataFrame()
print("Protocol rows:", len(df_protocol))


### Load all Optional files

In [None]:
optional_dfs = []
for f in optional_files:
    df = read_pamap_file(f, columns)
    optional_dfs.append(df)

df_optional = pd.concat(optional_dfs, ignore_index=True) if optional_dfs else pd.DataFrame()
print("Optional rows:", len(df_optional))


### Combine data

In [None]:
df_combined = pd.concat([df_protocol, df_optional], ignore_index=True)
print("Combined rows:", len(df_combined), "columns:", df_combined.shape[1])


### Quick Clean

In [None]:
# Remove rows with NaN activity IDs
df_combined = df_combined.dropna(subset=["activity_id"])

# Reset index
df_combined.reset_index(drop=True, inplace=True)

print("After cleaning:", df_combined.shape)
df_combined.head()


### Replace missing values, drop NaNs, and store in df_clean

In [None]:
df_clean = df_combined.dropna().reset_index(drop=True)

### Save cleaned dataset

In [None]:
output_path = "..SmartFit-SmartDiet/data/processed/pamap2_clean.csv"
os.makedirs(os.path.dirname(output_path), exist_ok=True)
df_clean.to_csv(output_path, index=False)

print(f"Cleaned dataset saved to {output_path}")
print(df_clean.shape) 