In [1]:
import os
import pandas as pd
import numpy as np
from scipy.signal import butter, filtfilt
from sklearn.preprocessing import StandardScaler, LabelEncoder
from tqdm import tqdm

In [2]:
# Define the base directory where sub1 to sub10 folders are located
BASE_DIR = "C:\\Users\\lankh\\Downloads\\SFU-IMU Dataset\IMU Dataset"  # Replace with the actual path to your dataset

# Define the subjects (sub1 to sub10)
subjects = [f'sub{i}' for i in range(1, 11)]

# Define the activity categories
categories = ['ADLs', 'Falls', 'Near_Falls']

# Define the columns to select
selected_columns = [
    'waist Acceleration X (m/s^2)',
    'waist Acceleration Y (m/s^2)',
    'waist Acceleration Z (m/s^2)',
    'waist Angular Velocity X (rad/s)',
    'waist Angular Velocity Y (rad/s)',
    'waist Angular Velocity Z (rad/s)'
]

# Sampling frequency of the IMU data (Hz)
SAMPLING_FREQUENCY = 50  # Adjust if different

# Low-pass filter configuration
CUTOFF_FREQUENCY = 5  # Hz
FILTER_ORDER = 4

# Output path for the preprocessed data
OUTPUT_FILE = os.path.join(BASE_DIR, 'preprocessed_imu_data.csv')

In [3]:
# Initialize a list to hold all DataFrames
data_frames = []

# Iterate through each subject
print("Loading and filtering data...")
for subject in tqdm(subjects, desc='Subjects'):
    subject_path = os.path.join(BASE_DIR, subject)
    
    # Iterate through each activity category
    for category in categories:
        category_path = os.path.join(subject_path, category)
        
        # Check if the category directory exists
        if not os.path.isdir(category_path):
            print(f"Warning: Directory {category_path} does not exist. Skipping...")
            continue
        
        # Iterate through each .xlsx file in the category directory
        for file_name in os.listdir(category_path):
            if file_name.endswith('.xlsx'):
                file_path = os.path.join(category_path, file_name)
                
                try:
                    # Read the Excel file
                    df = pd.read_excel(file_path)
                    
                    # Select the desired columns
                    df_selected = df[selected_columns].copy()
                    
                    # Add labels for activity and subject
                    df_selected['Activity'] = category
                    df_selected['Subject'] = subject
                    
                    # Append the DataFrame to the list
                    data_frames.append(df_selected)
                
                except Exception as e:
                    print(f"Error reading {file_path}: {e}")

# Concatenate all DataFrames into a single DataFrame
if data_frames:
    data = pd.concat(data_frames, ignore_index=True)
    print(f"Total data points before preprocessing: {len(data)}")
else:
    raise ValueError("No data was loaded. Please check the dataset path and folder structure.")

Loading and filtering data...


Subjects: 100%|████████████████████████████████████████████████████████████████████████| 10/10 [10:00<00:00, 60.03s/it]

Total data points before preprocessing: 1190369





In [6]:
def low_pass_filter(data, cutoff, fs, order=4):
    """
    Apply a low-pass Butterworth filter to the data.
    
    Parameters:
    - data: pandas DataFrame containing the data to filter
    - cutoff: cutoff frequency in Hz
    - fs: sampling frequency in Hz
    - order: order of the Butterworth filter
    
    Returns:
    - Filtered pandas DataFrame
    """
    nyquist = 0.5 * fs
    normal_cutoff = cutoff / nyquist
    # Get the filter coefficients
    b, a = butter(order, normal_cutoff, btype='low', analog=False)
    
    # Apply the filter to each selected column
    for column in selected_columns:
        data[column] = filtfilt(b, a, data[column])
    
    return data

In [7]:
print("Starting preprocessing...")

# 1. Handle Missing Values
# Option 1: Drop rows with any missing values
data.dropna(inplace=True)

# Option 2: Alternatively, fill missing values (uncomment if preferred)
# data.fillna(method='ffill', inplace=True)

print(f"Data points after handling missing values: {len(data)}")

# 2. Apply Low-Pass Filter to Remove High-Frequency Noise
data = low_pass_filter(data, CUTOFF_FREQUENCY, SAMPLING_FREQUENCY, FILTER_ORDER)

# 3. Standardize the Data
scaler = StandardScaler()
data[selected_columns] = scaler.fit_transform(data[selected_columns])

# 4. Encode Activity Labels
label_encoder = LabelEncoder()
data['Activity_Label'] = label_encoder.fit_transform(data['Activity'])

# ---------------------------------------------
# Save Preprocessed Data
# ---------------------------------------------

# Save the preprocessed data to a CSV file
data.to_csv(OUTPUT_FILE, index=False)
print(f"Preprocessed data saved to {OUTPUT_FILE}")

Starting preprocessing...
Data points after handling missing values: 1190369
Preprocessed data saved to C:\Users\lankh\Downloads\SFU-IMU Dataset\IMU Dataset\preprocessed_imu_data.csv
