In [None]:
import numpy as np
import pandas as pd
import glob
import os

In [None]:


RAW_DATA_PATH = ""  # Path where raw files are stored

# Load all accelerometer and gyroscope files
acc_files = sorted(glob.glob(os.path.join(RAW_DATA_PATH, "acc_exp*.txt")))
gyro_files = sorted(glob.glob(os.path.join(RAW_DATA_PATH, "gyro_exp*.txt")))

# Function to load sensor data (accelerometer or gyroscope)
def load_sensor_data(file_list):
    data = []
    for file in file_list:
        sensor_data = np.loadtxt(file)  # Load file as numpy array
        data.append(sensor_data)
    return np.vstack(data)  # Stack all experiments together

# Load all sensor readings
acc_data = load_sensor_data(acc_files)  # Shape (N, 3) → X, Y, Z acceleration
gyro_data = load_sensor_data(gyro_files)  # Shape (N, 3) → X, Y, Z angular velocity

# Combine accelerometer and gyroscope data (6 features per timestamp)
X_raw = np.hstack((acc_data, gyro_data))  # Shape (N, 6)

print(f"Raw Sensor Data Shape: {X_raw.shape}")  # Expect (Total_samples, 6)


""" Read all acc_expXX_userYY.txt and gyro_expXX_userYY.txt files.
Extract X, Y, Z values from both accelerometer and gyroscope.
Merge them into a single dataset."""

Raw Sensor Data Shape: (1122772, 6)


' Read all acc_expXX_userYY.txt and gyro_expXX_userYY.txt files.\nExtract X, Y, Z values from both accelerometer and gyroscope.\nMerge them into a single dataset.'

In [None]:
X_raw[0]

array([ 0.91805559, -0.1125    ,  0.50972225, -0.05497787, -0.06963864,
       -0.03084869])

Each row in labels.txt contains:<br>
[experiment_id, user_id, activity_id, start_sample, end_sample]

Example:

1 1 5 0 128<br>
1 1 2 128 256<br>
1 1 3 256 384<br>

This means:

Activity 5 starts at sample 0 and ends at 128.<br>
Activity 2 starts at sample 128 and ends at 256.<br>
Activity 3 starts at sample 256 and ends at 384.<br>
We will:<br>

- Read labels.txt.<br>
- Assign activity labels to corresponding samples.

In [None]:
# Load labels file
labels_path = os.path.join(RAW_DATA_PATH, "labels.txt")
labels_df = pd.read_csv(labels_path, sep=" ", header=None, names=["experiment", "user", "activity", "start", "end"])

# Create an empty label array
y_raw = np.zeros((X_raw.shape[0],))  # One label per timestamp

# Assign labels to sensor readings
for _, row in labels_df.iterrows():
    y_raw[row["start"]:row["end"]] = row["activity"]  # Assign activity ID

print(f"Labels Shape: {y_raw.shape}")  # Expect (Total_samples,)


Labels Shape: (1122772,)


In [None]:
y_raw[1400]

5.0

In [None]:
unique, counts = np.unique(y_seq.argmax(axis=1), return_counts=True)
print("📊 Label Distribution in Validation Data:", dict(zip(unique, counts)))

📊 Label Distribution in Validation Data: {0: 2751, 1: 4686, 2: 4742, 3: 3155, 4: 2785, 5: 2606, 6: 216, 7: 128, 8: 291, 9: 291, 10: 249, 11: 146, 12: 1100676}


#It seems like there is high class imablance where the transitional actions like lie_to_sit, stand_to_sit etc seems to have too low count while lie_to_stand has extreme high count. lets balance this and see if we can handle the overfitting

In [None]:
!pip install imblearn

Collecting imblearn
  Downloading imblearn-0.0-py2.py3-none-any.whl.metadata (355 bytes)
Collecting imbalanced-learn (from imblearn)
  Downloading imbalanced_learn-0.13.0-py3-none-any.whl.metadata (8.8 kB)
Collecting sklearn-compat<1,>=0.1 (from imbalanced-learn->imblearn)
  Downloading sklearn_compat-0.1.3-py3-none-any.whl.metadata (18 kB)
Downloading imblearn-0.0-py2.py3-none-any.whl (1.9 kB)
Downloading imbalanced_learn-0.13.0-py3-none-any.whl (238 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m238.4/238.4 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading sklearn_compat-0.1.3-py3-none-any.whl (18 kB)
Installing collected packages: sklearn-compat, imbalanced-learn, imblearn
Successfully installed imbalanced-learn-0.13.0 imblearn-0.0 sklearn-compat-0.1.3


In [None]:
""" Using Smote to do over sampling and udnersampling of data"""
from imblearn.over_sampling import SMOTE


# Convert one-hot labels to categorical format
y_train_flat = y_seq.argmax(axis=1)

# Check class distribution
unique, counts = np.unique(y_train_flat, return_counts=True)
class_counts = dict(zip(unique, counts))
print("Original Class Counts:", class_counts)

# Define target samples for rare classes (only increasing the rare ones)
desired_samples = {
    6: 3000,  # LAYING
    7: 3000,  # STAND_TO_SIT
    8: 3000,  # SIT_TO_STAND
    9: 3000,  # SIT_TO_LIE
    10: 3000, # LIE_TO_SIT
    11: 3000, # STAND_TO_LIE
}

# Apply SMOTE only to specified rare classes
smote = SMOTE(sampling_strategy=desired_samples, random_state=42)

# Flatten LSTM input for SMOTE (convert 3D -> 2D)
X_train_flat = X_seq.reshape(X_seq.shape[0], -1)

# Resample data
X_resampled_flat, y_resampled_flat = smote.fit_resample(X_train_flat, y_train_flat)

# Reshape back to seq format
X_resampled = X_resampled_flat.reshape(-1, TIME_STEPS, FEATURES)
y_resampled = to_categorical(y_resampled_flat, num_classes=y_seq.shape[1])

# Print new class distribution
new_counts = dict(zip(*np.unique(y_resampled.argmax(axis=1), return_counts=True)))
print("New Class Counts:", new_counts)

Original Class Counts: {0: 2751, 1: 4686, 2: 4742, 3: 3155, 4: 2785, 5: 2606, 6: 216, 7: 128, 8: 291, 9: 291, 10: 249, 11: 146, 12: 1100676}
New Class Counts: {0: 2751, 1: 4686, 2: 4742, 3: 3155, 4: 2785, 5: 2606, 6: 3000, 7: 3000, 8: 3000, 9: 3000, 10: 3000, 11: 3000, 12: 1100676}


#Now low count classes has been balanced. lets balance high count class 12

In [None]:
from imblearn.under_sampling import RandomUnderSampler

# Define undersampling strategy (reduce to 5000 samples max per class)
undersample = RandomUnderSampler(sampling_strategy={12: 4000}, random_state=42)

# Apply undersampling
X_under_flat, y_under_flat = undersample.fit_resample(X_resampled_flat, y_resampled_flat)

# Reshape back
X_under = X_under_flat.reshape(-1, TIME_STEPS, FEATURES)
y_under = to_categorical(y_under_flat, num_classes=y_seq.shape[1])

print(f"Reduced Training Data Shape: {X_under.shape}, {y_under.shape}")
new_counts = dict(zip(*np.unique(y_under.argmax(axis=1), return_counts=True)))
print("New Class Counts:", new_counts)

Reduced Training Data Shape: (42725, 50, 6), (42725, 13)
New Class Counts: {0: 2751, 1: 4686, 2: 4742, 3: 3155, 4: 2785, 5: 2606, 6: 3000, 7: 3000, 8: 3000, 9: 3000, 10: 3000, 11: 3000, 12: 4000}


In [None]:
# let us save the sampled balanced dataset for future purposes
np.savez("Xraw_yraw_balanced.npz",array1=X_under_flat, array2=y_under_flat)
#when loading this npz file, we need to convert it into sequence format to retain the shape

In [None]:
data = np.load("Xraw_yraw_balanced.npz")
X_under_flat= data["array1"]
y_under_flat= data["array2"]

TIME_STEPS = 50  # 1 second of data (50Hz sampling rate)
FEATURES = 6 # 6 features = acc + gyro

X_under = X_under_flat.reshape(-1, TIME_STEPS, FEATURES)
y_under = to_categorical(y_under_flat, num_classes=13)

print(f"Reduced Training Data Shape: {X_under.shape}, {y_under.shape}")
new_counts = dict(zip(*np.unique(y_under.argmax(axis=1), return_counts=True)))
print("New Class Counts:", new_counts)

Reduced Training Data Shape: (42725, 50, 6), (42725, 13)
New Class Counts: {0: 2751, 1: 4686, 2: 4742, 3: 3155, 4: 2785, 5: 2606, 6: 3000, 7: 3000, 8: 3000, 9: 3000, 10: 3000, 11: 3000, 12: 4000}
