# Data Exploration of Physical Therapy Dataset

## Imports + Data

In [1]:
import os
import math
import kagglehub
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder

import seglearn as seg
import tensorflow as tf
import sklearn

from functools import reduce

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Download latest version
path = kagglehub.dataset_download("rabieelkharoua/physical-therapy-exercises-dataset")

print("Path to dataset files:", path)

Path to dataset files: /Users/hammy/.cache/kagglehub/datasets/rabieelkharoua/physical-therapy-exercises-dataset/versions/1


Collecting metadata for template times and subject/exercise/sensor information to use for later

## Data Loading

In [3]:
times = []
data = []

for subject in os.listdir(path):
    subject_path = os.path.join(path, subject)
    if not os.path.isdir(subject_path):
        continue  # skip non-folder items
    
    for exercise in os.listdir(subject_path):
        exercise_path = os.path.join(subject_path, exercise)
        if not os.path.isdir(exercise_path):
            continue

        # CASE A: .txt files directly inside exercise folder
        for f in os.listdir(exercise_path):
            file_path = os.path.join(exercise_path, f)
            if os.path.isfile(file_path) and f.endswith(".txt"):
                times.append({
                    "subject": subject,
                    "exercise": exercise,
                    "file_name": f,
                    "path": file_path
                })

        # CASE B: nested sensor folders inside exercise folder
        for sensor in os.listdir(exercise_path):
            sensor_path = os.path.join(exercise_path, sensor)
            if os.path.isdir(sensor_path):
                for f in os.listdir(sensor_path):
                    file_path = os.path.join(sensor_path, f)
                    if f.endswith(".txt"):
                        data.append({
                            "subject": subject,
                            "exercise": exercise,
                            "sensor": sensor,
                            "file_name": f,
                            "path": file_path
                        })


template_times = pd.DataFrame(times)
df = pd.DataFrame(data)
df = df.sort_values(by=['subject', 'exercise', 'sensor']).reset_index().drop("index", axis=1)
df

Unnamed: 0,subject,exercise,sensor,file_name,path
0,s1,e1,u1,template_session.txt,/Users/hammy/.cache/kagglehub/datasets/rabieel...
1,s1,e1,u1,test.txt,/Users/hammy/.cache/kagglehub/datasets/rabieel...
2,s1,e1,u2,template_session.txt,/Users/hammy/.cache/kagglehub/datasets/rabieel...
3,s1,e1,u2,test.txt,/Users/hammy/.cache/kagglehub/datasets/rabieel...
4,s1,e1,u3,template_session.txt,/Users/hammy/.cache/kagglehub/datasets/rabieel...
...,...,...,...,...,...
395,s5,e8,u3,test.txt,/Users/hammy/.cache/kagglehub/datasets/rabieel...
396,s5,e8,u4,template_session.txt,/Users/hammy/.cache/kagglehub/datasets/rabieel...
397,s5,e8,u4,test.txt,/Users/hammy/.cache/kagglehub/datasets/rabieel...
398,s5,e8,u5,template_session.txt,/Users/hammy/.cache/kagglehub/datasets/rabieel...


In [4]:
times_sorted = template_times.sort_values(by=['subject', 'exercise']).reset_index().drop('index', axis=1)
times_sorted.head()

Unnamed: 0,subject,exercise,file_name,path
0,s1,e1,template_times.txt,/Users/hammy/.cache/kagglehub/datasets/rabieel...
1,s1,e2,template_times.txt,/Users/hammy/.cache/kagglehub/datasets/rabieel...
2,s1,e3,template_times.txt,/Users/hammy/.cache/kagglehub/datasets/rabieel...
3,s1,e4,template_times.txt,/Users/hammy/.cache/kagglehub/datasets/rabieel...
4,s1,e5,template_times.txt,/Users/hammy/.cache/kagglehub/datasets/rabieel...


In [5]:
templates = df[df['file_name'] == 'template_session.txt'].reset_index()
templates

Unnamed: 0,index,subject,exercise,sensor,file_name,path
0,0,s1,e1,u1,template_session.txt,/Users/hammy/.cache/kagglehub/datasets/rabieel...
1,2,s1,e1,u2,template_session.txt,/Users/hammy/.cache/kagglehub/datasets/rabieel...
2,4,s1,e1,u3,template_session.txt,/Users/hammy/.cache/kagglehub/datasets/rabieel...
3,6,s1,e1,u4,template_session.txt,/Users/hammy/.cache/kagglehub/datasets/rabieel...
4,8,s1,e1,u5,template_session.txt,/Users/hammy/.cache/kagglehub/datasets/rabieel...
...,...,...,...,...,...,...
195,390,s5,e8,u1,template_session.txt,/Users/hammy/.cache/kagglehub/datasets/rabieel...
196,392,s5,e8,u2,template_session.txt,/Users/hammy/.cache/kagglehub/datasets/rabieel...
197,394,s5,e8,u3,template_session.txt,/Users/hammy/.cache/kagglehub/datasets/rabieel...
198,396,s5,e8,u4,template_session.txt,/Users/hammy/.cache/kagglehub/datasets/rabieel...


In [6]:
def combine_sensors(sensor_files, subject, exercise):
    files = sensor_files[
        (sensor_files['subject'] == subject) & 
        (sensor_files['exercise'] == exercise)
    ]['path'].tolist()
    dfs = [pd.read_csv(f, sep=";", index_col='time index') for f in files]
    for i, df in enumerate(dfs):
        df = df.add_suffix(f"_u{i+1}")  # rename columns like acc_x_s1, gyr_y_s3, etc.
        df = df.rename(columns={f"time_s{i+1}": "time index"})  # keep common time
        dfs[i] = df
    merged_df = reduce(lambda left, right: pd.merge(left, right, on="time index", how="outer"), dfs)
    merged_df['subject'] = subject
    merged_df['exercise'] = exercise
    return merged_df

    
    

In [7]:
def collect_template_times(template_times, subject, exercise):
    files = template_times[
        (template_times['subject'] == subject) & 
        (template_times['exercise'] == exercise)
    ]['path'].tolist()
    dfs = [pd.read_csv(f, sep=";") for f in files]
    return dfs[0]
        

You can use this pathname to see what each data point would look like

In [8]:
exercise1 = combine_sensors(templates, 's1', 'e1')
exercise1.head()
sample_time = pd.read_csv(times_sorted.iloc[0]['path'], sep=";")
exercise1.shape, sample_time.shape

((2187, 47), (3, 3))

In [9]:
subject1_data = []
for exercise in templates['exercise'].unique():
    combined_df = combine_sensors(templates, 's1', exercise)
    subject1_data.append(combined_df)
    
subject1_data[0].shape, len(subject1_data)

((2187, 47), 8)

In [10]:
subject1_times = []

for exercise in templates['exercise'].unique():
    times = collect_template_times(times_sorted, 's1', exercise)
    subject1_times.append(times)

subject1_times[0].shape

(3, 3)

In [11]:
all_data = []
for subject in templates['subject'].unique():
    for exercise in templates['exercise'].unique():
        combined_df = combine_sensors(templates, subject, exercise)
        all_data.append(combined_df)
all_data[25]

Unnamed: 0_level_0,acc_x_u1,acc_y_u1,acc_z_u1,gyr_x_u1,gyr_y_u1,gyr_z_u1,mag_x_u1,mag_y_u1,mag_z_u1,acc_x_u2,...,acc_y_u5,acc_z_u5,gyr_x_u5,gyr_y_u5,gyr_z_u5,mag_x_u5,mag_y_u5,mag_z_u5,subject,exercise
time index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,-9.761873,0.892263,0.655809,0.000887,-0.006745,-0.013439,0.707813,0.307313,0.069317,-9.673734,...,-8.760168,0.743201,0.008860,0.008893,-0.005861,-0.037055,0.779400,-0.073291,s4,e2
2,-9.784284,0.922200,0.670455,0.000808,-0.003025,-0.007075,0.706766,0.306490,0.069706,-9.688588,...,-8.794193,0.743868,0.010659,0.003447,-0.005872,-0.036212,0.780707,-0.074447,s4,e2
3,-9.791800,0.907294,0.670288,-0.008396,-0.005488,0.005598,0.707055,0.308107,0.070228,-9.681160,...,-8.752614,0.705799,-0.008208,0.005291,0.000285,-0.037218,0.779678,-0.073954,s4,e2
4,-9.739362,0.922092,0.641326,-0.011787,0.007672,-0.006991,0.708114,0.308121,0.069855,-9.666269,...,-8.745565,0.713677,-0.018919,-0.019329,0.002001,-0.036166,0.780962,-0.073548,s4,e2
5,-9.761841,0.907178,0.663322,-0.008245,0.011386,-0.001538,0.708188,0.307306,0.070688,-9.681124,...,-8.810951,0.651055,-0.008875,0.007757,-0.009422,-0.037160,0.780860,-0.072823,s4,e2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1525,-9.769870,0.779255,0.746552,-0.012025,-0.005570,0.000200,0.702694,0.319586,0.055606,-9.651124,...,-8.772396,0.880728,0.002787,-0.005730,-0.010312,-0.040267,0.779677,-0.085916,s4,e2
1526,-9.754900,0.771713,0.746785,-0.017390,-0.001168,-0.002476,0.703696,0.319336,0.054421,-9.651075,...,-8.758101,0.915341,0.005202,-0.012686,0.002053,-0.040185,0.779815,-0.086820,s4,e2
1527,-9.777373,0.771819,0.746405,-0.011920,-0.003026,-0.007015,0.702267,0.318252,0.053307,-9.665963,...,-8.784875,0.920846,-0.021407,-0.000387,-0.005901,-0.040582,0.777840,-0.087042,s4,e2
1528,-9.784826,0.786744,0.761264,-0.001166,-0.002874,-0.001611,0.703931,0.317726,0.053644,-9.665860,...,-8.787084,0.883762,-0.012579,0.005276,-0.001471,-0.041042,0.778777,-0.085931,s4,e2


Here's an example of what each template_session / test would look like

In [12]:
# Suppose sensor_df has a 'time' column
# Initialize label column
exercise1['execution_type'] = 0  # 0 = no exercise


 # For each execution range, label those rows
def assign_execution_type(df, sample_time):
    for _, row in sample_time.iterrows():
        start, end, etype = row['start'], row['end'], row['execution type']
        mask = (df.index >= start) & (df.index <= end)
        df.loc[mask, 'execution_type'] = etype



assign_execution_type(exercise1, sample_time)
exercise1.head()

Unnamed: 0_level_0,acc_x_u1,acc_y_u1,acc_z_u1,gyr_x_u1,gyr_y_u1,gyr_z_u1,mag_x_u1,mag_y_u1,mag_z_u1,acc_x_u2,...,acc_z_u5,gyr_x_u5,gyr_y_u5,gyr_z_u5,mag_x_u5,mag_y_u5,mag_z_u5,subject,exercise,execution_type
time index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,-9.665799,-1.677241,0.615063,-0.014956,0.004388,0.010589,0.587318,0.455106,-0.094949,-9.567934,...,-4.317889,0.00461,-0.005113,0.008308,0.015362,0.734462,0.204774,s1,e1,0
2,-9.665806,-1.684737,0.622513,0.000607,-0.003094,-0.007589,0.587428,0.455621,-0.093364,-9.59025,...,-4.325564,0.000332,-0.001732,-0.000527,0.014407,0.73553,0.205962,s1,e1,0
3,-9.62841,-1.699724,0.585751,0.006007,0.000557,-0.004879,0.588389,0.454722,-0.094907,-9.590147,...,-4.317407,0.011205,0.012671,-0.00931,0.014569,0.736845,0.204158,s1,e1,0
4,-9.628372,-1.684836,0.600733,-0.003085,-0.000281,-0.00034,0.588673,0.455759,-0.092664,-9.560378,...,-4.288569,-0.005662,-0.002119,-0.012891,0.014377,0.737884,0.205386,s1,e1,0
5,-9.643291,-1.639893,0.585661,-0.002932,-0.006807,-0.013043,0.589193,0.453927,-0.093143,-9.545363,...,-4.300906,-0.003275,0.007345,0.000377,0.013961,0.735951,0.204659,s1,e1,0


## Data Preprocessing

### Windowing

In [13]:
def segment_series_seq2seq(
    series: np.ndarray,
    intervals: list | None,
    window_size: int = 100,
    stride: int = 50,
    num_classes: int = None,
    pad_label: int = 0
):
    """
    Converts a single time series with interval exercise labels into
    (X, y) segments for sequence-to-sequence learning.

    Parameters
    ----------
    series : np.ndarray
        Shape (timesteps, features)
    intervals : list of tuples
        Each tuple = (start_idx, end_idx, label_id)
    window_size : int
        Number of timesteps per segment
    stride : int
        Step size between consecutive windows
    num_classes : int or None
        Number of unique exercise labels. Used for one-hot encoding if desired.
    pad_label : int
        Label value to use for timesteps outside any interval (e.g. rest/none)

    Returns
    -------
    X : np.ndarray
        Shape (num_segments, window_size, num_features)
    y : np.ndarray
        Shape (num_segments, window_size)
        or (num_segments, window_size, num_classes) if num_classes is set
    """

    n_timesteps, n_features = series.shape

    # 1. Create timestep-level label array
    y_full = np.full(n_timesteps, pad_label, dtype=int)
    for start, end, label in intervals:
        start = max(0, start)
        end = min(n_timesteps, end)
        y_full[start:end] = label

    # 2. Slide a window
    X_segments, y_segments = [], []
    for start in range(0, n_timesteps - window_size + 1, stride):
        X_win = series[start:start + window_size, :]
        y_win = y_full[start:start + window_size]
        X_segments.append(X_win)
        y_segments.append(y_win)

    X = np.stack(X_segments)
    y = np.stack(y_segments)

    # 3. Optional one-hot encoding
    if num_classes is not None:
        one_hot = np.eye(num_classes)[y]
        y = one_hot.astype(np.float32)

    return X.astype(np.float32), y


In [14]:
X_seg, y_seg = segment_series_seq2seq(
    exercise1.drop(['subject', 'exercise'], axis=1).to_numpy(),
    intervals=[
        (int(row['start']), int(row['end']), int(row['execution type']))
        for _, row in sample_time.iterrows()
    ],
    window_size=50,
    stride=25,
    num_classes=4,  # assuming 4 classes: 0 (none), 1, 2, 3
    pad_label=0
)

X_seg.shape, y_seg.shape

((86, 50, 46), (86, 50, 4))

In [15]:
X_seg = []
y_seg = []

for i in range(len(subject1_data)):
    X_i, y_i = segment_series_seq2seq(
        subject1_data[i].drop(['subject', 'exercise'], axis=1).to_numpy(),
        intervals=[
            (int(row['start']), int(row['end']), int(row['execution type']))
            for _, row in subject1_times[i].iterrows()
        ],
        window_size=50,
        stride=25,
        num_classes=4,  # assuming 4 classes: 0 (none), 1, 2, 3
        pad_label=0
    )
    X_seg.append(X_i)
    y_seg.append(y_i)

X_seg = np.concatenate(X_seg, axis=0)
y_seg = np.concatenate(y_seg, axis=0)
X_seg.shape, y_seg.shape

((596, 50, 45), (596, 50, 4))

In [16]:
# Encode exercise names to numeric labels
label_encoder = LabelEncoder()
all_exercises = [df['exercise'].iloc[0] for df in subject1_data]
label_encoder.fit(all_exercises)

X_seg = []
y_seg = []

for i in range(len(subject1_data)):
    exercise_label = label_encoder.transform([subject1_data[i]['exercise'].iloc[0]])[0]

    X_i, y_i = segment_series_seq2seq(
        subject1_data[i].drop(['subject', 'exercise'], axis=1).to_numpy(),
        intervals=[
            (0, len(subject1_data[i]), exercise_label)
        ],  # the whole time series is one label
        window_size=50,
        stride=25,
        num_classes=len(label_encoder.classes_),
        pad_label=0
    )

    X_seg.append(X_i)
    y_seg.append(y_i)

# Combine all segmented windows
X_seg = np.concatenate(X_seg, axis=0)
y_seg = np.concatenate(y_seg, axis=0)

X_seg.shape, y_seg.shape


((596, 50, 45), (596, 50, 8))

In [17]:
# Encode exercise names to numeric labels
label_encoder = LabelEncoder()
all_exercises = [df['exercise'].iloc[0] for df in all_data]
label_encoder.fit(all_exercises)

X_seg = []
y_seg = []

for i in range(len(all_data)):
    exercise_label = label_encoder.transform([all_data[i]['exercise'].iloc[0]])[0]

    X_i, y_i = segment_series_seq2seq(
        all_data[i].drop(['subject', 'exercise'], axis=1).to_numpy(),
        intervals=[
            (0, len(all_data[i]), exercise_label)
        ],  # the whole time series is one label
        window_size=50,
        stride=25,
        num_classes=len(label_encoder.classes_),
        pad_label=0
    )

    X_seg.append(X_i)
    y_seg.append(y_i)

# Combine all segmented windows
X_seg = np.concatenate(X_seg, axis=0)
y_seg = np.concatenate(y_seg, axis=0)

X_seg.shape, y_seg.shape

((2155, 50, 45), (2155, 50, 8))

## Sample Training

In [18]:
y_int = np.argmax(y_seg, axis=-1)  # shape: (num_segments, window_size)

y_seg_majority = np.array([np.bincount(window).argmax() for window in y_int])
# shape: (num_segments,)
y_seg_majority.shape

(2155,)

In [19]:
y_seg_majority

array([0, 0, 0, ..., 7, 7, 7], shape=(2155,))

In [20]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(
    X_seg, y_seg_majority, test_size=0.2, random_state=42, stratify=y_seg_majority
)

In [21]:
X_train.shape, y_train.shape, X_val.shape, y_val.shape

((1724, 50, 45), (1724,), (431, 50, 45), (431,))

In [22]:
from sklearn.preprocessing import StandardScaler
import numpy as np

num_features = X_train.shape[2]
scalers = []

for i in range(num_features):
    scaler = StandardScaler()
    # Flatten samples x timesteps into one long vector for this feature
    X_train[:, :, i] = scaler.fit_transform(X_train[:, :, i].reshape(-1, 1)).reshape(X_train.shape[0], X_train.shape[1])
    X_val[:, :, i] = scaler.transform(X_val[:, :, i].reshape(-1, 1)).reshape(X_val.shape[0], X_val.shape[1])
    scalers.append(scaler)


In [23]:
from tensorflow.keras import layers, models

num_classes = len(np.unique(y_seg_majority))
window_size = X_train.shape[1]
num_features = X_train.shape[2]

model = models.Sequential([
    layers.Input(shape=(window_size, num_features)),
    layers.Bidirectional(layers.LSTM(128)),
    layers.Dropout(0.3),
    layers.Dense(num_classes, activation='softmax')
])

model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

model.summary()

In [24]:
model.fit(
    X_seg,
    y_seg_majority,  # Convert one-hot to class indices
    epochs=10,
    validation_split=0.2
)

Epoch 1/10
[1m54/54[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 41ms/step - accuracy: 0.9211 - loss: 0.2937 - val_accuracy: 0.9861 - val_loss: 0.0657
Epoch 2/10
[1m54/54[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 42ms/step - accuracy: 0.9965 - loss: 0.0175 - val_accuracy: 0.9838 - val_loss: 0.0701
Epoch 3/10
[1m54/54[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 46ms/step - accuracy: 0.9965 - loss: 0.0133 - val_accuracy: 0.9745 - val_loss: 0.0909
Epoch 4/10
[1m54/54[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 45ms/step - accuracy: 0.9919 - loss: 0.0279 - val_accuracy: 0.9861 - val_loss: 0.0563
Epoch 5/10
[1m54/54[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 46ms/step - accuracy: 1.0000 - loss: 0.0026 - val_accuracy: 0.9907 - val_loss: 0.0459
Epoch 6/10
[1m54/54[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 47ms/step - accuracy: 1.0000 - loss: 0.0012 - val_accuracy: 0.9884 - val_loss: 0.0400
Epoch 7/10
[1m54/54[0m [32m━━━━

<keras.src.callbacks.history.History at 0x17c01b7a0>