# LSTM heartrateclass prediction

Inspiration taken from: https://github.com/rikluost/athlete_hr_predict

In [16]:
# load libraries
import os, glob 
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
from tensorflow import keras
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical

# location of the fit files
fit_path = "../fit_file_csv"
fit_test_path = "../fit_file_test_csv"
graph_path = "../graphs"
os.chdir(fit_path)


In [8]:
# add calculated altitude difference column, and 5sec moving average column. Remove geographical coordinates for privacy.
fit_files = glob.glob("*.csv")
for file in fit_files:
    df = pd.read_csv(fit_path+'/'+file, index_col='timestamp')
    df['alt_difference'] = df['enhanced_altitude'] - df['enhanced_altitude'].shift(1)
    df['rolling_ave_alt'] = df['alt_difference'].rolling(window=5).mean()
    df = df.bfill()
    df = df.drop(['position_lat','position_long'], axis=1, errors='ignore')
    df.to_csv(fit_path+'/'+file)


In [19]:
# Load the entire CSV file
data = pd.read_csv('../with_ranges_features.csv')

# Determine the number of classes and create a global label encoder
label_encoder = LabelEncoder()
label_encoder.fit(data['HeartRateClass'])
num_classes = len(label_encoder.classes_)

# Group by 'RunID'
grouped = data.groupby('RunID')

def preprocess(df):
    # Extract features and label
    features = df[['Latitude', 'Longitude', 'Elevation', 'Distance', 'HeartRate', 'Cadence', 'Speed']]
    label = df['HeartRateClass']
    
    # Encode labels as integers and then convert to categorical
    label = label_encoder.transform(label)
    label = to_categorical(label, num_classes=num_classes)
    
    return features, label

def split_data(df):
    features, label = preprocess(df)
    # Split into training and validation sets
    x_train, x_val, y_train, y_val = train_test_split(features, label, test_size=0.2, random_state=42)
    return x_train, x_val, y_train, y_val

In [20]:
def build_model(num_classes):
    model = keras.Sequential([
        keras.layers.Input(shape=(7,)),  # 7 features
        keras.layers.Dense(64, activation='relu'),
        keras.layers.Dense(32, activation='relu'),
        keras.layers.Dense(num_classes, activation='softmax')  # Multi-class classification
    ])
    return model

In [None]:
# Iterate over each group
for run_id, group in grouped:
    print(f"Processing RunID: {run_id}")
    
    df = group.copy()

    # Split the data into training and validation sets
    x_train, x_val, y_train, y_val = split_data(df)

    # Define callbacks
    es_callback = keras.callbacks.EarlyStopping(monitor="val_loss", min_delta=0, patience=5, verbose=1)
    tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=f'./logs/run_{run_id}', histogram_freq=1)
    modelckpt_callback = keras.callbacks.ModelCheckpoint(
        monitor="val_loss",
        filepath=f'model_checkpoint_{run_id}.weights.h5',
        verbose=1,
        save_weights_only=True,
        save_best_only=True,
    )

    # Build and compile the model
    model = build_model(num_classes)
    model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.001), metrics=["accuracy"], loss="categorical_crossentropy")

    # Train the model
    model.fit(
        x_train, y_train,
        validation_data=(x_val, y_val),
        epochs=50,
        callbacks=[es_callback, tensorboard_callback, modelckpt_callback]
    )

Processing RunID: 741590575
Epoch 1/50
[1m33/60[0m [32m━━━━━━━━━━━[0m[37m━━━━━━━━━[0m [1m0s[0m 2ms/step - accuracy: 0.3354 - loss: 41.5474      
Epoch 1: val_loss improved from inf to 10.08746, saving model to model_checkpoint_741590575.weights.h5
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 8ms/step - accuracy: 0.3672 - loss: 31.4918 - val_accuracy: 0.4614 - val_loss: 10.0875
Epoch 2/50
[1m35/60[0m [32m━━━━━━━━━━━[0m[37m━━━━━━━━━[0m [1m0s[0m 1ms/step - accuracy: 0.4875 - loss: 6.7425 
Epoch 2: val_loss improved from 10.08746 to 2.03569, saving model to model_checkpoint_741590575.weights.h5
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.5195 - loss: 6.3255 - val_accuracy: 0.6806 - val_loss: 2.0357
Epoch 3/50
[1m39/60[0m [32m━━━━━━━━━━━━━[0m[37m━━━━━━━[0m [1m0s[0m 1ms/step - accuracy: 0.6601 - loss: 4.2249 
Epoch 3: val_loss did not improve from 2.03569
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m

Epoch 5/50
[1m 1/28[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m0s[0m 27ms/step - accuracy: 0.4688 - loss: 3.4568
Epoch 5: val_loss improved from 2.52669 to 2.32191, saving model to model_checkpoint_749703915.weights.h5
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.4549 - loss: 2.8931 - val_accuracy: 0.5874 - val_loss: 2.3219
Epoch 6/50
[1m 1/28[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m0s[0m 26ms/step - accuracy: 0.4375 - loss: 2.8777
Epoch 6: val_loss improved from 2.32191 to 1.75156, saving model to model_checkpoint_749703915.weights.h5
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.5113 - loss: 2.6538 - val_accuracy: 0.5695 - val_loss: 1.7516
Epoch 7/50
[1m 1/28[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m0s[0m 22ms/step - accuracy: 0.5625 - loss: 1.3863
Epoch 7: val_loss did not improve from 1.75156
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.4506 - loss: 4.1196 - val_

[1m100/123[0m [32m━━━━━━━━━━━━━━━━[0m[37m━━━━[0m [1m0s[0m 2ms/step - accuracy: 0.7070 - loss: 10.9603
Epoch 5: val_loss did not improve from 4.17862
[1m123/123[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7122 - loss: 10.6445 - val_accuracy: 0.7452 - val_loss: 28.6591
Epoch 6/50
[1m101/123[0m [32m━━━━━━━━━━━━━━━━[0m[37m━━━━[0m [1m0s[0m 2ms/step - accuracy: 0.7413 - loss: 15.6466
Epoch 6: val_loss did not improve from 4.17862
[1m123/123[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7423 - loss: 15.2669 - val_accuracy: 0.5749 - val_loss: 6.9426
Epoch 6: early stopping
Processing RunID: 760382662
Epoch 1/50
[1m36/55[0m [32m━━━━━━━━━━━━━[0m[37m━━━━━━━[0m [1m0s[0m 1ms/step - accuracy: 0.6950 - loss: 28.8383       
Epoch 1: val_loss improved from inf to 4.29728, saving model to model_checkpoint_760382662.weights.h5
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.718

In [11]:
# load entire CSV file
data = pd.read_csv('../with_ranges_features.csv')

# ensure that run sessions are grouped
grouped = data.groupby('RunID')

In [12]:
grouped

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000002230E7B9640>

In [13]:
def preprocess(df):
    # Extract features and label
    features = df[['Latitude', 'Longitude', 'Elevation', 'Distance', 'HeartRate', 'Cadence', 'Speed']]
    label = df['HeartRateClass']
    
    # Encode labels as integers and then convert to categorical
    label_encoder = LabelEncoder()
    label = label_encoder.fit_transform(label)
    label = to_categorical(label)
    
    return features, label

def split_data(df):
    features, label = preprocess(df)
    # Split into training and validation sets
    x_train, x_val, y_train, y_val = train_test_split(features, label, test_size=0.2, random_state=42)
    return x_train, x_val, y_train, y_val

In [14]:
def build_model(num_classes):
    model = keras.Sequential([
        keras.layers.Input(shape=(7,)),  # 7 features
        keras.layers.Dense(64, activation='relu'),
        keras.layers.Dense(32, activation='relu'),
        keras.layers.Dense(num_classes, activation='softmax')  # Multi-class classification
    ])
    return model

In [17]:
# Determine the number of classes
num_classes = len(data['HeartRateClass'].unique())

# Iterate over each group
for run_id, group in grouped:
    print(f"Processing RunID: {run_id}")
    
    df = group.copy()

    # Split the data into training and validation sets
    x_train, x_val, y_train, y_val = split_data(df)

    # Define callbacks
    es_callback = keras.callbacks.EarlyStopping(monitor="val_loss", min_delta=0, patience=5, verbose=1)
    tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=f'./logs/run_{run_id}', histogram_freq=1)
    modelckpt_callback = keras.callbacks.ModelCheckpoint(
        monitor="val_loss",
        filepath=f'model_checkpoint_{run_id}.weights.h5',
        verbose=1,
        save_weights_only=True,
        save_best_only=True,
    )

    # Build and compile the model
    model = build_model(num_classes)
    model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.001), metrics=["accuracy"], loss="categorical_crossentropy")

    # Train the model
    model.fit(
        x_train, y_train,
        validation_data=(x_val, y_val),
        epochs=50,
        callbacks=[es_callback, tensorboard_callback, modelckpt_callback]
    )

Processing RunID: 741590575
Epoch 1/50


ValueError: Arguments `target` and `output` must have the same shape. Received: target.shape=(None, 5), output.shape=(None, 9)

In [None]:
# set parameters, file names

#select the predictors for the model:
model_features =  ["heart_rate", "enhanced_speed","rolling_ave_alt","cadence"] #  cadence, altitude, distance, heart_rate, enhanced_speed, rolling_ave_alt
batch_size = 250 # training batch size for the LSTM
epochs = 5 # maximum number of epochs - autostop will work on per file basis
learning_rate = 0.001
decay_rate = 0.001
n_X = 120 # number of timesteps for training
n_y = 22 # number of timesteps in future for prediction
step = 1 # step size of predictors for model training

sequence_length = int(n_X/step)
n_fit_files_test_set = 10 # number of files for validation dataset (only 1 validation file supported at the moment)

# select the training files and the validation files 
train_files = glob.glob(fit_path+"/*.csv")[0:-n_fit_files_test_set]
valid_files = glob.glob(fit_path+"/*.csv")[-n_fit_files_test_set:]
test_files = glob.glob(fit_test_path+"/*.csv")


In [None]:
# calculate the data normalisation parameters from all training data

def normalize(data):
    data_mean = data.mean(axis=0)
    data_std = data.std(axis=0)
    #return (data - data_mean) / data_std, data_mean, data_std
    return data_mean, data_std

li = []

for file in train_files:
    df = pd.read_csv(file, index_col='timestamp')[model_features]
    li.append(df)

df = pd.concat(li, axis=0, ignore_index=True)

df_mean, df_std = normalize(df)

def denormalize_hr(data):
    return data*df_std[0]+df_mean[0] 

## Create validation dataset

In [None]:
# validation dataset - train each file at the time
n=0
for file in valid_files:
    df = pd.read_csv(file, index_col='timestamp')[model_features]
    df = (df - df_mean) / df_std
    start = n_X + n_y
    end = n_X + len(df.index)
    
    x = df[model_features].values
    y = df.iloc[start:end][["heart_rate"]]
    
    dataset_val = keras.preprocessing.timeseries_dataset_from_array(
        x,
        y,
        sequence_length=sequence_length,
        sampling_rate=step,
        batch_size=batch_size,
    )
    if n==0 : dataset_val_old = dataset_val
    if n>0 : dataset_val_old = dataset_val.concatenate(dataset_val_old)
    
    n=n+1

dataset_val = dataset_val_old

# calculate stats for a naive model

In [None]:
# Calculate statistics for the naive model
# make dataframe for the naive model
d_naive = pd.DataFrame(columns=['measured', 'predicted'])
d_naive['measured']=denormalize_hr(x[n_y:,0])
d_naive['predicted']=denormalize_hr(x[:-n_y,0])

# calculate some stats
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
import scipy

y_test, pred_test = d_naive['measured'].values, d_naive['predicted'].values

MSE_test=round(mean_squared_error(y_test, pred_test, squared=True),3)
MAE_test=round(mean_absolute_error(y_test, pred_test),3)

test_sdev = np.std(pred_test-y_test)*1.96
test_mean = np.mean(pred_test-y_test)


def mean_confidence_interval(data, confidence=0.95):
    a = 1.0 * np.array(data)
    n = len(a)
    m, se = np.mean(a), scipy.stats.sem(a)
    h = se * scipy.stats.t.ppf((1 + confidence) / 2., n-1)
    return m, m-h, m+h, h

mean_s, ci95_l, ci95_h, mean_uncertainty = mean_confidence_interval(data=(pred_test-y_test))

print('Naive model\nMAE = '+ str(MAE_test)+", MSE = "+str(MSE_test))
print ('Mean and 95% prediction interval = {} +/- {}'.format(test_mean,test_sdev))
print('Uncertainty of mean = '+ str(mean_uncertainty))

## Build the model

In [None]:
#Load the TensorBoard notebook extension
%load_ext tensorboard


# get the shapes of X & y for a batch
for batch in dataset_val.take(1):
    inputs, targets = batch

# the model architecture
inputs = keras.layers.Input(shape=(inputs.shape[1], inputs.shape[2]))
outputs = keras.layers.LSTM(4, return_sequences=False)(inputs)
outputs = keras.layers.Dense(1)(outputs)

model = keras.Model(inputs=inputs, outputs=outputs)

# learning rate
lr_schedule = keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate=0.01,
    decay_steps=50000,
    decay_rate=0.001)

path_checkpoint = "model_checkpoint.weights.h5"
es_callback = keras.callbacks.EarlyStopping(monitor="val_mae", min_delta=0, patience=5, verbose=1)

tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir='./logs/', histogram_freq=1)


modelckpt_callback = keras.callbacks.ModelCheckpoint(
    monitor="val_mae",
    filepath=path_checkpoint,
    verbose=1,
    save_weights_only=True,
    save_best_only=True,
)

model.compile(optimizer=keras.optimizers.Adam(learning_rate=lr_schedule), metrics=["mae"], loss="mae")

model.summary()


## Load the training data, train the model

Each file is processed separately for creating the training dataset, as every file is disconnected from previous file and the moving window cannot be extendented over.

In [None]:
# training dataset

n=0
for file in train_files:
    df = pd.read_csv(file, index_col='timestamp')[model_features]
    df = (df - df_mean) / df_std
    print(file)
    start = n_X + n_y
    end = n_X + len(df.index)
    
    x = df[model_features].values
    y = df.iloc[start:end][["heart_rate"]].values
    
    dataset_train = keras.preprocessing.timeseries_dataset_from_array(
        x,
        y,
        sequence_length=sequence_length,
        sampling_rate=step,
        batch_size=batch_size
    )
    
    if n==0 : dataset_train_old = dataset_train
    if n>0 : dataset_train_old = dataset_train.concatenate(dataset_train_old)

    n=n+1

dataset_train=dataset_train_old

len(dataset_train)


In [None]:
# train the model, with tensorboard visualisations
history = model.fit(
    dataset_train,
    epochs=epochs,
    validation_data=dataset_val,
    callbacks=[es_callback, modelckpt_callback, tensorboard_callback],
    verbose=1
)


In [None]:
%tensorboard --logdir logs/fit

In [None]:
def visualize_loss(history, title):
    loss = history.history["loss"]
    val_loss = history.history["val_loss"]
    epochs = range(len(loss))
    plt.figure()
    plt.plot(epochs, loss, "b", label="Training loss")
    plt.plot(epochs, val_loss, "r", label="Validation loss")
    plt.title(title)
    plt.xlabel("Epochs")
    plt.ylabel("Loss")
    plt.legend()

visualize_loss(history, "Training and Validation Loss")
plt.savefig(graph_path+'/HR_his_t'+str(n_y)+".png")


## Check the model predictions visually

In [None]:
n=0
for file in test_files:
    df = pd.read_csv(file, index_col='timestamp')[model_features]
    df = (df - df_mean) / df_std
    print(file)
    start = n_X + n_y
    end = n_X + len(df.index)
    
    x = df[model_features].values
    y = df.iloc[start:end][["heart_rate"]].values
    
    dataset_test = keras.preprocessing.timeseries_dataset_from_array(
        x,
        y,
        sequence_length=sequence_length,
        sampling_rate=step,
        batch_size=10
    )
    
    if n>0:
        dataset_test_old = dataset_train_old.concatenate(dataset_test)
    
    dataset_test_old = dataset_test
    
    n=n+1
dataset_test = dataset_test_old

In [None]:
def show_plot(plot_data, delta, title):
    labels = ["History", "True Future", "Model Prediction"]
    marker = [".-", "rx", "go"]
    time_steps = list(range(-(plot_data[0].shape[0]), 0))

    if delta:
        future = delta
    else:
        future = 0
    plt.title(title)
    for i, val in enumerate(plot_data):
        if i:
            plt.plot(future, plot_data[i], marker[i], markersize=10, label=labels[i])
        else:
            plt.plot(time_steps, plot_data[i].flatten(), marker[i], label=labels[i])
    plt.legend()
    plt.xlim([time_steps[0], (future + 5) * 2])
    plt.ylim(100,170)
    plt.xlabel("Time-Step")
    plt.show()
    return


for x, y in dataset_test.take(5):
    show_plot(
        [denormalize_hr(x[0][:, 0].numpy()), denormalize_hr(y[0]), denormalize_hr( model.predict(x)[0])],
        n_y,
        "Single Step Prediction",
    )