In [None]:
import pandas as pd 
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import TimeSeriesSplit, cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor


wd = pd.read_csv("workout_data.csv")
wd["start_time"] = pd.to_datetime(wd["start_time"], errors="coerce") # convert to datetime
wd = wd.dropna(subset=["start_time", "exercise_title", "weight_lbs", "reps"]) # drop rows with missing values

# filter to specific lifts
LIFTS = ["Front Squat", "Bench Press (Barbell)", "Shoulder Press (Dumbbell)", "Clean", "Lat Pulldown (Cable)"]

# make sure the lift titles match and then is filtered
wd = wd[wd["exercise_title"].isin(LIFTS)].copy()

# calculate volume
wd["volume"] = wd["reps"] * wd["weight_lbs"]

# aggregate to workout targets
workout_targets = (
    wd.groupby(["start_time", "exercise_title"], as_index=False)
      .agg(
          total_volume=("volume", "sum"),
          max_weight=("weight_lbs", "max"),
          top_set_reps=("reps", "max"),
          total_reps=("reps", "sum"),
      )
)

# add workout date column
workout_targets["workout_date"] = workout_targets["start_time"].dt.date
workout_targets.head()

Unnamed: 0,start_time,exercise_title,total_volume,max_weight,top_set_reps,total_reps,workout_date
0,2024-03-01 15:00:00,Lat Pulldown (Cable),5400.0,160.0,10,36,2024-03-01
1,2024-03-04 12:45:00,Lat Pulldown (Cable),4312.0,143.0,8,32,2024-03-04
2,2024-03-05 12:17:00,Bench Press (Barbell),4545.0,225.0,6,29,2024-03-05
3,2024-03-07 12:30:00,Bench Press (Barbell),4895.0,205.0,6,31,2024-03-07
4,2024-03-07 12:30:00,Shoulder Press (Dumbbell),2400.0,70.0,8,37,2024-03-07


In [70]:
# load my sleep data
sleep = pd.read_csv("sleeps.csv")
sleep.columns = sleep.columns.str.strip()

sleep["Cycle start time"] = pd.to_datetime(sleep["Cycle start time"], errors="coerce")
sleep["sleep_date"] = sleep["Cycle start time"].dt.date

# aggregate sleep features by date, keep track of mean values of sleep efficiency, duration, and awake time
sleep_feats = (
    sleep.groupby("sleep_date", as_index=False) # group by sleep date
        .agg( # aggregate features
            sleep_eff=("Sleep efficiency %", "mean"), # sleep efficiency i.e percentage of time asleep while in bed
            sleep_dur=("Asleep duration (min)", "mean"), # sleep duration in minutes
            awake_min=("Awake duration (min)", "mean"), # awake time in minutes
            light_min=("Light sleep duration (min)", "mean"), # light sleep duration in minutes
            deep_min=("Deep (SWS) duration (min)", "mean"), # deep sleep duration in minutes
            rem_min=("REM duration (min)", "mean") # REM sleep duration in minutes
        )
)

sleep_feats.head()

Unnamed: 0,sleep_date,sleep_eff,sleep_dur,awake_min,light_min,deep_min,rem_min
0,2024-02-07,87.0,426.0,65.0,227.0,98.0,101.0
1,2024-02-08,87.0,424.0,60.0,205.0,119.0,100.0
2,2024-02-09,89.0,453.0,50.0,247.0,98.0,108.0
3,2024-02-10,90.0,411.0,46.0,195.0,93.0,123.0
4,2024-02-11,86.0,411.0,65.0,288.0,53.0,70.0


In [71]:
# merge workout targets with prior night's sleep features
# this line shifts the workout date back by one day to align with prior sleep data
workout_targets["sleep_date"] = pd.to_datetime(workout_targets["workout_date"]) - pd.Timedelta(days=1)
# convert back to date for merging
workout_targets["sleep_date"] = workout_targets["sleep_date"].dt.date

# merge dataframes
model_df = workout_targets.merge(sleep_feats, on="sleep_date", how="left")
model_df.head() 

Unnamed: 0,start_time,exercise_title,total_volume,max_weight,top_set_reps,total_reps,workout_date,sleep_date,sleep_eff,sleep_dur,awake_min,light_min,deep_min,rem_min
0,2024-03-01 15:00:00,Lat Pulldown (Cable),5400.0,160.0,10,36,2024-03-01,2024-02-29,85.0,418.0,74.0,227.0,92.0,99.0
1,2024-03-04 12:45:00,Lat Pulldown (Cable),4312.0,143.0,8,32,2024-03-04,2024-03-03,89.0,464.0,55.0,289.0,87.0,88.0
2,2024-03-05 12:17:00,Bench Press (Barbell),4545.0,225.0,6,29,2024-03-05,2024-03-04,91.0,453.0,40.0,220.0,81.0,152.0
3,2024-03-07 12:30:00,Bench Press (Barbell),4895.0,205.0,6,31,2024-03-07,2024-03-06,89.0,476.0,57.0,248.0,106.0,122.0
4,2024-03-07 12:30:00,Shoulder Press (Dumbbell),2400.0,70.0,8,37,2024-03-07,2024-03-06,89.0,476.0,57.0,248.0,106.0,122.0


In [72]:
# load physiological cycle data, physiological data is my daily resting heart rate and heart rate variability, etc 
phys = pd.read_csv("physiological_cycles.csv")
phys["Cycle start time"] = pd.to_datetime(phys["Cycle start time"]) # parse datetime
phys["phys_date"] = phys["Cycle start time"].dt.date # extract date

phys_feats = (
    phys.groupby("phys_date", as_index=False)
        .agg(
            recovery=("Recovery score %", "mean"), # recovery score percentage based on HRV, RHR, RR, sleep performance and quality, skin temp, SpO2
            rhr=("Resting heart rate (bpm)", "mean"), # resting heart rate in beats per minute
            hrv=("Heart rate variability (ms)", "mean"), # heart rate variability in milliseconds
            rr=("Respiratory rate (rpm)", "mean"), # respiratory rate in respirations per minute
            spo2=("Blood oxygen %", "mean"), # blood oxygen saturation percentage
            skin_temp=("Skin temp (celsius)", "mean"), # skin temperature in celsius
            day_strain=("Day Strain", "mean"), # day strain score based on cardiovascular strain 
            sleep_need=("Sleep need (min)", "mean"), # sleep need in minutes
            sleep_debt=("Sleep debt (min)", "mean"), # sleep debt in minutes
            sleep_consistency=("Sleep consistency %", "mean"), # sleep consistency percentage based on regularity of sleep schedule
            sleep_performance=("Sleep performance %", "mean"), # sleep performance percentage based on sleep quality
            in_bed=("In bed duration (min)", "mean"), # in bed duration in minutes
        )
)


# merge physiological features with model dataframe
model_df = model_df.merge(phys_feats, left_on="workout_date", right_on="phys_date", how="left")
model_df.head()
# save final modeling dataframe to csv
model_df.to_csv("Health_lifting.csv", index=False)


In [79]:
# load new data from csv 
df = pd.read_csv("Health_lifting.csv")
df["start_time"] = pd.to_datetime(df["start_time"], errors="coerce") # convert to datetime
df.head()

Unnamed: 0,start_time,exercise_title,total_volume,max_weight,top_set_reps,total_reps,workout_date,sleep_date,sleep_eff,sleep_dur,...,hrv,rr,spo2,skin_temp,day_strain,sleep_need,sleep_debt,sleep_consistency,sleep_performance,in_bed
0,2024-03-01 15:00:00,Lat Pulldown (Cable),5400.0,160.0,10,36,2024-03-01,2024-02-29,85.0,418.0,...,94.0,12.5,95.86,33.67,15.6,671.0,127.0,85.0,31.0,227.0
1,2024-03-04 12:45:00,Lat Pulldown (Cable),4312.0,143.0,8,32,2024-03-04,2024-03-03,89.0,464.0,...,124.0,12.7,92.39,33.6,14.5,594.0,58.0,73.0,76.0,493.0
2,2024-03-05 12:17:00,Bench Press (Barbell),4545.0,225.0,6,29,2024-03-05,2024-03-04,91.0,453.0,...,133.0,12.9,97.0,33.44,18.7,606.0,91.0,77.0,63.0,444.0
3,2024-03-07 12:30:00,Bench Press (Barbell),4895.0,205.0,6,31,2024-03-07,2024-03-06,89.0,476.0,...,117.0,12.8,94.83,32.93,15.6,641.0,116.0,91.0,66.0,502.0
4,2024-03-07 12:30:00,Shoulder Press (Dumbbell),2400.0,70.0,8,37,2024-03-07,2024-03-06,89.0,476.0,...,117.0,12.8,94.83,32.93,15.6,641.0,116.0,91.0,66.0,502.0


In [81]:
# setting up lag features, want to use  prior performance to predict future performance
g = df.groupby("exercise_title", group_keys=False) # group by exercise title

df["prev_max_weight"] = g["max_weight"].shift(1) # previous max weight for the same exercise
df["prev_volume"] = g["total_volume"].shift(1) # previous total volume for the same exercise
df["prev_reps"] = g["total_reps"].shift(1) # previous total reps for the same exercise

df["prev_time"] = g["start_time"].shift(1) # previous time for the same exercise
df["days_since_last"] = (df["start_time"] - df["prev_time"]).dt.total_seconds() / 86400 # days since last workout for the same exercise

# target: next time you do the lift
df["next_max_weight"] = g["max_weight"].shift(-1)

model_df = df.dropna(subset=["prev_max_weight", "next_max_weight"]).copy() # drop rows with missing lag features or target
model_df.head()

Unnamed: 0,start_time,exercise_title,total_volume,max_weight,top_set_reps,total_reps,workout_date,sleep_date,sleep_eff,sleep_dur,...,sleep_debt,sleep_consistency,sleep_performance,in_bed,prev_max_weight,prev_volume,prev_reps,prev_time,days_since_last,next_max_weight
1,2024-03-04 12:45:00,Lat Pulldown (Cable),4312.0,143.0,8,32,2024-03-04,2024-03-03,89.0,464.0,...,58.0,73.0,76.0,493.0,160.0,5400.0,36.0,2024-03-01 15:00:00,2.90625,143.0
3,2024-03-07 12:30:00,Bench Press (Barbell),4895.0,205.0,6,31,2024-03-07,2024-03-06,89.0,476.0,...,116.0,91.0,66.0,502.0,225.0,4545.0,29.0,2024-03-05 12:17:00,2.009028,195.0
5,2024-03-08 13:27:00,Lat Pulldown (Cable),5170.0,143.0,10,40,2024-03-08,2024-03-07,86.0,423.0,...,108.0,92.0,72.0,512.0,143.0,4312.0,32.0,2024-03-04 12:45:00,4.029167,160.0
6,2024-03-09 12:52:00,Bench Press (Barbell),3030.0,195.0,6,22,2024-03-09,2024-03-08,89.0,457.0,...,111.0,86.0,13.0,102.0,205.0,4895.0,31.0,2024-03-07 12:30:00,2.015278,225.0
7,2024-03-10 20:02:00,Lat Pulldown (Cable),3860.0,160.0,8,28,2024-03-10,2024-03-09,74.0,137.0,...,84.0,55.5,80.5,472.5,143.0,5170.0,40.0,2024-03-08 13:27:00,2.274306,180.0


In [82]:
# health features to include
health_candidates = [
    "sleep_eff","sleep_dur","awake_min","light_min","deep_min","rem_min",
    "recovery","rhr","hrv","rr","spo2","skin_temp","day_strain",
    "sleep_need","sleep_debt","sleep_consistency","sleep_performance","in_bed"
]

# filter health features to those present in the dataframe
health_feats = [c for c in health_candidates if c in model_df.columns]

model_df["dow"] = model_df["start_time"].dt.dayofweek 
model_df["hour"] = model_df["start_time"].dt.hour

# final feature list: health features and lag features
FEATURES = ["prev_max_weight", "prev_volume", "prev_reps", "days_since_last", "dow", "hour"] + health_feats

In [83]:
# array to hold models
models = {}

# train a model for each lift
for lift in LIFTS:
    sub = model_df[model_df["exercise_title"] == lift].sort_values("start_time") # filter to lift and sort by time
    n = len(sub) # number of sessions for this lift
    print("\n", lift, "sessions for modeling:", n) # print number of sessions
    if n < 12: # if less than 12 sessions, skip modeling
        print("  -> not enough data yet to model well")
        continue
    
    # features and target
    X = sub[FEATURES]
    y = sub["next_max_weight"]

    # modeling pipeline, impute missing values with median, random forest regressor
    # the pipeline helps do the same preprocessing steps during cross-validation and final fitting
    pipe = Pipeline([
        ("imputer", SimpleImputer(strategy="median")), # impute missing values with median
        ("model", RandomForestRegressor(n_estimators=500, random_state=0)) # random forest regressor
    ])

    # time series cross-validation, this make sure the lag features are valid and used to predict future data
    n_splits = min(5, max(2, n // 10)) # at least 2 splits, at most 5, roughly 10 samples per split
    tscv = TimeSeriesSplit(n_splits=n_splits) # time series cross-validation

    # evaluate with mean absolute error
    mae = (-cross_val_score(pipe, X, y, cv=tscv, scoring="neg_mean_absolute_error")).mean()
    print(f"  MAE (lbs): {mae:.2f}") # off by this many pounds on average

    # fit models 
    pipe.fit(X, y)
    models[lift] = pipe


 Front Squat sessions for modeling: 49
  MAE (lbs): 25.75

 Bench Press (Barbell) sessions for modeling: 154
  MAE (lbs): 18.00

 Shoulder Press (Dumbbell) sessions for modeling: 70
  MAE (lbs): 6.16

 Clean sessions for modeling: 57
  MAE (lbs): 21.27

 Lat Pulldown (Cable) sessions for modeling: 119
  MAE (lbs): 16.21


In [84]:
from datetime import timedelta

def make_schedule_from_weeks(last_time, weeks=8, sessions_per_week=2):
    # total future sessions
    n_sessions = int(weeks * sessions_per_week)
    # spacing in days
    every_days = 7 / sessions_per_week
    return [last_time + timedelta(days=every_days*i) for i in range(1, n_sessions+1)]

def forecast_all_lifts(df, LIFTS, models, FEATURES, weeks=8, sessions_per_week=2):
    all_preds = []

    for lift in LIFTS:
        if lift not in models:
            print(f"Skipping {lift}: no trained model")
            continue

        sub = df[df["exercise_title"] == lift].sort_values("start_time")
        if len(sub) < 2:
            print(f"Skipping {lift}: not enough history")
            continue

        last_time = pd.to_datetime(sub["start_time"].iloc[-1])
        future_times = make_schedule_from_weeks(last_time, weeks=weeks, sessions_per_week=sessions_per_week)

        traj = forecast_lift_trajectory(df, lift, models[lift], FEATURES, future_times)
        all_preds.append(traj)

    if not all_preds:
        return pd.DataFrame(columns=["exercise_title", "start_time", "pred_max_weight"])

    return pd.concat(all_preds, ignore_index=True)

# run it
forecast_df = forecast_all_lifts(df, LIFTS, models, FEATURES, weeks=8, sessions_per_week=2)
forecast_df.head()

Unnamed: 0,lift,pred_next_max_weight_lbs
1,Bench Press (Barbell),231.85
3,Clean,191.94536
0,Front Squat,231.23574
4,Lat Pulldown (Cable),186.488
2,Shoulder Press (Dumbbell),79.20368
