# Modelling

In this notebook, I will do some final data engineering before using Pycaret to compare model performance on the dataset.

In [1]:
# CHANGING WORKING DIRECTORY
import os

os.chdir("..")

In [2]:
# Imports
import polars as pl


## 1. Pre Processing
In this section, I will encode the categorical variable of gender to a binary variable with 0 for males and 1 for females. I will add information to each row about the change in TRIMP and HRR from the previous reading. 

In [3]:
# Loading dataset
df = pl.read_csv(r"data\processed\final_df.csv")

In [4]:
# Binary encoding of gender
df = df.with_columns((pl.col("gender") == "F").cast(pl.Int8).alias("gender"))

In [5]:
df

athlete_id,gender,week_no,50_HRR(30),75_HRR(30),MMP_max,TRIMP
str,i8,i64,f64,f64,f64,f64
"""01843f3a-8883-4a76-a60f-223c92…",1,-1,,,,29.393435
"""01843f3a-8883-4a76-a60f-223c92…",1,1,,,,372.633562
"""01843f3a-8883-4a76-a60f-223c92…",1,2,,,,99.6454
"""01843f3a-8883-4a76-a60f-223c92…",1,3,,,,217.230806
"""01843f3a-8883-4a76-a60f-223c92…",1,4,,,,267.000822
…,…,…,…,…,…,…
"""fe4bcccf-399f-4cd4-b385-e3b053…",0,64,,,,364.959667
"""fe4bcccf-399f-4cd4-b385-e3b053…",0,65,,,223.554167,458.038017
"""fe4bcccf-399f-4cd4-b385-e3b053…",0,66,,,240.033333,395.231501
"""fe4bcccf-399f-4cd4-b385-e3b053…",0,67,,,224.525,295.831958


In [6]:
# --- 2. Define Parameters ---
# The lookback periods in weeks
lags = [1, 2, 4]
# The columns you want to calculate differences for
value_cols = ["50_HRR(30)", "75_HRR(30)", "MMP_max", "TRIMP"]

# --- 3. Main Logic: Loop, Join, and Calculate ---

# Start with the original dataframe
df_with_diffs = df.clone()

# Create and join the lagged data for each lag period
for lag in lags:
    # Create the dataframe to join with
    df_lagged = (
        df.select(["athlete_id", "gender", "week_no"] + value_cols)
        # Key Step: Create the future week_no to join on
        .with_columns(pl.col("week_no") + lag)
        # Rename value columns to avoid name collisions during the join
        .rename({col: f"{col}_lag_{lag}" for col in value_cols})
    )

    # Join back to the main dataframe
    df_with_diffs = df_with_diffs.join(
        df_lagged,
        on=["athlete_id", "gender", "week_no"],
        how="left",  # Use a left join to keep all original rows
    )

# Now, calculate the differences using the new lagged columns
diff_expressions = [
    (pl.col(col) - pl.col(f"{col}_lag_{lag}")).alias(f"{col}_diff_{lag}")
    for col in value_cols
    for lag in lags
]

df = df_with_diffs.with_columns(diff_expressions)

In [7]:
# Min-Max Scaling
# 2. Define which columns you want to scale
cols_to_scale = [
    "50_HRR(30)",
    "75_HRR(30)",
    "MMP_max",
    "TRIMP",
    "50_HRR(30)_lag_1",
    "50_HRR(30)_lag_2",
    "50_HRR(30)_lag_4",
    "75_HRR(30)_lag_1",
    "75_HRR(30)_lag_2",
    "75_HRR(30)_lag_4",
    "MMP_max_lag_1",
    "MMP_max_lag_2",
    "MMP_max_lag_4",
    "TRIMP_lag_1",
    "TRIMP_lag_2",
    "TRIMP_lag_4",
    "50_HRR(30)_diff_1",
    "50_HRR(30)_diff_2",
    "50_HRR(30)_diff_4",
    "75_HRR(30)_diff_1",
    "75_HRR(30)_diff_2",
    "75_HRR(30)_diff_4",
    "MMP_max_diff_1",
    "MMP_max_diff_2",
    "MMP_max_diff_4",
    "TRIMP_diff_1",
    "TRIMP_diff_2",
    "TRIMP_diff_4",
]

# 3. Create a list of scaling expressions
scaling_exprs = []
for col_name in cols_to_scale:
    # Define the min and max as window functions, partitioned by athlete_id
    min_val = pl.col(col_name).min().over("athlete_id")
    max_val = pl.col(col_name).max().over("athlete_id")

    # Create the full expression with a robust check for division by zero
    expr = (
        pl.when((max_val - min_val) == 0)
        .then(pl.lit(0.0))  # If max == min, the scaled value is 0
        .otherwise((pl.col(col_name) - min_val) / (max_val - min_val))
        .alias(f"{col_name}_scaled")  # Name the new scaled column
    )
    scaling_exprs.append(expr)

# 4. Apply the expressions to the DataFrame in one go
df_scaled = df.with_columns(scaling_exprs)

In [9]:
df_scaled.columns

['athlete_id',
 'gender',
 'week_no',
 '50_HRR(30)',
 '75_HRR(30)',
 'MMP_max',
 'TRIMP',
 '50_HRR(30)_lag_1',
 '75_HRR(30)_lag_1',
 'MMP_max_lag_1',
 'TRIMP_lag_1',
 '50_HRR(30)_lag_2',
 '75_HRR(30)_lag_2',
 'MMP_max_lag_2',
 'TRIMP_lag_2',
 '50_HRR(30)_lag_4',
 '75_HRR(30)_lag_4',
 'MMP_max_lag_4',
 'TRIMP_lag_4',
 '50_HRR(30)_diff_1',
 '50_HRR(30)_diff_2',
 '50_HRR(30)_diff_4',
 '75_HRR(30)_diff_1',
 '75_HRR(30)_diff_2',
 '75_HRR(30)_diff_4',
 'MMP_max_diff_1',
 'MMP_max_diff_2',
 'MMP_max_diff_4',
 'TRIMP_diff_1',
 'TRIMP_diff_2',
 'TRIMP_diff_4',
 '50_HRR(30)_scaled',
 '75_HRR(30)_scaled',
 'MMP_max_scaled',
 'TRIMP_scaled',
 '50_HRR(30)_lag_1_scaled',
 '50_HRR(30)_lag_2_scaled',
 '50_HRR(30)_lag_4_scaled',
 '75_HRR(30)_lag_1_scaled',
 '75_HRR(30)_lag_2_scaled',
 '75_HRR(30)_lag_4_scaled',
 'MMP_max_lag_1_scaled',
 'MMP_max_lag_2_scaled',
 'MMP_max_lag_4_scaled',
 'TRIMP_lag_1_scaled',
 'TRIMP_lag_2_scaled',
 'TRIMP_lag_4_scaled',
 '50_HRR(30)_diff_1_scaled',
 '50_HRR(30)_diff_2_s

In [10]:
df_scaled.write_csv(r"data\processed\final_df_scaled.csv")