In [3]:
import sys
import pickle
import time
import torch
import numpy as np
import pandas as pd
import plotly.graph_objects as go
from pathlib import Path
from plotly.subplots import make_subplots
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

# Import Project Utils
from project.utils.dataset import derive_features
from project.utils.modeling import predict_model


# ---- Setup Paths ----
def add_src_to_path():
    p = Path.cwd().resolve()
    # Search up the tree for 'src'
    for parent in [p] + list(p.parents):
        if (parent / "src").exists():
            sys.path.insert(0, str(parent / "src"))
            print(f"‚úÖ Added to sys.path: {parent / 'src'}")
            return parent / "data"
    raise RuntimeError("Could not locate 'src' directory.")


DATA_ROOT = add_src_to_path()

# Configuration - Now including all sports
SPORTS = ["biking", "running", "walking"]
SCALES = [0, 2, 5, 10]

print(f"üìÇ Data Root: {DATA_ROOT}")
print(f"üèÉ Sports to process: {', '.join(SPORTS)}")

‚úÖ Added to sys.path: /Users/jonasgundlach/Academic/P3/Data_Preparation/data-preparation-2026-group-project/src
üìÇ Data Root: /Users/jonasgundlach/Academic/P3/Data_Preparation/data-preparation-2026-group-project/data
üèÉ Sports to process: biking, running, walking


In [4]:
# %%
def load_and_recalc(sport, scale, variant):
    """
    Loads a parquet file and forces recalculation of speed/distance.
    Saves the result with a '_recalculated' suffix to skip future processing.
    """
    sport_dir = DATA_ROOT / sport

    # 1. Determine Original File Path
    if variant == "Cleaned":
        filename = f"cleaned_scale_{scale}_{sport}.parquet"
        path = sport_dir / filename
        if not path.exists():
            path = sport_dir / "cleaned" / filename
    else:
        filename = (
            f"{sport}_test_raw.parquet"
            if scale == 0
            else f"erroneous_scale_{scale}_{sport}_data.parquet"
        )
        path = sport_dir / filename

    # 2. Determine Recalculated File Path
    recalc_filename = path.stem + "_recalculated.parquet"
    recalc_path = path.parent / recalc_filename

    # 3. IF RECALCULATED FILE EXISTS: Load it and skip math
    if recalc_path.exists():
        import pyarrow.parquet as pq

        table = pq.read_table(recalc_path)
        df = pd.DataFrame({c: table[c].to_pylist() for c in table.column_names})
        print(f"  üöÄ {sport.capitalize()} [{variant} Scale {scale}]: Loaded from cache")
        return df

    # 4. IF NOT: Load original and do the heavy lifting
    if not path.exists():
        return None

    import pyarrow.parquet as pq

    table = pq.read_table(path)
    df = pd.DataFrame({c: table[c].to_pylist() for c in table.column_names})

    # Swap for physics (if uncleaned and scale > 0)
    if variant == "Uncleaned" and scale > 0:
        if "erroneous_latitude" in df.columns:
            df["latitude"] = df["erroneous_latitude"]
            df["longitude"] = df["erroneous_longitude"]

    # Run derive_features (The slow part)
    df = derive_features(df)

    # SAVE for next time
    df.to_parquet(recalc_path, index=False)

    print(f"  ‚úÖ {sport.capitalize()} [{variant} Scale {scale}]: Recalculated & Saved")
    return df


# Nested Dictionary: datasets[sport][scale][variant]
datasets = {}

print("üîÑ Processing Physics (Loading Cached or Recalculating)...")

for sport in SPORTS:
    datasets[sport] = {}
    for scale in SCALES:
        datasets[sport][scale] = {}
        for variant in ["Uncleaned", "Cleaned"]:
            df = load_and_recalc(sport, scale, variant)
            if df is not None:
                datasets[sport][scale][variant] = df

print(
    "\n‚ú® Success: All sport datasets are ready (using _recalculated files where available)."
)

üîÑ Processing Physics (Loading Cached or Recalculating)...
  ‚úÖ Biking [Uncleaned Scale 0]: Recalculated & Saved
  ‚úÖ Biking [Uncleaned Scale 2]: Recalculated & Saved
  ‚úÖ Running [Uncleaned Scale 0]: Recalculated & Saved
  ‚úÖ Running [Uncleaned Scale 2]: Recalculated & Saved
  ‚úÖ Walking [Uncleaned Scale 0]: Recalculated & Saved
  ‚úÖ Walking [Uncleaned Scale 2]: Recalculated & Saved

‚ú® Success: All sport datasets are ready (using _recalculated files where available).


In [5]:
# %%
import pickle
import time
from sklearn.metrics import r2_score

# ---- Configuration & Helper Functions ----
DATASET_ARGS = {
    "numerical_columns": [
        "time_elapsed_standardized",
        "altitude_standardized",
        "derived_speed_standardized",
        "derived_distance_standardized",
    ],
    "categorical_columns": ["userId_idx", "sport_idx", "gender_idx"],
    "heartrate_input_column": "heart_rate_standardized",
    "heartrate_output_column": "heart_rate",
    "workout_id_column": "id",
    "use_heartrate_input": True,
}

def apply_scaling_optimized(df, scaler, id_col="userId"):
    df_out = df.copy()
    raw_cols = ["time_elapsed", "altitude", "derived_speed", "derived_distance", "heart_rate"]
    us = scaler.user_stats
    default_stats = (0.0, 1.0)
    for col in raw_cols:
        col_stats = us.get(col, {})
        def get_stat_tuple(uid):
            val = col_stats.get(uid, us.get(uid, {}).get(col, default_stats))
            return (val.get("mean", 0.0), val.get("std", 1.0)) if isinstance(val, dict) else val
        stat_tuples = df_out[id_col].map(get_stat_tuple)
        normalized_data = [
            ((np.asarray(seq, dtype=float) - mu) / (sig if sig > 1e-12 else 1.0)).tolist()
            for seq, (mu, sig) in zip(df_out[col], stat_tuples)
        ]
        df_out[f"{col}_standardized"] = normalized_data
    return df_out

# ---- Smart Execution Logic ----
save_path = DATA_ROOT / "inference_results_cache.pkl"

if save_path.exists():
    print(f"\nüîÑ Found cached results at {save_path.name}. Loading...")
    with open(save_path, "rb") as f:
        results_log = pickle.load(f)
    print(f"‚úÖ Loaded {len(results_log)} entries from cache. Skipping inference.")

else:
    results_log = []
    print("\nüöÄ Cache not found. Starting Memory-Safe Inference on M4 Pro...")
    start_time_global = time.time()

    for sport in SPORTS:
        t0 = time.time()
        sport_dir = DATA_ROOT / sport
        model_path = sport_dir / f"{sport}_fitrec_model.pt"
        
        try:
            from project.utils.modeling.model import FitRecModel
            torch.serialization.add_safe_globals([FitRecModel])
            model = torch.load(model_path, map_location="cpu", weights_only=True)
            model.eval()
            
            with open(sport_dir / f"{sport}_user_standard_scaler.pkl", "rb") as f:
                scaler = pickle.load(f)
            with open(sport_dir / f"{sport}_static_ordinal_encoder.pkl", "rb") as f:
                encoder = pickle.load(f)
        except Exception as e:
            print(f"  ‚ùå Error loading {sport}: {e}")
            continue

        for scale in SCALES:
            for variant in ["Uncleaned", "Cleaned"]:
                if sport not in datasets or scale not in datasets[sport] or variant not in datasets[sport][scale]:
                    continue
                
                df_curr = datasets[sport][scale][variant].copy()
                df_curr = apply_scaling_optimized(df_curr, scaler)
                df_curr = encoder.transform(df_curr)
                
                preds = predict_model(model, df_curr, dataset_args=DATASET_ARGS, n_workers=6)
                df_curr["predicted_heart_rate"] = list(preds)
                
                y_true = np.concatenate([np.array(x, dtype=float) for x in df_curr["heart_rate"]])
                y_pred = np.concatenate([np.array(x, dtype=float) for x in df_curr["predicted_heart_rate"]])
                
                mae = np.mean(np.abs(y_true - y_pred))
                mape = np.mean(np.abs((y_true - y_pred) / (y_true + 1e-9)))
                accuracy_pct = max(0, (1 - mape) * 100)
                
                results_log.append({
                    "Sport": sport, "Scale": scale, "Variant": variant,
                    "MAE": mae, "Accuracy": accuracy_pct, "R2": r2_score(y_true, y_pred),
                    "DataFrame": df_curr[["id", "heart_rate", "predicted_heart_rate"]].copy()
                })

        print(f"  ‚è±Ô∏è {sport.upper()} total time: {time.time() - t0:.1f}s")

    print(f"\n‚ú® Global Inference Finished in {time.time() - start_time_global:.1f}s")

    # Save results to cache
    with open(save_path, "wb") as f:
        pickle.dump(results_log, f)
    print(f"‚úÖ Results log saved to {save_path}")


üîÑ Found cached results at inference_results_cache.pkl. Loading...
‚úÖ Loaded 24 entries from cache. Skipping inference.


In [6]:
import pickle

# Save to the root of your data folder
save_path = DATA_ROOT / "inference_results_cache.pkl"

with open(save_path, "wb") as f:
    pickle.dump(results_log, f)

print(f"‚úÖ Results log saved to {save_path}")

‚úÖ Results log saved to /Users/jonasgundlach/Academic/P3/Data_Preparation/data-preparation-2026-group-project/data/inference_results_cache.pkl


In [None]:
# %%
import pandas as pd
import numpy as np
import plotly.graph_objects as go
from sklearn.linear_model import LinearRegression
import os

# Ensure output directory exists (optional)
output_dir = "plots"
os.makedirs(output_dir, exist_ok=True)

df_res = pd.DataFrame(results_log)

def add_piecewise_fits_mae(fig, df_variant, name, color):
    if df_variant.empty:
        return

    # Sort data
    df_v = df_variant.sort_values("Scale")
    x = df_v["Scale"].values.reshape(-1, 1)
    y = df_v["MAE"].values

    # 1. Scatter Points
    fig.add_trace(go.Scatter(
        x=x.flatten(), y=y, mode="markers", name=name,
        marker=dict(color=color, size=10, line=dict(width=1, color="white")),
        legendgroup=name
    ))

    # --- SEGMENT 1: LOW NOISE (Scale 0 to 2) ---
    mask_1 = (x.flatten() <= 2)
    if np.sum(mask_1) >= 2:
        model_1 = LinearRegression().fit(x[mask_1], y[mask_1])
        x_range_1 = np.linspace(0, 2, 20).reshape(-1, 1)
        y_range_1 = model_1.predict(x_range_1) # No cap needed for MAE

        fig.add_trace(go.Scatter(
            x=x_range_1.flatten(), y=y_range_1, 
            mode="lines", 
            line=dict(color=color, width=2.5, dash="dash"), 
            opacity=0.8,
            showlegend=False, 
            legendgroup=name, 
            hoverinfo="skip"
        ))

    # --- SEGMENT 2: HIGH NOISE (Scale 2 to 10) ---
    mask_2 = (x.flatten() >= 2)
    if np.sum(mask_2) >= 2:
        model_2 = LinearRegression().fit(x[mask_2], y[mask_2])
        x_range_2 = np.linspace(2, 10, 50).reshape(-1, 1)
        y_range_2 = model_2.predict(x_range_2)

        fig.add_trace(go.Scatter(
            x=x_range_2.flatten(), y=y_range_2, 
            mode="lines", 
            line=dict(color=color, width=2.5, dash="dash"),
            opacity=0.8,
            showlegend=False, 
            legendgroup=name, 
            hoverinfo="skip"
        ))

# Loop to create 3 separate figures
for sport in SPORTS:
    fig = go.Figure()
    sport_data = df_res[df_res["Sport"] == sport]
    
    # 1. Add Blue Baseline (Scale 0 Uncleaned)
    base_data = sport_data[(sport_data["Scale"] == 0) & (sport_data["Variant"] == "Uncleaned")]
    if not base_data.empty:
        baseline_y = base_data["MAE"].values[0]
        
        fig.add_shape(
            type="line", x0=0, x1=10, y0=baseline_y, y1=baseline_y,
            line=dict(color="royalblue", width=2, dash="dot")
        )
        fig.add_annotation(
            x=9.8, y=baseline_y, text=f"Base: {baseline_y:.2f} BPM",
            showarrow=False, font=dict(color="royalblue", size=10),
            bgcolor="white", yshift=12 # Shift UP for MAE
        )

    # 2. Add Traces (Cleaned vs Noisy)
    add_piecewise_fits_mae(fig, sport_data[sport_data["Variant"] == "Cleaned"], "Cleaned", "#2ca02c")
    add_piecewise_fits_mae(fig, sport_data[sport_data["Variant"] == "Uncleaned"], "Noisy", "#d62728")

    # 3. Formatting
    fig.update_layout(
        title=dict(text=f"<b>{sport.upper()}</b>: MAE Growth (Error Sensitivity)", x=0.5),
        template="plotly_white",
        width=800, height=500,
        margin=dict(t=120, b=60, l=60, r=40),
        legend=dict(orientation="h", yanchor="bottom", y=1.09, xanchor="center", x=0.5),
        xaxis=dict(title="Noise Scale (Std Dev)", gridcolor="#f0f0f0", range=[-0.2, 10.2]),
        yaxis=dict(title="Mean Absolute Error (BPM)", gridcolor="#f0f0f0")
    )
    
    fig.update_yaxes(range=[0, None])

    fig.show()

    # --- SAVE COMMAND ---
    filename = f"{output_dir}/{sport}_mae_growth.png"
    fig.write_image(filename)
    print(f"‚úÖ Saved PNG: {filename}")

‚úÖ Saved SVG: plots_svg/biking_mae_growth.svg


‚úÖ Saved SVG: plots_svg/running_mae_growth.svg


‚úÖ Saved SVG: plots_svg/walking_mae_growth.svg


In [11]:
# %%
import pandas as pd
import numpy as np
import plotly.graph_objects as go
from sklearn.linear_model import LinearRegression

df_res = pd.DataFrame(results_log)

def add_piecewise_fits(fig, df_variant, name, color):
    if df_variant.empty:
        return

    # Sort data
    df_v = df_variant.sort_values("Scale")
    x = df_v["Scale"].values.reshape(-1, 1)
    y = df_v["Accuracy"].values

    # 1. Scatter Points (Solid Markers)
    fig.add_trace(go.Scatter(
        x=x.flatten(), y=y, mode="markers", name=name,
        marker=dict(color=color, size=10, line=dict(width=1, color="white")),
        legendgroup=name
    ))

    # --- SEGMENT 1: LOW NOISE (Scale 0 to 2) ---
    mask_1 = (x.flatten() <= 2)
    if np.sum(mask_1) >= 2:
        model_1 = LinearRegression().fit(x[mask_1], y[mask_1])
        x_range_1 = np.linspace(0, 2, 20).reshape(-1, 1)
        y_range_1 = np.minimum(model_1.predict(x_range_1), 100.0) # Cap at 100%

        fig.add_trace(go.Scatter(
            x=x_range_1.flatten(), y=y_range_1, 
            mode="lines", 
            line=dict(color=color, width=2.5, dash="dash"),  # <--- DASHED
            opacity=0.8,
            showlegend=False, 
            legendgroup=name, 
            hoverinfo="skip"
        ))

    # --- SEGMENT 2: HIGH NOISE (Scale 2 to 10) ---
    mask_2 = (x.flatten() >= 2)
    if np.sum(mask_2) >= 2:
        model_2 = LinearRegression().fit(x[mask_2], y[mask_2])
        x_range_2 = np.linspace(2, 10, 50).reshape(-1, 1)
        y_range_2 = np.minimum(model_2.predict(x_range_2), 100.0)

        fig.add_trace(go.Scatter(
            x=x_range_2.flatten(), y=y_range_2, 
            mode="lines", 
            line=dict(color=color, width=2.5, dash="dash"), # <--- DASHED
            opacity=0.8,
            showlegend=False, 
            legendgroup=name, 
            hoverinfo="skip"
        ))

# Loop to create 3 separate figures
for sport in SPORTS:
    fig = go.Figure()
    sport_data = df_res[df_res["Sport"] == sport]
    
    # 1. Add Blue Baseline (Solid or Dashed - keeping it Dot/Dash to distinguish)
    base_data = sport_data[(sport_data["Scale"] == 0) & (sport_data["Variant"] == "Uncleaned")]
    if not base_data.empty:
        baseline_y = base_data["Accuracy"].values[0]
        fig.add_shape(
            type="line", x0=0, x1=10, y0=baseline_y, y1=baseline_y,
            line=dict(color="royalblue", width=2, dash="dot") 
        )
        fig.add_annotation(
            x=9.8, y=baseline_y, text=f"Base: {baseline_y:.1f}%",
            showarrow=False, font=dict(color="royalblue", size=10),
            bgcolor="white", yshift=-12
        )

    # 2. Add Traces with Piecewise Fits
    add_piecewise_fits(fig, sport_data[sport_data["Variant"] == "Cleaned"], "Cleaned", "#2ca02c")
    add_piecewise_fits(fig, sport_data[sport_data["Variant"] == "Uncleaned"], "Noisy", "#d62728")

    # 3. Formatting
    fig.update_layout(
        title=dict(text=f"<b>{sport.upper()}</b>: Accuracy Retention", x=0.5),
        template="plotly_white",
        width=800, height=500,
        margin=dict(t=120, b=60, l=60, r=40),
        legend=dict(orientation="h", yanchor="bottom", y=1.09, xanchor="center", x=0.5),
        xaxis=dict(title="Noise Scale (Std Dev)", gridcolor="#f0f0f0", range=[-0.2, 10.2]),
        yaxis=dict(title="Accuracy (%)", gridcolor="#f0f0f0", range=[None, 100.5])
    )

    fig.show()