In [1]:
%pip install kagglehub

Note: you may need to restart the kernel to use updated packages.


In [2]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("mexwell/long-distance-running-dataset")

print("Path to dataset files:", path)

  from .autonotebook import tqdm as notebook_tqdm


Path to dataset files: /Users/joelsng/.cache/kagglehub/datasets/mexwell/long-distance-running-dataset/versions/1


In [3]:
!pip -q install pandas pyarrow

In [4]:
import os, sys, math, json
from pathlib import Path
import pandas as pd


In [5]:
root = Path(path)
assert root.exists(), f"Path does not exist: {root}"
print("=== DATASET ROOT ===")
print(root.resolve())
print()

=== DATASET ROOT ===
/Users/joelsng/.cache/kagglehub/datasets/mexwell/long-distance-running-dataset/versions/1



In [6]:
def sizeof_mb(p: Path) -> str:
    try:
        return f"{p.stat().st_size/1e6:.1f} MB"
    except Exception:
        return "-"
print("=== FILE TREE (top-level and 1 subdir level) ===")
for p in sorted([root] + list(root.glob("*")) + list(root.glob("*/*"))):
    rel = p.relative_to(root)
    kind = "DIR " if p.is_dir() else "FILE"
    print(f"{kind:4} {rel}  {'' if p.is_dir() else sizeof_mb(p)}")
print()

=== FILE TREE (top-level and 1 subdir level) ===
DIR  .  
FILE covid-containment-and-health-index.csv  2.2 MB
FILE covid-stringency-index.csv  2.2 MB
FILE policy_response_indexes.csv  0.0 MB
FILE run_ww_2019_d.csv  956.5 MB
FILE run_ww_2019_m.csv  38.5 MB
FILE run_ww_2019_q.csv  11.8 MB
FILE run_ww_2019_w.csv  148.6 MB
FILE run_ww_2020_d.csv  958.0 MB
FILE run_ww_2020_m.csv  37.9 MB
FILE run_ww_2020_q.csv  11.7 MB
FILE run_ww_2020_w.csv  147.5 MB
FILE stay-at-home-covid.csv  1.9 MB
FILE workplace-closures-covid.csv  1.9 MB



In [7]:
data_files = []
for ext in (".csv", ".parquet", ".jsonl", ".json"):
    data_files.extend(sorted(root.rglob(f"*{ext}")))
if not data_files:
    print("No CSV/Parquet/JSON files found. Check the structure above.")
else:
    print("=== DATA FILES FOUND ===")
    for f in data_files:
        print(f"- {f.relative_to(root)} ({sizeof_mb(f)})")
print()

=== DATA FILES FOUND ===
- covid-containment-and-health-index.csv (2.2 MB)
- covid-stringency-index.csv (2.2 MB)
- policy_response_indexes.csv (0.0 MB)
- run_ww_2019_d.csv (956.5 MB)
- run_ww_2019_m.csv (38.5 MB)
- run_ww_2019_q.csv (11.8 MB)
- run_ww_2019_w.csv (148.6 MB)
- run_ww_2020_d.csv (958.0 MB)
- run_ww_2020_m.csv (37.9 MB)
- run_ww_2020_q.csv (11.7 MB)
- run_ww_2020_w.csv (147.5 MB)
- stay-at-home-covid.csv (1.9 MB)
- workplace-closures-covid.csv (1.9 MB)



In [8]:
# 3) Helpers to read small samples safely
def read_sample(fp: Path, n=5):
    suf = fp.suffix.lower()
    if suf == ".csv":
        return pd.read_csv(fp, nrows=n)
    if suf == ".parquet":
        return pd.read_parquet(fp, engine="pyarrow")
    if suf in (".jsonl", ".json"):
        # Try to read as JSON lines first; fall back to standard JSON array
        try:
            return pd.read_json(fp, lines=True)
        except ValueError:
            return pd.read_json(fp)
    raise ValueError(f"Unsupported format: {fp}")

In [9]:
def quick_stats(df: pd.DataFrame):
    info = {}
    info["rows"] = len(df)
    info["cols"] = df.columns.tolist()
    info["dtypes"] = {c: str(t) for c, t in df.dtypes.items()}
    # Missingness (%)
    na_pct = (df.isna().mean() * 100.0).round(1)
    info["na_pct"] = na_pct[na_pct > 0].to_dict()
    # Heuristic key columns if present
    for candidate in ["athlete_id", "runner_id", "user_id", "id"]:
        if candidate in df.columns:
            info["unique_" + candidate] = int(df[candidate].nunique())
    # Date range if a date-like column exists
    for dcand in ["date", "start_time", "timestamp", "startDate", "activity_date"]:
        if dcand in df.columns:
            try:
                dt = pd.to_datetime(df[dcand], errors="coerce")
                info["date_range_"+dcand] = [str(dt.min()), str(dt.max())]
                break
            except Exception:
                pass
    return info


In [10]:
print("=== TABLE SUMMARIES (sample + stats) ===")
for f in data_files:
    print("\n------------------------------------------------------------")
    print(f"FILE: {f.relative_to(root)}  |  SIZE: {sizeof_mb(f)}")
    try:
        # For parquet we only need a peek; avoid loading entire large file
        if f.suffix.lower() == ".parquet":
            df = read_sample(f)
            # If huge, take head after read
            sample = df.head(5)
        else:
            df = read_sample(f, n=5000)  # small slice gives better stats than 5 rows
            sample = df.head(5)

        print(f"Shape (sampled or full if small): {df.shape}")
        print("\nColumns:", list(df.columns))
        print("\nDtypes:", {c: str(t) for c, t in df.dtypes.items()})

        print("\nHead(5):")
        display(sample) if "display" in globals() else print(sample.to_string(index=False))

        stats = quick_stats(df)
        print("\nQuick stats:")
        for k, v in stats.items():
            print(f"- {k}: {v}")

        # Common value counts (if columns exist)
        for col in ["sport", "activity_type", "surface", "terrain"]:
            if col in df.columns:
                vc = df[col].value_counts().head(10)
                print(f"\nTop {col} values:")
                print(vc.to_string())

    except Exception as e:
        print(f"Could not summarize {f.name}: {e}")

=== TABLE SUMMARIES (sample + stats) ===

------------------------------------------------------------
FILE: covid-containment-and-health-index.csv  |  SIZE: 2.2 MB
Shape (sampled or full if small): (5000, 4)

Columns: ['Entity', 'Code', 'Date', 'containment_index']

Dtypes: {'Entity': 'object', 'Code': 'object', 'Date': 'object', 'containment_index': 'float64'}

Head(5):
     Entity Code       Date  containment_index
Afghanistan  AFG 2020-01-01                0.0
Afghanistan  AFG 2020-01-02                0.0
Afghanistan  AFG 2020-01-03                0.0
Afghanistan  AFG 2020-01-04                0.0
Afghanistan  AFG 2020-01-05                0.0

Quick stats:
- rows: 5000
- cols: ['Entity', 'Code', 'Date', 'containment_index']
- dtypes: {'Entity': 'object', 'Code': 'object', 'Date': 'object', 'containment_index': 'float64'}
- na_pct: {}

------------------------------------------------------------
FILE: covid-stringency-index.csv  |  SIZE: 2.2 MB
Shape (sampled or full if small): (5

# Building Weekly Features

In [11]:
import pandas as pd
from pathlib import Path

root = Path(path)  # from kagglehub
usecols = ["datetime","athlete","distance","duration","gender","age_group","country"]

In [12]:
# Read daily 2019 + 2020 in chunks and aggregate to week-level
def weekly_from_daily(csv_path):
    weekly_parts = []
    for chunk in pd.read_csv(csv_path, usecols=usecols, chunksize=1_000_000):
        chunk["datetime"] = pd.to_datetime(chunk["datetime"], errors="coerce")
        chunk = chunk.dropna(subset=["datetime"])
        chunk["week"] = chunk["datetime"].dt.isocalendar().week.astype(int)
        chunk["year"] = chunk["datetime"].dt.year.astype(int)

        # per (athlete, year, week)
        g = chunk.groupby(["athlete","year","week"])
        weekly = g.agg(
            weekly_km=("distance","sum"),
            days_run=("distance", lambda s: (s > 0).sum()),
            long_run_km=("distance","max"),
            total_duration_min=("duration","sum"),
        ).reset_index()

        # carry 1st seen demographics in the chunk
        demo = chunk.sort_values("datetime").groupby(["athlete"]).agg(
            gender=("gender","first"),
            age_group=("age_group","first"),
            country=("country","first"),
        ).reset_index()
        weekly = weekly.merge(demo, on="athlete", how="left")
        weekly_parts.append(weekly)

    weekly_all = pd.concat(weekly_parts, ignore_index=True)
    # collapse across chunks
    g2 = weekly_all.groupby(["athlete","year","week","gender","age_group","country"], as_index=False).agg(
        weekly_km=("weekly_km","sum"),
        days_run=("days_run","sum"),
        long_run_km=("long_run_km","max"),
        total_duration_min=("total_duration_min","sum"),
    )
    return g2

weekly_2019 = weekly_from_daily(root / "run_ww_2019_d.csv")
weekly_2020 = weekly_from_daily(root / "run_ww_2020_d.csv")
weekly = pd.concat([weekly_2019, weekly_2020], ignore_index=True)

# Basic sanity filters
weekly = weekly[(weekly["weekly_km"] > 0) & (weekly["days_run"] > 0)]
weekly["pace_min_per_km"] = (weekly["total_duration_min"] / weekly["weekly_km"]).round(2)
weekly.head()

Unnamed: 0,athlete,year,week,gender,age_group,country,weekly_km,days_run,long_run_km,total_duration_min,pace_min_per_km
22,0,2019,23,F,18 - 34,United States,9.46,2,4.83,63.433333,6.71
23,0,2019,24,F,18 - 34,United States,4.92,1,4.92,30.316667,6.16
24,0,2019,25,F,18 - 34,United States,27.73,4,11.29,169.233333,6.1
25,0,2019,26,F,18 - 34,United States,20.02,3,7.34,125.116667,6.25
26,0,2019,27,F,18 - 34,United States,7.0,1,7.0,45.0,6.43


# Sample realistic contexts for SFT prompts

In [13]:
import numpy as np

def wk_to_context(row):
    # Coerce realistic bands
    temp_c = np.random.choice([24,26,28,30,32,34], p=[.15,.2,.25,.2,.15,.05])
    if temp_c >= 30:
        humidity = np.random.choice([70,80,90], p=[.5,.3,.2])
    else:
        humidity = np.random.choice([40,55,70], p=[.3,.4,.3])
    stomach = np.random.choice(["none","sensitive"])
    goal = np.random.choice(["complete comfortably","negative split","PR attempt (~1:45)"])

    return (
        f"Runner: {int(row.days_run)} days/week; long run {row.long_run_km:.0f} km; "
        f"weekly volume {row.weekly_km:.0f} km; terrain flat; "
        f"weather {temp_c}°C, humidity {humidity}%; stomach {stomach}; "
        f"goal: {goal}."
    )

# Sample 200 weekly rows to build candidate prompts
cand = weekly.sample(200, random_state=7).copy()
cand["context"] = cand.apply(wk_to_context, axis=1)

cand[["athlete","year","week","weekly_km","days_run","long_run_km","pace_min_per_km","context"]].head()


Unnamed: 0,athlete,year,week,weekly_km,days_run,long_run_km,pace_min_per_km,context
3391869,29673,2020,19,5.42,1,5.42,5.72,Runner: 1 days/week; long run 5 km; weekly vol...
3330166,28470,2020,8,119.809,6,32.369,4.38,Runner: 6 days/week; long run 32 km; weekly vo...
2886528,19828,2020,33,36.1,4,12.41,5.9,Runner: 4 days/week; long run 12 km; weekly vo...
271958,5489,2019,51,4.34,1,4.34,6.75,Runner: 1 days/week; long run 4 km; weekly vol...
1864567,37358,2019,4,146.96,5,37.05,5.2,Runner: 5 days/week; long run 37 km; weekly vo...


# Turn samples into instruction/input/output JSONL

In [14]:
import json
from datetime import date
from pathlib import Path

tasks = [
    "Give a 5-week half-marathon taper.",
    "Create a 1-week plan balancing recovery and one speed session.",
    "Provide a race-day hydration and fueling plan.",
    "Design a heat-acclimation mini-plan for race week.",
]

out_path = Path("/Users/joelsng/Documents/GitHub/RunBuddy/model/corpus/coach_tron_candidates_from_logs.jsonl")
out_path.parent.mkdir(exist_ok=True)

with open(out_path, "w", encoding="utf-8") as f:
    for _, r in cand.iterrows():
        item = {
            "instruction": np.random.choice(tasks),
            "input": r["context"] + f" (observed week: {int(r.year)}-W{int(r.week)})",
            "output": "",  # <-- leave blank or draft with your model, then human-edit
            "meta": {
                "source": "kaggle_mexwell_running_logs",
                "created": date.today().isoformat(),
                "weekly_km": float(r.weekly_km),
                "days_run": int(r.days_run),
                "long_run_km": float(r.long_run_km),
                "pace_min_per_km": float(r.pace_min_per_km),
                "country": str(r.country),
                "age_group": str(r.age_group),
                "gender": str(r.gender),
            }
        }
        f.write(json.dumps(item, ensure_ascii=False) + "\n")

print("Wrote:", out_path)


Wrote: /Users/joelsng/Documents/GitHub/RunBuddy/model/corpus/coach_tron_candidates_from_logs.jsonl


# Populating output field for each jsonl

In [3]:
%pip install -U huggingface_hub hf-transfer
# Terminal once:
# %huggingface-cli login

Collecting huggingface_hub
  Using cached huggingface_hub-0.35.3-py3-none-any.whl.metadata (14 kB)
Collecting hf-transfer
  Using cached hf_transfer-0.1.9-cp38-abi3-macosx_11_0_arm64.whl.metadata (1.7 kB)
Collecting filelock (from huggingface_hub)
  Using cached filelock-3.20.0-py3-none-any.whl.metadata (2.1 kB)
Collecting fsspec>=2023.5.0 (from huggingface_hub)
  Using cached fsspec-2025.9.0-py3-none-any.whl.metadata (10 kB)
Collecting packaging>=20.9 (from huggingface_hub)
  Using cached packaging-25.0-py3-none-any.whl.metadata (3.3 kB)
Collecting pyyaml>=5.1 (from huggingface_hub)
  Downloading pyyaml-6.0.3-cp310-cp310-macosx_11_0_arm64.whl.metadata (2.4 kB)
Collecting requests (from huggingface_hub)
  Using cached requests-2.32.5-py3-none-any.whl.metadata (4.9 kB)
Collecting tqdm>=4.42.1 (from huggingface_hub)
  Using cached tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Collecting typing-extensions>=3.7.4.3 (from huggingface_hub)
  Using cached typing_extensions-4.15.0-py3-none-any

In [8]:
import os
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"   # rust downloader, much faster

: 

In [5]:
%pip install -U "protobuf>=4.25.3" "sentencepiece>=0.1.99" \
  "transformers>=4.43.0" "tokenizers>=0.15.2" \
  "huggingface_hub>=0.22.0" "accelerate>=0.30.0" safetensors

Collecting protobuf>=4.25.3
  Downloading protobuf-6.33.0-cp39-abi3-macosx_10_9_universal2.whl.metadata (593 bytes)
Collecting sentencepiece>=0.1.99
  Downloading sentencepiece-0.2.1-cp310-cp310-macosx_11_0_arm64.whl.metadata (10 kB)
Collecting transformers>=4.43.0
  Downloading transformers-4.57.1-py3-none-any.whl.metadata (43 kB)
Collecting tokenizers>=0.15.2
  Using cached tokenizers-0.22.1-cp39-abi3-macosx_11_0_arm64.whl.metadata (6.8 kB)
Collecting accelerate>=0.30.0
  Using cached accelerate-1.10.1-py3-none-any.whl.metadata (19 kB)
Collecting safetensors
  Using cached safetensors-0.6.2-cp38-abi3-macosx_11_0_arm64.whl.metadata (4.1 kB)
Collecting numpy>=1.17 (from transformers>=4.43.0)
  Downloading numpy-2.2.6-cp310-cp310-macosx_14_0_arm64.whl.metadata (62 kB)
Collecting regex!=2019.12.17 (from transformers>=4.43.0)
  Downloading regex-2025.9.18-cp310-cp310-macosx_11_0_arm64.whl.metadata (40 kB)
Collecting psutil (from accelerate>=0.30.0)
  Using cached psutil-7.1.0-cp36-abi3-ma

In [6]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

# TEACHER_CKPT = "mistralai/Mixtral-8x7B-Instruct-v0.1"  # very powerful, does not work
TEACHER_CKPT = "mistralai/Mistral-7B-Instruct-v0.3"

teacher_tok = AutoTokenizer.from_pretrained(TEACHER_CKPT, use_fast=True)
teacher_mod = AutoModelForCausalLM.from_pretrained(
    TEACHER_CKPT,
    torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
    device_map="auto"
)

if teacher_tok.pad_token is None:
    teacher_tok.pad_token = teacher_tok.eos_token

def teacher_generate(prompt: str) -> str:
    enc = teacher_tok(prompt, return_tensors="pt")
    enc = {k: v.to(teacher_mod.device) for k, v in enc.items()}
    teacher_mod.eval()
    with torch.inference_mode():
        out = teacher_mod.generate(
            **enc,
            max_new_tokens=480,
            do_sample=True, temperature=0.7, top_p=0.9,
            repetition_penalty=1.07,
            eos_token_id=None,
            pad_token_id=teacher_tok.pad_token_id,
            use_cache=True,
        )
    new = out[0, enc["input_ids"].shape[1]:]
    return teacher_tok.decode(new, skip_special_tokens=True).strip()


OSError: Could not find a suitable TLS CA certificate bundle, invalid path: /Users/joelsng/Documents/GitHub/RunBuddy/.venv/lib/python3.13/site-packages/certifi/cacert.pem

In [6]:
import json, re, time, random
from pathlib import Path

SYSTEM = ("You are a marathon coach. Output exactly the requested section(s). "
          "Use clear bullets, distances, intensity cues, and a short safety note. "
          "Do not give medical advice.")
PLAN_ONLY_INSTRUCTIONS = True  # set False if you want Plan/Hydration/Safety in one

def build_prompt(instruction, ctx):
    section = "Only the training schedule (no hydration/safety)." if PLAN_ONLY_INSTRUCTIONS else \
              "Include Plan, Hydration, and Safety sections."
    return (
        f"<|start|>system<|message|>{SYSTEM}<|end|>"
        f"<|start|>user<|message|>Task: {instruction}\n\nContext:\n{ctx}\n\n"
        f"Write a structured answer. {section}<|end|>"
        "<|start|>assistant<|channel|>final<|message|>"
    )

def looks_ok(text: str) -> bool:
    if PLAN_ONLY_INSTRUCTIONS:
        # must have some bullets or days and distances; forbid obviously unsafe claims
        if len(text.strip()) < 120: return False
        if re.search(r"\b(always|guarantee|ignore pain)\b", text, re.I): return False
        # basic sanity: mention km or min at least a few times
        if len(re.findall(r"\b(\d+(\.\d+)?)\s*(km|minutes?|min)\b", text, re.I)) < 3:
            return False
        return True
    else:
        # require sections
        return all(h in text for h in ["Plan", "Hydration", "Safety"])

def draft_one(instruction, ctx, n_best=3, sleep_s=0.0):
    prompt = build_prompt(instruction, ctx)
    best = []
    for _ in range(n_best):
        txt = teacher_generate(prompt).strip()
        if looks_ok(txt):
            best.append(txt)
        if sleep_s: time.sleep(sleep_s)
    # choose the “best” by length (proxy) or implement a small scorer
    if not best: return ""
    return max(best, key=len)[:2000]  # clip runaway outputs

# ----------- Run over your candidate set -----------
src = Path("/content/drive/My Drive/Colab Notebooks/IndividualAssignment/Documents/coach_tron_candidates_from_logs.jsonl")   # contexts you built
dst = Path("/content/drive/My Drive/Colab Notebooks/IndividualAssignment/Documents/coach_tron_candidates_with_drafts.jsonl")
dst.parent.mkdir(exist_ok=True)

count_in = count_out = 0
with src.open() as fin, dst.open("w", encoding="utf-8") as fout:
    for line in fin:
        ex = json.loads(line)
        count_in += 1
        if ex.get("output"):    # skip already filled
            fout.write(line); count_out += 1; continue
        d = draft_one(ex["instruction"], ex["input"], n_best=4, sleep_s=0.0)
        if d:
            ex["output"] = d
            ex.setdefault("meta", {})["draft_source"] = "teacher_v1"
            fout.write(json.dumps(ex, ensure_ascii=False) + "\n")
            count_out += 1

print(f"Drafted {count_out}/{count_in}. Now HUMAN-EDIT and mark meta.approved=true.")


FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/My Drive/Colab Notebooks/IndividualAssignment/Documents'