In [22]:
%pip install kagglehub

Collecting kagglehub
  Using cached kagglehub-0.3.13-py3-none-any.whl.metadata (38 kB)
Using cached kagglehub-0.3.13-py3-none-any.whl (68 kB)
Installing collected packages: kagglehub
Successfully installed kagglehub-0.3.13
Note: you may need to restart the kernel to use updated packages.


In [23]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("mexwell/long-distance-running-dataset")

print("Path to dataset files:", path)

  from .autonotebook import tqdm as notebook_tqdm


Path to dataset files: /Users/joelsng/.cache/kagglehub/datasets/mexwell/long-distance-running-dataset/versions/1


In [24]:
!pip -q install pandas pyarrow

In [25]:
import os, sys, math, json
from pathlib import Path
import pandas as pd


In [26]:
root = Path(path)
assert root.exists(), f"Path does not exist: {root}"
print("=== DATASET ROOT ===")
print(root.resolve())
print()

=== DATASET ROOT ===
/Users/joelsng/.cache/kagglehub/datasets/mexwell/long-distance-running-dataset/versions/1



In [27]:
def sizeof_mb(p: Path) -> str:
    try:
        return f"{p.stat().st_size/1e6:.1f} MB"
    except Exception:
        return "-"
print("=== FILE TREE (top-level and 1 subdir level) ===")
for p in sorted([root] + list(root.glob("*")) + list(root.glob("*/*"))):
    rel = p.relative_to(root)
    kind = "DIR " if p.is_dir() else "FILE"
    print(f"{kind:4} {rel}  {'' if p.is_dir() else sizeof_mb(p)}")
print()

=== FILE TREE (top-level and 1 subdir level) ===
DIR  .  
FILE covid-containment-and-health-index.csv  2.2 MB
FILE covid-stringency-index.csv  2.2 MB
FILE policy_response_indexes.csv  0.0 MB
FILE run_ww_2019_d.csv  956.5 MB
FILE run_ww_2019_m.csv  38.5 MB
FILE run_ww_2019_q.csv  11.8 MB
FILE run_ww_2019_w.csv  148.6 MB
FILE run_ww_2020_d.csv  958.0 MB
FILE run_ww_2020_m.csv  37.9 MB
FILE run_ww_2020_q.csv  11.7 MB
FILE run_ww_2020_w.csv  147.5 MB
FILE stay-at-home-covid.csv  1.9 MB
FILE workplace-closures-covid.csv  1.9 MB



In [28]:
data_files = []
for ext in (".csv", ".parquet", ".jsonl", ".json"):
    data_files.extend(sorted(root.rglob(f"*{ext}")))
if not data_files:
    print("No CSV/Parquet/JSON files found. Check the structure above.")
else:
    print("=== DATA FILES FOUND ===")
    for f in data_files:
        print(f"- {f.relative_to(root)} ({sizeof_mb(f)})")
print()

=== DATA FILES FOUND ===
- covid-containment-and-health-index.csv (2.2 MB)
- covid-stringency-index.csv (2.2 MB)
- policy_response_indexes.csv (0.0 MB)
- run_ww_2019_d.csv (956.5 MB)
- run_ww_2019_m.csv (38.5 MB)
- run_ww_2019_q.csv (11.8 MB)
- run_ww_2019_w.csv (148.6 MB)
- run_ww_2020_d.csv (958.0 MB)
- run_ww_2020_m.csv (37.9 MB)
- run_ww_2020_q.csv (11.7 MB)
- run_ww_2020_w.csv (147.5 MB)
- stay-at-home-covid.csv (1.9 MB)
- workplace-closures-covid.csv (1.9 MB)



In [29]:
# 3) Helpers to read small samples safely
def read_sample(fp: Path, n=5):
    suf = fp.suffix.lower()
    if suf == ".csv":
        return pd.read_csv(fp, nrows=n)
    if suf == ".parquet":
        return pd.read_parquet(fp, engine="pyarrow")
    if suf in (".jsonl", ".json"):
        # Try to read as JSON lines first; fall back to standard JSON array
        try:
            return pd.read_json(fp, lines=True)
        except ValueError:
            return pd.read_json(fp)
    raise ValueError(f"Unsupported format: {fp}")

In [30]:
def quick_stats(df: pd.DataFrame):
    info = {}
    info["rows"] = len(df)
    info["cols"] = df.columns.tolist()
    info["dtypes"] = {c: str(t) for c, t in df.dtypes.items()}
    # Missingness (%)
    na_pct = (df.isna().mean() * 100.0).round(1)
    info["na_pct"] = na_pct[na_pct > 0].to_dict()
    # Heuristic key columns if present
    for candidate in ["athlete_id", "runner_id", "user_id", "id"]:
        if candidate in df.columns:
            info["unique_" + candidate] = int(df[candidate].nunique())
    # Date range if a date-like column exists
    for dcand in ["date", "start_time", "timestamp", "startDate", "activity_date"]:
        if dcand in df.columns:
            try:
                dt = pd.to_datetime(df[dcand], errors="coerce")
                info["date_range_"+dcand] = [str(dt.min()), str(dt.max())]
                break
            except Exception:
                pass
    return info


In [31]:
print("=== TABLE SUMMARIES (sample + stats) ===")
for f in data_files:
    print("\n------------------------------------------------------------")
    print(f"FILE: {f.relative_to(root)}  |  SIZE: {sizeof_mb(f)}")
    try:
        # For parquet we only need a peek; avoid loading entire large file
        if f.suffix.lower() == ".parquet":
            df = read_sample(f)
            # If huge, take head after read
            sample = df.head(5)
        else:
            df = read_sample(f, n=5000)  # small slice gives better stats than 5 rows
            sample = df.head(5)

        print(f"Shape (sampled or full if small): {df.shape}")
        print("\nColumns:", list(df.columns))
        print("\nDtypes:", {c: str(t) for c, t in df.dtypes.items()})

        print("\nHead(5):")
        display(sample) if "display" in globals() else print(sample.to_string(index=False))

        stats = quick_stats(df)
        print("\nQuick stats:")
        for k, v in stats.items():
            print(f"- {k}: {v}")

        # Common value counts (if columns exist)
        for col in ["sport", "activity_type", "surface", "terrain"]:
            if col in df.columns:
                vc = df[col].value_counts().head(10)
                print(f"\nTop {col} values:")
                print(vc.to_string())

    except Exception as e:
        print(f"Could not summarize {f.name}: {e}")

=== TABLE SUMMARIES (sample + stats) ===

------------------------------------------------------------
FILE: covid-containment-and-health-index.csv  |  SIZE: 2.2 MB
Shape (sampled or full if small): (5000, 4)

Columns: ['Entity', 'Code', 'Date', 'containment_index']

Dtypes: {'Entity': 'object', 'Code': 'object', 'Date': 'object', 'containment_index': 'float64'}

Head(5):
     Entity Code       Date  containment_index
Afghanistan  AFG 2020-01-01                0.0
Afghanistan  AFG 2020-01-02                0.0
Afghanistan  AFG 2020-01-03                0.0
Afghanistan  AFG 2020-01-04                0.0
Afghanistan  AFG 2020-01-05                0.0

Quick stats:
- rows: 5000
- cols: ['Entity', 'Code', 'Date', 'containment_index']
- dtypes: {'Entity': 'object', 'Code': 'object', 'Date': 'object', 'containment_index': 'float64'}
- na_pct: {}

------------------------------------------------------------
FILE: covid-stringency-index.csv  |  SIZE: 2.2 MB
Shape (sampled or full if small): (5

# Building Weekly Features

In [32]:
import pandas as pd
from pathlib import Path

root = Path(path)  # from kagglehub
usecols = ["datetime","athlete","distance","duration","gender","age_group","country"]

In [34]:
# Read daily 2019 + 2020 in chunks and aggregate to week-level
def weekly_from_daily(csv_path):
    weekly_parts = []
    for chunk in pd.read_csv(csv_path, usecols=usecols, chunksize=1_000_000):
        chunk["datetime"] = pd.to_datetime(chunk["datetime"], errors="coerce")
        chunk = chunk.dropna(subset=["datetime"])
        chunk["week"] = chunk["datetime"].dt.isocalendar().week.astype(int)
        chunk["year"] = chunk["datetime"].dt.year.astype(int)

        # per (athlete, year, week)
        g = chunk.groupby(["athlete","year","week"])
        weekly = g.agg(
            weekly_km=("distance","sum"),
            days_run=("distance", lambda s: (s > 0).sum()),
            long_run_km=("distance","max"),
            total_duration_min=("duration","sum"),
        ).reset_index()

        # carry 1st seen demographics in the chunk
        demo = chunk.sort_values("datetime").groupby(["athlete"]).agg(
            gender=("gender","first"),
            age_group=("age_group","first"),
            country=("country","first"),
        ).reset_index()
        weekly = weekly.merge(demo, on="athlete", how="left")
        weekly_parts.append(weekly)

    weekly_all = pd.concat(weekly_parts, ignore_index=True)
    # collapse across chunks
    g2 = weekly_all.groupby(["athlete","year","week","gender","age_group","country"], as_index=False).agg(
        weekly_km=("weekly_km","sum"),
        days_run=("days_run","sum"),
        long_run_km=("long_run_km","max"),
        total_duration_min=("total_duration_min","sum"),
    )
    return g2

weekly_2019 = weekly_from_daily(root / "run_ww_2019_d.csv")
weekly_2020 = weekly_from_daily(root / "run_ww_2020_d.csv")
weekly = pd.concat([weekly_2019, weekly_2020], ignore_index=True)

# Basic sanity filters
weekly = weekly[(weekly["weekly_km"] > 0) & (weekly["days_run"] > 0)]
weekly["pace_min_per_km"] = (weekly["total_duration_min"] / weekly["weekly_km"]).round(2)
weekly.head()

Unnamed: 0,athlete,year,week,gender,age_group,country,weekly_km,days_run,long_run_km,total_duration_min,pace_min_per_km
22,0,2019,23,F,18 - 34,United States,9.46,2,4.83,63.433333,6.71
23,0,2019,24,F,18 - 34,United States,4.92,1,4.92,30.316667,6.16
24,0,2019,25,F,18 - 34,United States,27.73,4,11.29,169.233333,6.1
25,0,2019,26,F,18 - 34,United States,20.02,3,7.34,125.116667,6.25
26,0,2019,27,F,18 - 34,United States,7.0,1,7.0,45.0,6.43


# Sample realistic contexts for SFT prompts

In [37]:
import numpy as np

def wk_to_context(row):
    # Coerce realistic bands
    temp_c = np.random.choice([24,26,28,30,32,34], p=[.15,.2,.25,.2,.15,.05])
    if temp_c >= 30:
        humidity = np.random.choice([70,80,90], p=[.5,.3,.2])
    else:
        humidity = np.random.choice([40,55,70], p=[.3,.4,.3])
    stomach = np.random.choice(["none","sensitive"])
    goal = np.random.choice(["complete comfortably","negative split","PR attempt (~1:45)"])

    return (
        f"Runner: {int(row.days_run)} days/week; long run {row.long_run_km:.0f} km; "
        f"weekly volume {row.weekly_km:.0f} km; terrain flat; "
        f"weather {temp_c}°C, humidity {humidity}%; stomach {stomach}; "
        f"goal: {goal}."
    )

# Sample 200 weekly rows to build candidate prompts
cand = weekly.sample(200, random_state=7).copy()
cand["context"] = cand.apply(wk_to_context, axis=1)

cand[["athlete","year","week","weekly_km","days_run","long_run_km","pace_min_per_km","context"]].head()


Unnamed: 0,athlete,year,week,weekly_km,days_run,long_run_km,pace_min_per_km,context
3391869,29673,2020,19,5.42,1,5.42,5.72,Runner: 1 days/week; long run 5 km; weekly vol...
3330166,28470,2020,8,119.809,6,32.369,4.38,Runner: 6 days/week; long run 32 km; weekly vo...
2886528,19828,2020,33,36.1,4,12.41,5.9,Runner: 4 days/week; long run 12 km; weekly vo...
271958,5489,2019,51,4.34,1,4.34,6.75,Runner: 1 days/week; long run 4 km; weekly vol...
1864567,37358,2019,4,146.96,5,37.05,5.2,Runner: 5 days/week; long run 37 km; weekly vo...


# Turn samples into instruction/input/output JSONL

In [38]:
import json
from datetime import date
from pathlib import Path

tasks = [
    "Give a 5-week half-marathon taper.",
    "Create a 1-week plan balancing recovery and one speed session.",
    "Provide a race-day hydration and fueling plan.",
    "Design a heat-acclimation mini-plan for race week.",
]

out_path = Path("/Users/joelsng/Documents/GitHub/RunBuddy/model/corpus/coach_tron_candidates_from_logs.jsonl")
out_path.parent.mkdir(exist_ok=True)

with open(out_path, "w", encoding="utf-8") as f:
    for _, r in cand.iterrows():
        item = {
            "instruction": np.random.choice(tasks),
            "input": r["context"] + f" (observed week: {int(r.year)}-W{int(r.week)})",
            "output": "",  # <-- leave blank or draft with your model, then human-edit
            "meta": {
                "source": "kaggle_mexwell_running_logs",
                "created": date.today().isoformat(),
                "weekly_km": float(r.weekly_km),
                "days_run": int(r.days_run),
                "long_run_km": float(r.long_run_km),
                "pace_min_per_km": float(r.pace_min_per_km),
                "country": str(r.country),
                "age_group": str(r.age_group),
                "gender": str(r.gender),
            }
        }
        f.write(json.dumps(item, ensure_ascii=False) + "\n")

print("Wrote:", out_path)


Wrote: /Users/joelsng/Documents/GitHub/RunBuddy/model/corpus/coach_tron_candidates_from_logs.jsonl


# Populating output field for each jsonl

In [39]:
import json, re, time, os, subprocess
from pathlib import Path
import requests

In [40]:
SRC = Path("/Users/joelsng/Documents/GitHub/RunBuddy/model/corpus/coach_tron_candidates_from_logs.jsonl")
DST = Path("/Users/joelsng/Documents/GitHub/RunBuddy/model/corpus/coach_tron_candidates_with_drafts.jsonl")
CHECKPOINT = Path("/Users/joelsng/Documents/GitHub/RunBuddy/model/corpus/coach_tron_checkpoint.txt")

BATCH_SIZE = 20            # adjust per VRAM
SLEEP_BETWEEN = 5          # seconds between samples
SLEEP_BETWEEN_BATCHES = 20 # seconds between batches
OLLAMA_MODEL = "mistral:instruct"

In [41]:
OLLAMA_URL   = os.environ.get("OLLAMA_URL", "http://localhost:11434/api/chat")
TEACHER_MODEL = os.environ.get("TEACHER_MODEL", "mistral:instruct")
TIMEOUT_S     = 300
N_BEST        = 3           # try 3 variants and keep the best one
SLEEP_BETWEEN = 0.0         # seconds between tries

# Global coach style
SYSTEM = (
    "You are a careful marathon coach. Use clear bullets, day-by-day schedules, "
    "and practical pacing cues. Use ranges (not absolutes). "
    "Avoid medical claims. Keep outputs concise and actionable."
)

In [42]:
# ---------------- Helpers ----------------
def restart_ollama():
    """Kill and restart Ollama cleanly to flush Metal memory."""
    print("\n🔄 Restarting Ollama to clear VRAM ...")
    subprocess.run(["pkill", "ollama"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
    time.sleep(3)
    subprocess.Popen(["ollama", "serve"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
    time.sleep(10)  # give it time to reload

def read_checkpoint():
    if CHECKPOINT.exists():
        return int(CHECKPOINT.read_text().strip())
    return 0

def write_checkpoint(idx):
    CHECKPOINT.write_text(str(idx))

def safe_write(f, ex):
    f.write(json.dumps(ex, ensure_ascii=False) + "\n")


In [43]:
# Routert by instruction
def route_instruction(instruction: str):
    ins = instruction.lower()
    if "taper" in ins:
        # Plan-only (no hydration/safety sections here; you can do separate channels later)
        return {
            "section": "plan",
            "directions": (
                "Write ONLY the training schedule. Include days, distances (km), and intensity cues. "
                "Keep it realistic for the provided weekly volume and long run. "
                "Do not include hydration or safety sections here."
            ),
            "min_tokens": 120,
            "must_contain": [r"\b(km|kilometer|minute|min)\b"],
            "forbid": [r"\b(cure|guarantee|ignore pain)\b"]
        }
    if "hydration" in ins or "fuel" in ins:
        # Hydration + fueling (race-day style)
        return {
            "section": "hydration",
            "directions": (
                "Provide a race-day hydration and fueling plan with practical ranges. "
                "Structure with short headings (Pre, During, Post). "
                "Include sodium mg/h ranges, fluid mL/h, gel timing. "
                "Add one-line safety note at the end. Keep concise."
            ),
            "min_tokens": 100,
            "must_contain": [r"\b(Pre|During|Post)\b"],
            "forbid": [r"\b(cure|guarantee|always run through pain)\b"]
        }
    # Fallback: generic structured answer
    return {
        "section": "plan",
        "directions": (
            "Write a concise, structured answer with practical steps. "
            "Use bullets and short sentences."
        ),
        "min_tokens": 80,
        "must_contain": [],
        "forbid": []
    }

In [44]:
# Ollama call function
def ollama_chat(system: str, user: str) -> str:
    payload = {
        "model": TEACHER_MODEL,
        "stream": False,
        "messages": [
            {"role": "system", "content": system},
            {"role": "user", "content": user},
        ],
        "options": {
            "temperature": 0.7,
            "top_p": 0.9,
            "num_predict": 480,
            "repeat_penalty": 1.07,
        }
    }
    r = requests.post(OLLAMA_URL, json=payload, timeout=TIMEOUT_S)
    r.raise_for_status()
    return r.json()["message"]["content"].strip()

In [45]:
# Drafting + filtering
def looks_ok(text: str, rules: dict) -> bool:
    if text is None: 
        return False
    t = text.strip()
    if len(t) < rules["min_tokens"]:
        return False
    for pat in rules["must_contain"]:
        if re.search(pat, t, flags=re.I) is None:
            return False
    for pat in rules["forbid"]:
        if re.search(pat, t, flags=re.I):
            return False
    # Anti-runaway junk
    if "```" in t and t.count("```") % 2 == 1:
        return False
    return True


In [46]:
def draft_best(instruction: str, ctx: str, rules: dict) -> str:
    # Build the user prompt
    user_prompt = f"Task: {instruction}\n\nContext:\n{ctx}\n\n{rules['directions']}"
    best = []
    for _ in range(N_BEST):
        try:
            txt = ollama_chat(SYSTEM, user_prompt)
            if looks_ok(txt, rules):
                best.append(txt)
        except Exception as e:
            print("Ollama error:", e)
        if SLEEP_BETWEEN:
            time.sleep(SLEEP_BETWEEN)
    if not best:
        return ""
    # Choose longest as proxy for completeness (you can add smarter scoring)
    return max(best, key=len)[:2000]

In [51]:
start_idx = read_checkpoint()
print(f"▶️ Resuming from index {start_idx}")

lines = [json.loads(l) for l in SRC.open() if l.strip()]
total = len(lines)
print(f"Total samples: {total}")

with DST.open("a", encoding="utf-8") as fout:
    for i in range(start_idx, total, BATCH_SIZE):
        batch = lines[i:i + BATCH_SIZE]
        print(f"\n=== Processing batch {i // BATCH_SIZE + 1} "
              f"({i}-{i+len(batch)-1}) ===")

        for j, ex in enumerate(batch, start=i):
            if ex.get("output") and ex["output"].strip():
                safe_write(fout, ex)
                continue

            try:
                rules = route_instruction(ex["instruction"])
                draft = draft_best(ex["instruction"], ex["input"], rules)
            except Exception as e:
                print(f"❌ Error on line {j}: {e}")
                ex.setdefault("meta", {})["draft_status"] = "error"
                safe_write(fout, ex)
                continue

            if not draft:
                ex.setdefault("meta", {})["draft_status"] = "failed"
                safe_write(fout, ex)
                continue
            
            ex["output"] = draft
            meta = ex.setdefault("meta", {})
            meta.update({
                "draft_source": f"ollama:{OLLAMA_MODEL}",
                "draft_status": "ok",
                "approved": False,
            })

            safe_write(fout, ex)
            print(f"✅ Example {j+1}/{total} done")
            time.sleep(SLEEP_BETWEEN)

        # save checkpoint after each batch
        write_checkpoint(i + len(batch))
        print(f"💾 Saved checkpoint at {i + len(batch)} / {total}")

        # restart Ollama to flush memory
        restart_ollama()
        time.sleep(SLEEP_BETWEEN_BATCHES)

print("🎯 All batches processed successfully!")

▶️ Resuming from index 1
Total samples: 200

=== Processing batch 1 (1-20) ===
✅ Example 2/200 done
✅ Example 3/200 done
✅ Example 4/200 done
✅ Example 5/200 done
✅ Example 6/200 done
✅ Example 7/200 done
✅ Example 8/200 done
✅ Example 9/200 done
✅ Example 10/200 done
✅ Example 11/200 done
✅ Example 12/200 done
✅ Example 13/200 done
✅ Example 14/200 done
✅ Example 15/200 done
✅ Example 16/200 done
✅ Example 17/200 done
✅ Example 18/200 done
✅ Example 19/200 done
✅ Example 20/200 done
✅ Example 21/200 done
💾 Saved checkpoint at 21 / 200

🔄 Restarting Ollama to clear VRAM ...

=== Processing batch 2 (21-40) ===
✅ Example 22/200 done
✅ Example 23/200 done


KeyboardInterrupt: 