In [None]:
print('Setup complete.')

# Lab 08 — JSONL Dashboard 

Diagnostics & logging hygiene.  



## 1) Setup (Colab)

Minimal installs. Data: `/content/data`. Outputs: `/content/out`.


In [None]:
%%bash
pip -q install --upgrade pandas matplotlib nbformat

In [None]:
from pathlib import Path
BASE = Path('/content')
DATA = BASE/'data'
OUT = BASE/'out'
DATA.mkdir(parents=True, exist_ok=True)
OUT.mkdir(parents=True, exist_ok=True)
print('DATA=', DATA)
print('OUT=', OUT)

## 2) Task 1 — Load & Peek (code)

Load `.jsonl` files and quick-check structure.


In [None]:
from pathlib import Path
import json, pandas as pd

DATA = Path('/content/data')
JSONL_FILES = []  # e.g., ['sample1.jsonl', 'sample2.jsonl']

def read_jsonl(path: Path):
    """Load JSON Lines without enforcing a schema."""
    rows = []
    with path.open('r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            try:
                rows.append(json.loads(line))
            except Exception:
                # TODO: append the raw line to a quarantine list / file for later review
                pass
    return rows

def load_many_jsonl(files):
    """Concatenate many JSONL files into a single DataFrame."""
    all_rows = []
    for name in files:
        p = DATA / name
        if not p.exists():
            print(f"[skip] {p}")
            continue
        all_rows.extend(read_jsonl(p))
    return pd.DataFrame(all_rows) if all_rows else pd.DataFrame()

df = load_many_jsonl(JSONL_FILES)

# Optional: keep only AskSage models if present
if not df.empty and 'model' in df.columns:
    df = df[df['model'].isin(['gpt-5', 'gpt-5-mini'])].copy()

print('rows =', len(df), 'cols =', len(df.columns))
print('cols  =', list(df.columns))
display(df.head(5))

# TODOs:
# - TODO: add an assertion only after you find the schema
# - TODO: list any rows missing key fields and decide whether to drop or fix them
# - TODO: persist a small "quarantine" .jsonl of malformed lines


## 3) Task 2 — Metrics (code)

Compute latency stats, token counts, and cost.


In [None]:
import pandas as pd
import numpy as np

if df.empty:
    print('No data. Load files in Task 1.')
else:
    def p95(s):
        return float(s.quantile(0.95)) if len(s) else float('nan')

    # Ensure numeric
    for col in ['latency_ms','input_tokens','output_tokens','cost_usd']:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce')

    # Fill cost if missing
    if 'cost_usd' not in df.columns or df['cost_usd'].isna().all():
        df['cost_usd'] = 0.0
        # TODO: compute cost from AskSage pricing (gpt-5, gpt-5-mini)

    # Latency stats
    if 'latency_ms' in df.columns:
        lat_stats = df['latency_ms'].describe(percentiles=[0.5, 0.95])
        print('Latency:')
        display(lat_stats)

    # By model
    if 'model' in df.columns and {'latency_ms','cost_usd'}.issubset(df.columns):
        by_model = df.groupby('model').agg(
            n=('model','size'),
            lat_med=('latency_ms','median'),
            lat_p95=('latency_ms', p95),
            cost_sum=('cost_usd','sum')
        )
        print('By model:')
        display(by_model)

    # Approx cost per 1K interactions
    if 'cost_usd' in df.columns and len(df)>0:
        total_cost = float(df['cost_usd'].sum())
        per_1k = (total_cost / max(1, len(df))) * 1000.0
        print('Cost per 1K interactions:', round(per_1k, 6))

    # TODO: add error_rate if error column exists


## 4) Task 3 — Tiny Dashboard (code)

Make 1 table + 1 chart. Save PNG to `/content/out`.


In [None]:
import pandas as pd, matplotlib.pyplot as plt
from pathlib import Path
OUT = Path('/content/out'); OUT.mkdir(parents=True, exist_ok=True)

if df.empty:
    print('No data.')
else:
    # TODO: build compact summary table

    # TODO: make 1 chart and save PNG
    # Example:
    # if 'latency_ms' in df.columns:
    #     plt.figure()
    #     df['latency_ms'].dropna().plot(kind='hist', bins=30, title='Latency (ms)')
    #     plt.xlabel('latency_ms'); plt.ylabel('count')
    #     plt.tight_layout()
    #     p = OUT/'latency_hist.png'
    #     plt.savefig(p); plt.show(); print('Saved:', p)

    pass


## 5) Task 4 — Ongoing Hygiene TODOs

Long-term TODOs to add into your codebase.


In [None]:
# TODOs — Logging & Diagnostics (AskSage: gpt-5, gpt-5-mini)

# Logging
# - TODO: add correlation_id per request
# - TODO: log p95 latency per model/hour
# - TODO: record tokens consistently
# - TODO: capture error_code + retries
# - TODO: enforce UTC ISO 8601 timestamps

# Cost
# - TODO: compute cost_usd per call using AskSage pricing
# - TODO: weekly rollup per model

# Dashboards
# - TODO: daily PNG export
# - TODO: alerts for high p95 latency / error_rate

# Validation
# - TODO: schema checks; quarantine bad rows
# - TODO: dedupe by (correlation_id, timestamp)

# Docs
# - TODO: README for fields + logging process


## Deliverables

- One PNG dashboard image in `/content/out/`  
- Notebook with completed code cells for Tasks 1–3


## Exit Ticket

Final markdown cell, answer:
1) One key metric you computed  
2) One dashboard insight  
3) First hygiene TODO you'll implement
