In [1]:
import json
from datetime import timedelta
from html import unescape
from html.parser import HTMLParser

import requests

from utils.datetime import duration_to_minutes
from utils.datetime import parse_iso

In [2]:
JSON_URL = "https://cfp.pydata.org/pydata-amsterdam-2025/schedule/export/schedule.json"  # listed on the Pretalx schedule page
out_schedule = "data/pydata_amsterdam_2025_schedule.json"
out_descriptions = "data/pydata_amsterdam_2025_descriptions.json"
out_speakers_description = "data/pydata_amsterdam_2025_speakers_description.json"

In [3]:
class _Stripper(HTMLParser):
    def __init__(self):
        super().__init__(convert_charrefs=True)
        self._buf = []
    def handle_data(self, d): self._buf.append(d)
    def get_text(self): return "".join(self._buf)

def strip_html(html: str) -> str:
    if not html:
        return ""
    p = _Stripper()
    p.feed(html)
    return p.get_text().strip()

def normalize_ws(s: str) -> str:
    return " ".join((s or "").split())

In [4]:
data = requests.get(JSON_URL, timeout=30).json()

In [5]:
data["schedule"]["conference"]["days"]

[{'index': 1,
  'date': '2025-09-24',
  'day_start': '2025-09-24T04:00:00+02:00',
  'day_end': '2025-09-25T03:59:00+02:00',
  'rooms': {'Katherine Johnson @ TNW City': [{'guid': 'dffc7442-d29d-594b-8704-c6d2d7a20702',
     'code': 'BTDWCM',
     'id': 77812,
     'logo': None,
     'date': '2025-09-24T09:00:00+02:00',
     'start': '09:00',
     'duration': '01:30',
     'room': 'Katherine Johnson @ TNW City',
     'slug': 'pydata-amsterdam-2025-77812-next-level-retrieval-in-rag-techniques-and-tools-for-enhanced-performance',
     'url': 'https://cfp.pydata.org/pydata-amsterdam-2025/talk/BTDWCM/',
     'title': 'Next-Level Retrieval in RAG: Techniques and Tools for Enhanced Performance',
     'subtitle': '',
     'track': None,
     'type': 'Tutorial',
     'language': 'en',
     'abstract': 'Retrieval-Augmented Generation (RAG) systems rely heavily on the quality of the retrieval process to generate accurate and contextually relevant outputs. In this 90-minute tutorial, we explore pra

In [6]:
schedule = data["schedule"]["conference"]
days = schedule["days"]

In [7]:
rows = []
desc_map = dict()
speaker_map = dict()

for day in days:
    day_date = day.get("date")  # e.g. "2025-09-24"
    for room_name, events in (day.get("rooms") or {}).items():
        for e in events:
            # Core fields (safe defaults)
            event_id = e.get("id")
            title = normalize_ws(e.get("title", ""))
            talk_type = e.get("type", "")

            start_iso = e.get("date")
            start_dt = parse_iso(start_iso)
            duration = e.get("duration", "00:00")
            dur_min = duration_to_minutes(duration)
            end_dt = (start_dt + timedelta(minutes=dur_min)) if start_dt else None

            speakers_list = [normalize_ws(p.get("public_name", "")) for p in e.get("persons", [])]
            speakers = ", ".join(filter(None, speakers_list))

            # Main CSV row (no bulky text)
            rows.append({
                "day_date": day_date or (start_dt.date().isoformat() if start_dt else ""),
                "start": start_dt.isoformat() if start_dt else "",
                "end": end_dt.isoformat() if end_dt else "",
                "duration_min": dur_min,
                "room": room_name,
                "title": title,
                "type": talk_type,
                "speakers": speakers,
                "event_id": event_id,
            })

            # Description sidecar (JSONL)
            abstract = normalize_ws(e.get("abstract", ""))
            description_html = (e.get("description") or "").strip()
            description_text = normalize_ws(strip_html(unescape(description_html)))

            assert event_id not in desc_map
            desc_map[event_id] = {
                "title": title,
                "speakers": speakers_list,
                "type": talk_type,
                "abstract": abstract,
                "description_text": description_text,
            }

            # --- Speakers mapping (name -> description) -------------------------------
            for p in e.get("persons", []):
                # Prefer public_name; fall back to name
                name = normalize_ws(p.get("public_name") or p.get("name") or "")
                if not name:
                    continue

                # Biography can be plain text, markdown, or HTML; normalize to plain text
                bio_raw = p.get("biography") or ""
                bio_text = normalize_ws(strip_html(unescape(bio_raw)))

                speaker_map[name] = bio_text

# Sort for readability
rows.sort(key=lambda r: (r["day_date"], r["start"], r["room"]))

# ---------- write outputs ----------
fieldnames = [
    "day_date", "start", "end", "duration_min", "room",
    "title", "type", "track", "speakers", "url", "event_id", "guid"
]
with open(out_schedule, "w", encoding="utf-8") as f:
    json.dump(rows, f, indent=2)

with open(out_descriptions, "w", encoding="utf-8") as f:
    json.dump(desc_map, f, indent=2)

# Sort keys for readability
speaker_map_sorted = dict(sorted(speaker_map.items(), key=lambda kv: kv[0].lower()))
with open(out_speakers_description, "w", encoding="utf-8") as f:
    json.dump(speaker_map_sorted, f, ensure_ascii=False, indent=2)

print(f"Wrote {len(rows)} sessions to {out_schedule}")
print(f"Wrote {len(desc_map)} description records to {out_descriptions}")
print(f"Wrote {len(speaker_map)} speaker bios to {out_speakers_description}")

Wrote 61 sessions to data/pydata_amsterdam_2025_schedule.json
Wrote 61 description records to data/pydata_amsterdam_2025_descriptions.json
Wrote 82 speaker bios to data/pydata_amsterdam_2025_speakers_description.json
