In [1]:
import os
import re
import time
import requests
import json
import pandas as pd
import psycopg2
import psycopg2.extras

In [2]:
# =====================
# CONFIG
# =====================
CSV_PATH = "C:/Users/Krist/Documents/Work/Data Science/projects/Air_Quality/data/raw/schools/edubasealldata20260109_edited.csv"
PG_DSN="dbname=airquality user=postgres password=Milian112! host=localhost port=5432"

if not PG_DSN:
    raise RuntimeError("Set PG_DSN in environment or hardcode it.")

POSTCODES_IO_BULK = "https://api.postcodes.io/postcodes"

UK_POSTCODE_RE = re.compile(
    r"\b([A-Z]{1,2}\d[A-Z\d]?\s*\d[A-Z]{2})\b", re.IGNORECASE
)

In [3]:
# =====================
# HELPERS
# =====================
def pg_conn():
    return psycopg2.connect(PG_DSN)

def normalize_postcode(pc):
    if not pc or pd.isna(pc):
        return None
    pc = str(pc).strip().upper().replace(" ", "")
    if len(pc) < 5:
        return None
    pc = pc[:-3] + " " + pc[-3:]
    return pc if UK_POSTCODE_RE.match(pc) else None

In [4]:
# =====================
# MAIN LOAD
# =====================
def main():
    df = pd.read_csv(CSV_PATH, low_memory=False)
    print("Rows loaded:", len(df))

    # ---- Filter to OPEN schools in England ----
    df = df[df["EstablishmentStatus (name)"] == "Open"]
    print("Open schools:", len(df))

    # ---- Build clean fields ----
    df["school_id"] = df["URN"].astype(str)
    df["school_name"] = df["EstablishmentName"]

    df["postcode"] = df["Postcode"].apply(normalize_postcode)

    df["address"] = (
        df["Street"].fillna("") + ", " +
        df["Locality"].fillna("")
    ).str.strip(", ")

    df["town"] = df["Town"]
    df["local_authority"] = df["LA (name)"]
    
    # ---- Geocode ----
    postcodes = sorted(df["postcode"].dropna().unique())
    print("Unique postcodes:", len(postcodes))

    geo = bulk_geocode_postcodes(
    postcodes=postcodes,
    cache_path="postcode_cache.json",
    chunk_size=50
    )

    # ---- Prepare rows ----
    rows = []
    for _, r in df.iterrows():
        g = geo.get(r["postcode"])
        lat = g["lat"] if g else None
        lon = g["lon"] if g else None
        geom = f"SRID=4326;POINT({lon} {lat})" if lat and lon else None

        rows.append((
            r["school_id"],
            r["school_name"],
            r["postcode"],
            r["address"],
            r["town"],
            r["local_authority"],
            lat,
            lon,
            geom
        ))

    # ---- Upsert ----
    sql = """
    INSERT INTO dim_schools
      (school_id, school_name, postcode, address, town,
       local_authority, lat, lon, geom)
    VALUES %s
    ON CONFLICT (school_id) DO UPDATE SET
      school_name=EXCLUDED.school_name,
      postcode=EXCLUDED.postcode,
      address=EXCLUDED.address,
      town=EXCLUDED.town,
      local_authority=EXCLUDED.local_authority,
      lat=EXCLUDED.lat,
      lon=EXCLUDED.lon,
      geom=EXCLUDED.geom,
      loaded_at=now();
    """

    template = "(%s,%s,%s,%s,%s,%s,%s,%s,ST_GeomFromEWKT(%s))"

    with pg_conn() as conn:
        with conn.cursor() as cur:
            psycopg2.extras.execute_values(
                cur, sql, rows, template=template, page_size=2000
            )
        conn.commit()

    print("dim_schools loaded.")

In [5]:
POSTCODES_IO_BULK = "https://api.postcodes.io/postcodes"

def bulk_geocode_postcodes(
    postcodes: list[str],
    cache_path: str = "postcode_cache.json",
    chunk_size: int = 50,          # smaller than 100 to reduce resets
    base_sleep_s: float = 0.20,    # polite pacing
    max_retries: int = 6
) -> dict[str, dict]:
    """
    Bulk geocode postcodes via postcodes.io with:
    - chunking (default 50)
    - retries + exponential backoff on transient failures
    - on-disk cache to resume runs
    """
    # load existing cache if present
    cache = {}

    if os.path.exists(cache_path):
        try:
            with open(cache_path, "r", encoding="utf-8") as f:
                content = f.read().strip()
                cache = json.loads(content) if content else {}
        except (json.JSONDecodeError, OSError):
            # cache file is empty/corrupted; start fresh
            cache = {}

    # only request those not already cached
    todo = [pc for pc in postcodes if pc and pc not in cache]
    print(f"Geocode total={len(postcodes)} | cached={len(cache)} | remaining={len(todo)}")

    session = requests.Session()
    session.headers.update({"Content-Type": "application/json"})

    def call_api(chunk: list[str]) -> dict:
        last_err = None
        for attempt in range(1, max_retries + 1):
            try:
                r = session.post(POSTCODES_IO_BULK, json={"postcodes": chunk}, timeout=60)

                # rate limiting / transient server errors
                if r.status_code in (429, 500, 502, 503, 504):
                    last_err = (r.status_code, r.text[:200])
                    backoff = min(60, (2 ** (attempt - 1)) + random.random())
                    time.sleep(backoff)
                    continue

                r.raise_for_status()
                return r.json()

            except (requests.exceptions.ConnectionError,
                    requests.exceptions.ChunkedEncodingError,
                    requests.exceptions.Timeout) as e:
                last_err = str(e)
                backoff = min(60, (2 ** (attempt - 1)) + random.random())
                time.sleep(backoff)

        raise RuntimeError(f"postcodes.io failed after retries. last_error={last_err}")

    # chunk + process
    for i in range(0, len(todo), chunk_size):
        chunk = todo[i:i + chunk_size]
        payload = call_api(chunk)
        results = payload.get("result", []) or []

        for item in results:
            q = item.get("query")
            res = item.get("result")
            if not q or not res:
                continue

            # store minimal fields needed for your project
            cache[q] = {
                "lat": res.get("latitude"),
                "lon": res.get("longitude"),
                "admin_district": res.get("admin_district"),
                "region": res.get("region"),
            }

        # save cache every chunk so you can resume safely
        with open(cache_path, "w", encoding="utf-8") as f:
            json.dump(cache, f)

        time.sleep(base_sleep_s)

        if (i // chunk_size) % 10 == 0 and i > 0:
            print(f"Progress: {i}/{len(todo)} remaining...")

    return cache

In [6]:
# =====================
if __name__ == "__main__":
    main()

Rows loaded: 52201
Open schools: 27165
Unique postcodes: 25478
Geocode total=25478 | cached=749 | remaining=24729
Progress: 500/24729 remaining...
Progress: 1000/24729 remaining...
Progress: 1500/24729 remaining...
Progress: 2000/24729 remaining...
Progress: 2500/24729 remaining...
Progress: 3000/24729 remaining...
Progress: 3500/24729 remaining...
Progress: 4000/24729 remaining...
Progress: 4500/24729 remaining...
Progress: 5000/24729 remaining...
Progress: 5500/24729 remaining...
Progress: 6000/24729 remaining...
Progress: 6500/24729 remaining...
Progress: 7000/24729 remaining...
Progress: 7500/24729 remaining...
Progress: 8000/24729 remaining...
Progress: 8500/24729 remaining...
Progress: 9000/24729 remaining...
Progress: 9500/24729 remaining...
Progress: 10000/24729 remaining...
Progress: 10500/24729 remaining...
Progress: 11000/24729 remaining...
Progress: 11500/24729 remaining...
Progress: 12000/24729 remaining...
Progress: 12500/24729 remaining...
Progress: 13000/24729 remaining