In [5]:
import re
import pandas as pd
import psycopg2

# ======================
# PATHS (EDIT IF NEEDED)
# ======================
EDUBASE_PATH = r"C:/Users/Krist/Documents/Work/Data Science/projects/Air_Quality/data/raw/schools/edubasealldata20260109.csv"
OUT_PATH     = r"C:/Users/Krist/Documents/Work/Data Science/projects/Air_Quality/data/csv/tableau_school_exposure.csv"

PG_DSN = "dbname=airquality user=postgres password=Milian112! host=localhost port=5432"

# ======================
# HELPERS
# ======================
def norm_postcode(pc):
    if pc is None or pd.isna(pc):
        return None
    return re.sub(r"\s+", "", str(pc).strip().upper())

def norm_name(name):
    if name is None or pd.isna(name):
        return None
    s = str(name).strip().lower()
    s = re.sub(r"[^a-z0-9]+", " ", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

# ======================
# LOAD DATA
# ======================
def fetch_mart_school_air_exposure():
    sql = """
    SELECT
        school_id,
        school_name,
        postcode,
        town,
        local_authority,
        lat,
        lon,
        pm25_exposure,
        no2_exposure,
        combined_exposure_index,
        confidence_score
    FROM mart_school_air_exposure;
    """
    with psycopg2.connect(PG_DSN) as conn:
        return pd.read_sql(sql, conn)

def load_edubase_pupils():
    # Adjust column names here IF needed
    usecols = ["EstablishmentName", "Postcode", "NumberOfPupils"]

    ed = pd.read_csv(
        EDUBASE_PATH,
        usecols=usecols,
        low_memory=False,
        encoding="cp1252"
    )

    ed = ed.rename(columns={
        "EstablishmentName": "school_name",
        "Postcode": "postcode",
        "NumberOfPupils": "numberofpupils"
    })

    ed["pc_key"] = ed["postcode"].apply(norm_postcode)
    ed["name_key"] = ed["school_name"].apply(norm_name)

    # Deduplicate safely
    ed = (
        ed.dropna(subset=["pc_key", "name_key"])
          .groupby(["pc_key", "name_key"], as_index=False)["numberofpupils"]
          .max()
    )

    return ed

# ======================
# MAIN PIPELINE
# ======================
def main():
    print("Loading mart_school_air_exposure...")
    schools = fetch_mart_school_air_exposure()
    print(f"Schools loaded: {len(schools)}")

    schools["pc_key"] = schools["postcode"].apply(norm_postcode)
    schools["name_key"] = schools["school_name"].apply(norm_name)

    print("Loading Edubase pupil counts...")
    pupils = load_edubase_pupils()
    print(f"Edubase keyed rows: {len(pupils)}")

    print("Merging exposure + pupils...")
    out = schools.merge(
        pupils,
        how="left",
        on=["pc_key", "name_key"]
    )

    # ======================
    # BEST PRACTICE FIX
    # ======================
    # Treat missing OR zero pupils as NULL (not 0)
    out["numberofpupils"] = out["numberofpupils"].where(
        out["numberofpupils"].notna() & (out["numberofpupils"] > 0),
        None
    )

    # Flag potentially unreliable pupil counts
    out["pupil_count_flag"] = "OK"
    
    out.loc[out["numberofpupils"].isna(), "pupil_count_flag"] = "Missing"
    out.loc[out["numberofpupils"] < 20, "pupil_count_flag"] = "Very small roll"
    out.loc[out["numberofpupils"] < 5,  "pupil_count_flag"] = "Implausibly small"


    match_rate = out["numberofpupils"].notna().mean() * 100
    print(f"Pupil match rate: {match_rate:.2f}%")

    # Drop internal keys before export
    out = out.drop(columns=["pc_key", "name_key"])

    # Optional: move pupils near the front
    cols = out.columns.tolist()
    cols.insert(1, cols.pop(cols.index("numberofpupils")))
    out = out[cols]

    # Export
    out.to_csv(OUT_PATH, index=False, encoding="utf-8")
    print("Export written to:")
    print(OUT_PATH)

# ======================
# RUN
# ======================
if __name__ == "__main__":
    main()

Loading mart_school_air_exposure...
Schools loaded: 26601


  return pd.read_sql(sql, conn)


Loading Edubase pupil counts...
Edubase keyed rows: 44777
Merging exposure + pupils...
Pupil match rate: 90.41%
Export written to:
C:/Users/Krist/Documents/Work/Data Science/projects/Air_Quality/data/csv/tableau_school_exposure.csv
