In [1]:
import os
import json
import pandas as pd
import psycopg2
import psycopg2.extras
from datetime import date

PM25_PATH = "C:/Users/Krist/Documents/Work/Data Science/projects/Air_Quality/data/raw/pollution/PM2_5_2022.csv"
NO2_PATH  = "C:/Users/Krist/Documents/Work/Data Science/projects/Air_Quality/data/raw/pollution/Nitrogen_Dioxide_2022.csv"

BASE_URL = "https://api.openaq.org/v3"
PG_DSN="dbname=airquality user=postgres password=Milian112! host=localhost port=5432"

START_DATE = pd.Timestamp("2022-09-01")

def pg_conn():
    return psycopg2.connect(PG_DSN)

def guess_col(df, candidates):
    cols = {c.lower(): c for c in df.columns}
    for cand in candidates:
        if cand.lower() in cols:
            return cols[cand.lower()]
    # fallback: substring match
    for c in df.columns:
        cl = c.lower()
        if any(k.lower() in cl for k in candidates):
            return c
    return None

def standardise(df: pd.DataFrame, pollutant: str) -> pd.DataFrame:
    """
    Attempt to standardise to:
      site_id, site_name, latitude, longitude, date_utc, value, units
    Works even if your CSV uses different column names.
    """
    # Common column possibilities
    c_site_id   = guess_col(df, ["site_id", "siteid", "station_id", "stationid", "location_id", "locationid", "Site"])
    c_site_name = guess_col(df, ["site_name", "sitename", "station_name", "location", "name"])
    c_lat       = guess_col(df, ["latitude", "lat"])
    c_lon       = guess_col(df, ["longitude", "lon", "lng", "long"])
    c_date      = guess_col(df, ["date_utc", "date", "day", "datetime", "period"])
    c_value     = guess_col(df, ["value", "mean", "avg", "concentration"])
    c_units     = guess_col(df, ["units", "unit"])

    if not c_date or not c_value:
        raise RuntimeError(
            f"Could not detect required columns in {pollutant} CSV. "
            f"Need at least a date and value column. Found columns: {list(df.columns)[:40]}"
        )

    out = pd.DataFrame()
    out["pollutant"] = pollutant
    out["site_id"] = df[c_site_id].astype(str) if c_site_id else None
    out["site_name"] = df[c_site_name].astype(str) if c_site_name else None
    out["latitude"] = pd.to_numeric(df[c_lat], errors="coerce") if c_lat else None
    out["longitude"] = pd.to_numeric(df[c_lon], errors="coerce") if c_lon else None

    # Parse date
    d = pd.to_datetime(df[c_date], errors="coerce", utc=True)
    out["date_utc"] = d.dt.date

    out["value"] = pd.to_numeric(df[c_value], errors="coerce")
    out["units"] = df[c_units].astype(str) if c_units else None

    # Keep a raw copy (optional but handy)
    out["raw"] = df.to_dict(orient="records")

    # Filter date
    out = out[out["date_utc"].notna()]
    out = out[pd.to_datetime(out["date_utc"]) >= START_DATE]

    # If site_id is missing, create a deterministic ID from lat/lon + name
    if out["site_id"].isna().all():
        # fallback id
        out["site_id"] = (
            out["site_name"].fillna("unknown").astype(str)
            + "_"
            + out["latitude"].fillna(0).round(5).astype(str)
            + "_"
            + out["longitude"].fillna(0).round(5).astype(str)
        )

    return out

def truncate_tables(conn):
    with conn.cursor() as cur:
        cur.execute("TRUNCATE TABLE stg_uk_air_quality_daily;")
        cur.execute("TRUNCATE TABLE fact_uk_air_quality_daily;")
    conn.commit()

def insert_stage(conn, df: pd.DataFrame):
    rows = []
    for r in df.itertuples(index=False):
        rows.append((
            r.pollutant,
            r.site_id,
            r.site_name,
            r.latitude,
            r.longitude,
            r.date_utc,
            r.value,
            r.units,
            json.dumps(r.raw),
        ))

    sql = """
    INSERT INTO stg_uk_air_quality_daily
      (pollutant, site_id, site_name, latitude, longitude, date_utc, value, units, raw)
    VALUES %s;
    """

    with conn.cursor() as cur:
        psycopg2.extras.execute_values(cur, sql, rows, page_size=5000)
    conn.commit()

def build_fact(conn):
    """
    Build deduped fact table:
    - For duplicates within (pollutant, site_id, date_utc), keep max(value) as a simple deterministic rule.
      (We can change this to avg/last if you prefer.)
    """
    with conn.cursor() as cur:
        cur.execute("""
            INSERT INTO fact_uk_air_quality_daily
              (pollutant, site_id, date_utc, value, units, latitude, longitude, site_name, geom)
            SELECT
              pollutant,
              site_id,
              date_utc,
              MAX(value) AS value,
              MAX(units) AS units,
              MAX(latitude) AS latitude,
              MAX(longitude) AS longitude,
              MAX(site_name) AS site_name,
              CASE
                WHEN MAX(latitude) IS NOT NULL AND MAX(longitude) IS NOT NULL
                THEN ST_SetSRID(ST_MakePoint(MAX(longitude), MAX(latitude)), 4326)
                ELSE NULL
              END AS geom
            FROM stg_uk_air_quality_daily
            WHERE site_id IS NOT NULL
              AND date_utc IS NOT NULL
            GROUP BY pollutant, site_id, date_utc
            ON CONFLICT (pollutant, site_id, date_utc) DO UPDATE SET
              value=EXCLUDED.value,
              units=EXCLUDED.units,
              latitude=EXCLUDED.latitude,
              longitude=EXCLUDED.longitude,
              site_name=EXCLUDED.site_name,
              geom=EXCLUDED.geom,
              loaded_at=now();
        """)
    conn.commit()

def main():
    pm25_raw = pd.read_csv(PM25_PATH, low_memory=False)
    no2_raw  = pd.read_csv(NO2_PATH, low_memory=False)

    pm25 = standardise(pm25_raw, "pm25")
    no2  = standardise(no2_raw, "no2")

    print("PM2.5 rows after filtering:", len(pm25))
    print("NO2 rows after filtering:", len(no2))

    with pg_conn() as conn:
        # If you truly want a clean reload each time:
        truncate_tables(conn)

        insert_stage(conn, pm25)
        insert_stage(conn, no2)

        build_fact(conn)

    print("Done loading fact_uk_air_quality_daily")

if __name__ == "__main__":
    main()

ParserError: Error tokenizing data. C error: Expected 1 fields in line 4, saw 292
