In [1]:
import pandas as pd
import json
from pathlib import Path

OUT = "../data/processed/hygiene_uk.csv"
META_OUT = "../data/processed/hygiene_uk_meta.json"

Path("../data/processed").mkdir(parents=True, exist_ok=True)

# 1) Context:
#    The UK does not report JMP handwashing indicators in household surveys.
#    However, UK SILC 2018 reports 'bathing facility' access (99.7%).
#    We'll use this as the hygiene proxy for the UK in this project.

# 2) Parameters (easy to update later)
COUNTRY   = "United Kingdom"
YEAR      = 2018
INDICATOR = "bathing_facility"
VALUE     = 99.7  # percent of households with bathing facility

# 3) Create a tidy one-row DataFrame
hyg_df = pd.DataFrame([{
    "country": COUNTRY,
    "year": YEAR,
    "indicator": INDICATOR,
    "value": VALUE
}])

# 4) Preview
hyg_df

Unnamed: 0,country,year,indicator,value
0,United Kingdom,2018,bathing_facility,99.7


In [2]:
# 1) Ensure correct types
hyg_df["year"]  = pd.to_numeric(hyg_df["year"], errors="coerce").astype("Int64")
hyg_df["value"] = pd.to_numeric(hyg_df["value"], errors="coerce")

# 2) Basic sanity checks
assert hyg_df.shape[0] == 1, "Expected exactly one UK hygiene row."
assert 0.0 <= float(hyg_df.loc[0, "value"]) <= 100.0, "Value must be a percentage between 0 and 100."
assert str(hyg_df.loc[0, "indicator"]) == "bathing_facility", "Indicator should be 'bathing_facility'."

# 3) Final tidy order
hyg_df = hyg_df[["country", "year", "indicator", "value"]]

# 4) Preview
hyg_df

Unnamed: 0,country,year,indicator,value
0,United Kingdom,2018,bathing_facility,99.7


In [3]:
# 1) Save the processed dataset for DB/API ingestion
hyg_df.to_csv(OUT, index=False)

# 2) Assemble concise metadata (to mirror CPI/PIP meta files)
meta = {
    "source_summary": "UK SILC 2018 via WHO/UNICEF JMP (household bathing facility access)",
    "country": COUNTRY,
    "indicator": INDICATOR,
    "units": "percent of households with bathing facility",
    "value": float(hyg_df.loc[0, "value"]),
    "year": int(hyg_df.loc[0, "year"]),
    "notes": "UK lacks JMP handwashing indicators; bathing facility is used as a hygiene proxy."
}

# 3) Write metadata file
with open(META_OUT, "w") as f:
    json.dump(meta, f, indent=2)

# 4) Return summary
OUT, META_OUT, hyg_df, meta

('../data/processed/hygiene_uk.csv',
 '../data/processed/hygiene_uk_meta.json',
           country  year         indicator  value
 0  United Kingdom  2018  bathing_facility   99.7,
 {'source_summary': 'UK SILC 2018 via WHO/UNICEF JMP (household bathing facility access)',
  'country': 'United Kingdom',
  'indicator': 'bathing_facility',
  'units': 'percent of households with bathing facility',
  'value': 99.7,
  'year': 2018,
  'notes': 'UK lacks JMP handwashing indicators; bathing facility is used as a hygiene proxy.'})

In [4]:
# 1) Reload to ensure schema and types are as expected
check = pd.read_csv(OUT)
assert list(check.columns) == ["country", "year", "indicator", "value"], "Unexpected columns after save."
assert check.shape[0] == 1, "Expected a single row after save."

# 2) Show the reloaded DataFrame
check

Unnamed: 0,country,year,indicator,value
0,United Kingdom,2018,bathing_facility,99.7
