# Setup

In [12]:
import pandas as pd
from pathlib import Path
import numpy as np
import geopandas as gpd
from scipy.stats import norm


## Create the Accesibility Averages

In [13]:
def aggregate_routing_results(
    parquet_dir: str | Path,
    group_col: str = 'lsoa21cd',
    value_col: str = 'duration_minutes',
    pattern: str = "*.parquet"
) -> pd.DataFrame:
    """
    Aggregate routing results from multiple parquet files into a single DataFrame.
    
    Parameters
    ----------
    parquet_dir : str or Path
        Directory containing parquet files
    group_col : str
        Column to group by (default: 'lsoa21cd')
    value_col : str
        Column to compute mean of (default: 'duration_minutes')
    pattern : str
        Glob pattern for parquet files (default: '*.parquet')
    
    Returns
    -------
    pd.DataFrame
        DataFrame with group_col as index and one column per input file
    """
    parquet_dir = Path(parquet_dir)
    parquet_files = sorted(parquet_dir.glob(pattern))
    
    if not parquet_files:
        raise FileNotFoundError(f"No parquet files found in {parquet_dir} matching '{pattern}'")
    
    aggregated_dfs = []
    
    for path in parquet_files:
        col_name = path.stem  # filename without extension
        df = pd.read_parquet(path)
        
        agg_df = (
            df.groupby(group_col, as_index=False)[value_col]
            .mean()
            .rename(columns={value_col: col_name})
        )
        aggregated_dfs.append(agg_df)
    
    # Merge all dataframes on group_col
    result = aggregated_dfs[0]
    for df in aggregated_dfs[1:]:
        result = result.merge(df, on=group_col, how='outer')
    
    return result

In [3]:
# Aggregate all routing results
Accessibility = aggregate_routing_results(
    parquet_dir="data/routing_results/",
    group_col='lsoa21cd',
    value_col='duration_minutes',
    pattern="*.parquet"
)

# Air Quality

In [4]:
# Load and process NO2 data
no2_path = Path("data/airquality/no2.parquet")
no2 = pd.read_parquet(no2_path)
no2 = no2[['LSOA_DZ_SDZ_21_22', 'nox2024_weighted_mean']].rename(columns={'nox2024_weighted_mean': 'NO2'})

# Load and process PM10 data
pm10_path = Path("data/airquality/pm10.parquet")
pm10 = pd.read_parquet(pm10_path)
pm10 = pm10[['LSOA_DZ_SDZ_21_22', 'pm102024g_weighted_mean']].rename(columns={'pm102024g_weighted_mean': 'PM10'})

# Load and process SO2 data
so2_path = Path("data/airquality/so2.parquet")
so2 = pd.read_parquet(so2_path)
so2 = so2[['LSOA_DZ_SDZ_21_22', 'so22024_weighted_mean']].rename(columns={'so22024_weighted_mean': 'SO2'})

In [5]:
# Combine Air Quality data
air_quality = no2.merge(pm10, on="LSOA_DZ_SDZ_21_22", how="inner").merge(so2, on="LSOA_DZ_SDZ_21_22", how="inner")

# Green Space

In [14]:
# Passive Greenaspace data
greenspace_path = Path("data/green_blue/greenspace.parquet")
greenspace = gpd.read_parquet(greenspace_path)
greenspace = greenspace[['LSOA_DZ_SDZ_21_22', 'NDVI_median']].rename(columns={'NDVI_median': 'greenspace'})
greenspace = greenspace[greenspace['LSOA_DZ_SDZ_21_22'].isin(air_quality['LSOA_DZ_SDZ_21_22'])] # Removes the NI data
greenspace = greenspace.reset_index(drop=True)

# Gather All the Input Measures

In [22]:
# Merge Accessibility with air quality and greenspace using the LSOA code
aq = air_quality.rename(columns={'LSOA_DZ_SDZ_21_22': 'lsoa21cd'})
gs = greenspace.rename(columns={'LSOA_DZ_SDZ_21_22': 'lsoa21cd'})

AHAH_input = Accessibility.merge(aq, on='lsoa21cd', how='left').merge(gs, on='lsoa21cd', how='left')
AHAH_input.columns


Index(['lsoa21cd', 'GP', 'bluespace', 'dentist', 'fast_food', 'gambling',
       'greenspace_active', 'hospital', 'leisure', 'pharmacy', 'pub_bar',
       'tobacco', 'NO2', 'PM10', 'SO2', 'greenspace'],
      dtype='object')

# Create Index

In [23]:


def exp_trans(x, n):
    """Exponential transformation for domain scores."""
    return -23 * np.log(1 - (x / n) * (1 - np.exp(-100 / 23)))


def exp_default(x, n):
    """Probit transformation for individual indicators."""
    return norm.ppf((x - 0.5) / n)


def calculate_ahah(df):
    """
    Calculate AHAH index from raw distance/accessibility measures.
    
    Parameters
    ----------
    df : pd.DataFrame
        Must contain columns: lsoa21cd, GP, dentist, pharmacy, hospital, leisure,
        greenspace, bluespace, NO2, PM10, SO2, fast_food, gambling, pub_bar, tobacco
    
    Returns
    -------
    pd.DataFrame
        Original data with added rank, percentile, domain, and AHAH scores
    """
    idx = df.copy()
    n = len(idx)
    
    # Define indicator groups by domain
    health = ["GP", "dentist", "pharmacy", "hospital", "leisure"]
    green_blue = ["greenspace","greenspace_active", "bluespace"]
    air_quality = ["NO2", "PM10", "SO2"]
    retail = ["fast_food", "gambling", "pub_bar", "tobacco"]
    
    # --- RANKING ---
    # Health services: lower distance = better access (rank ascending)
    for col in health:
        idx[f"{col}_rnk"] = idx[col].rank(method="min").astype(int)
    
    # Active greenspace: lower distance = better (rank ascending)
    idx["greenspace_active_rnk"] = idx["greenspace_active"].rank(method="min").astype(int)

    # Greenspace: higher NDVI = better (rank descending)
    idx["greenspace_rnk"] = idx["greenspace"].rank(method="min", ascending=False).astype(int)
    
    # Bluespace: lower distance = better (rank ascending)
    idx["bluespace_rnk"] = idx["bluespace"].rank(method="min").astype(int)
    
    # Air quality: lower pollution = better (rank ascending)
    for col in air_quality:
        idx[f"{col}_rnk"] = idx[col].rank(method="min").astype(int)
    
    # Retail hazards: higher distance from hazards = better (rank descending)
    for col in retail:
        idx[f"{col}_rnk"] = idx[col].rank(method="min", ascending=False).astype(int)
    
    # --- PROBIT TRANSFORMATION (normalizes ranked indicators) ---
    all_indicators = health + green_blue + air_quality + retail
    for col in all_indicators:
        idx[f"{col}_exp"] = exp_default(idx[f"{col}_rnk"], n)
    
    # --- PERCENTILES (for each indicator) ---
    for col in all_indicators:
        idx[f"{col}_pct"] = (idx[f"{col}_rnk"] / idx[f"{col}_rnk"].max() * 100).astype(int)
    
    # --- DOMAIN SCORES (mean of transformed indicators per domain) ---
    idx["domain_h"] = idx[[f"{col}_exp" for col in health]].mean(axis=1)         # Health
    idx["domain_g"] = idx[[f"{col}_exp" for col in green_blue]].mean(axis=1)     # Green/Blue
    idx["domain_e"] = idx[[f"{col}_exp" for col in air_quality]].mean(axis=1)    # Environment
    idx["domain_r"] = idx[[f"{col}_exp" for col in retail]].mean(axis=1)         # Retail hazards
    
    # --- DOMAIN RANKS AND PERCENTILES ---
    for domain in ["h", "g", "e", "r"]:
        idx[f"domain_{domain}_rnk"] = idx[f"domain_{domain}"].rank(method="min").astype(int)
        idx[f"domain_{domain}_pct"] = pd.qcut(idx[f"domain_{domain}_rnk"], 100, labels=False) + 1
    
    # --- FINAL AHAH SCORE (exponential transform of domain ranks, then average) ---
    idx["h_exp"] = exp_trans(idx["domain_h_rnk"], n)
    idx["g_exp"] = exp_trans(idx["domain_g_rnk"], n)
    idx["e_exp"] = exp_trans(idx["domain_e_rnk"], n)
    idx["r_exp"] = exp_trans(idx["domain_r_rnk"], n)
    
    # Composite AHAH score (equal domain weights)
    idx["ahah"] = idx[["h_exp", "g_exp", "e_exp", "r_exp"]].mean(axis=1)
    idx["ahah_rnk"] = idx["ahah"].rank(method="min").astype(int)
    idx["ahah_pct"] = pd.qcut(idx["ahah_rnk"], 100, labels=False) + 1
    
    # Remove intermediate exponential columns
    exp_cols = [c for c in idx.columns if c.endswith("_exp")]
    idx = idx.drop(columns=exp_cols)
    
    return idx

# Calculate AHAH index using the prepared input data
AHAH_V5 = calculate_ahah(AHAH_input)
AHAH_V5

AHAH_V5.to_csv('./AHAH_V5.csv', index=False)


In [24]:
import geopandas as gpd

# Load boundary data and merge with AHAH_V5
boundary_path = Path("data/boundary/LSOA_DZ_SDZ_21_22.parquet")
boundary = pd.read_parquet(boundary_path)

# Merge AHAH_V5 with boundary data
AHAH_V5_geo = boundary.merge(AHAH_V5, left_on='LSOA_DZ_SDZ_21_22', right_on='lsoa21cd', how='inner')
AHAH_V5_geo

# Convert to GeoDataFrame and export as GeoParquet
gdf = gpd.GeoDataFrame(AHAH_V5_geo, geometry=gpd.GeoSeries.from_wkb(AHAH_V5_geo['geometry']))
gdf.to_parquet("AHAH_V5_geo.parquet")