In [None]:


import os
from pathlib import Path

import numpy as np
import pandas as pd
import os

os.environ["HABNETIC_DATA"] = r"C:\Users\C.Price\Habnetic\data"


# Reproducibility
RANDOM_SEED = 42
rng = np.random.default_rng(RANDOM_SEED)

DATA_ROOT = os.environ.get("HABNETIC_DATA")
if DATA_ROOT is None:
    raise KeyError("HABNETIC_DATA not set. In PowerShell: $env:HABNETIC_DATA='C:\\Users\\C.Price\\Habnetic\\data'")

path = Path(DATA_ROOT) / "processed" / "RTM" / "priors" / "building_water_proximity.parquet"
print("Reading:", path)

df = pd.read_parquet(path)

# Use a manageable sample first
N = 2_000   # start small
df_s = df.sample(n=N, random_state=RANDOM_SEED).copy()

df_s.head(), df_s.shape



In [None]:
# Safer transforms
eps = 1e-9

# Distance (meters). +1 avoids log(0) and keeps interpretation sane.
df_s["x_d"] = np.log(df_s["dist_to_water_m"].clip(lower=0) + 1.0)

# Densities. Add eps to avoid log(0)
for r in [250, 500, 1000]:
    col = f"water_len_density_{r}m"
    df_s[f"x_{r}"] = np.log(df_s[col].clip(lower=0) + eps)

# Optional: standardize predictors so beta priors behave nicely
X_cols = ["x_d", "x_250", "x_500", "x_1000"]
X = df_s[X_cols].to_numpy()

X_mean = X.mean(axis=0)
X_std = X.std(axis=0)
X_std[X_std == 0] = 1.0

Xz = (X - X_mean) / X_std

# Keep around for export / interpretation
df_s[["x_d", "x_250", "x_500", "x_1000"]] = Xz
df_s[X_cols].describe()

In [None]:
import pymc as pm
import arviz as az

coords = {"obs": np.arange(len(df_s)), "feature": X_cols}

X_data = df_s[X_cols].to_numpy()

with pm.Model(coords=coords) as model:
    X_pm = pm.MutableData("X", X_data, dims=("obs", "feature"))
    
    alpha = pm.Normal("alpha", mu=0.0, sigma=1.0)
    beta  = pm.Normal("beta",  mu=0.0, sigma=1.0, dims=("feature",))
    
    mu = pm.Deterministic("mu", alpha + pm.math.dot(X_pm, beta), dims=("obs",))
    
    # Identification / anchoring: fixed observation noise
    # This makes it a proper probabilistic regression-like scoring model.
    sigma = 1.0
    y = pm.Normal("E", mu=mu, sigma=sigma, observed=np.zeros(len(df_s)), dims=("obs",))
    
    idata = pm.sample(
        draws=500,
        tune=500,
        chains=2,
        cores=2,
        target_accept=0.9,
        random_seed=RANDOM_SEED
    )


az.summary(idata, var_names=["alpha", "beta"])

In [None]:
az.plot_trace(idata, var_names=["alpha", "beta"]);

beta_post = az.summary(idata, var_names=["beta"])
beta_post.index = X_cols
beta_post


In [None]:
# posterior for mu: shape (chain, draw, obs)
mu_post = idata.posterior["mu"]

mu_mean = mu_post.mean(dim=("chain", "draw")).to_numpy()
mu_sd   = mu_post.std(dim=("chain", "draw")).to_numpy()

out = pd.DataFrame({
    "fid": df_s["fid"].to_numpy() if "fid" in df_s.columns else df_s.index.to_numpy(),
    "E_hat": mu_mean,
    "E_sd": mu_sd,
})

# Persist
out_dir = Path("outputs") / "rtm"
out_dir.mkdir(parents=True, exist_ok=True)
out_path = out_dir / "water_exposure_latent_v0_sample20k.parquet"
out.to_parquet(out_path, index=False)

out_path, out.describe()
