# Weighted PCA configurations → SumReal labels → Evaluation suite
This notebook demonstrates how to:
- Run multiple weighted PCA configurations with different hyper-parameters,
- Summarize selected PCs into SumReal and derive Quality labels,
- Evaluate each labeling using five tests (suite) and produce two plots,
- Store all results and figures per configuration in artifacts.

In [62]:
# Setup and imports
import os, sys, json, importlib
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

PROJECT_DIR = os.path.dirname(os.getcwd())
DATA_DIR = os.path.join(PROJECT_DIR, 'data', 'processed')
ART_DIR = Path(PROJECT_DIR) / 'artifacts' / 'weighted_pca_runs'
SRC_DIR = os.path.join(PROJECT_DIR, 'src')
ART_DIR.mkdir(parents=True, exist_ok=True)
if SRC_DIR not in sys.path:
    sys.path.insert(0, SRC_DIR)

import ecoindex
ecoindex = importlib.reload(ecoindex)
from ecoindex import prepare_feature_matrix, build_groups_from_labels, build_groups_from_quantiles, run_assessment_suite, plot_permanova_null_distribution, plot_directional_null_distribution
from ecoindex.pca_assessment import pca_chemical_assessment, select_pcs_by_weighted_loadings, compute_pollution_scores_with_labels
from ecoindex.chemical_weights import build_weights_for_columns

## Load master data
We use the processed `master_pollution_rank.csv` with MultiIndex (block, subblock, var).

In [63]:
csv_path = os.path.join(DATA_DIR, 'master_pollution_rank.csv')
master = pd.read_csv(csv_path, header=[0,1,2], index_col=0)
master.index.name = 'StationID'
if isinstance(master.columns, pd.MultiIndex) and master.columns.nlevels == 3:
    master.columns = pd.MultiIndex.from_tuples(master.columns, names=['block','subblock','var'])
print('master shape:', master.shape)
master.head(2)

master shape: (104, 106)


block,chemical,chemical,chemical,chemical,chemical,chemical,chemical,chemical,chemical,chemical,chemical,chemical,chemical,chemical,pollution,pollution,pollution,pollution,pollution,pollution,pollution
subblock,raw,raw,raw,raw,raw,raw,raw,raw,raw,raw,...,logz,logz,logz,sumreal_by_logz_chemical,sumreal_by_logz_chemical,sumreal_by_logz_chemical,sumreal_by_logz_chemical,2006_rank,2006_rank,2006_rank
var,1234TCB,1245TCB,Al,As,Bi,Ca,Cd,Co,Cr,Cu,...,ppDDD,ppDDE,total_PCB,SumReal,Quality,SumReal_Shuffled,Quality_Shuffled,Rank,rank,Quality
StationID,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3,Unnamed: 21_level_3
A10,0.835583,0.775732,3041,1.939,18.45,28170,0.295,2.723,8.766,17.64,...,-0.190979,-0.3432,-0.397416,5.989604,medium,-4.07984,reference,54.0,14.0,reference
A23,0.639983,0.697265,4483,2.512,17.03,42110,0.3986,4.009,10.85,17.28,...,0.07759,0.137696,0.36541,6.911119,degraded,7.844743,degraded,79.0,38.0,medium


## Helper: filter selected PCs by cumulative explained variance
Keeps the order from selection and stops when threshold is met.

In [64]:
def filter_selected_pcs_by_explained_variance(selected_pcs, explained_var_ratio_by_pcs, threshold=0.7):
    cum = 0.0
    out = []
    for pc in selected_pcs:
        v = float(explained_var_ratio_by_pcs.get(pc, 0.0))
        cum += v
        out.append(pc)
        if cum >= threshold:
            break
    return out, cum

## Define PCA configurations
Each config tunes: variable type weights, PCA standardize, high-weight threshold, loading threshold, top-K dominant variables per PC, min cumulative explained variance for selected PCs, and label quantiles.

In [65]:
# Determine feature subblock
FEATURE_BLOCK = 'chemical'
FEATURE_SUBBLOCK = 'logz' if ('chemical','logz') in set((c[0],c[1]) for c in master.columns) else 'raw'
chem_cols = list(master[(FEATURE_BLOCK, FEATURE_SUBBLOCK)].columns)

# Example type weight overrides (override only a subset of types)
WEIGHTS_SET = {
    'default': {},
    'metals_focus': {
        'Trace Metal (pollutant)': 3.0,
        'Trace Metal_Sb (pollutant)': 3.0,
        'Trace Metal_Bi (pollutant)': 3.0,
        'Earth element (nontoxic)': 1.5,
    },
    'organics_focus': {
        'Hydrocarbon pollutant': 3.0,
        'organochlorine pesticide': 3.0,
        'Sum of all PCBs': 3.0,
    },
    'balanced': {
        'Trace Metal (pollutant)': 2.0,
        'Trace Metal_Sb (pollutant)': 2.0,
        'Trace Metal_Bi (pollutant)': 2.0,
        'Earth element (nontoxic)': 2.0,
        'Hydrocarbon pollutant': 2.0,
        'organochlorine pesticide': 2.0,
        'Sum of all PCBs': 2.0,
    }
}

CONFIGS = [
    {
        'name': 'std_flase_metals_focus',
        'type_weights': WEIGHTS_SET['metals_focus'],
        'standardize_pca': False,
        'high_weight_threshold': 1,
        'loading_threshold': 0.15,
        'top_k': 15,
        'prefer_positive': True,
        'min_cum_var': 0.70,
        'quantiles': (0.2, 0.8),
    },
    {
        'name': 'std_false_organics_focus',
        'type_weights': WEIGHTS_SET['organics_focus'],
        'standardize_pca': False,
        'high_weight_threshold': 1.0,
        'loading_threshold': 0.18,
        'top_k': 12,
        'prefer_positive': True,
        'min_cum_var': 0.70,
        'quantiles': (0.20, 0.80),
    },
    {
        'name': 'std_true_balanced_weights',
        'type_weights': WEIGHTS_SET['balanced'],
        'standardize_pca': False,
        'high_weight_threshold': 1.0,
        'loading_threshold': 0.2,
        'top_k': 10,
        'prefer_positive': True,
        'min_cum_var': 0.70,
        'quantiles': (0.20, 0.80),
    },
    {
        'name': 'std_true_default_weights',
        'type_weights': WEIGHTS_SET['default'],
        'standardize_pca': False,
        'high_weight_threshold': 1.0,
        'loading_threshold': 0.2,
        'top_k': 10,
        'prefer_positive': True,
        'min_cum_var': 0.70,
        'quantiles': (0.20, 0.80),
    },
]

  chem_cols = list(master[(FEATURE_BLOCK, FEATURE_SUBBLOCK)].columns)


## Evaluate each configuration
For each config: run weighted PCA, select/filter PCs, compute SumReal + labels, then run the 5-test suite and create both plots. Save JSON and PNGs per config.

In [66]:
all_summaries = {}
# Base feature matrix for tests (constant across configs for comparability)
X_test, idx_test = prepare_feature_matrix(master, block=FEATURE_BLOCK, subblock=FEATURE_SUBBLOCK, variables=None, standardize=True)

for cfg in CONFIGS:
    name = cfg['name']
    print(f"\n=== Running config: {name} ===")

    # Build variable weights for current columns (type overrides)
    weight_map = build_weights_for_columns(chem_cols, type_weights=cfg['type_weights'])

    # Weighted PCA
    result = pca_chemical_assessment(
        master,
        chemical_block=FEATURE_BLOCK,
        subblock=FEATURE_SUBBLOCK,
        n_components=None,
        standardize=cfg['standardize_pca'],
        apply_weights=True,
        custom_weights=weight_map,
    )

    # Select PCs dominated by high-weight variables and optionally prefer sign
    sel = select_pcs_by_weighted_loadings(
        result,
        high_weight_threshold=cfg['high_weight_threshold'],
        loading_threshold=cfg['loading_threshold'],
        top_k_variables_per_pc=cfg['top_k'],
        prefer_positive=cfg['prefer_positive'],
    )
    selected_pcs = sel.get('selected_pcs', [])

    # Filter selected PCs by cumulative explained variance threshold
    ev_ratio = {f"PC{i+1}": float(r) for i, r in enumerate(result.explained_variance_ratio)}
    filtered_pcs, cum_var = filter_selected_pcs_by_explained_variance(
        selected_pcs,
        explained_var_ratio_by_pcs=ev_ratio,
        threshold=cfg['min_cum_var']
    )
    print(f"Selected PCs: {selected_pcs} -> filtered: {filtered_pcs} (cum≈{cum_var:.2f})")

    if not filtered_pcs:
        print("Skipping: no PCs passed filtering.")
        all_summaries[name] = {"error": "no PCs selected"}
        continue

    # Compute SumReal and Quality labels
    tidy = compute_pollution_scores_with_labels(result, filtered_pcs, quantiles=cfg['quantiles'])

    # Align labels to the base X_test index
    labels_sr = tidy.set_index('StationID')['Quality'].reindex(idx_test)
    mask = labels_sr.notna().to_numpy()
    X_cfg = X_test[mask]
    labels_cfg = labels_sr[mask].astype(str).to_numpy()

    # Validate group counts
    if len(np.unique(labels_cfg)) < 2 or X_cfg.shape[0] < 5:
        print("Skipping: not enough labeled samples after alignment.")
        all_summaries[name] = {"error": "insufficient labeled samples"}
        continue

    # Run assessment suite
    suite = run_assessment_suite(
        X_cfg,
        labels_cfg,
        reference_label='reference',
        degraded_label='degraded',
        var_names=None,
        permutations=499,
        seed=42,
        standardize_directional=True,
    )
    all_summaries[name] = suite

    # Prepare output dir and save artifacts
    out_dir = ART_DIR / name
    out_dir.mkdir(parents=True, exist_ok=True)

    # Save summary JSON
    with open(out_dir / 'summary.json', 'w') as f:
        json.dump(suite, f, indent=2)

    # Save SumReal + Quality table
    tidy.to_csv(out_dir / 'sumreal_quality.csv', index=False)

    # Plots: PERMANOVA null
    fig1, ax1, info1 = plot_permanova_null_distribution(
        X_cfg, labels_cfg, permutations=499, seed=42, title=f"{name}: PERMANOVA null"
    )
    fig1.savefig(out_dir / 'permanova_null.png', dpi=150, bbox_inches='tight')
    plt.close(fig1)

    # Plots: Directional null (average)
    fig2, ax2, info2 = plot_directional_null_distribution(
        X_cfg, labels_cfg,
        reference_label='reference', degraded_label='degraded',
        mode='average', permutations=499, standardize=True, seed=42,
        title=f"{name}: Directional null (avg)"
    )
    fig2.savefig(out_dir / 'directional_null_avg.png', dpi=150, bbox_inches='tight')
    plt.close(fig2)

print("Done. Summaries saved to", ART_DIR)


=== Running config: std_flase_metals_focus ===
Applied variable weights - >=3.0: 13 vars, 2.0-<3.0: 0 vars, =1.0: 11 vars
Selected PCs: ['PC1', 'PC2', 'PC3', 'PC4', 'PC6', 'PC7', 'PC8', 'PC10', 'PC11', 'PC14', 'PC17', 'PC18', 'PC19', 'PC20', 'PC21', 'PC24', 'PC25', 'PC27'] -> filtered: ['PC1', 'PC2', 'PC3'] (cum≈0.72)


  sub = master[(block, subblock)]



=== Running config: std_false_organics_focus ===
Applied variable weights - >=3.0: 10 vars, 2.0-<3.0: 0 vars, =1.0: 20 vars
Selected PCs: ['PC1', 'PC2', 'PC3', 'PC7', 'PC9', 'PC11', 'PC12', 'PC15', 'PC17', 'PC18', 'PC19', 'PC21', 'PC22', 'PC23', 'PC24', 'PC26', 'PC28', 'PC30'] -> filtered: ['PC1', 'PC2', 'PC3', 'PC7', 'PC9', 'PC11'] (cum≈0.71)

=== Running config: std_true_balanced_weights ===
Applied variable weights - >=3.0: 0 vars, 2.0-<3.0: 29 vars, =1.0: 1 vars
Selected PCs: ['PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'PC8', 'PC10', 'PC12', 'PC13', 'PC16', 'PC17', 'PC18', 'PC19', 'PC20', 'PC21', 'PC22', 'PC23', 'PC24', 'PC25', 'PC26', 'PC28', 'PC30'] -> filtered: ['PC1', 'PC2', 'PC3', 'PC4'] (cum≈0.72)

=== Running config: std_true_balanced_weights ===
Applied variable weights - >=3.0: 0 vars, 2.0-<3.0: 29 vars, =1.0: 1 vars
Selected PCs: ['PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'PC8', 'PC10', 'PC12', 'PC13', 'PC16', 'PC17', 'PC18', 'PC19', 'PC20', 'PC21', 'PC22', 'PC23', 'PC24', 'PC25', 'PC2

In [67]:
# Aggregate summaries across configs
from pprint import pprint

# Show a compact table of key metrics
rows = []
for name, res in all_summaries.items():
    if 'error' in res:
        rows.append({"config": name, "status": res['error']})
        continue
    rows.append({
        "config": name,
        "perm_p": res["perm_direct"]["p"],
        "perm_R2": res["perm_direct"]["R2"],
        "tail_p": res["perm_null_tail"]["p_tail"],
        "dir_avg_p": res["dir_avg"]["p"],
        "dir_min_p": res["dir_min"]["p"],
        "loss": res["loss"],
    })

summary_df = pd.DataFrame(rows).sort_values(by=["loss"], ascending=False)
summary_df

# Write combined JSON for convenience
with open(ART_DIR / 'all_summaries.json', 'w') as f:
    json.dump(all_summaries, f, indent=2)

### Notes
- Per-configuration results (JSON, CSV, PNG) are saved under `artifacts/weighted_pca_runs/<config-name>/`.
- The table above ranks by the loss metric (higher is better = stronger separation with lower p).
- You can tweak `CONFIGS` to explore more weighting schemes and thresholds.