In [1]:
import io
import copy
import random
import time
import contextlib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages

from UQpy.distributions import Uniform, Normal, JointIndependent
from UQpy.sampling import ThetaCriterionPCE
import SAPCE

In [2]:
# Pomocne funkce SAPCE
def pce_to_list(pce_multi):
    # Funkce: ze SAPCE multioutput PCE udelam list PCE (kazdy vystup zvlast)
    C = np.asarray(pce_multi.coefficients)
    if C.ndim == 1:
        C = C.reshape(-1, 1)

    pce_list = []
    for j in range(C.shape[1]):
        p = copy.deepcopy(pce_multi)
        p.coefficients = C[:, j].reshape(-1, 1)
        pce_list.append(p)
    return pce_list


def predict_pce_list(pce_list, X):
    # Funkce: predikce pro list PCE, vrati matici (n, n_out)
    out = []
    for p in pce_list:
        y = None
        for m in ("predict", "evaluate", "__call__", "run"):
            if hasattr(p, m):
                y = np.asarray(getattr(p, m)(X))
                break
        if y is None:
            raise RuntimeError("No usable prediction method found on PCE object.")
        if y.ndim == 1:
            y = y.reshape(-1, 1)
        out.append(y)
    return np.hstack(out)


def uqpy_validation_error_exact(y_true, y_pred):
    # eps = ((n-1)/n) * sum((y - y_pred)^2) / sum((y - mean(y))^2)
    y = np.asarray(y_true, dtype=float)
    y_val = np.asarray(y_pred, dtype=float)

    if y.ndim == 1:
        y = y.reshape(-1, 1)
    if y_val.ndim == 1:
        y_val = y_val.reshape(-1, 1)

    n_samples = y.shape[0]
    mu = (1.0 / n_samples) * np.sum(y, axis=0)

    num = np.sum((y - y_val) ** 2, axis=0)
    den = np.sum((y - mu) ** 2, axis=0)

    eps = np.full_like(num, np.nan, dtype=float)
    mask = np.isfinite(num) & np.isfinite(den) & (den > 0)
    eps[mask] = ((n_samples - 1) / n_samples) * (num[mask] / den[mask])

    return eps


def build_sapce_pce_multi(joint, X_train, Y_train, n_inputs, deg, cr, max_cond=1e8, silent=True):
    #  postavim SAPCE model a vratim multioutput pce
    # kdyz SAPCE nepostavi model, zkusim mensi deg a cr.

    try_deg = [deg]
    if deg >= 7:
        try_deg.append(5)
    if deg >= 5:
        try_deg.append(3)
    try_deg.append(1)

    try_cr = [cr, cr / 10.0, 0.0]

    last_err = None
    for d in try_deg:
        for c in try_cr:
            try:
                sapce = SAPCE.SensitivityAdaptivePCE(
                    pdf=joint,
                    exp_design_in=X_train,
                    exp_design_out=Y_train,
                    max_partial_degree=d,
                    num_inputs=n_inputs
                )

                if silent:
                    with contextlib.redirect_stdout(io.StringIO()), contextlib.redirect_stderr(io.StringIO()):
                        sapce.construct_adaptive_basis(max_condition_number=max_cond)
                        sapce.construct_pruned_pce(cr=c)
                else:
                    sapce.construct_adaptive_basis(max_condition_number=max_cond)
                    sapce.construct_pruned_pce(cr=c)

                pce = getattr(sapce, "pce", None)
                if pce is not None and hasattr(pce, "coefficients"):
                    return pce
            except Exception as e:
                last_err = e

    raise RuntimeError(f"SAPCE failed to build PCE. Last error: {repr(last_err)}")


# PDF REPORT -Z Githubu
def make_pdf_report(results_df, out_pdf="sapce_theta_vs_random_report.pdf"):
    # Funkce: udelam PDF s porovnanim RMSE a UQPy val err.

    metrics = [
        ("RMSE", "rmse_theta", "rmse_random", True),
        ("UQPy val err (mean)", "uqpy_mean_theta", "uqpy_mean_random", True),
    ]

    start_list = sorted(results_df["pocet_start"].unique().tolist())
    deg_list = sorted(results_df["deg"].unique().tolist())
    addk_list = sorted(results_df["add_k"].unique().tolist())

    with PdfPages(out_pdf) as pdf:
        # Titulni strana.
        fig = plt.figure(figsize=(11.7, 8.3))
        ax = fig.add_axes([0, 0, 1, 1])
        ax.axis("off")
        ax.text(0.05, 0.92, "SAPCE: Theta vs Random (simple report)", fontsize=20, va="top")
        ax.text(
            0.05, 0.78,
            "\n".join([
                f"pocet_start: {start_list}",
                f"deg_list: {deg_list}",
                f"add_k: {addk_list}",
                f"seeds: {sorted(results_df['seed'].unique().tolist())}",
                "",
                "Cary = prumer pres seeds.",
                "Y osa je log (kdyz to jde)."
            ]),
            fontsize=11, va="top"
        )
        pdf.savefig(fig)
        plt.close(fig)

        # Vykreslim metriky po pocet_start.
        for pocet_start in start_list:
            sub = results_df[results_df["pocet_start"] == pocet_start].copy()

            for metric_name, col_t, col_r, logy in metrics:
                # Spocitam prumer pres seeds pro kazdy deg a add_k.
                g = sub.groupby(["deg", "add_k"], as_index=False)[[col_t, col_r]].mean()
                yvals = np.r_[g[col_t].values, g[col_r].values]
                yvals = yvals[np.isfinite(yvals)]
                if yvals.size == 0:
                    continue

                if logy:
                    yvals = yvals[yvals > 0]
                    if yvals.size == 0:
                        logy = False

                if logy:
                    ymin = float(np.min(yvals) * 0.8)
                    ymax = float(np.max(yvals) * 1.2)
                else:
                    ymin = float(np.min(yvals))
                    ymax = float(np.max(yvals))
                    pad = 0.05 * (ymax - ymin) if ymax > ymin else 1.0
                    ymin -= pad
                    ymax += pad

                ncols = 2
                nrows = int(np.ceil(len(deg_list) / ncols))
                fig, axes = plt.subplots(nrows, ncols, figsize=(11.7, 8.3), sharex=True, sharey=True)
                axes = np.atleast_1d(axes).ravel()
                fig.suptitle(f"{metric_name} vs add_k | pocet_start={pocet_start}", fontsize=14, y=0.98)

                for i, deg in enumerate(deg_list):
                    ax = axes[i]
                    d = g[g["deg"] == deg].sort_values("add_k")
                    if d.shape[0] == 0:
                        ax.axis("off")
                        continue

                    ax.plot(d["add_k"], d[col_t], label="Theta (mean)")
                    ax.plot(d["add_k"], d[col_r], label="Random (mean)")

                    ax.set_title(f"deg={deg}")
                    ax.set_xlabel("add_k")
                    ax.set_ylabel(metric_name)
                    if logy:
                        ax.set_yscale("log")
                    ax.set_ylim(ymin, ymax)
                    ax.grid(True, which="both", alpha=0.3)
                    ax.legend(fontsize=8)

                for j in range(i + 1, len(axes)):
                    axes[j].axis("off")

                pdf.savefig(fig)
                plt.close(fig)

In [None]:
# Main 

t0_all = time.time()

# Nacteni dat
data_raw = pd.read_csv("Oakwood_NVM.csv", sep=";", header=0, dtype=str)
data = data_raw.applymap(lambda s: s.replace(",", ".") if isinstance(s, str) else s)
data = data.apply(pd.to_numeric, errors="coerce")

names = data.columns.tolist()
arr = data.to_numpy()

# vstupy X a vystupy Y.
X_raw = arr[:, :12].astype(float)
Y_all = arr[:, 12:].astype(float)
Y_N = Y_all[:, :113].astype(float)


X = X_raw.copy()
_ec_mean = 13.0 # og-transform pro Ec, aby PCE fungovalo jako Normal.
_ec_std = 1.0
_ec_sigma_ln = np.sqrt(np.log(1.0 + (_ec_std / _ec_mean) ** 2))
_ec_mu_ln = np.log(_ec_mean) - 0.5 * _ec_sigma_ln**2
if np.any(X[:, 0] <= 0):
    raise ValueError("Ec musi byt kladne pro log transform.")
X[:, 0] = np.log(X[:, 0])

n_samples = X.shape[0]
n_inputs = X.shape[1]
n_outputs = Y_N.shape[1]

print(f"Oakwood_NVM.csv: X={X.shape}, Y_N={Y_N.shape}")
dist_Ec = Normal(_ec_mu_ln, _ec_sigma_ln)
dist_Relax = Uniform(loc=30, scale=40)
# Soil1
dist_S1_Erel = Uniform(loc=2, scale=1)
dist_S1_E50 = Normal(65, 5)
dist_S1_c = Normal(30, 5)
dist_S1_theta = Normal(30, 1)
dist_S1_ko = Uniform(loc=0.6, scale=0.5)
# Soil2
dist_S2_Erel = Uniform(loc=2, scale=1)
dist_S2_E50 = Normal(130, 10)
dist_S2_c = Normal(5, 1)
dist_S2_theta = Normal(42, 1)
dist_S2_ko = Uniform(loc=0.45, scale=0.2)
marginals = [
    dist_Ec, dist_Relax,
    dist_S1_Erel, dist_S1_E50, dist_S1_c, dist_S1_theta, dist_S1_ko,
    dist_S2_Erel, dist_S2_E50, dist_S2_c, dist_S2_theta, dist_S2_ko
]
joint = JointIndependent(marginals=marginals)

# Nastaveni experimentu
deg_list = [3, 5, 7]
cr = 0.0005

pocet_start_list = [30, 70]
K_total = 30
step_eval = 5
theta_n_pick = 40
n_test_target = 100

max_cond_selection = 1e7
max_cond_eval = 1e7
seed_list = [0, 1, 2, 3]

print("\n Setting of my expediment")
print(f"deg_list={deg_list}")
print(f"pocet_start_list={pocet_start_list}")
print(f"K_total={K_total}, step_eval={step_eval}, cr={cr}")
print(f"max_cond_selection={max_cond_selection}, max_cond_eval={max_cond_eval}")
print(f"seed_list={seed_list}")

rows = []

for pocet_start in pocet_start_list:
    for seed in seed_list:
        # Fixni split: nejdriv start, pak test, pak kandidati.
        rng = np.random.RandomState(seed)
        all_idx = np.arange(n_samples)

        start_idx = rng.choice(all_idx, size=pocet_start, replace=False)
        remaining = np.setdiff1d(all_idx, start_idx, assume_unique=False)

        n_test = int(min(n_test_target, remaining.shape[0]))
        test_idx = rng.choice(remaining, size=n_test, replace=False)

        cand_idx = np.setdiff1d(remaining, test_idx, assume_unique=False)

        if cand_idx.shape[0] < K_total:
            raise ValueError("Malo kandidatu pro K_total.")

        rand_body_idx = rng.choice(cand_idx, size=K_total, replace=False)

        print(f"\nFixni split pocet_start={pocet_start}, seed={seed}")
        print(f"start_idx: n={len(start_idx)} min={int(np.min(start_idx))} max={int(np.max(start_idx))}")
        print(f"cand_idx : n={len(cand_idx)} min={int(np.min(cand_idx))} max={int(np.max(cand_idx))}")
        print(f"rand_idx : n={len(rand_body_idx)} min={int(np.min(rand_body_idx))} max={int(np.max(rand_body_idx))}")
        print(f"test_idx : n={len(test_idx)} min={int(np.min(test_idx))} max={int(np.max(test_idx))}")

        for deg in deg_list:
            print(f"RUN: pocet_start={pocet_start} | seed={seed} | deg={deg} | cr={cr}")
            if theta_n_pick >= n_outputs:
                theta_out_idx = np.arange(n_outputs, dtype=int)
            else:
                theta_out_idx = np.unique(np.round(np.linspace(0, n_outputs - 1, theta_n_pick)).astype(int))

            # Theta selection (iteracne).
            random.seed(seed)

            X_theta = X[start_idx].copy()
            Y_theta_sel = Y_N[start_idx][:, theta_out_idx].copy()

            cand_theta = X[cand_idx].copy()
            cand_idx_theta = cand_idx.copy()

            theta_body_idx = []

            t0_theta = time.time()
            for k in range(K_total):
                pce_sel = build_sapce_pce_multi(
                    joint=joint,
                    X_train=X_theta,
                    Y_train=Y_theta_sel,
                    n_inputs=n_inputs,
                    deg=deg,
                    cr=cr,
                    max_cond=max_cond_selection,
                    silent=True
                )

                pce_list_sel = pce_to_list(pce_sel)
                theta = ThetaCriterionPCE(pce_list_sel)

                i = int(theta.run(X_theta, cand_theta))

                new_idx = int(cand_idx_theta[i])
                theta_body_idx.append(new_idx)

                X_theta = np.vstack([X_theta, X[new_idx].reshape(1, -1)])
                Y_theta_sel = np.vstack([Y_theta_sel, Y_N[new_idx, theta_out_idx].reshape(1, -1)])

                cand_theta = np.delete(cand_theta, i, axis=0)
                cand_idx_theta = np.delete(cand_idx_theta, i, axis=0)

            theta_body_idx = np.array(theta_body_idx, dtype=int)

            # Vyhodnoceni chyb pro pridavani bodu po krocich.
            test_x = X[test_idx]
            test_y = Y_N[test_idx]

            for add_k in range(step_eval, K_total + 1, step_eval):
                # Random train set.
                train_rand_idx = np.concatenate([start_idx, rand_body_idx[:add_k]])
                X_rand = X[train_rand_idx]
                Y_rand = Y_N[train_rand_idx]

                # Theta train set.
                train_theta_idx = np.concatenate([start_idx, theta_body_idx[:add_k]])
                X_th = X[train_theta_idx]
                Y_th = Y_N[train_theta_idx]

                # Fit + predict Random.
                pce_rand = build_sapce_pce_multi(
                    joint=joint,
                    X_train=X_rand,
                    Y_train=Y_rand,
                    n_inputs=n_inputs,
                    deg=deg,
                    cr=cr,
                    max_cond=max_cond_eval,
                    silent=True
                )
                pred_rand = predict_pce_list(pce_to_list(pce_rand), test_x)

                # Fit + predict Theta.
                pce_theta = build_sapce_pce_multi(
                    joint=joint,
                    X_train=X_th,
                    Y_train=Y_th,
                    n_inputs=n_inputs,
                    deg=deg,
                    cr=cr,
                    max_cond=max_cond_eval,
                    silent=True
                )
                pred_theta = predict_pce_list(pce_to_list(pce_theta), test_x)

                # RMSE.
                rmse_rand = float(np.sqrt(np.mean((pred_rand - test_y) ** 2)))
                rmse_theta = float(np.sqrt(np.mean((pred_theta - test_y) ** 2)))

                # UQPy normalizovana MSE (prumer pres vystupy).
                uq_rand = uqpy_validation_error_exact(test_y, pred_rand)
                uq_theta = uqpy_validation_error_exact(test_y, pred_theta)
                uq_mean_rand = float(np.nanmean(uq_rand))
                uq_mean_theta = float(np.nanmean(uq_theta))

                rows.append({
                    "pocet_start": pocet_start,
                    "seed": seed,
                    "deg": deg,
                    "add_k": add_k,
                    "rmse_random": rmse_rand,
                    "rmse_theta": rmse_theta,
                    "uqpy_mean_random": uq_mean_rand,
                    "uqpy_mean_theta": uq_mean_theta,
                })

print(f"\n Time spend calculating: {time.time() - t0_all:.2f} s")

results = pd.DataFrame(rows)
results = results.sort_values(["pocet_start", "seed", "deg", "add_k"]).reset_index(drop=True)
out_csv = "results_simple_theta_vs_random.csv"
results.to_csv(out_csv, index=False)
print(f"Saved: {out_csv}")
out_pdf = "sapce_theta_vs_random_report.pdf"
make_pdf_report(results, out_pdf=out_pdf)
print(f"Saved: {out_pdf}")


Oakwood_NVM.csv: X=(1000, 12), Y_N=(1000, 113)

EXPERIMENT
deg_list=[3, 5, 7]
pocet_start_list=[30, 70]
K_total=30, step_eval=5, cr=0.0005
max_cond_selection=10000000.0, max_cond_eval=10000000.0
seed_list=[0, 1, 2, 3]

Fixni split pocet_start=30, seed=0
start_idx: n=30 min=1 max=993
cand_idx : n=870 min=0 max=999
rand_idx : n=30 min=125 max=997
test_idx : n=100 min=7 max=994
RUN: pocet_start=30 | seed=0 | deg=3 | cr=0.0005
RUN: pocet_start=30 | seed=0 | deg=5 | cr=0.0005
RUN: pocet_start=30 | seed=0 | deg=7 | cr=0.0005

Fixni split pocet_start=30, seed=1
start_idx: n=30 min=6 max=971
cand_idx : n=870 min=0 max=999
rand_idx : n=30 min=30 max=997
test_idx : n=100 min=4 max=988
RUN: pocet_start=30 | seed=1 | deg=3 | cr=0.0005
RUN: pocet_start=30 | seed=1 | deg=5 | cr=0.0005
RUN: pocet_start=30 | seed=1 | deg=7 | cr=0.0005

Fixni split pocet_start=30, seed=2
start_idx: n=30 min=37 max=954
cand_idx : n=870 min=0 max=999
rand_idx : n=30 min=22 max=938
test_idx : n=100 min=4 max=993
RUN: poce