In [45]:
import io
import copy
import time
import contextlib
import numpy as np
import pandas as pd

from UQpy.distributions import Uniform, Normal, JointIndependent
from UQpy.sampling import ThetaCriterionPCE
import SAPCE

from openpyxl import load_workbook
from openpyxl.styles import Font, Alignment
from openpyxl.utils import get_column_letter

In [47]:
def pce_to_list(pce_multi):
    #PCE na jednotlive vystupy.
    C = np.asarray(pce_multi.coefficients)
    if C.ndim == 1:
        C = C.reshape(-1, 1)
    pce_list = []
    for j in range(C.shape[1]):
        # Kopiruju model a vymenim koeficienty vystupu.
        p = copy.deepcopy(pce_multi)
        p.coefficients = C[:, j].reshape(-1, 1)
        pce_list.append(p)
    return pce_list

def predict_pce_list(pce_list, X):   # NEPREDIKUJE polynom vytvoří space a tady se do něj jen dosazuje
    out = []
    for p in pce_list:
        y = None
        for m in ("predict", "evaluate", "__call__", "run"):  # Muze mit ruzne metody predikce
            if hasattr(p, m):
                y = np.asarray(getattr(p, m)(X))
                break
        if y is None:
            raise RuntimeError("Nenasel jsem")
        if y.ndim == 1:
            y = y.reshape(-1, 1)
        out.append(y)
    return np.hstack(out)


def uqpy_validation_error_exact(y_true, y_pred): # UQPy MSE 
    y = np.asarray(y_true, dtype=float)
    y_val = np.asarray(y_pred, dtype=float)
    if y.ndim == 1:
        y = y.reshape(-1, 1)
    if y_val.ndim == 1:
        y_val = y_val.reshape(-1, 1)
    # (n-1)/n.
    n_samples = y.shape[0]
    mu = (1.0 / n_samples) * np.sum(y, axis=0)
    num = np.sum((y - y_val) ** 2, axis=0)  # Num je soucet ctvercu chyb.
    den = np.sum((y - mu) ** 2, axis=0) # Den je soucet ctvercu odchylek od prumeru.
    # Osetrim deleni nulou u konstantnich vystupu.
    eps = np.full_like(num, np.nan, dtype=float)
    mask = np.isfinite(num) & np.isfinite(den) & (den > 0)
    eps[mask] = ((n_samples - 1) / n_samples) * (num[mask] / den[mask])
    return np.round(eps, 7)  # Zaokrouhlim kvuli Excelu.

    
# Export do exelu (z Githubu)
def export_excel(X, Y_true, Y_pred, x_names, y_names, train_idx,
                        out_xlsx="all_points_true_vs_pred_final_sapce.xlsx"):
    n_samples = X.shape[0] # Ulozim data do vice listu pro prehlednost.
    n_outputs = Y_true.shape[1]
    nontrain_mask = np.ones(n_samples, dtype=bool)
    nontrain_mask[train_idx] = False
    
    # vstupy
    df_inputs = pd.DataFrame(X, columns=x_names)
    df_inputs.insert(0, "row_idx", np.arange(n_samples))
    # vystupy
    df_true = pd.DataFrame(Y_true, columns=y_names)
    df_true.insert(0, "row_idx", np.arange(n_samples))
    # predikovane vystupy
    df_pred = pd.DataFrame(Y_pred, columns=y_names)
    df_pred.insert(0, "row_idx", np.arange(n_samples))
    # dlouhy format
    row_idx_long = np.repeat(np.arange(n_samples), n_outputs)
    out_name_long = np.tile(np.array(y_names, dtype=object), n_samples)
    
    # matice na vektory pro tabulku.
    true_long = Y_true.reshape(-1)
    pred_long = Y_pred.reshape(-1)
    err_long = pred_long - true_long
    df_err_long = pd.DataFrame({
        "row_idx": row_idx_long,
        "output": out_name_long,
        "true": true_long,
        "pred": pred_long,
        "error": err_long,
        "abs_error": np.abs(err_long)
    })
    # etriky po vystupech.
    per_out_rmse = np.sqrt(np.mean((Y_pred - Y_true) ** 2, axis=0))
    per_out_mae = np.mean(np.abs(Y_pred - Y_true), axis=0)
    
    uq_all = uqpy_validation_error_exact(Y_true, Y_pred) # UQPy val err pro vsechny body.
    uq_nontrain = uqpy_validation_error_exact(Y_true[nontrain_mask], Y_pred[nontrain_mask])  # UQPy val err jen mimo trenink.
    
    df_metrics_per_out = pd.DataFrame({
        "output": y_names,
        "RMSE": per_out_rmse,
        "MAE": per_out_mae,
        "UQPy_valerr_all": uq_all,
        "UQPy_valerr_nontrain": uq_nontrain
    })
    
    #globalni metriky pres vsechny vystupy
    global_rmse = float(np.sqrt(np.mean((Y_true - Y_pred) ** 2)))
    global_mae = float(np.mean(np.abs(Y_true - Y_pred)))
    global_medae = float(np.median(np.abs(Y_true - Y_pred)))
    # U globalni souhrn metrik
    df_metrics_global = pd.DataFrame({
        "metric": [
            "RMSE (all)",
            "MAE (all)",
            "Median AE (all)",
            "UQPy val err mean (all)",
            "UQPy val err median (all)",
            "UQPy val err mean (non-train)",
            "UQPy val err median (non-train)"
        ],
        "value": [
            global_rmse,
            global_mae,
            global_medae,
            float(np.nanmean(uq_all)),
            float(np.nanmedian(uq_all)),
            float(np.nanmean(uq_nontrain)),
            float(np.nanmedian(uq_nontrain))
        ]
    })

    # Zapisu listy do Excelu pro dalsi praci.
    with pd.ExcelWriter(out_xlsx, engine="openpyxl") as writer:
        df_inputs.to_excel(writer, sheet_name="Inputs", index=False)
        df_true.to_excel(writer, sheet_name="True", index=False)
        df_pred.to_excel(writer, sheet_name="Pred", index=False)
        df_err_long.to_excel(writer, sheet_name="Errors_Long", index=False)
        df_metrics_global.to_excel(writer, sheet_name="Metrics_Global", index=False)
        df_metrics_per_out.to_excel(writer, sheet_name="Metrics_PerOutput", index=False)
    wb = load_workbook(out_xlsx)
    header_font = Font(bold=True)
    center = Alignment(horizontal="center", vertical="center", wrap_text=True)
    for ws in wb.worksheets:
        ws.freeze_panes = "B2"
        for cell in ws[1]:
            cell.font = header_font
            cell.alignment = center
        if ws.max_row >= 1 and ws.max_column >= 1:
            last_col_letter = get_column_letter(ws.max_column)
            ws.auto_filter.ref = f"A1:{last_col_letter}{ws.max_row}"
        #sirky sloupcu
        for col in range(1, ws.max_column + 1):
            col_letter = get_column_letter(col)
            ws.column_dimensions[col_letter].width = 10 if col == 1 else 16
    wb.save(out_xlsx)

In [49]:
# Main
seed = 0 # Pocet behu
initial_points = 30 # startovacich bodu pro trenink.
theta_add_points = 20 # kolik bodu Theta prida.
deg = 3 # stupen polynomu pro SAPCE.
cr = 0.0001 # nastaveni sapce
max_cond = 1e8 # podminenos


rng = np.random.RandomState(seed) # pokazde jine


print("Loading dataset")
data_raw = pd.read_csv("Oakwood_NVM.csv", sep=";", header=0, dtype=str)
# Prevedu desetinnou carku na tecku.
data = data_raw.applymap(lambda s: s.replace(",", ".") if isinstance(s, str) else s)
data = data.apply(pd.to_numeric, errors="coerce")

names = data.columns.tolist()
arr = data.to_numpy() # tabulku do numpy pro rychlost.


X = arr[:, :12].astype(float) # Prvnich 12 sloupcu jsou vstupy X.
Y_all = arr[:, 12:].astype(float) # Vse za tim jsou vystupy N.
output_cols = names[12:]
Y_N = Y_all[:, :113].astype(float)
y_names = output_cols[:113]

print(f"Loaded Oakwood_NVM.csv: X={X.shape}, Y_N={Y_N.shape}")
print(selected_info)

x_names = names[:12] # nazvy vstupu pro export.

# Vytahnu rozmery
n_samples = X.shape[0]
n_inputs = X.shape[1]
n_outputs = Y_N.shape[1]


all_idx = np.arange(n_samples) # Udelam indexy vsech bodu v datasetu.
start_idx = rng.choice(all_idx, size=initial_points, replace=False) # Vyberu nahodne startovaci body treninku.
cand_idx = np.setdiff1d(all_idx, start_idx, assume_unique=False) # Zbytek bodu kandidati.

print(f"start_idx: n={len(start_idx)} min={int(np.min(start_idx))} max={int(np.max(start_idx))}") # Body na startu
print(f"cand_idx : n={len(cand_idx)} min={int(np.min(cand_idx))} max={int(np.max(cand_idx))}") # Body pro vyber

# Nastavim rozdeleni vstupu
dist_Ec = Normal(13, 1)
dist_Relax = Uniform(loc=30, scale=10)
# Soil1 parametry.
dist_S1_Erel = Uniform(loc=2, scale=1)
dist_S1_E50 = Normal(65, 5)
dist_S1_c = Normal(30, 5)
dist_S1_theta = Normal(30, 1)
dist_S1_ko = Uniform(loc=0.6, scale=0.5)
# Soil2 parametry.
dist_S2_Erel = Uniform(loc=2, scale=1)
dist_S2_E50 = Normal(130, 10)
dist_S2_c = Normal(5, 1)
dist_S2_theta = Normal(42, 1)
dist_S2_ko = Uniform(loc=0.45, scale=0.2)

marginals = [
    dist_Ec, dist_Relax,
    dist_S1_Erel, dist_S1_E50, dist_S1_c, dist_S1_theta, dist_S1_ko,
    dist_S2_Erel, dist_S2_E50, dist_S2_c, dist_S2_theta, dist_S2_ko
]
joint = JointIndependent(marginals=marginals)

theta_out_idx = np.arange(n_outputs, dtype=int) # Nastavim, ze Theta vidi vsechny vystupy.

# Inicializuju treninkove matice startovacimi body.
X_theta = X[start_idx].copy()
Y_theta_sel = Y_N[start_idx][:, theta_out_idx].copy()

theta_body_idx = [] # indexy vybranych bodu.

print("Theta selection")
print(f"deg={deg}, cr={cr}, max_cond={max_cond}, add_points={theta_add_points}") #  nastaveni.

t0 = time.time() # Zmerim cas vyberu bodu

# Smycka postupne vybere dalsi body treninku.
for k in range(theta_add_points):
    # SAPCE model
    sapce_sel = SAPCE.SensitivityAdaptivePCE(
        pdf=joint,
        exp_design_in=X_theta,
        exp_design_out=Y_theta_sel,
        max_partial_degree=deg,
        num_inputs=n_inputs
    )
    # Zruseni vypisu ze sapce - přehled ve výpisu
    with contextlib.redirect_stdout(io.StringIO()), contextlib.redirect_stderr(io.StringIO()):
        sapce_sel.construct_adaptive_basis(max_condition_number=max_cond)
        sapce_sel.construct_pruned_pce(cr=cr)


    pce_sel = sapce_sel.pce   # Vytahnu PCE model z SAPCE objektu.
    pce_list_sel = pce_to_list(pce_sel) # Rozsekam PCE na seznam vystupu pro Thetu.
    # Theta
    theta = ThetaCriterionPCE(pce_list_sel)
    cand_theta = X[cand_idx]
    i = int(theta.run(X_theta, cand_theta))
    new_idx = int(cand_idx[i])
    theta_body_idx.append(new_idx)

    # Pridam vybrany bod do treninku.
    X_theta = np.vstack([X_theta, X[new_idx].reshape(1, -1)])
    Y_theta_sel = np.vstack([Y_theta_sel, Y_N[new_idx, theta_out_idx].reshape(1, -1)])
    # A odstranim vybrany bod z kandidatu.
    cand_idx = np.delete(cand_idx, i)
    # uroven vybirani bodu
    if (k + 1) % 5 == 0 or (k + 1) == theta_add_points:
        print(f"Theta step {k+1:02d}/{theta_add_points}: picked idx={new_idx}")

print(f"Theta selection time: {time.time() - t0:.2f} s") # Vypisu celkovy cas vyberu Thetou.
theta_body_idx = np.array(theta_body_idx, dtype=int)
print(f"Theta picked indices (all): {theta_body_idx.tolist()}") # vybrane indexy pro zapis.

# Slozim finalni trenink: start plus theta body.
train_idx = np.concatenate([start_idx, theta_body_idx])
X_train = X[train_idx]
Y_train = Y_N[train_idx]


print("FINAL SAPCE")
# Vypisu pocet bodu pro finalni trenink.
print(f"train points: {len(train_idx)} (start {len(start_idx)} + theta {len(theta_body_idx)})")
# Finalni SAPCE model natrenuju na vsech treninkovych bodech.
sapce_final = SAPCE.SensitivityAdaptivePCE(
    pdf=joint,
    exp_design_in=X_train,
    exp_design_out=Y_train,
    max_partial_degree=deg,
    num_inputs=n_inputs
)

# ztiseny bypisy
with contextlib.redirect_stdout(io.StringIO()), contextlib.redirect_stderr(io.StringIO()):
    sapce_final.construct_adaptive_basis(max_condition_number=max_cond)
    sapce_final.construct_pruned_pce(cr=cr)
pce_final_multi = sapce_final.pce
pce_final_list = pce_to_list(pce_final_multi)
Y_hat = predict_pce_list(pce_final_list, X) # Predikuju vystupy pro vsechny body v datasetu.

# Export do souboru
df_out = pd.DataFrame({"row_idx": np.arange(n_samples)}) 
for j, col in enumerate(x_names):
    df_out[col] = X[:, j]
for j in range(n_outputs):
    col_name = str(y_names[j]) if j < len(y_names) else f"Y{j+1}"
    df_out[f"true_{col_name}"] = Y_N[:, j]
    df_out[f"pred_{col_name}"] = Y_hat[:, j]
out_csv = "all_points_true_vs_pred_final_sapce.csv"
df_out.to_csv(out_csv, index=False)
print(f"Saved: {out_csv}")
# Excel 
out_xlsx = "all_points_true_vs_pred_final_sapce.xlsx"
export_excel(
    X=X,
    Y_true=Y_N,
    Y_pred=Y_hat,
    x_names=x_names,
    y_names=list(y_names),
    train_idx=train_idx,
    out_xlsx=out_xlsx
)
print(f"Saved: {out_xlsx}")

Loading dataset
Loaded Oakwood_NVM.csv: X=(1000, 12), Y_N=(1000, 113)
Selected N as the first 113 output columns (fallback)
start_idx: n=30 min=1 max=993
cand_idx : n=970 min=0 max=999
Theta selection
deg=3, cr=0.0001, max_cond=100000000.0, add_points=20
Theta step 05/20: picked idx=5
Theta step 10/20: picked idx=10
Theta step 15/20: picked idx=16
Theta step 20/20: picked idx=21
Theta selection time: 91.27 s
Theta picked indices (all): [0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 16, 17, 18, 19, 20, 21]
FINAL SAPCE
train points: 50 (start 30 + theta 20)
Saved: all_points_true_vs_pred_final_sapce.csv
Saved: all_points_true_vs_pred_final_sapce.xlsx
