In [1]:
import numpy as np
import matplotlib as mpl

mpl.use("Cairo")  # for saving SVGs that Affinity Designer can parse
import matplotlib.pyplot as plt
import pathlib as pl
import dill

import candas as can
import gumbi as gmb
from candas.learn import ParameterSet

from tqdm.auto import tqdm

code_pth = pl.Path.cwd()  # for running in Jupyter
# code_pth = pl.Path(__file__)  # for running in terminal
fig_pth = code_pth.parent
data_pth = fig_pth / "data"
graph_pth = fig_pth / "graphics"
graph_pth.mkdir(exist_ok=True)

gen_pth = fig_pth / "generated"
gen_pth.mkdir(exist_ok=True)

plt.style.use(str(can.style.breve))

In [2]:
# plt.style.use('style.mplstyle')

%config InlineBackend.figure_format = 'retina'

from utils import savefig

In [3]:
ps = ParameterSet.load(data_pth / "ADVI_ParameterSets_220528.pkl")


def make_pair(row):
    return "-".join(sorted([row.FPrimer, row.RPrimer]))


data = (
    ps.wide.query('Metric == "mean"')
    .astype({"BP": float})
    .assign(PrimerPair=lambda df: df.apply(make_pair, axis=1))
    .groupby(["Target", "PrimerPair", "Reporter"])
    .mean(numeric_only=True)
    .drop_duplicates()
    .reset_index()
)

selected = (
    data.groupby(["PrimerPair", "Reporter"])
    .size()
    .reset_index()
    .rename(columns={0: "Observations"})
    .sort_values("Observations", ascending=False)
    .reset_index(drop=True)
).iloc[[0, 1, 4, 5, 6, 8, 38, 39, 42]]

In [4]:
ds_full = gmb.DataSet(
    data=data,
    outputs=["F0_lg", "r", "K", "m"],
    log_vars=["BP", "K", "m", "r"],
    logit_vars=["GC"],
)
stdzr = ds_full.stdzr

In [5]:
static_data = data[
            ~((data.PrimerPair == 'FP004-RP004') & (data.Reporter == 'HEX'))
        ]

xval_data = data[
            (data.PrimerPair == 'FP004-RP004') & (data.Reporter == 'HEX')
        ].reset_index(drop=True)

In [6]:
from scipy.special import comb

In [7]:
J = 10
i = 3
I = len(xval_data)
    
def get_train_vec(x, i):
    """
    Get a training vector for the cross-validation
    """
    is_train = np.array([1]*i + [0]*(I-i)).astype(bool)
    np.random.RandomState(x*I+i).shuffle(is_train)
    return is_train

In [8]:
import pandas as pd
from numpy.linalg import LinAlgError

rows = []

x = 0

if (gen_pth / "Avg_model_xval.csv").exists():
    df = pd.read_csv(gen_pth / "Avg_model_xval.csv")
    print(df[["N_train", "Iteration"]].astype(int).apply(tuple, axis=1).tolist())
else:
    df = pd.DataFrame(
        columns=[
            "N_train",
            "Iteration",
            "Train_NLPD",
            "Train_RMSE",
            "Test_NLPD",
            "Test_RMSE",
            "TrainCode",
        ]
    )

for i in tqdm(range(1, I + 1, 5), desc="Training set size", leave=True):
    for j in tqdm(range(J), desc="Iteration", leave=False):
        
        so_far = df[["N_train", "Iteration"]].astype(int).apply(tuple, axis=1).tolist()
        if (i, j) in so_far:
            continue
        
        is_train = get_train_vec(j, i)
        train_code = "".join(is_train.astype(int).astype(str))
        k = 0
        while train_code in df["TrainCode"].values:
            is_train = get_train_vec(j*1000+k*I, i)
            train_code = "".join(is_train.astype(int).astype(str))
            k += 1

        train_data = pd.concat([static_data, xval_data.iloc[is_train]])
        test_data = xval_data.iloc[~is_train]

        train_ds = gmb.DataSet(
            data=train_data,
            outputs=["F0_lg", "r", "K", "m"],
            log_vars=["BP", "K", "m", "r"],
            logit_vars=["GC"],
            stdzr=stdzr,
        )

        test_ds = gmb.DataSet(
            data=test_data,
            outputs=["F0_lg", "r", "K", "m"],
            log_vars=["BP", "K", "m", "r"],
            logit_vars=["GC"],
            stdzr=stdzr,
        )

        try:
            gp = gmb.GP(train_ds).fit(
                continuous_dims=["BP", "GC"], progressbar=False
            )
            test_preds = gp.predict_points(
                gp.parray(**test_ds.wide[["BP", "GC"]].to_dict(orient="list"))
            ).get("r")
            train_preds = gp.predict_points(
                gp.parray(**xval_data.iloc[is_train][["BP", "GC"]].to_dict(orient="list"))
            ).get("r")
        except LinAlgError:
            continue

        test_rmse = np.sqrt(np.mean((test_preds.μ - test_ds.wide["r"]) ** 2))
        train_rmse = np.sqrt(
            np.mean((train_preds.μ - xval_data.iloc[is_train]["r"]) ** 2)
        )

        test_nlpd = -test_preds.dist.logpdf(test_ds.wide["r"]).mean()
        train_nlpd = -train_preds.dist.logpdf(xval_data.iloc[is_train]["r"]).mean()

        test_nlpd, train_nlpd, test_rmse, train_rmse

        row = {
            "N_train": i,
            "Iteration": j,
            "Train_NLPD": train_nlpd,
            "Train_RMSE": train_rmse,
            "Test_NLPD": test_nlpd,
            "Test_RMSE": test_rmse,
            "TrainCode": train_code,
        }

        df = pd.concat([df, pd.DataFrame([row])])
        df.to_csv(gen_pth / "Avg_model_xval.csv", index=False)

[(1, 0), (1, 1), (1, 2), (1, 3), (1, 4), (1, 5), (1, 6), (1, 7), (1, 8), (1, 9), (6, 0), (6, 1), (6, 2), (6, 3), (6, 4), (6, 5), (6, 6), (6, 7), (6, 8), (6, 9), (11, 0), (11, 1), (11, 2), (11, 3), (11, 4), (11, 5), (11, 6), (11, 7), (11, 8), (11, 9), (16, 0), (16, 1), (16, 2), (16, 3), (16, 4), (16, 5), (16, 6), (16, 7), (16, 8), (16, 9), (21, 0), (21, 1), (21, 2), (21, 3), (21, 4), (21, 5), (21, 6), (21, 7), (21, 8), (21, 9), (26, 0), (26, 1), (26, 2), (26, 3), (26, 4), (26, 5), (26, 6), (26, 7), (26, 8), (26, 9), (31, 0), (31, 1), (31, 2), (31, 3), (31, 4), (31, 5), (31, 6), (31, 7), (31, 8), (31, 9), (36, 0), (36, 1), (36, 2), (36, 3), (36, 4), (36, 5), (36, 6), (36, 7), (36, 8), (36, 9)]


Training set size:   0%|          | 0/8 [00:00<?, ?it/s]

Iteration:   0%|          | 0/10 [00:00<?, ?it/s]

Iteration:   0%|          | 0/10 [00:00<?, ?it/s]

Iteration:   0%|          | 0/10 [00:00<?, ?it/s]

Iteration:   0%|          | 0/10 [00:00<?, ?it/s]

Iteration:   0%|          | 0/10 [00:00<?, ?it/s]

Iteration:   0%|          | 0/10 [00:00<?, ?it/s]

Iteration:   0%|          | 0/10 [00:00<?, ?it/s]

Iteration:   0%|          | 0/10 [00:00<?, ?it/s]

In [9]:
import pandas as pd
from numpy.linalg import LinAlgError

rows = []

x = 0

if (gen_pth / "Ind_model_xval.csv").exists():
    df = pd.read_csv(gen_pth / "Ind_model_xval.csv")
    print(df[["N_train", "Iteration"]].astype(int).apply(tuple, axis=1).tolist())
else:
    df = pd.DataFrame(
        columns=[
            "N_train",
            "Iteration",
            "Train_NLPD",
            "Train_RMSE",
            "Test_NLPD",
            "Test_RMSE",
            "TrainCode",
        ]
    )

for i in tqdm(range(1, I + 1, 5), desc="Training set size", leave=True):
    for j in tqdm(range(J), desc="Iteration", leave=False):
        
        so_far = df[["N_train", "Iteration"]].astype(int).apply(tuple, axis=1).tolist()
        if (i, j) in so_far:
            continue
        
        is_train = get_train_vec(j, i)
        train_code = "".join(is_train.astype(int).astype(str))
        k = 0
        while train_code in df["TrainCode"].values:
            is_train = get_train_vec(j*1000+k*I, i)
            train_code = "".join(is_train.astype(int).astype(str))
            k += 1

        train_data = xval_data.iloc[is_train]
        test_data = xval_data.iloc[~is_train]

        train_ds = gmb.DataSet(
            data=train_data,
            outputs=["F0_lg", "r", "K", "m"],
            log_vars=["BP", "K", "m", "r"],
            logit_vars=["GC"],
            stdzr=stdzr,
        )

        test_ds = gmb.DataSet(
            data=test_data,
            outputs=["F0_lg", "r", "K", "m"],
            log_vars=["BP", "K", "m", "r"],
            logit_vars=["GC"],
            stdzr=stdzr,
        )

        try:
            if train_ds.wide.shape[0] > 1:
                gp = gmb.GP(train_ds).fit(
                    continuous_dims=["BP", "GC"], progressbar=False
                )
            else:
                gp = gmb.GP(train_ds)
                gp.specify_model(continuous_dims=["BP", "GC"])
                gp.filter_dims = {}
                gp.continuous_dims = ["BP", "GC"]
                gp.continuous_levels = gp._parse_levels(gp.continuous_dims, None)
                gp.continuous_coords = gp._parse_coordinates(
                    gp.continuous_dims, gp.continuous_levels, None
                )
                gp.build_model()
                gp.find_MAP(progressbar=False)
                
            test_preds = gp.predict_points(
                gp.parray(**test_ds.wide[["BP", "GC"]].to_dict(orient="list"))
            ).get("r")
            train_preds = gp.predict_points(
                gp.parray(**train_ds.wide[["BP", "GC"]].to_dict(orient="list"))
            ).get("r")
        except LinAlgError:
            continue

        test_rmse = np.sqrt(np.mean((test_preds.μ - test_ds.wide["r"]) ** 2))
        train_rmse = np.sqrt(
            np.mean((train_preds.μ - train_ds.wide["r"]) ** 2)
            
        )

        test_nlpd = -test_preds.dist.logpdf(test_ds.wide["r"]).mean()
        train_nlpd = -train_preds.dist.logpdf(train_ds.wide["r"]).mean()

        test_nlpd, train_nlpd, test_rmse, train_rmse

        row = {
            "N_train": i,
            "Iteration": j,
            "Train_NLPD": train_nlpd,
            "Train_RMSE": train_rmse,
            "Test_NLPD": test_nlpd,
            "Test_RMSE": test_rmse,
            "TrainCode": train_code,
        }

        df = pd.concat([df, pd.DataFrame([row])])
        df.to_csv(gen_pth / "Ind_model_xval.csv", index=False)

[(1, 0), (1, 1), (1, 2), (1, 3), (1, 4), (1, 5), (1, 6), (1, 7), (1, 8), (1, 9), (6, 0), (6, 1), (6, 2), (6, 3), (6, 4), (6, 5), (6, 6), (6, 7), (6, 8), (6, 9), (11, 0), (11, 1), (11, 2), (11, 3), (11, 4), (11, 5), (11, 6), (11, 7), (11, 8), (11, 9), (16, 0), (16, 1), (16, 2), (16, 3), (16, 4), (16, 5), (16, 6), (16, 7), (16, 8), (16, 9), (21, 0), (21, 1), (21, 2), (21, 3), (21, 4), (21, 5), (21, 6), (21, 7), (21, 8), (21, 9), (26, 0), (26, 1), (26, 2), (26, 3), (26, 4), (26, 5), (26, 6), (26, 7), (26, 8), (26, 9), (31, 0), (31, 1), (31, 2), (31, 3), (31, 4), (31, 5), (31, 6), (31, 7), (31, 8), (31, 9), (36, 0), (36, 1), (36, 2), (36, 3), (36, 4), (36, 5), (36, 6), (36, 7), (36, 8), (36, 9)]


Training set size:   0%|          | 0/8 [00:00<?, ?it/s]

Iteration:   0%|          | 0/10 [00:00<?, ?it/s]

Iteration:   0%|          | 0/10 [00:00<?, ?it/s]

Iteration:   0%|          | 0/10 [00:00<?, ?it/s]

Iteration:   0%|          | 0/10 [00:00<?, ?it/s]

Iteration:   0%|          | 0/10 [00:00<?, ?it/s]

Iteration:   0%|          | 0/10 [00:00<?, ?it/s]

Iteration:   0%|          | 0/10 [00:00<?, ?it/s]

Iteration:   0%|          | 0/10 [00:00<?, ?it/s]

In [10]:
import pandas as pd
from numpy.linalg import LinAlgError

rows = []

x = 0

if (gen_pth / "LMC_model_xval.csv").exists():
    df = pd.read_csv(gen_pth / "LMC_model_xval.csv")
    print(df[["N_train", "Iteration"]].astype(int).apply(tuple, axis=1).tolist())
else:
    df = pd.DataFrame(
        columns=[
            "N_train",
            "Iteration",
            "Train_NLPD",
            "Train_RMSE",
            "Test_NLPD",
            "Test_RMSE",
            "TrainCode",
        ]
    )

for i in tqdm(range(1, I + 1, 5), desc="Training set size", leave=True):
    for j in tqdm(range(J), desc="Iteration", leave=False):

        so_far = df[["N_train", "Iteration"]].astype(int).apply(tuple, axis=1).tolist()
        if (i, j) in so_far:
            continue

        is_train = get_train_vec(j, i)
        train_code = "".join(is_train.astype(int).astype(str))
        k = 0
        while train_code in df["TrainCode"].values:
            is_train = get_train_vec(j * 1000 + k * I, i)
            train_code = "".join(is_train.astype(int).astype(str))
            k += 1

        train_data = pd.concat([static_data, xval_data.iloc[is_train]])
        test_data = xval_data.iloc[~is_train]

        train_ds = gmb.DataSet(
            data=train_data,
            outputs=["F0_lg", "r", "K", "m"],
            log_vars=["BP", "K", "m", "r"],
            logit_vars=["GC"],
            stdzr=stdzr,
        )

        test_ds = gmb.DataSet(
            data=test_data,
            outputs=["F0_lg", "r", "K", "m"],
            log_vars=["BP", "K", "m", "r"],
            logit_vars=["GC"],
            stdzr=stdzr,
        )

        try:
            gp = gmb.GP(train_ds).fit(
                continuous_dims=["BP", "GC"],
                categorical_dims=["PrimerPair", "Reporter"],
                progressbar=False,
            )

            test_pa = gp.parray(**test_ds.wide[["BP", "GC"]].to_dict(orient="list"))
            test_pa = gp.append_categorical_points(
                test_pa, {"PrimerPair": "FP004-RP004", "Reporter": "HEX"}
            )
            test_preds = gp.predict_points(test_pa).get("r")

            train_pa = gp.parray(
                **xval_data.iloc[is_train][["BP", "GC"]].to_dict(orient="list")
            )
            train_pa = gp.append_categorical_points(
                train_pa, {"PrimerPair": "FP004-RP004", "Reporter": "HEX"}
            )
            train_preds = gp.predict_points(train_pa).get("r")
        except LinAlgError:
            continue

        test_rmse = np.sqrt(np.mean((test_preds.μ - test_ds.wide["r"]) ** 2))
        train_rmse = np.sqrt(
            np.mean((train_preds.μ - xval_data.iloc[is_train]["r"]) ** 2)
        )

        test_nlpd = -test_preds.dist.logpdf(test_ds.wide["r"]).mean()
        train_nlpd = -train_preds.dist.logpdf(xval_data.iloc[is_train]["r"]).mean()

        test_nlpd, train_nlpd, test_rmse, train_rmse

        row = {
            "N_train": i,
            "Iteration": j,
            "Train_NLPD": train_nlpd,
            "Train_RMSE": train_rmse,
            "Test_NLPD": test_nlpd,
            "Test_RMSE": test_rmse,
            "TrainCode": train_code,
        }

        df = pd.concat([df, pd.DataFrame([row])])
        df.to_csv(gen_pth / "LMC_model_xval.csv", index=False)

[(1, 0), (1, 1), (1, 2), (1, 3), (1, 4), (1, 5), (1, 6), (1, 7), (1, 8), (1, 9), (6, 0), (6, 1), (6, 2), (6, 3), (6, 4), (6, 5), (6, 6), (6, 7), (6, 8), (6, 9), (11, 0), (11, 1), (11, 2), (11, 3), (11, 4), (11, 5), (11, 6), (11, 7), (11, 8), (11, 9), (16, 0), (16, 1), (16, 2), (16, 3), (16, 4), (16, 5), (16, 6), (16, 7), (16, 8), (16, 9), (21, 0), (21, 1), (21, 2), (21, 3), (21, 4), (21, 5), (21, 6), (21, 7), (21, 8), (21, 9), (26, 0), (26, 1), (26, 2), (26, 3), (26, 4), (26, 5), (26, 6), (26, 7), (26, 8), (26, 9), (31, 0), (31, 1), (31, 2), (31, 3), (31, 4), (31, 5), (31, 6), (31, 7), (31, 8), (31, 9), (36, 0), (36, 1), (36, 2), (36, 3), (36, 4), (36, 5), (36, 6)]


Training set size:   0%|          | 0/8 [00:00<?, ?it/s]

Iteration:   0%|          | 0/10 [00:00<?, ?it/s]

Iteration:   0%|          | 0/10 [00:00<?, ?it/s]

Iteration:   0%|          | 0/10 [00:00<?, ?it/s]

Iteration:   0%|          | 0/10 [00:00<?, ?it/s]

Iteration:   0%|          | 0/10 [00:00<?, ?it/s]

Iteration:   0%|          | 0/10 [00:00<?, ?it/s]

Iteration:   0%|          | 0/10 [00:00<?, ?it/s]

Iteration:   0%|          | 0/10 [00:00<?, ?it/s]

In [None]:
import pandas as pd
from numpy.linalg import LinAlgError

rows = []

x = 0

if (gen_pth / "LMC_model2_xval.csv").exists():
    df = pd.read_csv(gen_pth / "LMC_model2_xval.csv")
    print(df[["N_train", "Iteration"]].astype(int).apply(tuple, axis=1).tolist())
else:
    df = pd.DataFrame(
        columns=[
            "N_train",
            "Iteration",
            "Train_NLPD",
            "Train_RMSE",
            "Test_NLPD",
            "Test_RMSE",
            "TrainCode",
        ]
    )

for i in tqdm(range(1, I + 1, 5), desc="Training set size", leave=True):
    for j in tqdm(range(J), desc="Iteration", leave=False):

        so_far = df[["N_train", "Iteration"]].astype(int).apply(tuple, axis=1).tolist()
        if (i, j) in so_far:
            continue

        is_train = get_train_vec(j, i)
        train_code = "".join(is_train.astype(int).astype(str))
        k = 0
        while train_code in df["TrainCode"].values:
            is_train = get_train_vec(j * 1000 + k * I, i)
            train_code = "".join(is_train.astype(int).astype(str))
            k += 1

        train_data = pd.concat([static_data, xval_data.iloc[is_train]])
        test_data = xval_data.iloc[~is_train]

        train_ds = gmb.DataSet(
            data=train_data,
            outputs=["F0_lg", "r", "K", "m"],
            log_vars=["BP", "K", "m", "r"],
            logit_vars=["GC"],
            stdzr=stdzr,
        )

        test_ds = gmb.DataSet(
            data=test_data,
            outputs=["F0_lg", "r", "K", "m"],
            log_vars=["BP", "K", "m", "r"],
            logit_vars=["GC"],
            stdzr=stdzr,
        )

        try:
            gp = gmb.GP(train_ds).fit(
                continuous_dims=["BP", "GC"],
                categorical_dims=["PrimerPair"],
                progressbar=False,
            )

            test_pa = gp.parray(**test_ds.wide[["BP", "GC"]].to_dict(orient="list"))
            test_pa = gp.append_categorical_points(
                test_pa, {"PrimerPair": "FP004-RP004"}
            )
            test_preds = gp.predict_points(test_pa).get("r")

            train_pa = gp.parray(
                **xval_data.iloc[is_train][["BP", "GC"]].to_dict(orient="list")
            )
            train_pa = gp.append_categorical_points(
                train_pa, {"PrimerPair": "FP004-RP004"}
            )
            train_preds = gp.predict_points(train_pa).get("r")
        except LinAlgError:
            continue

        test_rmse = np.sqrt(np.mean((test_preds.μ - test_ds.wide["r"]) ** 2))
        train_rmse = np.sqrt(
            np.mean((train_preds.μ - xval_data.iloc[is_train]["r"]) ** 2)
        )

        test_nlpd = -test_preds.dist.logpdf(test_ds.wide["r"]).mean()
        train_nlpd = -train_preds.dist.logpdf(xval_data.iloc[is_train]["r"]).mean()

        test_nlpd, train_nlpd, test_rmse, train_rmse

        row = {
            "N_train": i,
            "Iteration": j,
            "Train_NLPD": train_nlpd,
            "Train_RMSE": train_rmse,
            "Test_NLPD": test_nlpd,
            "Test_RMSE": test_rmse,
            "TrainCode": train_code,
        }

        df = pd.concat([df, pd.DataFrame([row])])
        df.to_csv(gen_pth / "LMC_model2_xval.csv", index=False)

Training set size:   0%|          | 0/8 [00:00<?, ?it/s]

Iteration:   0%|          | 0/10 [00:00<?, ?it/s]

In [None]:
1