In [1]:
%load_ext autoreload
%autoreload 2

import arviz as az
import json
import numpy as np
import pandas
import pymc as pm
from matplotlib import pyplot as plt
from pathlib import Path

In [4]:
raw_data_files = [
    Path("./paper raw data/synthetic data sets for validation/Normal model_normal data_noise level 0.6.xlsx"),
    Path("./paper raw data/synthetic data sets for validation/Normal model_normal data_noise level 1.2.xlsx"),
    Path("./paper raw data/synthetic data sets for validation/Normal model_skew normal data_noise level 0.6.xlsx"),
    Path("./paper raw data/synthetic data sets for validation/Skew normal model_skew normal data_noise level 0.6.xlsx"),
    Path("./paper raw data/synthetic data sets for validation/Skew normal model_skew normal data_noise level 1.2.xlsx"),
    Path("./paper raw data/synthetic data sets for validation/Skew normal model_normal data_noise level 0.6.xlsx"),
]

parameters = ["mean", "std", "area", "height", "alpha", "baseline_intercept", "baseline_slope"]

### Prepare data in df_results

In [5]:
df_results = pandas.DataFrame()

for path in raw_data_files:
    for param in parameters:
        # print(path, param)
        # normal distribution does not have the alpha parameter so skip that when necessary
        if path in [raw_data_files[0], raw_data_files[1], raw_data_files[2]] and param == "alpha":
            print("alpha skip" + param)
            continue
        # summary laden
        summary = pandas.read_excel(path, index_col=0)
        # sort summary and calculate differences between true and simulated values
        df = summary.loc[param, ["mean", "sd", "true_values"]]
        print(set(df.index))
        df["ratio_mean_to_truth"] = np.abs(df.loc[:, "mean"] / df.loc[:, "true_values"])
        df["absolute_difference"] = df.loc[:, "mean"] - df.loc[:, "true_values"]
        df["ratio_std_to_mean"] = df.loc[:, "sd"] / df.loc[:, "mean"]
        df["within_range_of_1_std"] = [True if df.iloc[x, 0] - df.iloc[x, 1] <= df.iloc[x, 2] <= df.iloc[x, 0] + df.iloc[x, 1] else False for x in range(len(df))]
        df["within_range_of_3_stds"] = [True if df.iloc[x, 0] - 3 * df.iloc[x, 1] <= df.iloc[x, 2] <= df.iloc[x, 0] + 3 * df.iloc[x, 1] else False for x in range(len(df))]
        df["noise_level"] = len(df) * [list(set(summary.loc[:,"noise_scale"]))[0]]
        df["draws"] = len(df) * [list(set(summary.loc[:,"draws"]))[0]]
        df["tuning"] = len(df) * [list(set(summary.loc[:,"tuning_samples"]))[0]]
        # calculate mean and std of differences
        df2 = pandas.DataFrame()
        df2["path"] = [path]
        df2["parameter"] = ["".join(set(df.index))]
        df2["ratio_mean_to_truth"] = [(np.mean(df.loc[:, "ratio_mean_to_truth"]), np.std(df.loc[:, "ratio_mean_to_truth"]))]
        df2["absolute_difference"] = [(np.mean(df.loc[:, "absolute_difference"]), np.std(df.loc[:, "absolute_difference"]))]
        df2["within_range_of_3_stds"] = np.count_nonzero(df.loc[:, "within_range_of_3_stds"]) / len(df)
        df2["within_range_of_1_std"] = np.count_nonzero(df.loc[:, "within_range_of_1_std"]) / len(df)
        df2["noise_level"] = list(set(df["noise_level"]))[0]
        df2["tuning samples"] = list(set(df["tuning"]))[0]
        df2["draws"] = list(set(df["draws"]))[0] 
        if path in [raw_data_files[0], raw_data_files[1]]:
            df2["data_distribution"] = ["normal"]
            df2["model_distribution"] = ["normal"]
        elif path == raw_data_files[2]:
            df2["data_distribution"] = ["skew normal"]
            df2["model_distribution"] = ["normal"]
        elif path in [raw_data_files[3], raw_data_files[4]]:
            df2["data_distribution"] = ["skew normal"]
            df2["model_distribution"] = ["skew normal"]
        elif path == raw_data_files[5]:
            df2["data_distribution"] = ["normal"]
            df2["model_distribution"] = ["skew normal"]      
        # save results in one DataFrame for subsequent plotting
        df_results = pandas.concat([df_results, df2])

{'mean'}
{'std'}
{'area'}
{'height'}
alpha skipalpha
{'baseline_intercept'}
{'baseline_slope'}
{'mean'}
{'std'}
{'area'}
{'height'}
alpha skipalpha
{'baseline_intercept'}
{'baseline_slope'}
{'mean'}
{'std'}
{'area'}
{'height'}
alpha skipalpha
{'baseline_intercept'}
{'baseline_slope'}
{'mean'}
{'std'}
{'area'}
{'height'}
{'alpha'}
{'baseline_intercept'}
{'baseline_slope'}
{'mean'}
{'std'}
{'area'}
{'height'}
{'alpha'}
{'baseline_intercept'}
{'baseline_slope'}
{'mean'}
{'std'}
{'area'}
{'height'}
{'alpha'}
{'baseline_intercept'}
{'baseline_slope'}


In [6]:
for model in set(df_results.loc[:, "model_distribution"]):
    dfdf = df_results[df_results.loc[:, "model_distribution"] == model]
    for data in set(dfdf.loc[:, "data_distribution"]):
        dfdf2 = dfdf[dfdf.loc[:, "data_distribution"] == data]
        for noise_level in set(dfdf2.loc[:, "noise_level"]):
            dfdf3 = dfdf2[dfdf2.loc[:, "noise_level"] == noise_level]
            model = list(dfdf3.loc[:,"model_distribution"])[0]
            data = list(dfdf3.loc[:,"data_distribution"])[0]
            noise = list(dfdf3.loc[:,"noise_level"])[0]
            print(f"model: {model}, data: {data}, noise level: {noise}")

model: normal, data: normal, noise level: 0.6
model: normal, data: normal, noise level: 1.2
model: normal, data: skew normal, noise level: 0.6
model: skew normal, data: skew normal, noise level: 0.6
model: skew normal, data: skew normal, noise level: 1.2
model: skew normal, data: normal, noise level: 0.6


In [7]:
dfdf = df_results[df_results.loc[:, "model_distribution"] == "skew normal"]
dfdf2 = dfdf[dfdf.loc[:, "data_distribution"] == "skew normal"]
dfdf3 = dfdf2[dfdf2.loc[:, "noise_level"] == 0.6]

In [8]:
all_data = {}
for model in set(df_results.loc[:, "model_distribution"]):
    dfdf = df_results[df_results.loc[:, "model_distribution"] == model]
    for data in set(dfdf.loc[:, "data_distribution"]):
        dfdf2 = dfdf[dfdf.loc[:, "data_distribution"] == data]
        for noise_level in set(dfdf2.loc[:, "noise_level"]):
            dfdf3 = dfdf2[dfdf2.loc[:, "noise_level"] == noise_level]
            model = list(dfdf3.loc[:,"model_distribution"])[0]
            data = list(dfdf3.loc[:,"data_distribution"])[0]
            noise = list(dfdf3.loc[:,"noise_level"])[0]
            # print(f"model: {model}, data: {data}, noise level: {noise}")
            # print(noise)
            dfdf4 = dfdf3[~dfdf3.loc[:, "parameter"].isin(["alpha", "baseline_intercept", "baseline_slope"])]
            if noise == 1.2:
                all_data[f"{data} data (higher noise), {model} model"] = [[x[0] for x in list(dfdf4.loc[:,"ratio_mean_to_truth"])], [x[1] for x in list(dfdf4.loc[:,"ratio_mean_to_truth"])]]
            else:
                all_data[f"{data} data, {model} model"] = [[x[0] for x in list(dfdf4.loc[:,"ratio_mean_to_truth"])], [x[1] for x in list(dfdf4.loc[:,"ratio_mean_to_truth"])]]
all_data

{'normal data, normal model': [[0.9999001336005343,
   1.0025702609677298,
   1.0017147282600856,
   1.000123572878139],
  [0.002286555631402294,
   0.028900726978078068,
   0.02958680525019264,
   0.022445046960539197]],
 'normal data (higher noise), normal model': [[0.9997316666666668,
   1.0059567381829964,
   1.001356598861276,
   0.9977187067316658],
  [0.004410296979166418,
   0.05488690135089093,
   0.055093378982298734,
   0.04168657187789078]],
 'skew normal data, normal model': [[0.9990176666666667,
   0.7598253910963016,
   0.9869124703934096,
   0.9889579711666672],
  [0.04540922653553522,
   0.1425229338854569,
   0.029251994462966387,
   0.02178598822049324]],
 'skew normal data, skew normal model': [[1.0003276666666665,
   1.0178059537564914,
   0.9995769654521169,
   0.9994046368514812],
  [0.022164664598810824,
   0.08144664654979102,
   0.02553221429137138,
   0.019596288333603468]],
 'skew normal data (higher noise), skew normal model': [[0.9975454545454545,
   1.062

In [9]:
rearrange = ['normal data, normal model', 'normal data (higher noise), normal model', 'skew normal data, normal model', 'skew normal data, skew normal model', 'skew normal data (higher noise), skew normal model','normal data, skew normal model']
reordered_dict = {k: all_data[k] for k in rearrange}
reordered_dict.keys()

dict_keys(['normal data, normal model', 'normal data (higher noise), normal model', 'skew normal data, normal model', 'skew normal data, skew normal model', 'skew normal data (higher noise), skew normal model', 'normal data, skew normal model'])

In [10]:
# save processed data in file

# with open('all_data.txt', 'w') as file:
#     file.write(json.dumps(reordered_dict)) # use `json.loads` to do the reverse

In [11]:
%load_ext watermark
%watermark -idu

Last updated: 2024-10-13T15:03:43.532805+02:00

