In [1]:
# Put here the path where the new datafiles are
from pathlib import Path
BUILDMASTER_PATH = Path("/mount/storage/Academic_Workspace/NNPDF/src/nnpdf/buildmaster")

from validphys.core import CommonDataSpec
from reportengine.compat import yaml
import pandas as pd
import numpy as np
from collections import defaultdict

In [2]:
if False:
    # Playground for kinematics
    kinfile = BUILDMASTER_PATH / "NMCPD/kinematics.yaml"
    kinyaml = yaml.safe_load(kinfile.read_text())
    kin_data = []
    keys = []
    for key, data in kinyaml.items():
        kin_data.append(pd.DataFrame.from_records(data, index="index"))
        keys.append(key.replace("kin_", ""))
    kin_df = pd.concat(kin_data, axis=1, keys=keys).swaplevel(0,1, axis=1).sort_values(1, axis="columns")
    
if False:     # Playground for uncertainties
    unfile = BUILDMASTER_PATH / "NMCPD/uncertainties.yaml"
    unyaml = yaml.safe_load(unfile.read_text())
    K_MODE = "Mode"
    K_TREAT = "Treatment"
    sys_dfs = defaultdict(list)
    
    # Below, some abusing of pandas dataframes
    for key, data in unyaml.items():
        if key == "stat":
            lol = pd.concat({"": pd.DataFrame.from_records(data, index="index")}, names=[K_MODE], axis=1)
            lol.rename(columns={"value": key}, inplace=True)
            stat_df = pd.concat({"": lol}, axis=1, names=[K_TREAT])
        else:
            tmp = pd.DataFrame.from_records(data["errors"], index="index")
            tmp.rename(columns={"value": key}, inplace=True)
            mode = data["mode"]
            if "CORR" in mode:
                tmp_df = pd.concat({"CORR": tmp}, axis=1, names=[K_MODE])
            else:
                tmp_df = pd.concat({"UNCORR": tmp}, axis=1, names=[K_MODE])
            if "ADD" in mode:
                sys_dfs["ADD"].append(tmp_df)
            else:
                sys_dfs["MULT"].append(tmp_df)  
    all_sys = {k: pd.concat(i) for k,i in sys_dfs.items()}
    sys_df = pd.concat(all_sys, axis=1, names=[K_TREAT])
    unc_df = pd.concat({"stat": stat_df, "sys": sys_df}, axis=1, names=["Type"])
    
if False:
    _data_file = BUILDMASTER_PATH / "NMCPD/data.yaml"
    datayaml = yaml.safe_load(_data_file.read_text(encoding="utf-8"))
    data_df = pd.DataFrame.from_records(datayaml["data_central"], index="index")
    data_df.rename(columns={"value":"data"}, inplace=True)

In [3]:
# This cell is (more or less) what the loader will do when asked for a dataset_input
# and will be a more-or less substitute commondataparser.parse_commondata
# which parts will be inside CommonDataSpec and which inside parse_commondata can be decided a posteriori

# Write here the dataset_input you want to play with
dataset_input = {
    "dataset": "NMCPD",
    "variant": "shifted"
}

# Loader
setname = dataset_input["dataset"]
variant = dataset_input["variant"]

setdir = BUILDMASTER_PATH / setname
metadatafile = setdir / "metadata.yaml"
if not metadatafile.exists():
    raise FileNotFoundError(f"Metadata not found for {setname}")                                                                                

cd_spec = CommonDataSpec(setname, variant, metadatafile)

In [4]:
# Let's see the uncertainties
kk = cd_spec.uncertainties
kk.get_systematic()
kk.get_stat()

Unnamed: 0_level_0,stat
index,Unnamed: 1_level_1
1,0.0203
2,0.0212
3,0.0205
4,0.0258
5,0.0176
...,...
256,0.0235
257,0.0330
258,0.0373
259,0.0513


In [5]:
# Let's look at the kinematics:
kk = cd_spec.kinematics
kk.get_kintable()
kk.get_all_kin_cv()
kk.get_kin_cv("x")

Unnamed: 0_level_0,avg
index,Unnamed: 1_level_1
1,0.0015
2,0.0015
3,0.0015
4,0.0015
5,0.0015
...,...
256,0.6750
257,0.6750
258,0.6750
259,0.6750


In [6]:
# And now at the uncertainties
print(cd_spec.nsys)

1


In [15]:
# Load the full commondata_table
cd = cd_spec.load()
type(cd.central_values)

pandas.core.series.Series

In [21]:
# Select data with cuts
loaded_cd_with_cuts = cd.with_cuts([32,67,89])
loaded_cd_with_cuts.commondata_table[["data"]]

Unnamed: 0_level_0,data
index,Unnamed: 1_level_1
33,0.9924
68,0.9849
90,0.9686


In [9]:
loaded_cd_with_cuts.central_values

index
33    0.9924
68    0.9849
90    0.9686
Name: data, dtype: float64

In [10]:
loaded_cd_with_cuts.systematics_table

Treatment,ADD
Mode,CORR
Unnamed: 0_level_2,sys_1
index,Unnamed: 1_level_3
33,0.0084
68,0.0027
90,0.0021


In [11]:
# Now check that all methods work

for method in dir(loaded_cd_with_cuts):
    if method.startswith("_"):
        continue
    print(f"Testing '{method}'... ", end="")
    met = getattr(loaded_cd_with_cuts, method)
    if hasattr(met, "__call__"):
        try:
            met()
        except TypeError:
            if method == "with_cuts":
                print("not ", end="")
        print("executed... ", end="")
    print("ok")

Testing 'additive_errors'... ok
Testing 'central_values'... ok
Testing 'commondata_table'... ok
Testing 'data'... ok
Testing 'kinematics'... ok
Testing 'multiplicative_errors'... ok
Testing 'ndata'... ok
Testing 'nkin'... executed... ok
Testing 'nsys'... ok
Testing 'process'... ok
Testing 'setname'... ok
Testing 'stat_errors'... ok
Testing 'systematic_errors'... executed... ok
Testing 'systematics_table'... ok
Testing 'uncertainties'... ok
Testing 'variant'... ok
Testing 'with_cuts'... not executed... ok


In [12]:
loaded_cd_with_cuts.systematic_errors()

Mode,CORR
Unnamed: 0_level_1,sys_1
index,Unnamed: 1_level_2
33,0.0084
68,0.0027
90,0.0021
