In [1]:
from pathlib import Path
import pandas as pd
import numpy as np

import validphys
from validphys.commondataparser import parse_commondata_folder, parse_commondata
from validphys.loader import _get_nnpdf_profile

## Old-New comparison

The following cells compare the output the old `parse_commondata` and the new `parse_commondata_folder` (both from `commondataparser.py`).

The output of these two functions should be equal, since they parse the old/new commondata files into the `CommonData` class from `coredata.py`.

Note that for this comparison, my version of "NMCPD" is called "NMCPDF_fake" so that the old and new dataset don't clash.

In [2]:
#  for NMCPD
dataset = "NMCPD"


# Again, this assumes validphys installed in editable mode etc etc and we look at the defaults
nmcpd_path = Path(_get_nnpdf_profile()["data_path"]) / "commondata" / f"DATA_{dataset}.dat"
nmcpd_unc_path = nmcpd_path.parent / "systypes" / f"SYSTYPE_{dataset}_DEFAULT.dat"
old_cd = parse_commondata(nmcpd_path, nmcpd_unc_path, dataset)

# If this trick doesn't find your data, put here the path to wherever your data is
folder_data = Path(validphys.__file__).parent / "../../../buildmaster"

folder_new_dataset = folder_data / f"{dataset}_fake"
new_cd = parse_commondata_folder(folder_new_dataset)


In [3]:
old_cd.commondata_table

Unnamed: 0_level_0,process,kin1,kin2,kin3,data,stat,ADD,MULT,ADD,MULT,ADD,MULT,ADD,MULT,ADD,MULT
entry,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1,DIS_F2R,0.0015,0.16,0.0,0.9815,0.0203,0.000982,0.1,0.000000,0.0,0.010797,1.1,0.000000,0.0,0.000000,0.0
2,DIS_F2R,0.0015,0.25,0.0,1.0030,0.0212,0.001003,0.1,0.000000,0.0,0.013039,1.3,0.001003,0.1,0.001003,0.1
3,DIS_F2R,0.0015,0.35,0.0,0.9675,0.0205,0.001935,0.2,0.000000,0.0,0.010642,1.1,0.000000,0.0,0.000000,0.0
4,DIS_F2R,0.0015,0.45,0.0,1.0330,0.0258,0.001033,0.1,0.000000,0.0,0.019627,1.9,0.000000,0.0,0.000000,0.0
5,DIS_F2R,0.0015,0.60,0.0,0.9912,0.0176,0.000991,0.1,0.000000,0.0,0.011894,1.2,0.000000,0.0,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
256,DIS_F2R,0.6750,26.49,0.0,0.6717,0.0235,0.000672,0.1,0.002687,0.4,0.000000,0.0,-0.001343,-0.2,0.001343,0.2
257,DIS_F2R,0.6750,35.40,0.0,0.7194,0.0330,0.000719,0.1,0.002158,0.3,0.000000,0.0,-0.001439,-0.2,0.002158,0.3
258,DIS_F2R,0.6750,47.03,0.0,0.6959,0.0373,0.000696,0.1,0.000696,0.1,0.000696,0.1,-0.001392,-0.2,0.002088,0.3
259,DIS_F2R,0.6750,63.53,0.0,0.7020,0.0513,0.000702,0.1,0.000000,0.0,0.000702,0.1,-0.001404,-0.2,0.002106,0.3


In [4]:
new_cd.commondata_table

Unnamed: 0_level_0,process,kin1,kin2,kin3,data,stat,ADD
entry,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,DIS NC,0.0015,0.16,0.120852,0.9815,0.0203,0.0109
2,DIS NC,0.0015,0.25,0.188831,1.0030,0.0212,0.0134
3,DIS NC,0.0015,0.35,0.264363,0.9675,0.0205,0.0112
4,DIS NC,0.0015,0.45,0.339895,1.0330,0.0258,0.0195
5,DIS NC,0.0015,0.60,0.453194,0.9912,0.0176,0.0121
...,...,...,...,...,...,...,...
256,DIS NC,0.6750,26.49,0.044463,0.6717,0.0235,0.0034
257,DIS NC,0.6750,35.40,0.059419,0.7194,0.0330,0.0033
258,DIS NC,0.6750,47.03,0.078940,0.6959,0.0373,0.0026
259,DIS NC,0.6750,63.53,0.106635,0.7020,0.0513,0.0029


## The new commondata

The following cells basically do what `parse_commondata_folder` does internally (wihtout creating the final `CommonData` object).

The first step is to read the metadata from `<datafolder>/metadata.yaml`. Most of the new logic is concentrated in this step since it uses `validobj` to ensure that all the information inside the `metadata` file is correct (very importantly, it checks the theory information which is also used by the pineappl parser later on!)

After the metadata has been read, the data, uncertainties, and kinematics are read as well into beautiful dataframes. The `new_cd.commondata_table` above is made from a simplified concatenation of said dataframes.

In [5]:
from validphys.utils import parse_yaml_inp
from validphys.commondataparser import CommonMetaData, _parse_data, _parse_uncertainties, _parse_kinematics

commondata_folder = folder_data / "NMCPD_fake"
metadata_file = commondata_folder / "metadata.yaml"

# Select which variants you want
variants = []

# Read up all the metadata, i.e., use validobj to parse metadata_file into CommonMetaData
metadata = parse_yaml_inp(metadata_file, CommonMetaData)

# Now apply variants:
for variant in variants:
    metadata = metadata.apply_variant(variant)
    
# Once we have our final commondata file, parse data, kinematics and uncertainties
data_df = _parse_data(metadata)
uncertainties_df = _parse_uncertainties(metadata)
kin_df = _parse_kinematics(metadata)

In [6]:
data_df

Unnamed: 0_level_0,data
entry,Unnamed: 1_level_1
1,0.9815
2,1.0030
3,0.9675
4,1.0330
5,0.9912
...,...
256,0.6717
257,0.7194
258,0.6959
259,0.7020


In [7]:
uncertainties_df

name,stat_1,syst_1
treatment,ADD,ADD
type,UNCORR,CORR
entry,Unnamed: 1_level_3,Unnamed: 2_level_3
1,0.0203,0.0109
2,0.0212,0.0134
3,0.0205,0.0112
4,0.0258,0.0195
5,0.0176,0.0121
...,...,...
256,0.0235,0.0034
257,0.0330,0.0033
258,0.0373,0.0026
259,0.0513,0.0029


In [8]:
kin_df

Unnamed: 0_level_0,x,q2,y
Unnamed: 0_level_1,mid,mid,mid
entry,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
1,0.0015,0.16,0.120852
2,0.0015,0.25,0.188831
3,0.0015,0.35,0.264363
4,0.0015,0.45,0.339895
5,0.0015,0.60,0.453194
...,...,...,...
256,0.6750,26.49,0.044463
257,0.6750,35.40,0.059419
258,0.6750,47.03,0.078940
259,0.6750,63.53,0.106635


In [9]:
if False:
    # Quickly check that nothing _important_ is broken
    from validphys.api import API
    from validphys.convolution import central_predictions
    pdf = API.pdf(pdf="NNPDF40_nnlo_as_01180")
    dname = "ATLAS_SINGLETOP_TCH_DIFF_7TEV_T_RAP_NORM" # dataset with apfelcomb flags
    new_ds = API.dataset(dataset_input={"dataset": dname}, theoryid=400, use_cuts="internal")
    print(central_predictions(new_ds, pdf))