In [2]:
from pathlib import Path
import pandas as pd
import numpy as np


from reportengine.compat import yaml
import validphys
from validphys.commondataparser import parse_commondata_metadata

# Path to your buildmaster, here I'm guessing vp as installed in edit mode
BUILDMASTER_PATH = Path(validphys.__file__).parent / "../../../buildmaster"
DATASET = "NMCPD"

#from validphys.core import CommonDataSpec
#import pandas as pd
#import numpy as np
#from collections import defaultdict

dataset_path = BUILDMASTER_PATH / DATASET
metadata_file = dataset_path / "metadata.yaml"
metadata = yaml.safe_load(metadata_file.read_text())
ndata = metadata["ndata"]

metadata_object = parse_commondata_metadata(metadata_file)
print(metadata_object)

CommonMetaData(setname='NMCPD', ndata=260, observable=ordereddict([('description', 'Ratio of deuterium to proton structure function F2'), ('label:"$F_2^d/F_2^p(x', None), ('Q^2)$"', None), ('units', '')]), kinematics=ordereddict([('variables', ordereddict([('q2', ordereddict([('description', 'Parton momentum transfer'), ('label', '$Q^2$'), ('units', 'GeV²')])), ('x', ordereddict([('description', 'Bjorken x'), ('label', '$x$'), ('units', '')])), ('y', ordereddict([('description', 'Inelasticity'), ('units', '')]))])), ('file', 'kinematics.yaml')]), kinematic_coverage=ordereddict([('x', 'x'), ('q2', 'q2')]), data_central=PosixPath('data.yaml'), data_uncertainties=[PosixPath('uncertainties.yaml')], dataset_label='NMC $d/p$', plot_x='x', figure_by=['q2'], theory=TheoryMeta(FK_tables=[['NMCPD_D1', 'NMCPD_D2'], ['NMCPD_P']], operation='RATIO', conversion_factor=1.0, apfelcomb=None), nnpdf_metadata=ordereddict([('nnpdf31_process', 'DIS NC'), ('experiment', 'NMC')]), version=0, version_comment=

In [None]:
Ñ

In [2]:
# Read the kinematics dataframe
kin_df = None
if True:
    kin_file = dataset_path / "kinematics.yaml"
    kinyaml = yaml.safe_load(kin_file.read_text())
    
    kin_dict = {i+1: pd.DataFrame(d).stack() for i, d in enumerate(kinyaml["bins"])}
    kin_df = pd.concat(kin_dict, axis=1, names=["index"]).swaplevel(0,1).T
kin_df

Unnamed: 0_level_0,x,q2,y
Unnamed: 0_level_1,mid,mid,mid
index,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
1,0.0015,0.16,0.120852
2,0.0015,0.25,0.188831
3,0.0015,0.35,0.264363
4,0.0015,0.45,0.339895
5,0.0015,0.60,0.453194
...,...,...,...
256,0.6750,26.49,0.044463
257,0.6750,35.40,0.059419
258,0.6750,47.03,0.078940
259,0.6750,63.53,0.106635


In [3]:
# Read the kinematics dataframe
variant = True
unc_df = None

def read_uncertainties(unc_file):
    uncyaml = yaml.safe_load(unc_file.read_text())
    
    mindex = pd.MultiIndex.from_tuples([(k, v["treatment"], v["type"]) for k,v in uncyaml["definition"].items()], names=["name", "treatment", "type"])
    # I'm guessing there will be a better way of doing this than calling  dataframe twice for the same thing
    final_df = pd.DataFrame(pd.DataFrame(uncyaml["bins"]).values, columns=mindex, index=range(1,ndata+1))
    final_df.index.name = "index"
    return final_df 

if True:
    unc_df = read_uncertainties(dataset_path / "uncertainties.yaml")
    
    if variant:
        # With a variant
        var_df = read_uncertainties(dataset_path / "uncertainties_dw.yaml")
        
        unc_df = pd.concat([unc_df, var_df], axis=1)
        
unc_df

name,stat_1,syst_1,nuclear
treatment,ADD,ADD,ADD
type,UNCORR,CORR,NUC_DW
index,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3
1,0.0203,0.0109,0.0203
2,0.0212,0.0134,0.0212
3,0.0205,0.0112,0.0205
4,0.0258,0.0195,0.0258
5,0.0176,0.0121,0.0176
...,...,...,...
256,0.0235,0.0034,0.0235
257,0.0330,0.0033,0.0330
258,0.0373,0.0026,0.0373
259,0.0513,0.0029,0.0513


In [4]:
data_df = None
    
if True:
    data_file = dataset_path / "data.yaml"
    datayaml = yaml.safe_load(data_file.read_text(encoding="utf-8"))
    
    data_df = pd.DataFrame(datayaml["data_central"], index=range(1, ndata+1), columns=["data"])
    data_df.index.name = "index"
    
data_df

Unnamed: 0_level_0,data
index,Unnamed: 1_level_1
1,0.9815
2,1.0030
3,0.9675
4,1.0330
5,0.9912
...,...
256,0.6717
257,0.7194
258,0.6959
259,0.7020


In [61]:
from dataclasses import dataclass
from pathlib import Path
import typing

from validobj.custom import Parser
from validobj import ValidationError, parse_input


from validphys import convolution


# Scalar parsers
@Parser
def ValidPath(path_str: str) -> Path:
    """Parse strings into paths"""
    try:
        return Path(path_str)
    except exception as e:
        raise ValidationError(f"{path_str} is not a valid path") from e
        
@Parser
def ValidOperation(op_str: str) -> str:
    """Ensures that the operation defined in the commondata file is implemented in validphys"""
    ret = op_str.upper()
    if ret not in convolution.OP:
        raise ValidationError(f"The operation '{op_str}' is not implemented in validphys")
    return ret



# Object classes wrappers
@dataclass
class TheoryMeta:
    FK_tables: list
    operation: ValidOperation
    conversion_factor: float = 1.0
    apfelcomb: dict = None
        
    @classmethod
    def parser(cls, meta: dict):
        return parse_input(meta, cls)
        
@dataclass
class ReferenceMeta:
    url: str
    version: int = None
    tables: typing.List[int] = None
        
    @classmethod
    def parser(cls, meta: dict):
        return parse_input(meta, cls)

        
@dataclass
class Variant:
    data_uncertainties: typing.List[ValidPath]
        
        
ValidTheory = Parser(TheoryMeta.parser)
ValidReference = Parser(ReferenceMeta.parser)


@Parser
def ValidVariants(variant_dict: dict) -> dict:
    """Variants of a dataset are allowed to overwrite a subset of the keys of a dataset
    (those defined in the Variant dataclass).
    This wrapper class runs over the dictionary of variant and parses them into valid Variants
    """
    return {k: parse_input(v, Variant) for k,v in variant_dict.items()}
    
                
@dataclass
class CommonMetaData:
    setname: str
    ndata: int
    observable: dict
    kinematics: dict
    kinematic_coverage: dict
    data_central: ValidPath
    data_uncertainties: typing.List[ValidPath]
    dataset_label: str
    plot_x: str
    figure_by: typing.List[str]
    theory: ValidTheory
    nnpdf_metadata: dict    
    version: int
    version_comment: str = ""
    arXiv: ValidReference = None
    iNSPIRE: ValidReference = None
    hepdata: ValidReference = None
    variants: ValidVariants = None

In [67]:
from validphys.utils import parse_yaml_inp
from ruamel import yaml


parse_yaml_inp(metadata_file, CommonMetaData).variants
#parse_input(metadata["theory"], ValidTheory)

In [68]:
aa = ret["shifted"]

In [80]:
dir(aa)

['__annotations__',
 '__class__',
 '__dataclass_fields__',
 '__dataclass_params__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__match_args__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 'data_uncertainties']