In [1]:
from inspect import ismethod
from IPython.display import HTML as html_print
import numpy as np

from validphys.api import API
from validphys.commondataparser import parse_commondata_new

## Old-New comparison

The next cells use `parse_commondata` and `pase_commondata_new` to compare the `CommonData` objects returned by parsing common data files in the old and new styles.

At the moment the "new commondatas" are in a hard-coded folder called `new_data` (it is hardcoded in `commondataparser.py` while the old ones are taken from whatever was installed in `share/NNPDF/data`. Eventually the new commondata will also live there.

The comparison checks that the data, uncertainties and kinematics are the same (or as close as they can be). The format of the dataframes is slightly different at the moment, but this is at the moment a desired result.

In [2]:
# Helper functions

def load_old_and_new(old_name, new_name, theoryid=200):
    """Load the old and new commondata. The theory id is needed by the validphys API when using .dataset
    but it is never used. Putting 200 as default since I guess everyone has that theory installed already.
    """
    old_cd = API.dataset(dataset_input={"dataset": old_name}, use_cuts="nocuts", theoryid=theoryid).load_commondata()

    # If this doesn't find your data, change the path inside `commondataparser.py` to wherever your data is
    # the variable is `_folder_data` at the beginning of the file.
    new_cd = parse_commondata_new(new_name)
    return old_cd, new_cd

def print_check(msg, res):
    color = "green" if res else "red"
    display(html_print(f"<text style=color:{color}>equal={res} {msg}</text>"))

def check_this(cds, method, naming=None, unc=False):
    """Check whether the results of `method` for two different commondata agree"""
    if len(cds) != 2:
        raise ValueError(f"Can only check two commondata, not {len(cds)}")
        
    test = None
    check_result = True
    if naming is None:
        naming = method
    
    values = []
    for cd in cds:
        res = getattr(cd, method)
        if ismethod(res): # since sometimes we have methods, sometimes they are attributes
            res = res()
        values.append(res)
        
    check_result = np.allclose(*values)
    
    print_check(f"Testing {naming} ({method})", check_result)
    
    if not check_result:
        nnn = (values[0]+values[1]) + 1e-7
        val = np.max( (values[0]-values[1])/nnn*100)
        if val == val:
            print(f"       Max % diff: {np.abs(val)}")

def check_systematics(cds):
    additive_attr = "additive_errors"
    multipli_attr = "multiplicative_errors"
    
    add = []
    mult = []
    covmats = []
    syserr = []
    
    nsys = cds[0].nsys

    for cd in cds:
        if cd.nsys != nsys:
            print_check(f"The number of systematic uncertainites is different!!", False)
            # it doesn't make sense to continue here
            return
            
        res_add = getattr(cd, additive_attr)
        res_mult = getattr(cd, multipli_attr)
        res_sys = cd.systematic_errors()
        
        add.append(res_add)
        mult.append(res_mult)
        syserr.append(res_sys)
        covmats.append(res_add @ res_add.T)
    
    # Even if the number of systematics is the same, it is not a given that they are
    # distributed in the same way between multiplicative and additive, check that
    all_nmul = [i.shape[-1] for i in mult]    
    if not len(np.unique(all_nmul)) == 1:
        print(f" > The combination of multiplicative and additive is different!")
        check_sys = np.allclose(*syserr)
        print_check(f"Testing the raw table of systematics: systematic_errors()", check_sys)
        return

    check_multiplicative = np.allclose(*mult)
    print_check(f"Testing multiplicative errors ({multipli_attr})", check_multiplicative)
        
    check_additive = np.allclose(*add)
    print_check(f"Testing additive errors ({additive_attr})", check_additive)
    
    # When checking additive uncertainties, check also the covmat obtained from res @ res.T
    # since there might -1 signs in the eigenvectors
    if not check_additive:
        check_again = np.allclose(*covmats)
        print(" > > The resulting covmat instead agrees? ", check_again)
        

def run_checks(old_cd, new_cd):
    """Run through a number of checks to ensure they contain the same information"""
    cds = [old_cd, new_cd]
        
    check_this(cds, "get_cv", naming="central_values")
    check_this(cds, "get_kintable", naming="kinematics")
    check_this(cds, "stat_errors")
    check_systematics(cds)

In [3]:
def test_group(datasets, nofail=False):
    all_cds = []
    for i, (old_name, new_name) in enumerate(datasets.items()):
        print(f"\nChecking {i} {new_name} (old name: {old_name})")
        try:
            old_cd, new_cd = load_old_and_new(old_name, new_name)
            all_cds.append((old_cd, new_cd))
            print(" > Comparing values...")
            run_checks(old_cd, new_cd)
        except FileNotFoundError:
            print(f"Files for {new_name} not found")
            continue
        except Exception as e:
            if nofail:
                print(f"### FAILED for {new_name}\n")
                continue
            raise e
    return all_cds

## FT Drell-yan

These datasets can be found in the `E605` branch [#1679](https://github.com/NNPDF/nnpdf/pull/1679)

In [4]:
datasets = {
    "DYE605":  "E605_DY_38P8GEV_PXSEC",
    "DYE886P": "E866_DY_800GEV_PXSEC",
    "DYE886R": "E866_DY_800GEV_RATIO_PDXSECRATIO",
    "DYE906R": "E906_DY_120GEV_PDXSECRATIO",
}

res_ft = test_group(datasets, nofail=False)


#old_cd, new_cd = load_old_and_new("DYE906R", "E906_DY_120GEV_PDXSECRATIO")
#run_checks(old_cd, new_cd)


Checking 0 E605_DY_38P8GEV_PXSEC (old name: DYE605)
Files for E605_DY_38P8GEV_PXSEC not found

Checking 1 E866_DY_800GEV_PXSEC (old name: DYE886P)
Files for E866_DY_800GEV_PXSEC not found

Checking 2 E866_DY_800GEV_RATIO_PDXSECRATIO (old name: DYE886R)
Files for E866_DY_800GEV_RATIO_PDXSECRATIO not found

Checking 3 E906_DY_120GEV_PDXSECRATIO (old name: DYE906R)
Files for E906_DY_120GEV_PDXSECRATIO not found


## Atlas datasets

These datasets can be found in the `gluon_pdf_ncd` branch

In [23]:
datasets = {
    "ATLASTTBARTOT13TEV": "ATLAS_TTBAR_13TEV_TOT_X-SEC",
    "ATLASTTBARTOT7TEV": "ATLAS_TTBAR_7TEV_TOT_X-SEC",
    "ATLAS_TOPDIFF_DILEPT_8TEV_TTM": "ATLAS_TTBAR_8TEV_2L_DIF_MTTBAR",
    "ATLAS_TOPDIFF_DILEPT_8TEV_TTMNORM": "ATLAS_TTBAR_8TEV_2L_DIF_MTTBAR-NORM",
    "ATLAS_TOPDIFF_DILEPT_8TEV_TTRAP": "ATLAS_TTBAR_8TEV_2L_DIF_YTTBAR",
    "ATLAS_TOPDIFF_DILEPT_8TEV_TTRAPNORM": "ATLAS_TTBAR_8TEV_2L_DIF_YTTBAR-NORM",
    "ATLAS_TTB_DIFF_8TEV_LJ_TTM": "ATLAS_TTBAR_8TEV_LJ_DIF_MTTBAR",
    "ATLAS_TTB_DIFF_8TEV_LJ_TTMNORM":"ATLAS_TTBAR_8TEV_LJ_DIF_MTTBAR-NORM",
    "ATLAS_TTB_DIFF_8TEV_LJ_TPT": "ATLAS_TTBAR_8TEV_LJ_DIF_PTT",
    "ATLAS_TTB_DIFF_8TEV_LJ_TPTNORM": "ATLAS_TTBAR_8TEV_LJ_DIF_PTT-NORM",
    "ATLAS_TTB_DIFF_8TEV_LJ_TRAP": "ATLAS_TTBAR_8TEV_LJ_DIF_YT",
    "ATLAS_TTB_DIFF_8TEV_LJ_TRAPNORM": "ATLAS_TTBAR_8TEV_LJ_DIF_YT-NORM",
    "ATLAS_TTB_DIFF_8TEV_LJ_TTRAP": "ATLAS_TTBAR_8TEV_LJ_DIF_YTTBAR",
    "ATLAS_TTB_DIFF_8TEV_LJ_TTRAPNORM": "ATLAS_TTBAR_8TEV_LJ_DIF_YTTBAR-NORM",
    "ATLASTTBARTOT8TEV": "ATLAS_TTBAR_8TEV_TOT_X-SEC"
}

#res = test_group(datasets, nofail=True)

old_name = list(datasets)[7]
new_name = datasets[old_name]
print(f"old: {old_name}, new: {new_name}")

old_cd, new_cd = load_old_and_new(old_name, new_name)
run_checks(old_cd, new_cd)

old: ATLAS_TTB_DIFF_8TEV_LJ_TTMNORM, new: ATLAS_TTBAR_8TEV_LJ_DIF_MTTBAR-NORM


       Max % diff: 0.5745252872439593


       Max % diff: 0.3120838497055149


In [26]:
new_cd.get_cv()

array([4.24583077e-03, 4.17868779e-03, 2.56983525e-03, 1.33185288e-03,
       5.63952011e-04, 1.42523530e-04, 1.76074933e-05])

In [25]:
old_cd.get_cv()

array([4.29490e-03, 4.20499e-03, 2.55705e-03, 1.31962e-03, 5.54111e-04,
       1.38558e-04, 1.64430e-05])

## CMS ttbar

These datasets can be found in the `more_efficient_metadata_for_new_commondata` branch [#1684](https://github.com/NNPDF/nnpdf/pull/1684)

In [8]:
datasets = {
    "CMSTOPDIFF8TEVTTRAPNORM":  "CMS_TTBAR_8TEV_LJ_DIF_YTTBAR-NORM",
}

res = test_group(datasets, nofail=False)


Checking 0 CMS_TTBAR_8TEV_LJ_DIF_YTTBAR-NORM (old name: CMSTOPDIFF8TEVTTRAPNORM)
 > Comparing values...


       Max % diff: 0.3120838497055149


 > > The resulting covmat instead agrees?  True
