In [1]:
from inspect import ismethod
from IPython.display import HTML as html_print
import numpy as np

from validphys.api import API
from validphys.commondataparser import parse_commondata_new

## Old-New comparison

The next cells use `parse_commondata` and `pase_commondata_new` to compare the `CommonData` objects returned by parsing common data files in the old and new styles.

At the moment the "new commondatas" are in a hard-coded folder called `new_data` (it is hardcoded in `commondataparser.py` while the old ones are taken from whatever was installed in `share/NNPDF/data`. Eventually the new commondata will also live there.

The comparison checks that the data, uncertainties and kinematics are the same (or as close as they can be). The format of the dataframes is slightly different at the moment, but this is at the moment a desired result.

In [2]:
# Helper functions

def load_old_and_new(old_name, new_name, theoryid=200):
    """Load the old and new commondata. The theory id is needed by the validphys API when using .dataset
    but it is never used. Putting 200 as default since I guess everyone has that theory installed already.
    """
    old_cd = API.dataset(dataset_input={"dataset": old_name}, use_cuts="nocuts", theoryid=theoryid).load_commondata()

    # If this doesn't find your data, change the path inside `commondataparser.py` to wherever your data is
    # the variable is `_folder_data` at the beginning of the file.
    new_cd = parse_commondata_new(new_name)
    return old_cd, new_cd

def print_check(msg, res):
    color = "green" if res else "red"
    display(html_print(f"<text style=color:{color}>equal={res} {msg}</text>"))

def check_this(cds, method, naming=None, unc=False):
    """Check whether the results of `method` for two different commondata agree"""
    if len(cds) != 2:
        raise ValueError(f"Can only check two commondata, not {len(cds)}")
        
    test = None
    check_result = True
    if naming is None:
        naming = method
    
    values = []
    covmats = []
    for cd in cds:
        res = getattr(cd, method)
        if ismethod(res): # since sometimes we have methods, sometimes they are attributes
            res = res()
        values.append(res)
        if unc:
            covmats.append(res@res.T)
        
    check_result = np.allclose(*values)
    
    print_check(f"Testing {naming} ({method})", check_result)
    
    # When checking uncertainties, check also the covmat obtained from res @ res.T
    # since there might -1 signs in the eigenvectors
    if unc and not check_result:
        check_again = np.allclose(*covmats)
        print(" > > The resulting covmat instead agrees? ", check_again)
        

def run_checks(old_cd, new_cd):
    """Run through a number of checks to ensure they contain the same information"""
    cds = [old_cd, new_cd]
        
    check_this(cds, "get_cv", naming="central_values")
    check_this(cds, "get_kintable", naming="kinematics")
    check_this(cds, "multiplicative_errors")
    check_this(cds, "additive_errors", unc=True)
    check_this(cds, "stat_errors")

In [3]:
def test_group(datasets, nofail=False):
    all_cds = []
    for old_name, new_name in datasets.items():
        print(f"\nChecking {new_name} (old name: {old_name})")
        try:
            old_cd, new_cd = load_old_and_new(old_name, new_name)
            all_cds.append((old_cd, new_cd))
            print(" > Comparing values...")
            run_checks(old_cd, new_cd)
        except FileNotFoundError:
            print(f"Files for {new_name} not found")
            continue
        except Exception as e:
            if nofail:
                print(f"### FAILED for {new_name}\n")
                continue
            raise e
    return all_cds

## FT Drell-yan

These datasets can be found in the `E605` branch [#1679](https://github.com/NNPDF/nnpdf/pull/1679)

In [4]:
datasets = {
    "DYE605":  "E605_DY_38P8GEV_PXSEC",
    "DYE886P": "E866_DY_800GEV_PXSEC",
    "DYE886R": "E866_DY_800GEV_RATIO_PDXSECRATIO",
    "DYE906R": "E906_DY_120GEV_PDXSECRATIO",
}

res_ft = test_group(datasets, nofail=False)


#old_cd, new_cd = load_old_and_new("DYE906R", "E906_DY_120GEV_PDXSECRATIO")
#run_checks(old_cd, new_cd)


Checking E605_DY_38P8GEV_PXSEC (old name: DYE605)
 > Comparing values...


 > > The resulting covmat instead agrees?  False



Checking E866_DY_800GEV_PXSEC (old name: DYE886P)
 > Comparing values...


 > > The resulting covmat instead agrees?  False



Checking E866_DY_800GEV_RATIO_PDXSECRATIO (old name: DYE886R)
 > Comparing values...



Checking E906_DY_120GEV_PDXSECRATIO (old name: DYE906R)
 > Comparing values...


 > > The resulting covmat instead agrees?  True


## CMS ttbar

These datasets can be found in the `more_efficient_metadata_for_new_commondata` branch [#1684](https://github.com/NNPDF/nnpdf/pull/1684)

In [5]:
datasets = {
    "CMSTOPDIFF8TEVTTRAPNORM":  "CMS_TTBAR_8TEV_LJ_DIF_YTTBAR-NORM",
}

res = test_group(datasets, nofail=False)


Checking CMS_TTBAR_8TEV_LJ_DIF_YTTBAR-NORM (old name: CMSTOPDIFF8TEVTTRAPNORM)
Files for CMS_TTBAR_8TEV_LJ_DIF_YTTBAR-NORM not found
