**A collection of useful basic functions.**  
Functionality includes (among others):
- reading the raw input csv files and producing more manageable csv files (grouped per histogram type).
- reading csv files into dataframes and performing basic operations (e.g. selecting DCS-bit on data or golden json data).
- some plotting functions.
- preparing data for machine learning, starting from e.g. the dataframes read from a csv file.


In [None]:
### imports
# as this notebook is at the basis, it does not include other notebooks within this project, only external python modules

import os
import pandas as pd
import json
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl

In [None]:
### some functions that point to the data directories (hard-coded for now!)
# example usage of these functions: see function write_skimmed_csv within this same notebook

def get_data_dirs(year='2017',eras=[],dim=1):
    ### yield all data directories
    # note that the location of the data is hard-coded;
    # this function might break for newer or later reprocessings of the data.
    # - year is a string, either '2017' or '2018'
    # - era is a list containing a selection of era names
    #   (default empty list = all eras)
    # - dim is either 1 or 2 (for 1D or 2D plots)
    if(year=='2017' and len(eras)==0): eras = ['B','C','D','E','F']
    if(year=='2018' and len(eras)==0): eras = ['A','B','C','D']
    basedir = '/eos/project/c/cmsml4dc/ML_2020/UL'+year+'_Data/'
    for era in eras:
        eradir = basedir+'DF'+year+era+'_'+str(dim)+'D_Complete'
        if not os.path.exists(eradir):
            print('### ERROR ###: requested directory '+eradir+' does not seem to exist...')
            return
        yield eradir

def get_csv_files(inputdir):
    ### yields paths to all csv files in input directory
    # note that the output paths consist of input_dir/filename
    # this function is only meant for 1-level down searching,
    # i.e. the .csv files listed directly under input_dir.
    for el in os.listdir(inputdir):
        if el[-4:]=='.csv':
            yield os.path.join(inputdir,el)

def sort_filenames(filelist):
    ### sort filenames in numerical order (e.g. 2 before 10)
    # note that the number is supposed to be in ..._<number>.<extension> format
    nlist = []
    for f in filelist:
        temp = f.split('.')[0]
        temp = temp[temp.rfind('_')+1:]
        nlist.append(int(temp))
    return [f for _,f in sorted(zip(nlist,filelist))]

In [None]:
### some functions to load one or more csv files into pandas dataframe (df) and make a subselection of the histograms contained in them

def read_csv(csv_file):
    ### read csv file into pandas dataframe
    # csv_file is the path to the csv file to be read
    df = pd.read_csv(csv_file)
    df.sort_values(by=['fromrun','fromlumi'],inplace=True)
    df.reset_index(drop=True,inplace=True)
    return df

# getter and selector for histogram names 

def get_histnames(df):
    ### get a list of (unique) histogram names present in a df
    # df is a dataframe (e.g. the return value of read_csv)
    histnamelist = []
    for i in list(df.index):
        val = df.at[i,'hname'] 
        if val not in histnamelist: 
            histnamelist.append(val)
    return histnamelist
    
def select_histnames(df,histnames):
    ### keep only a subset of histograms in a df
    # histnames is a list of histogram names to keep in the df.
    df = df[df['hname'].isin(histnames)]
    df.reset_index(drop=True,inplace=True)
    return df

# getter and selector for run numbers

def get_runs(df):
    ### return a list of (unique) run numbers present in a df
    # df is a dataframe
    runlist = []
    for i in list(df.index):
        val = df.at[i,'fromrun'] 
        if val not in runlist: 
            runlist.append(val)
    return runlist

def select_runs(df,runnbs):
    ### keep only a subset of runs in a df
    # runnbs is a list of run numbers to keep in the df.
    df = df[df['fromrun'].isin(runnbs)]
    df.reset_index(drop=True,inplace=True)
    return df

# getter and selector for lumisection numbers
# note: no check is done on the run number so mostly useful if the input df already contains only a single run

def get_ls(df):
    ### return a list of ls numbers present in a df
    # not that the numbers are not required to be unique
    lslist = []
    for i in list(df.index):
        val = df.at[i,'fromlumi']
        lslist.append(val)
    return lslist

def select_ls(df,lsnbs):
    ### keep only a subset of lumisection numbers in a df
    # lsnbs is a list of lumisection numbers to keep in the df.
    df = df[df['fromlumi'].isin(runnbs)]
    df.reset_index(drop=True,inplace=True)
    return df

# more advanced getter and selector for multiple runs, each with their own lumisection range
# note: here the datatype used for this kind of selection is a list of tuples.
#       each tuple in the list is of the form (run number,[lumisection numbers]),
#       where [-1] implies all lumisections in that run.
#       In the future it might be best to implement similar functions using a json file format instead of this custom data format,
#       for easier integration in whatever already exists...

def get_runsls(df):
    ### return a list of tuples of format (runnb,[lsnbs]) in a df
    runslslist = get_runs(df)
    for i,run in enumerate(runslslist):
        runslslist[i] = (run,get_ls(select_runs(df,[run])))
    return runslslist

def select_runsls(df,select):
    ### keep only a subset of runs and ls in df
    # select is a list of tuples of format (runnb,[lsnbs])
    # (use [lsnbs]=[-1] to keep whole run)
    partialdfs = []
    for el in select:
        thisdf = df[df['fromrun']==el[0]]
        if(el[1][0]!=-1):
            thisdf = thisdf[thisdf['fromlumi'].isin(el[1])]
        partialdfs.append(thisdf)
    df = pd.concat(partialdfs,ignore_index=True)
    df.sort_values(by=['fromrun','fromlumi'],inplace=True)
    df.reset_index(drop=True,inplace=True)
    return df

# getter and selector for sufficient statistics
# same remark about data format holds here as it holds for the above.

def get_highstat(df,entries_to_bins_ratio=100):
    ### return a select object of runs and ls of histograms with high statistics
    return get_runsls(df[df['entries']/df['Xbins']>entries_to_bins_ratio])

def select_highstat(df,entries_to_bins_ratio=100):
    return select_runsls(df,get_highstat(df,entries_to_bins_ratio))

# other functions that more or less fit into this block

def read_and_merge_csv(csv_files,histnames=[],runnbs=[]):
    ### read and merge list of csv files into a single df
    # csv_files is a list of paths to files to merge into a df
    # histnames is a list of the types of histograms to keep (default: all)
    # runnbs is a list of run numbers to keep (default: all)
    dflist = []
    for f in csv_files:
        dffile = read_csv(f)
        if len(histnames)>0: 
            dffile = select_histnames(dffile,histnames)
        if len(runnbs)>0:
            dffile = dffile[dffile['fromrun'].isin(runnbs)]
        dflist.append(dffile)
    df = pd.concat(dflist,ignore_index=True)
    df.sort_values(by=['fromrun','fromlumi'],inplace=True)
    df.reset_index(drop=True,inplace=True)
    return df

def get_lcs(list_of_selects):
    ### return a select object that is the largest common subset between the select objects in list_of_selects
    # by 'select object' we mean the list of tuples that can be used to select specific runs/lumisections in a dataframe (described above).
    # remark: this is probably not the most efficient implementation...
    if(len(list_of_selects)==1): return list_of_selects[0]
    lcs = []
    for runobject in list_of_selects[0]:
        runnb = runobject[0]
        ls = runobject[1]
        allcommon = True
        for select in list_of_selects[1:]:
            thiscommon = False
            for otherrunobject in select:
                if otherrunobject[0]==runnb:
                    commonls = list(set(ls) & set(otherrunobject[1]))
                    if len(commonls)>0:
                        thiscommon = True
                        ls = commonls
            if not thiscommon: 
                allcommon = False
                break
        if allcommon:
            lcs.append((runnb,ls))
    return lcs

def write_skimmed_csv(histnames,year):
    ### read all available data for a given year
    # and write a separate csv file for each histogram type in histnames;
    # each csv file will contain one single histogram type 
    # for all runs and lumisections for the given year.
    # note: this function can take quite a while to run!
    datadirs = list(get_data_dirs(year))
    csvfiles = []
    for datadir in datadirs:
        csvfiles += sort_filenames(list(get_csv_files(datadir)))
    # read histograms into df
    temp = read_and_merge_csv(csvfiles,histnames)
    # write df to files
    for histname in histnames:
        seldf = select_histnames(temp,[histname])
        histname = histname.replace(' ','_')
        seldf.to_csv('DF'+year+'_'+histname+'.csv')

In [None]:
# some functions to find if a lumisection belongs to DCS, GOLDEN or neither

def injson(run,lumi,jsonfile):
    ### find if a run and lumi combination is in a given json file
    # run and lumi are either integers or (equally long) arrays of integers
    # jsonfile is a path to a json file
    # output is a boolean or array of booleans respectively
    if not os.path.exists(jsonfile):
        print('requested json file '+jsonfile+' does not seem to exist...')
    with open(jsonfile) as f: gdict = json.load(f)
    if not hasattr(run,'__len__') and not isinstance(run,str):
        run = [run]; lumi = [lumi]
    res = np.zeros(len(run),dtype=np.int8)
    for i,(r,l) in enumerate(zip(run,lumi)):
        r = str(r)
        if not r in gdict: continue
        glumis = gdict[r]
        inlumis = False
        for lumis in glumis:
            if(l>=lumis[0] and l<=lumis[1]): 
                inlumis = True
                break
        if inlumis: res[i] = 1
    res = res.astype(np.bool)
    if len(res)==1: res = res[0]
    return res

def isgolden(run,lumi):
    ### find if a run and lumi combination is in golden json file
    # run and lumi are either integers or (equally long) arrays of integers
    
    # old golden jsons (prompt reco):
    #jsonloc2017 = '/eos/project/c/cmsml4dc/ML_2020/Scripts2020/GoldenJSON17.json'
    #jsonloc2018 = 'goldenJSON2018.json' # temporary and manually copied from twiki, removed now.
    # new golden jsons (rereco)
    jsonloc2017 = 'utils/json_GOLDEN_2017.txt' 
    # ultralegacy reprocessing; from: /afs/cern.ch/cms/CAF/CMSCOMM/COMM_DQM/certification/Collisions17/13TeV/Legacy_2017/Cert_294927-306462_13TeV_UL2017_Collisions17_GoldenJSON.txt
    jsonloc2018 = 'utils/json_GOLDEN_2018.txt' 
    # legacy reprocessing; from: /afs/cern.ch/cms/CAF/CMSCOMM/COMM_DQM/certification/Collisions18/13TeV/Legacy_2018/Cert_314472-325175_13TeV_Legacy2018_Collisions18_JSON.txt
    
    return injson(run,lumi,jsonloc2017) + injson(run,lumi,jsonloc2018)

def select_golden(df):
    ### keep only golden lumisections in df
    dfres = df[isgolden(df['fromrun'].values,df['fromlumi'].values)]
    dfres.reset_index(drop=True,inplace=True)
    return dfres

def select_notgolden(df):
    ### keep all but golden lumisections in df
    dfres = df[np.invert(isgolden(df['fromrun'].values,df['fromlumi'].values))]
    dfres.reset_index(drop=True,inplace=True)
    return dfres

def isdcson(run,lumi):
    ### find if a run and lumi combination is in DCS-only json file
    # run and lumi are either integers or arrays of integers
    
    jsonloc2017 = 'utils/json_DCSONLY_2017.txt' 
    # from: /afs/cern.ch/cms/CAF/CMSCOMM/COMM_DQM/certification/Collisions17/13TeV/DCSOnly/json_DCSONLY.txt
    jsonloc2018 = 'utils/json_DCSONLY_2018.txt' 
    # from: /afs/cern.ch/cms/CAF/CMSCOMM/COMM_DQM/certification/Collisions18/13TeV/DCSOnly/json_DCSONLY.txt
    return injson(run,lumi,jsonloc2017) + injson(run,lumi,jsonloc2018)

def select_dcson(df):
    ### keep only lumisections in df that have DCS-bit on
    dfres = df[isdcson(df['fromrun'].values,df['fromlumi'].values)]
    dfres.reset_index(drop=True,inplace=True)
    return dfres

def select_dcsoff(df):
    ### keep only lumisections in df that have DCS-bit off
    dfres = df[np.invert(isdcson(df['fromrun'].values,df['fromlumi'].values))]
    dfres.reset_index(drop=True,inplace=True)
    return dfres

### deprecated functions ###
# either not used anymore since a long time (so potentially need updates)
# or known to be wrong/senseless/unuseful

def isgolden_fast(run,lumi,gdict):
    ### faster method using an already loaded dict
    # run and lumi are integers
    run = str(run)
    if not run in gdict: return False
    glumis = gdict[run]
    for lumis in glumis:
        if(lumi>=lumis[0] and lumi<=lumis[1]): 
            return True
    return False

def isbad(run,lumi):
    jsonloc2017 = '/eos/project/c/cmsml4dc/ML_2020/Scripts2020/JsonBAD17.json'
    # no 2018 json file for bad runs available yet!
    # NOTE: turns out this is not official, just something that Francesco created based on an autoencoder...
    # do not use for official tests.
    return injson(run,lumi,jsonloc2017)

def get_bad(df):
    ### return a list of bad lumisections, without modifying df
    runlist = []; lslist = []
    for i in range(len(df)):
        r = df.at[i,'fromrun']
        l = df.at[i,'fromlumi']
        if(isbad(r,l)): 
            runlist.append(r)
            lslist.append(l)
    return(runlist,lslist)

def get_quality(df):
    ### get total entries, number of 'good' ones, 'bad' ones and other
    # warning: this function assumes that df contains only one entry per unique LS!
    ngood = np.sum(isgolden(df['fromrun'].values,df['fromlumi'].values))
    nbad = np.sum(isbad(df['fromrun'].values,df['fromlumi'].values))
    nother = len(df)-ngood-nbad
    return (ngood,nbad,nother)

def select_bad(df):
    ### keep only bad lumisections in df
    dfres = df[isbad(df['fromrun'].values,df['fromlumi'].values)]
    dfres.reset_index(drop=True,inplace=True)
    return dfres

def select_golden_and_bad(df):
    ### keep only golden and bad lumisections in df
    dfres = df[isgolden(df['fromrun'].values,df['fromlumi'].values) + isbad(df['fromrun'].values,df['fromlumi'].values)]
    dfres.reset_index(drop=True,inplace=True)
    return dfres

In [None]:
# functions to obtain histograms in np array format

def get_hist_values(df):
    ### same as builtin "df['histo'].values" but convert strings to np arrays
    # also an array of run and LS numbers is returned
    # warning: no check is done to assure that all histograms are of the same type!
    nn = len(json.loads(df.at[0,'histo']))
    vals = np.zeros((len(df),nn))
    ls = np.zeros(len(df))
    runs = np.zeros(len(df))
    for i in range(len(df)):
        vals[i,:] = json.loads(df.at[i,'histo'])
        ls[i] = int(df.at[i,'fromlumi'])
        runs[i] = int(df.at[i,'fromrun'])
    return (vals,runs,ls)

In [None]:
# functions for plotting 
      
def plot_hists(histlist,colorlist=[],labellist=[],transparency=1,xlims=(0,-1)):
    ### plot some histograms (in histlist) in one figure using specified colors and/or labels
    # - histlist is a list of 1D arrays containing the histograms (or a 2D array of shape (nhistograms,nbins))
    # - colorlist is a list or array containing colors (in string format)
    # - labellist is a list or array containing labels for in legend
    dolabel = True; docolor = True
    if len(labellist)==0:
        labellist = ['']*len(histlist)
        dolabel = False
    if len(colorlist)==0:
        docolor = False
    if xlims[1]<xlims[0]: xlims = (0,len(histlist[0]))
    xax = np.linspace(xlims[0],xlims[1],num=len(histlist[0]))
    plt.figure()
    for i,row in enumerate(histlist):
        if docolor: plt.step(xax,row,color=colorlist[i],label=labellist[i],alpha=transparency)
        else: plt.step(xax,row,label=labellist[i],alpha=transparency)
    if dolabel: plt.legend()  
    
def plot_hists_multi(histlist,colorlist=[],labellist=[],transparency=1,xlims=(0,-1)):
    ### plot many histograms (in histlist) in one figure using specified colors and/or labels
    # - histlist is a list of 1D arrays containing the histograms (or a 2D array of shape (nhistograms,nbins))
    # - colorlist is a list or array containing numbers to be mapped to colors
    # - labellist is a list or array containing labels for in legend
    dolabel = True; docolor = True
    if len(labellist)==0:
        labellist = ['']*len(histlist)
        dolabel = False
    if len(colorlist)==0:
        docolor = False
    if xlims[1]<xlims[0]: xlims = (0,len(histlist[0]))
    xax = np.linspace(xlims[0],xlims[1],num=len(histlist[0]))
    plt.figure()
    if docolor:
        norm = mpl.colors.Normalize(vmin=np.min(colorlist),vmax=np.max(colorlist))
        cobject = mpl.cm.ScalarMappable(norm=norm, cmap=mpl.cm.jet)
        cobject.set_array([]) # ad-hoc bug fix
    for i,row in enumerate(histlist):
        if docolor: plt.step(xax,row,color=cobject.to_rgba(colorlist[i]),label=labellist[i],alpha=transparency)
        else: plt.step(xax,row,label=labellist[i],alpha=transparency)
    if docolor: plt.colorbar(cobject)
    if dolabel: plt.legend()
    
def plot_hists_from_df(df,histtype,nhists):
    ### plot a number of histograms in a dataframe
    # - df is the dataframe from which to plot
    # - histtype is the name of the histogram type (e.g. 'chargeInner_PXLayer_1')
    # - nhists is the number of histograms to plot
    dfs = select_histnames(df,[histtype])
    nhists = min(len(dfs),nhists)
    dfs = dfs[0:nhists+1]
    val = get_hist_values(dfs)[0]
    plot_hists(val)
    
def plot_sets(setlist,ax=None,title='',colorlist=[],labellist=[],transparencylist=[],xlims=(0,-1)):
    ### plot multiple sets of histograms to compare the shapes
    # - setlist is a list of 2D numpy arrays containing histograms
    # - ax is a pyplot axis object (if none a new figure is created)
    # - title is a string that will be used as the title for the ax object
    # other parameters are lists of which each element applies to one list of histograms
    dolabel = True
    if len(labellist)==0:
        labellist = ['']*len(setlist)
        dolabel = False
    if len(colorlist)==0:
        colorlist = ['red','blue','green','orange']
        if len(setlist)>4:
            print('ERROR: please specify the colors if you plot more than four sets.')
            return
    if len(transparencylist)==0:
        transparencylist = [1.]*len(setlist)
    if xlims[1]<xlims[0]: xlims = (0,len(setlist[0][0]))
    xax = np.linspace(xlims[0],xlims[1],num=len(setlist[0][0]))
    if ax is None: fig,ax = plt.subplots()
    for i,histlist in enumerate(setlist):
        row = histlist[0]
        ax.step(xax,row,color=colorlist[i],label=labellist[i],alpha=transparencylist[i])
        if len(histlist)<2: continue
        for j,row in enumerate(histlist[1:,:]):
            ax.step(xax,row,color=colorlist[i],alpha=transparencylist[i])
    if dolabel: ax.legend(loc='upper right')
    if len(title)>0: ax.set_title(title)

In [None]:
### functions for calculating moments of a histogram

def moment(bins,counts,order):
    ### get n-th central moment of a histogram
    # - bins is a 1D or 2D np array holding the bin centers
    #   (shape (nbins) or (nhistograms,nbins))
    # - array is a 2D np array containing the bin counts
    #   (shape (nhistograms,nbins))
    # - order is the order of the moment to calculate
    #   (0 = maximum, 1 = mean value)
    if len(bins.shape)==1:
        bins = np.tile(bins,(len(counts),1))
    if not bins.shape == counts.shape:
        print('### ERROR ###: bins and counts do not have the same shape!')
        return None
    if len(bins.shape)==1:
        bins = np.array([bins])
        counts = np.array([counts])
    if order==0: # return maximum
        return np.nan_to_num(np.max(counts,axis=1))
    return np.nan_to_num(np.divide(np.sum(np.multiply(counts,np.power(bins,order)),axis=1,dtype=np.float),np.sum(counts,axis=1)))

def histmean(bins,counts):
    ### special case of moment calculation
    return moment(bins,counts,1)

def histrms(bins,counts):
    ### special case of moment calculation
    return np.power(moment(bins,counts,2)-np.power(moment(bins,counts,1),2),0.5)

def histmoments(bins,counts,orders):
    ### apply moment calculation for a list of orders
    # the return type is a numpy array of shape (nhistograms,nmoments)
    moments = np.zeros((len(counts),len(orders)))
    for i,order in enumerate(orders):
        moments[:,i] = moment(bins,counts,order)
    return moments

In [None]:
### averaging a collection of histograms (e.g. for template definition)

def averagehists(hists,nout):
    ### partition hists (of shape (nhistograms,nbins)) into nout parts and take the average histogram of each part
    avghists = np.zeros((nout,hists.shape[1]))
    nsub = int(len(hists)/nout)
    for i in range(nout):
        startindex = i*nsub
        stopindex = (i+1)*nsub
        avghists[i,:] = np.mean(hists[startindex:stopindex,:],axis=0)
    return avghists

In [None]:
### rebinning of histograms

def rebinhists(hists,factor):
    ### perform rebinning on a set of histograms
    # hists is a numpy array of shape (nhistograms,nbins)
    # factor is the rebinning factor, which must be a divisor of nbins.
    if(not hists.shape[1]%factor==0): 
        print('### ERROR ###: no rebinning performed since no suitable reduction factor was given.')
        return hists
    (len1,len2) = hists.shape
    newlen = int(len2/factor)
    rebinned = np.zeros((len1,newlen))
    for i in range(newlen):
        rebinned[:,i] = np.sum(hists[:,factor*i:factor*(i+1)],axis=1)
    return rebinned

In [None]:
### normalization

from sklearn.preprocessing import normalize

def normalizehists(hists):
    ### perform normalization (i.e. sum of bin contents equals one for each histogram)
    return normalize(hists, norm='l1', axis=1)

In [None]:
def preparedatafromnpy(dataname, rebinningfactor=1, donormalize=True, doplot=False):
    # read a .npy file and output the histograms
    
    hist = np.load(dataname,allow_pickle=False)
    # preprocessing of the data: rebinning and normalizing
    hist = hist[:,1:-1]
    if rebinningfactor != 1: rhist = rebinhists(hist,rebinningfactor)
    else: rhist = hist
    if donormalize: rhist = normalizehists(rhist)
        
    if not doplot: return rhist
    
    # plot histograms
    plt.figure()
    xlims = (0,len(rhist[0]))
    xax = np.linspace(xlims[0],xlims[1],num=len(rhist[0]))
    for i in range(len(hist)): plt.step(xax,rhist[int(i),:],color='b')
    plt.title('Histograms in file')
        
    return rhist

def preparedatafromdf(df, returnrunls=False, onlygolden=False, rebinningfactor=1, donormalize=True, doplot=False):
    # prepare the data contained in a dataframe in the form of a numpy array
    # args:
    # - returnrunls: wether to return only a histogram array or 1D arrays of run and lumisection numbers as well
    # - onlygolden: if True, only lumisections in the golden json file are kept
    # - rebinningfactor: an integer number to downsample the histograms in the dataframe
    # - donormalize: if True, data are normalized
    # - doplot: if True, some example plots are made showing the histograms
    
    if onlygolden:
        df = select_golden(df)

    # preprocessing of the data: rebinning and normalizing
    (hist,runnbs,lsnbs) = get_hist_values(df)
    hist = hist[:,1:-1]
    if rebinningfactor != 1: rhist = rebinhists(hist,rebinningfactor)
    else: rhist = hist
    if donormalize: rhist = normalizehists(rhist)
        
    if not doplot:
        if returnrunls: return (rhist,runnbs,lsnbs) 
        else: return rhist
    
    # plot some examples
    nplot = min(10,len(hist))
    flatindex = np.linspace(0,len(hist),num=len(hist),endpoint=False)
    randint = np.random.choice(flatindex,size=nplot,replace=False)
    xlims = (0,len(hist[0]))
    xax = np.linspace(xlims[0],xlims[1],num=len(hist[0]))
    plt.figure()
    for i in randint: plt.step(xax,hist[int(i),:],color='r')
    plt.title('Examples of histograms in data')
    plt.figure()
    for i in randint: plt.step(xax,rhist[int(i),:],color='b')
    plt.title('Same histograms, but rebinned and normalized')
        
    if returnrunls: return (rhist,runnbs,lsnbs)
    else: return rhist

def preparedatafromcsv(dataname, returnrunls=False, onlygolden=False, rebinningfactor=1, donormalize=True, doplot=False):
    # prepare the data contained in a dataframe csv file in the form of a numpy array
    # args:
    # - returnrunls: wether to return only a histogram array or 1D arrays of run and lumisection numbers as well
    # - onlygolden: if True, only lumisections in the golden json file are kept
    # - rebinningfactor: an integer number to downsample the histograms in the dataframe
    # - doplot: if True, some example plots are made showing the histograms

    # read data
    df = read_csv(dataname)
    # prepare data from df
    return preparedatafromdf(df, returnrunls=returnrunls,onlygolden=onlygolden,rebinningfactor=rebinningfactor,donormalize=donormalize,doplot=doplot)