**Create a HistStruct object**

Use this script or something similar to create and save a HistStruct object that can be used as an input for later steps (training and testing of classifiers).  
The HistStruct should contain all histograms (all lumisections, all types) that will be needed in later steps,
with suitable masks to select training, testing and/or other subsets.  

This script is not fixed. Depending on your needs you might need different definitions of the training set, the good and bad test sets, application runs, histogram types etc.!

In [None]:
### imports

# external modules
print('importing external modules...')
print('  import os'); import os
print('  import sys'); import sys
print('  import importlib'); import importlib

# local modules: utils
print('importing utils...')
sys.path.append('../utils')
print('  import csv_utils as csvu'); import csv_utils as csvu
print('  import json_utils as jsonu'); import json_utils as jsonu
print('  import dataframe_utils as dfu'); import dataframe_utils as dfu
print('refreshing utils...')
importlib.reload(csvu)
importlib.reload(jsonu)
importlib.reload(dfu)

# local modules: src
print('importing src...')
sys.path.append('../src')
sys.path.append('../src/classifiers')
print('  import HistStruct'); import HistStruct
print('refreshing src...')
importlib.reload(HistStruct)

print('done')

In [None]:
### define properties

# define a list of good 'reference' runs (found by eye)
# should be replaced at some point by the reference runs defined by the DQM/DC team.
goodrunsls = {'2017':
              {
                "297056":[[-1]],
                "297177":[[-1]],
                "301449":[[-1]],
              },
              '2018': # needs to be re-checked, not guaranteed to be fully correct or representative.
             {  "315267":[[-1]] 
             }}

# define core test set of clearly bad runs (found by eye)
badrunsls = {'2017':
                {
                "297287":[[-1]],
                "297288":[[-1]],
                "297289":[[-1]],
                "299316":[[-1]],
                "299324":[[-1]],
                "299326":[[-1]],
                "301086":[[88,126]] # only bad for size_PXDisk_+1 -> maybe do not use for now (unclear what are real anomalies)
                },
            '2018': # needs to be re-checked, not guaranteed to be fully correct or representative.
                {
                #"317479":[[-1]],
                "317480":[[-1]],
                "317481":[[-1]],
                "317482":[[-1]],
                #"319847":[[1,35]]
            }}

# set year to use
year = '2017'

# set histogram names to use 
histnames = [
            'chargeInner_PXLayer_2',
             'chargeInner_PXLayer_3',
             'charge_PXDisk_+1','charge_PXDisk_+2','charge_PXDisk_+3',
             'size_PXLayer_1','size_PXLayer_2',
             'size_PXLayer_3'
            ]

# set whether to train globally or locally
training_mode = 'local'

if training_mode == 'global':
    # train globally on a large set of runs (e.g. entire 2017 data with DCS-bit on and sufficient statistics)
    runsls_training = None # use none to not add a mask for training (can e.g. use DCS-bit on and high statistics masks)
    runsls_good = None # use none to not add a mask for good runs (can e.g. use averages of training set)
    runsls_bad = badrunsls[year] # predefined bad runs
    
elif training_mode == 'local':
    # train locally on a small set of runs
    # - either on n runs preceding a chosen application run,
    # - or on the run associated as reference to the chosen application run.
    #   (not yet defined properly, use first approach for now.)
    
    # select application run
    available_runs = dfu.get_runs( dfu.select_dcson( csvu.read_csv('../data/DF'+year+'_'+histnames[0]+'.csv') ) )
    run_application = 306458
    run_application_index = available_runs.index(run_application)
    # select training set
    usereference = False
    if usereference:
        run_reference = rru.get_reference_run( run_application, jsonfile='../utils/json_allRunsRefRuns.json' )
        if run_reference<0:
            raise Exception('no valid reference run has been defined for run {}'.format(run_application))
        runsls_training = jsonu.tuplelist_to_jsondict([(run_reference,[-1])])
    else:
        ntraining = 5 # number of training runs (preceding the application run)
        offset = 0 # normal case: offset = 0 (just use ntraining previous runs)
        runsls_training = jsonu.tuplelist_to_jsondict([(el,[-1]) for el in available_runs[run_application_index-ntraining-offset:run_application_index-offset]])
    # put the usual bad runs as bad set and the application run as good set
    runsls_bad = badrunsls[year]
    runsls_good = jsonu.tuplelist_to_jsondict([(run_application,[-1])])
    # alternative: put the application run as bad set and the training runs as good set
    #runsls_bad = jsonu.tuplelist_to_jsondict([(run_application,[-1])])
    #runsls_good = runsls_training

print('The following masks will be defined:')
print('for the training set: ')
print('  {}'.format(runsls_training))
print('for the good test set:')
print('  {}'.format(runsls_good))
print('for the bad test set:')
print('  {}'.format(runsls_bad))

In [None]:
### read the data based on the configuration defined above
    
# create the HistStruct object
histstruct = HistStruct.HistStruct()
# loop over the histogram types to take into account
for histname in histnames:
    print('adding {}...'.format(histname))
    # read the histograms from the csv file
    filename = '../data/DF'+year+'_'+histname+'.csv'
    df = csvu.read_csv( filename )
    # in case of local training, we can remove most of the histograms
    if( runsls_training is not None and runsls_good is not None and runsls_bad is not None ):
        runsls_total = {k: v for d in (runsls_training, runsls_good, runsls_bad) for k, v in d.items()}
        df = dfu.select_runsls( df, runsls_total )
    # add the histograms to the HistStuct (note the available preprocessing options, e.g. normalizing which is switched on by default)!
    histstruct.add_dataframe( df )
print('added {} lumisections with {} histograms each to the dataframe.'.format(len(histstruct.runnbs),len(histstruct.histnames)))
    
# add default masks for DCS-bit on, golden json, and high statistics selection
histstruct.add_dcsonjson_mask( 'dcson' )
histstruct.add_goldenjson_mask('golden' )
histstruct.add_highstat_mask( 'highstat' )

# add custom masks for the training and test sets
if runsls_training is not None: histstruct.add_json_mask( 'training', runsls_training )
if runsls_good is not None: histstruct.add_json_mask( 'good', runsls_good )
nbadruns = 0
if runsls_bad is not None:
    histstruct.add_json_mask( 'bad', runsls_bad )
    # special case for bad runs: add a mask per bad run (different bad runs have different characteristics)
    nbadruns = len(runsls_bad.keys())
    for i,badrun in enumerate(runsls_bad.keys()):
        histstruct.add_json_mask( 'bad{}'.format(i), {badrun:runsls_bad[badrun]} )
    
# save the HistStruct
histstruct.save( 'test.zip' )
    
print('created a histstruct with the following properties:')
print('- number of histogram types: {}'.format(len(histstruct.histnames)))
print('- number of lumisections: {}'.format(len(histstruct.lsnbs)))
print('- masks: {}'.format(list(histstruct.masks.keys())))