**Run the testing of histogram classifiers using a configuration file**

See example\_configuration\_for\_autoencoder.py for an example of a valid configuration file.  

Note: this is step 3/3 preceded by run\_fitting.ipynb.

Import the proper configuration file in the second cell of this notebook. The rest should be fully automatic, just run all cells sequentially. (Note: at a later stage, the plan is to convert this notebook to a regular .py script that can simply be run with the configuration file as command line argument.)

In [None]:
### imports

# external modules
print('importing external modules...')
print('  import os'); import os
print('  import sys'); import sys
print('  import pandas as pd'); import pandas as pd
print('  import numpy as np'); import numpy as np
print('  import matplotlib.pyplot as plt'); import matplotlib.pyplot as plt
print('  import pickly'); import pickle
print('  import importlib'); import importlib

# local modules: utils
print('importing utils...')
sys.path.append('../utils')
print('  import csv_utils as csvu'); import csv_utils as csvu
print('  import json_utils as jsonu'); import json_utils as jsonu
print('  import dataframe_utils as dfu'); import dataframe_utils as dfu
print('  import hist_utils as hu'); import hist_utils as hu
print('  import autoencoder_utils as aeu'); import autoencoder_utils as aeu
print('  import plot_utils as pu'); import plot_utils as pu
print('  import generate_data_utils as gdu'); import generate_data_utils as gdu
print('  import refruns_utils as rru'); import refruns_utils as rru
print('  import mask_utils as mu'); import mask_utils as mu
print('refreshing utils...')
importlib.reload(csvu)
importlib.reload(jsonu)
importlib.reload(dfu)
importlib.reload(hu)
importlib.reload(aeu)
importlib.reload(pu)
importlib.reload(gdu)
importlib.reload(rru)
importlib.reload(mu)

# local modules: src
print('importing src...')
sys.path.append('../src')
sys.path.append('../src/classifiers')
sys.path.append('../src/cloudfitters')
print('  import HistStruct'); import HistStruct
print('  import HistogramClassifier'); import HistogramClassifier
print('  import AutoEncoder'); import AutoEncoder
print('  import SeminormalFitter'); import SeminormalFitter
print('  import GaussianKdeFitter'); import GaussianKdeFitter
print('  import HyperRectangleFitter'); import HyperRectangleFitter
print('refreshing src...')
importlib.reload(HistStruct)
importlib.reload(HistogramClassifier)
importlib.reload(AutoEncoder)
importlib.reload(SeminormalFitter)
importlib.reload(GaussianKdeFitter)
importlib.reload(HyperRectangleFitter)

print('done')

In [None]:
### import the configuration file

print('--- importing configuration file... ---\n')
import example_configuration_for_autoencoder_cfg as conf
importlib.reload(conf)
print('\n--- done importing configuration file ---')

# load the histstruct
try: conf.HISTSTRUCT_FILE_NAME
except: raise Exception('ERROR: parameter HISTSTRUCT_FILE_NAME not found in the config file.'
                       +' Please set this parameter with the path to the input HistStruct')
histstruct = HistStruct.HistStruct.load( os.path.splitext(conf.HISTSTRUCT_FILE_NAME)[0]+'_fitted.zip', verbose=True )

# other initializations
try: conf.TEST_GOOD_MASKS
except: raise Exception('ERROR: variable TEST_GOOD_MASKS not defined in the config file.'
                       +' You must define a double list of mask names for the good test set(s) with this name.')
ngoodsets = len(conf.TEST_GOOD_MASKS)
try: conf.TEST_BAD_MASKS
except: raise Exception('ERROR: variable TEST_BAD_MASKS not defined in the config file.'
                       +' You must define a double list of mask names for the bad test set(s) with this name.')
nbadsets = len(conf.TEST_BAD_MASKS)

In [None]:
### read the scores

logprob_good_parts = []

if not conf.EXTEND_TEST_GOOD:
    for i in range(ngoodsets):
        logprob_good_parts.append( histstruct.get_globalscores( masknames=conf.TEST_GOOD_MASKS[i]) )
else:
    for i in range(ngoodsets):
        logprob_good_parts.append( histstruct.get_extglobalscores( mu.get_combined_name(conf.TEST_GOOD_MASKS[i]) ) )
        
        
logprob_bad_parts = []

if not conf.EXTEND_TEST_BAD:
    for i in range(nbadsets):
        logprob_bad_parts.append( histstruct.get_globalscores( masknames=conf.TEST_BAD_MASKS[i]) )
else:
    for i in range(nbadsets):
        logprob_bad_parts.append( histstruct.get_extglobalscores( mu.get_combined_name( conf.TEST_BAD_MASKS[i]) ) )
        
logprob_good = np.concatenate(tuple(logprob_good_parts))
logprob_bad = np.concatenate(tuple(logprob_bad_parts))

In [None]:
### make a roc curve based on the test results above
# note: smaller logprob = less probable = more outlier = more anomalous
# so if anomalies are signal and good histograms are background, -logprob is a suitable score definition,
# since everything above a certain threshold will be considered signal and below it background.

labels_good = np.zeros(len(logprob_good)) # background: label = 0
labels_bad = np.ones(len(logprob_bad)) # signal: label = 1

labels = np.concatenate(tuple([labels_good,labels_bad]))
scores = np.concatenate(tuple([-logprob_good,-logprob_bad]))
scores = aeu.clip_scores( scores )

try: conf.PLOT_SCORE_DIST
except: 
    print('WARNING: parameter PLOT_SCORE_DIST not found in config file, putting to False.')
    conf.PLOT_SCORE_DIST = False
    
if conf.PLOT_SCORE_DIST: 
    try: conf.PLOT_SCORE_DIST_OPTIONS
    except: 
        print('WARNING: parameter PLOT_SCORE_DIST_OPTIONS not found in config file, putting to empty dict.')
        conf.PLOT_SCORE_DIST_OPTIONS = {}
    pu.plot_score_dist(scores, labels, **conf.PLOT_SCORE_DIST_OPTIONS)

try: conf.PLOT_ROC_CURVE
except: 
    print('WARNING: parameter PLOT_ROC_CURVE not found in config file, putting to False.')
    conf.PLOT_ROC_CURVE = False
    
if conf.PLOT_ROC_CURVE:
    try: conf.PLOT_ROC_CURVE_OPTIONS
    except: 
        print('WARNING: parameter PLOT_ROC_CURVE_OPTIONS not found in config file, putting to empty dict.')
        conf.PLOT_ROC_CURVE_OPTIONS = {}
    auc = aeu.get_roc(scores, labels, **conf.PLOT_ROC_CURVE_OPTIONS)

try: conf.PLOT_CONFUSION_MATRIX
except: 
    print('WARNING: parameter PLOT_CONFUSION_MATRIX not found in config file, putting to False.')
    conf.PLOT_CONFUSION_MATRIX = False
    
if conf.PLOT_CONFUSION_MATRIX:
    try: conf.PLOT_CONFUSION_MATRIX_OPTIONS
    except: 
        print('WARNING: parameter PLOT_CONFUSION_MATRIX_OPTIONS not found in config file, putting to empty dict.')
        conf.PLOT_CONFUSION_MATRIX_OPTIONS = {}
    aeu.get_confusion_matrix(scores,labels, **conf.PLOT_CONFUSION_MATRIX_OPTIONS)

In [None]:
### investigate particular lumisections

try: conf.INSPECT_MODE
except: 
    print('WARNING: parameter INSPECT_MODE not found in config file, setting to None.')
    conf.INSPECT_MODE = None

if conf.INSPECT_MODE:
      
        
    try: conf.INSPECT_RECO_MODE
    except: 
        print('WARNING: parameter INSPECT_RECO_MODE not found in config file, setting to None.')
        conf.INSPECT_RECO_MODE = None
        
    try: conf.INSPECT_REFERENCE_MASKS
    except:
        print('WARNING: parameter INSPECT_REFERENCE_MASKS not found in config file, setting to None.')
        conf.INSPECT_REFERENCE_MASKS = None
    try: conf.INSPECT_REFERENCE_PARTITIONS
    except:
        print('WARNING: parameter INSPECT_REFERENCE_PARTITIONS not found in config file, setting to -1.')
        conf.INSPECT_REFERENCE_PARTITIONS = -1
        
    if( conf.INSPECT_REFERENCE_MASKS or conf.INSPECT_REFERENCE_PARTITIONS>0 ):
        # define reference histograms
        refhists = {}
        for histname in histstruct.histnames:
            refhists[histname] = hu.averagehists( histstruct.get_histograms( histname=histname, masknames=conf.INSPECT_REFERENCE_MASKS ), 
                                                  conf.INSPECT_REFERENCE_PARTITIONS )
    else: refhists = None
        
if conf.INSPECT_MODE=='ls':
    
    try: conf.INSPECT_RUN
    except: raise Exception('ERROR: parameter INSPECT_RUN not found in config file, pleas specify a run number to inspect.')
    try: conf.INSPECT_LS
    except: raise Exception('ERROR: parameter INSPECT_LS not found in config file, pleas specify a lumisection number to inspect.')
    try: conf.INSPECT_PLOT_SCORE
    except: 
        print('WARNING: parameter INPSECT_PLOT_MSE not found in config file, setting to False.')
        conf.INSPECT_PLOT_SCORE = False
    if conf.INSPECT_PLOT_SCORE:
        try: conf.INSPECT_MASKS
        except:
            print('WARNING: parameter INSPECT_MASKS not found in config file, setting to None.')
            conf.INSPECT_MASKS = None
    
    # plot this particular run/ls
    _ = histstruct.plot_ls( conf.INSPECT_RUN, conf.INSPECT_LS, recohist=conf.INSPECT_RECO_MODE, refhists=refhists )
    msepoint = histstruct.get_scores_ls( conf.INSPECT_RUN, conf.INSPECT_LS )
    logprob = histstruct.get_globalscore_ls( conf.INSPECT_RUN, conf.INSPECT_LS )
    print('-------------')
    print('MSE values:')
    for histname in histstruct.histnames: print('{} : {}'.format(histname,msepoint[histname]))
    print('-------------')
    print('logprob: '+str(logprob))
    # plot mse distribution
    if conf.INSPECT_PLOT_SCORE:
        for dim,histname in enumerate(histstruct.histnames):
            mses = histstruct.get_scores( histname=histname, masknames=conf.INSPECT_MASKS )
            nmses = len(mses)
            labels = np.zeros(nmses)
            mses = np.concatenate((mses,np.ones(int(nmses/15))*msepoint[histname]))
            labels = np.concatenate((labels,np.ones(int(nmses/15))))
            pu.plot_score_dist( mses, labels, nbins=200, normalize=False,
                        siglabel='this lumisection', bcklabel='all (masked) lumisections',
                        title=histname )
    
    
elif conf.INSPECT_MODE=='run':
    
    try: conf.INSPECT_RUN
    except: raise Exception('ERROR: parameter INSPECT_RUN not found in config file, pleas specify a run number to inspect.')
    try: conf.INSPECT_MASKS
    except:
        print('WARNING: parameter INSPECT_MASKS not found in config file, setting to None.')
        conf.INSPECT_MASKS = None
        
    # plot given run
    runnbs = histstruct.get_runnbs( masknames=conf.INSPECT_MASKS )
    lsnbs = histstruct.get_lsnbs( masknames=conf.INSPECT_MASKS )
    runsel = np.where(runnbs==conf.INSPECT_RUN)
    lsnbs = lsnbs[runsel]
    print('plotting {} lumisections...'.format(len(lsnbs)))
    for lsnb in lsnbs:
        fig,ax = histstruct.plot_ls(conf.INSPECT_RUN, lsnb, recohist=conf.INSPECT_RECO_MODE, refhists=refhists )
        plt.show()
        msepoint = histstruct.get_scores_ls( conf.INSPECT_RUN, lsnb )
        logprob = histstruct.get_globalscore_ls( conf.INSPECT_RUN, lsnb )
        print('-------------')
        print('MSE values:')
        for histname in histstruct.histnames: print('{} : {}'.format(histname,msepoint[histname]))
        print('-------------')
        print('logprob: '+str(logprob))
    
else: raise Exception('ERROR: value {} for parameter INSPECT_MODE not recognized,'.format(conf.INSPECT_MODE)
                   +' should be either "run" or "ls".')

In [None]:
### investigate how the method performs on the golden/custom test set

# help function
def get_runsls_inrange(logprob,runnbs,lsnbs,logprob_up=None,logprob_down=None):
    # get a list of tuples of (run number, ls number) corresponding to ls with log probability within a given range
    # - logprob, runnbs and lsnbs are equally long 1D arrays
    # - logprob_up and logprob_down are upper and lower thresholds
    #     if both are not None, the lumisections with logprob between the boundaries are returned
    #     if logprob_up is None, the lumisections with logprob > logprob_down are returned
    #     if logprob_down is None, the lumisections with logprob < logprob_up are returned
    indices = np.array([])
    if logprob_down is None:
        indices = np.nonzero(logprob<logprob_up)[0]
    elif logprob_up is None:
        indices = np.nonzero(logprob>logprob_down)[0]
    else:
        indices = np.nonzero((logprob>logprob_down) & (logprob<logprob_up))[0]
    runsinrange = runnbs[indices]
    lsinrange = lsnbs[indices]
    runslsinrange = []
    for rr,lsr in zip(runsinrange,lsinrange):
        runslsinrange.append((int(rr),int(lsr)))
    return {'indices':indices,'runslsinrange':runslsinrange}


try: conf.DO_EVAL
except:
    print('WARNING: parameter DO_EVAL not found in config file, setting to False.')
    conf.DO_EVAL = False
    
if conf.DO_EVAL:
    
    try: conf.EVAL_MASKS
    except: 
        print('WARNING: parameter EVAL_MASKS not found in config file, setting to None.')
        conf.EVAL_MASKS = None
    try: conf.EVAL_SCORE_UP
    except: 
        print('WARNING: parameter EVAL_SCORE_UP not found in config file, setting to None.')
        conf.EVAL_SCORE_UP = None
    try: conf.EVAL_SCORE_DOWN
    except: 
        print('WARNING: parameter EVAL_SCORE_DOWN not found in config file, setting to None.')
        conf.EVAL_SCORE_DOWN = None
        
    if( conf.EVAL_SCORE_UP is None and conf.EVAL_SCORE_DOWN is None ):
        raise Exception('ERROR: parameters EVAL_SCORE_UP and EVAL_SCORE_DOWN cannot both be None.'
                         +' Please specify at least one of both.')
    if( conf.EVAL_SCORE_UP is not None and conf.EVAL_SCORE_DOWN is not None 
        and conf.EVAL_SCORE_UP <= conf.EVAL_SCORE_DOWN ):
        raise Exception('ERROR: parameter EVAL_SCORE_UP cannot be less than or equal to EVAL_SCORE_DOWN.'
                         +' Please specify valid values.')
        
    try: conf.EVAL_NMAXPLOTS
    except:
        print('WARNING: parameter EVAL_NMAXPLOTS not found in config file, setting to 0.')
        conf.EVAL_NMAXPLOTS = 0
        
    if( conf.EVAL_NMAXPLOTS>0 ):
        
        try: conf.EVAL_RECO_MODE
        except: 
            print('WARNING: parameter EVAL_RECO_MODE not found in config file, setting to None.')
            conf.EVAL_RECO_MODE = None
        
        try: conf.EVAL_REFERENCE_MASKS
        except:
            print('WARNING: parameter EVAL_REFERENCE_MASKS not found in config file, setting to None.')
            conf.EVAL_REFERENCE_MASKS = None
        try: conf.EVAL_REFERENCE_PARTITIONS
        except:
            print('WARNING: parameter EVAL_REFERENCE_PARTITIONS not found in config file, setting to -1.')
            conf.EVAL_REFERENCE_PARTITIONS = -1
        
        if( conf.EVAL_REFERENCE_MASKS or conf.EVAL_REFERENCE_PARTITIONS>0 ):
            # define reference histograms
            refhists = {}
            for histname in histstruct.histnames:
                refhists[histname] = hu.averagehists( histstruct.get_histograms( histname=histname, masknames=conf.EVAL_REFERENCE_MASKS ), 
                                                      conf.EVAL_REFERENCE_PARTITIONS )
        else: refhists = None
            
        try: conf.EVAL_OUTFILENAME
        except: 
            print('WARNING: parameter EVAL_OUTFILENAME not found in config file, setting to None (will not create an output file).')
            conf.EVAL_OUTFILENAME = None
        if conf.EVAL_OUTFILENAME:
            conf.EVAL_OUTFILENAME = os.path.splitext(conf.EVAL_OUTFILENAME)[0]+'.pdf'


    lsnbs_eval = histstruct.get_lsnbs( masknames=conf.EVAL_MASKS )
    runnbs_eval = histstruct.get_runnbs( masknames=conf.EVAL_MASKS )
    mse_eval_dict = histstruct.get_scores( masknames=conf.EVAL_MASKS )
    logprob_eval = histstruct.get_globalscores( masknames=conf.EVAL_MASKS )

    temp = get_runsls_inrange(logprob_eval, runnbs_eval, lsnbs_eval,
                              logprob_up = conf.EVAL_SCORE_UP, logprob_down = conf.EVAL_SCORE_DOWN)

    runslsinrange = temp['runslsinrange']
    print('{} out of {} LS are within these boundaries'.format(len(runslsinrange),len(logprob_eval)))

    # make plots
    if conf.EVAL_NMAXPLOTS > 0:
        if conf.EVAL_OUTFILENAME is not None:
            from matplotlib.backends.backend_pdf import PdfPages
            pdf = PdfPages(conf.EVAL_OUTFILENAME)
        for i,(runnb,lsnb) in enumerate(runslsinrange):
            if i>=conf.EVAL_NMAXPLOTS:
                print('WARNING: maximum number of plots reached.')
                break
            print('------------------------')
            fig,axs = histstruct.plot_ls( runnb, lsnb, recohist=conf.EVAL_RECO_MODE, refhists=refhists)
            plt.show()
            if conf.EVAL_OUTFILENAME is not None: pdf.savefig(fig)
        if conf.EVAL_OUTFILENAME is not None: pdf.close()