**Run the fitting of histogram classifier output scores using a configuration file**

See example\_configuration\_for\_autoencoder.py for an example of a valid configuration file.  

Note: this is step 2/3 preceded by run\_training.ipynb and followed by run\_testing.ipynb.

Import the proper configuration file in the second cell of this notebook. The rest should be fully automatic, just run all cells sequentially. (Note: at a later stage, the plan is to convert this notebook to a regular .py script that can simply be run with the configuration file as command line argument.)

In [None]:
### imports

# external modules
print('importing external modules...')
print('  import os'); import os
print('  import sys'); import sys
print('  import pandas as pd'); import pandas as pd
print('  import numpy as np'); import numpy as np
print('  import matplotlib.pyplot as plt'); import matplotlib.pyplot as plt
print('  import pickly'); import pickle
print('  import importlib'); import importlib

# local modules: utils
print('importing utils...')
sys.path.append('../utils')
print('  import csv_utils as csvu'); import csv_utils as csvu
print('  import json_utils as jsonu'); import json_utils as jsonu
print('  import dataframe_utils as dfu'); import dataframe_utils as dfu
print('  import hist_utils as hu'); import hist_utils as hu
print('  import autoencoder_utils as aeu'); import autoencoder_utils as aeu
print('  import plot_utils as pu'); import plot_utils as pu
print('  import generate_data_utils as gdu'); import generate_data_utils as gdu
print('  import refruns_utils as rru'); import refruns_utils as rru
print('  import mask_utils as mu'); import mask_utils as mu
print('refreshing utils...')
importlib.reload(csvu)
importlib.reload(jsonu)
importlib.reload(dfu)
importlib.reload(hu)
importlib.reload(aeu)
importlib.reload(pu)
importlib.reload(gdu)
importlib.reload(rru)
importlib.reload(mu)

# local modules: src
print('importing src...')
sys.path.append('../src')
sys.path.append('../src/classifiers')
sys.path.append('../src/cloudfitters')
print('  import HistStruct'); import HistStruct
print('  import HistogramClassifier'); import HistogramClassifier
print('  import AutoEncoder'); import AutoEncoder
print('  import SeminormalFitter'); import SeminormalFitter
print('  import GaussianKdeFitter'); import GaussianKdeFitter
print('  import HyperRectangleFitter'); import HyperRectangleFitter
print('refreshing src...')
importlib.reload(HistStruct)
importlib.reload(HistogramClassifier)
importlib.reload(AutoEncoder)
importlib.reload(SeminormalFitter)
importlib.reload(GaussianKdeFitter)
importlib.reload(HyperRectangleFitter)

print('done')

In [None]:
### import the configuration file

print('--- importing configuration file... ---\n')
import example_configuration_for_autoencoder_cfg as conf
importlib.reload(conf)
print('\n--- done importing configuration file ---\n')

# load the histstruct
try: conf.HISTSTRUCT_FILE_NAME
except: raise Exception('ERROR: parameter HISTSTRUCT_FILE_NAME not found in the config file.'
                       +' Please set this parameter with the path to the input HistStruct')
histstruct = HistStruct.HistStruct.load( os.path.splitext(conf.HISTSTRUCT_FILE_NAME)[0]+'_evaluated.zip', verbose=True )

# other initializations
try: conf.TEST_GOOD_MASKS
except: raise Exception('ERROR: variable TEST_GOOD_MASKS not defined in the config file.'
                       +' You must define a double list of mask names for the good test set(s) with this name.')
ngoodsets = len(conf.TEST_GOOD_MASKS)
try: conf.TEST_BAD_MASKS
except: raise Exception('ERROR: variable TEST_BAD_MASKS not defined in the config file.'
                       +' You must define a double list of mask names for the bad test set(s) with this name.')
nbadsets = len(conf.TEST_BAD_MASKS)

In [None]:
### get the scores for the training set

mse_train = []   
for histname in histstruct.histnames:
    # get mse for training set
    thismse = histstruct.get_scores( histname=histname, masknames=conf.TRAINING_MASKS )
    mse_train.append( thismse )
        
# transform to arrays with correct shape
mse_train = np.array(mse_train)
mse_train = np.transpose(mse_train)
print('found mse array for training set of following shape: {}'.format(mse_train.shape))

In [None]:
### plot the multidemensional score and fit a distribution

try: conf.CLOUDFITTER_PLOT_TRAINING
except: 
    print('WARNING: parameter CLOUDFITTER_PLOT_TRAINING not found in config file, setting to False.')
    conf.CLOUDFITTER_PLOT_TRAINING = False
    
if conf.CLOUDFITTER_PLOT_TRAINING:
    dimslist = []
    fitfunclist = []
    nhisttypes = len(histstruct.histnames)
    for i in range(0,nhisttypes-1):
        for j in range(i+1,nhisttypes):
            dimslist.append((i,j))

    plt.close('all')
    for dims in dimslist:
        thismse = mse_train[:,dims]
        fitfunc = conf.CLOUDFITTER_TYPE( thismse, **conf.CLOUDFITTER_OPTIONS )
        pu.plot_fit_2d(thismse, fitfunc=fitfunc, logprob=True, clipprob=True,
                    onlycontour=False, xlims=30, ylims=30, 
                    onlypositive=True, transparency=0.5,
                    xaxtitle=histstruct.histnames[dims[0]], 
                    yaxtitle=histstruct.histnames[dims[1]],
                    title='density fit of lumisection MSE')
        fitfunclist.append(fitfunc)
        
try: conf.CLOUDFITTER_TYPE
except: raise Exception('ERROR: parameter CLOUDFITTER_TYPE not found in config file.'
                       +' Please provide a valid cloud fitter class.')
try: conf.CLOUDFITTER_OPTIONS
except: 
    print('WARNING: parameter CLOUDFITTER_OPTIONS not found in config file, setting to empty dict.')
    conf.CLOUDFITTER_OPTIONS = {}
    
fitfunc = conf.CLOUDFITTER_TYPE( mse_train, **conf.CLOUDFITTER_OPTIONS )

In [None]:
### evaluate the fit function on the (non-extended) histstruct

mse_all = []

for histname in histstruct.histnames:
    thismse = histstruct.get_scores( histname=histname )
    mse_all.append( thismse )
    
# transform to arrays with correct shape
mse_all = np.array(mse_all)
mse_all = np.transpose(mse_all)
print('found mse array for total set of following shape: {}'.format(mse_all.shape))

histstruct.add_globalscores( np.log(fitfunc.pdf(mse_all)) )

In [None]:
### get the scores for the non-extended good and bad test sets

logprob_good_parts = []
for i in range(ngoodsets):
    logprob_good_parts.append( histstruct.get_globalscores( masknames=conf.TEST_GOOD_MASKS[i]) )
logprob_bad_parts = []
for i in range(nbadsets):
    logprob_bad_parts.append( histstruct.get_globalscores( masknames=conf.TEST_BAD_MASKS[i]) )

In [None]:
### get the mse for the non-extended good and bad test sets
# (not needed for scores but for plotting later on)

mse_good = []
for i in range(ngoodsets):
    mse_good.append( [] )
    for histname in histstruct.histnames:
        mse = histstruct.get_scores( masknames=conf.TEST_GOOD_MASKS[i], histname=histname )
        mse_good[i].append(mse)
    mse_good[i] = np.array(mse_good[i])
    mse_good[i] = np.transpose(mse_good[i])
    print('found mse array for good set of following shape: {}'.format(mse_good[i].shape))
      
mse_bad = []
for i in range(nbadsets): 
    mse_bad.append( [] )
    for histname in histstruct.histnames:
        mse = histstruct.get_scores( masknames=conf.TEST_BAD_MASKS[i], histname=histname )
        mse_bad[i].append(mse)
    mse_bad[i] = np.array(mse_bad[i])
    mse_bad[i] = np.transpose(mse_bad[i])
    print('found mse array for bad set of following shape: {}'.format(mse_bad[i].shape))

In [None]:
### evaluate the fit function on the extended good and bad test sets

try: conf.EXTEND_TEST_GOOD
except: 
    print('WARNING: parameter EXTEND_TEST_GOOD not found in config file, setting to False.')
    conf.EXTEND_TEST_GOOD = False
    
if conf.EXTEND_TEST_GOOD:
    
    mse_good_ext = []
    logprob_good_ext_parts = []
    for i in range(ngoodsets):
        mse_good_ext.append( [] )
        for histname in histstruct.histnames:
            mse = histstruct.get_extscores( mu.get_combined_name(conf.TEST_GOOD_MASKS[i]), histname=histname )
            mse_good_ext[i].append(mse)
        mse_good_ext[i] = np.array(mse_good_ext[i])
        mse_good_ext[i] = np.transpose(mse_good_ext[i])
        print('found mse array for good set of following shape: {}'.format(mse_good_ext[i].shape))
        gscores = np.log(fitfunc.pdf(mse_good_ext[i]))
        logprob_good_ext_parts.append(gscores)
        histstruct.add_extglobalscores( mu.get_combined_name(conf.TEST_GOOD_MASKS[i]), gscores )
        
try: conf.EXTEND_TEST_BAD
except: 
    print('WARNING: parameter EXTEND_TEST_BAD not found in config file, setting to False.')
    conf.EXTEND_TEST_BAD = False
            
if conf.EXTEND_TEST_BAD:
            
    mse_bad_ext = []
    logprob_bad_ext_parts = []
    for i in range(nbadsets): 
        mse_bad_ext.append( [] )
        for histname in histstruct.histnames:
            mse = histstruct.get_extscores( mu.get_combined_name(conf.TEST_BAD_MASKS[i]), histname=histname )
            mse_bad_ext[i].append(mse)
        mse_bad_ext[i] = np.array(mse_bad_ext[i])
        mse_bad_ext[i] = np.transpose(mse_bad_ext[i])
        print('found mse array for bad set of following shape: {}'.format(mse_bad_ext[i].shape))
        gscores = np.log(fitfunc.pdf(mse_bad_ext[i]))
        logprob_bad_ext_parts.append(gscores)
        histstruct.add_extglobalscores( mu.get_combined_name(conf.TEST_BAD_MASKS[i]), gscores )

In [None]:
### redefine evaluation set

mse_good_eval = mse_good
logprob_good_eval_parts = logprob_good_parts
mse_bad_eval = mse_bad
logprob_bad_eval_parts = logprob_bad_parts
if conf.EXTEND_TEST_GOOD: 
    mse_good_eval = mse_good_ext
    logprob_good_eval_parts = logprob_good_ext_parts
if conf.EXTEND_TEST_BAD: 
    mse_bad_eval = mse_bad_ext
    logprob_bad_eval_parts = logprob_bad_ext_parts

In [None]:
### make a new plot of probability contours and overlay data points
### (only 2D projections!)  

try: conf.CLOUDFITTER_PLOT_TEST
except: 
    print('WARNING: parameter CLOUDFITTER_PLOT_TEST not found in config file, setting to False.')
    conf.CLOUDFITTER_PLOT_TEST = False
    
if conf.CLOUDFITTER_PLOT_TEST:
    
    if not conf.CLOUDFITTER_PLOT_TRAINING:
        raise Exception('ERROR: parameter CLOUDFITTER_PLOT_TEST is True while CLOUDFITTER_PLOT_TRAINING is False,'
                       +' this combination is not allowed.')
    
    plt.close('all')
    badcolorlist = ['red','lightcoral','firebrick','chocolate','fuchsia','orange','purple']
    goodcolorlist = ['blue']
    if len(badcolorlist)<len(mse_bad_eval):
        print('WARNING: too many bad test sets for available colors, putting all to red')
        badcolorist = ['red']*len(mse_bad_eval)
    if len(goodcolorlist)<len(mse_good_eval):
        print('WARNING: too many good test sets for available colors, putting all to blue')
        goodcolorist = ['blue']*len(mse_good_eval)

    for dims,partialfitfunc in zip(dimslist,fitfunclist):
        fig,ax = pu.plot_fit_2d(mse_train[:,dims], fitfunc=partialfitfunc, logprob=True, clipprob=True, 
                    onlycontour=True, xlims=60, ylims=60, 
                    onlypositive=True, transparency=0.5,
                    xaxtitle=histstruct.histnames[dims[0]], 
                    yaxtitle=histstruct.histnames[dims[1]],
                    title='density fit of lumisection MSE')
        for j in range(len(mse_bad_eval)): ax.plot(mse_bad_eval[j][:,dims[0]],mse_bad_eval[j][:,dims[1]],
                                               '.',color=badcolorlist[j],markersize=4)
        for j in range(len(mse_good_eval)): ax.plot(mse_good_eval[j][:,dims[0]],mse_good_eval[j][:,dims[1]],
                                                '.',color=goodcolorlist[j],markersize=4)
    

print('--- good lumesections ---')
logprob_good_eval = np.concatenate(tuple(logprob_good_eval_parts))
print('length of log prob array: '+str(len(logprob_good_eval)))
print('minimum of log prob: '+str(np.min(logprob_good_eval)))

print('--- bad lumisections ---')
logprob_bad_eval = np.concatenate(tuple(logprob_bad_eval_parts))
print('length of log prob array: '+str(len(logprob_bad_eval)))
print('maximum of log prob: '+str(np.max(logprob_bad_eval)))

In [None]:
### save the fully extended and evaluated histstruct for further testing
    
savename = os.path.splitext(conf.HISTSTRUCT_FILE_NAME)[0]+'_fitted.zip'
histstruct.save( savename )