**Run the training of histogram classifiers using a configuration file**

See example\_configuration\_for\_autoencoder.py for an example of a valid configuration file. 

Note: this is step 1/3, followed by run\_fitting.ipynb

Import the proper configuration file in the second cell of this notebook. The rest should be fully automatic, just run all cells sequentially. (Note: at a later stage, the plan is to convert this notebook to a regular .py script that can simply be run with the configuration file as command line argument.)

In [None]:
### imports

# external modules
print('importing external modules...')
print('  import os'); import os
print('  import sys'); import sys
print('  import pandas as pd'); import pandas as pd
print('  import numpy as np'); import numpy as np
print('  import matplotlib.pyplot as plt'); import matplotlib.pyplot as plt
print('  import pickly'); import pickle
print('  import importlib'); import importlib

# local modules: utils
print('importing utils...')
sys.path.append('../utils')
print('  import csv_utils as csvu'); import csv_utils as csvu
print('  import json_utils as jsonu'); import json_utils as jsonu
print('  import dataframe_utils as dfu'); import dataframe_utils as dfu
print('  import hist_utils as hu'); import hist_utils as hu
print('  import autoencoder_utils as aeu'); import autoencoder_utils as aeu
print('  import plot_utils as pu'); import plot_utils as pu
print('  import generate_data_utils as gdu'); import generate_data_utils as gdu
print('  import refruns_utils as rru'); import refruns_utils as rru
print('  import mask_utils as mu'); import mask_utils as mu
print('refreshing utils...')
importlib.reload(csvu)
importlib.reload(jsonu)
importlib.reload(dfu)
importlib.reload(hu)
importlib.reload(aeu)
importlib.reload(pu)
importlib.reload(gdu)
importlib.reload(rru)
importlib.reload(mu)

# local modules: src
print('importing src...')
sys.path.append('../src')
sys.path.append('../src/classifiers')
sys.path.append('../src/cloudfitters')
print('  import HistStruct'); import HistStruct
print('  import HistogramClassifier'); import HistogramClassifier
print('  import AutoEncoder'); import AutoEncoder
print('  import SeminormalFitter'); import SeminormalFitter
print('  import GaussianKdeFitter'); import GaussianKdeFitter
print('  import HyperRectangleFitter'); import HyperRectangleFitter
print('refreshing src...')
importlib.reload(HistStruct)
importlib.reload(HistogramClassifier)
importlib.reload(AutoEncoder)
importlib.reload(SeminormalFitter)
importlib.reload(GaussianKdeFitter)
importlib.reload(HyperRectangleFitter)

print('done')

In [None]:
### import the configuration file

print('--- importing configuration file... ---\n')
import example_configuration_for_autoencoder_cfg as conf
importlib.reload(conf)
print('\n--- done importing configuration file ---\n')

# load the histstruct
try: conf.HISTSTRUCT_FILE_NAME
except: raise Exception('ERROR: parameter HISTSTRUCT_FILE_NAME not found in the config file.'
                       +' Please set this parameter with the path to the input HistStruct')
histstruct = HistStruct.HistStruct.load( os.path.splitext(conf.HISTSTRUCT_FILE_NAME)[0]+'_configured.zip', verbose=True )

# other initializations
try: conf.TEST_GOOD_MASKS
except: raise Exception('ERROR: variable TEST_GOOD_MASKS not defined in the config file.'
                       +' You must define a double list of mask names for the good test set(s) with this name.')
ngoodsets = len(conf.TEST_GOOD_MASKS)
try: conf.TEST_BAD_MASKS
except: raise Exception('ERROR: variable TEST_BAD_MASKS not defined in the config file.'
                       +' You must define a double list of mask names for the bad test set(s) with this name.')
nbadsets = len(conf.TEST_BAD_MASKS)

In [None]:
### plot the training and/or test sets

try: conf.DO_INITIAL_PLOT
except: 
    print('WARNING: parameter DO_INITIAL_PLOT not defined in the config file, setting to False.')
    conf.DO_INITIAL_PLOT = False
    
if conf.DO_INITIAL_PLOT:
    
    print('making plot(s) of input histograms according to provided options...')
    
    try: conf.INITIAL_PLOT_SETTINGS
    except: raise Exception('ERROR: parameter INITIAL_PLOT_SETTINGS not defined in the config file.'
                           +' Please provide valid options or set DO_INITIAL_PLOT to False.')
    
    for options in conf.INITIAL_PLOT_SETTINGS:
        print('  - making plot with following options: {}'.format(options))
        histstruct.plot_histograms( **options )

In [None]:
### extend the training set using artificial data

try: conf.EXTEND_TRAINING
except:
    print('WARNING: parameter EXTEND_TRAINING not defined in config file, setting to False...')
    conf.EXTEND_TRAINING = False
    
if conf.EXTEND_TRAINING:
    
    print('extending training set...')
    
    try: conf.EXTEND_TRAINING_PARTITIONS
    except: 
        print('WARNING: parameter EXTEND_TRAINING_PARTITIONS not found in config file, setting to -1.')
        conf.EXTEND_TRAINING_PARTITIONS = -1
    try: conf.EXTEND_TRAINING_FUNCTION
    except:
        print('WARNING: parameter EXTEND_TRAINING_FUNCTION not found in config file,'
                       +' setting it to None (will not extend this histogram set).')
    
    for histname in histstruct.histnames:
        print('generating artificial training data for '+histname)
        trainhists = hu.averagehists( histstruct.get_histograms( histname=histname, masknames=conf.TRAINING_MASKS ), 
                                      conf.EXTEND_TRAINING_PARTITIONS )
        if conf.EXTEND_TRAINING_FUNCTION is not None:
            trainhists = conf.EXTEND_TRAINING_FUNCTION( trainhists, **conf.EXTEND_TRAINING_OPTIONS )
        histstruct.add_exthistograms( mu.get_combined_name(conf.TRAINING_MASKS), histname, trainhists )
        print(' -> generated {} histograms'.format(len(trainhists)))

In [None]:
### define and train an autoencoder for each element

try: conf.DO_TRAINING
except: 
    print('WARNING: parameter DO_TRAINING not found in config file, setting to True.')
    conf.DO_TRAINING = True
    
if conf.DO_TRAINING:

    print('training a classifier for each histogram type...')
    
    try: conf.TRAINING_OPTIONS
    except: 
        print('WARNING: parameter TRAINING_OPTIONS not found in config file, setting to empty dict')
        conf.TRAINING_OPTIONS = {}
    
    for histname in histstruct.histnames:
        # choose training set
        hists = histstruct.get_histograms( histname=histname, masknames=conf.TRAINING_MASKS )
        if conf.EXTEND_TRAINING: hists = histstruct.exthistograms[ mu.get_combined_name(conf.TRAINING_MASKS) ][histname]
        print('training a classifier for {}'.format(histname))
        print('size of training set: {}'.format(hists.shape))
        histstruct.classifiers[histname].train( hists, **conf.TRAINING_OPTIONS )
        # choose whether to save the model
        if conf.SAVE_MODELS:
            savename = conf.SAVE_MODELS_BASENAME+'_'+histname
            savename = os.path.join(conf.SAVE_MODELS_DIR,savename)
            print('saving the trained model to {}'.format(savename))
            histstruct.classifiers[histname].save( savename )
        print('---------------------------\n')

In [None]:
### evaluate the models on all histograms in the (non-extended) histstruct

# parameters TEST_GOOD_PARTITIONS and TEST_BAD_PARTITIONS not used anymore

deprecated = '''# do some checks for good test sets
try: conf.TEST_GOOD_PARTITIONS
except: 
    print('WARNING: parameter TEST_GOOD_PARTITIONS not found in config file, setting to -1.')
    conf.TEST_GOOD_PARTITIONS = [-1]*len(ngoodsets)
if len(conf.TEST_GOOD_PARTITIONS)!=ngoodsets:
    raise Exception('ERROR: found incompatible lengths of TEST_GOOD_PARTITIONS and TEST_GOOD_MASKS')

# do some check for bad test sets
try: conf.TEST_BAD_PARTITIONS
except: 
    print('WARNING: parameter TEST_BAD_PARTITIONS not found in config file, setting to -1.')
    conf.TEST_BAD_PARTITIONS = [-1]*len(nbadsets)
if len(conf.TEST_BAD_PARTITIONS)!=nbadsets:
    raise Exception('ERROR: found incompatible lengths of TEST_BAD_PARTITIONS and TEST_BAD_MASKS')'''
    
print('evaluating the classifiers on all histograms...') 
for histname in histstruct.histnames:
    print('evaluating model for '+histname)
    histstruct.evaluate_classifier( histname )

In [None]:
### extend the test set using artificial data generation and evaluate the model on the extended test set

try: conf.EXTEND_TEST_GOOD
except: 
    print('WARNING: parameter EXTEND_TEST_GOOD not found in config file, setting to False.')
    conf.EXTEND_TEST_GOOD = False
    
if conf.EXTEND_TEST_GOOD:
    
    try: conf.EXTEND_TEST_GOOD_PARTITIONS
    except: 
        print('WARNING: parameter EXTEND_TEST_GOOD_PARTITIONS not found in config file, setting to -1.')
        conf.EXTEND_TEST_GOOD_PARTITIONS = [-1]*ngoodsets
    if len(conf.EXTEND_TEST_GOOD_PARTITIONS)!=ngoodsets:
        raise Exception('ERROR: found incompatible lengths of EXTEND_TEST_GOOD_PARTITIONS and TEST_GOOD_MASKS')
    try: conf.EXTEND_TEST_GOOD_FUNCTION
    except:
        print('WARNING: parameter EXTEND_TEST_GOOD_FUNCTION not found in config file,'
                       +' setting it to None (will not extend this histogram set).')
        conf.EXTEND_TEST_GOOD_FUNCTION = None
    
    for histname in histstruct.histnames:
        print('generating good data for '+histname)
        for i in range(ngoodsets):
            print('  generating set {} of {}...'.format(i+1,ngoodsets))
            goodhists = hu.averagehists( histstruct.get_histograms( histname=histname, masknames=conf.TEST_GOOD_MASKS[i] ), 
                                         conf.EXTEND_TEST_GOOD_PARTITIONS[i] )
            if conf.EXTEND_TEST_GOOD_FUNCTION is not None:
                try: conf.EXTEND_TEST_GOOD_OPTIONS
                except: raise Exception('ERROR: parameter EXTEND_TEST_GOOD_OPTIONS not found in config file.'
                       +' Please provide valid options to the resampling functions.')
                goodhists = conf.EXTEND_TEST_GOOD_FUNCTION( goodhists, **conf.EXTEND_TEST_GOOD_OPTIONS)
            histstruct.add_exthistograms( mu.get_combined_name(conf.TEST_GOOD_MASKS[i]), histname, goodhists )
            print('  -> generated {} histograms'.format(len(goodhists)))
            print('  evaluating the classifier on this set...')
            histstruct.evaluate_classifier( histname, extname=mu.get_combined_name(conf.TEST_GOOD_MASKS[i]) )

        
try: conf.EXTEND_TEST_BAD
except: 
    print('WARNING: parameter EXTEND_TEST_BAD not found in config file, setting to False.')
    conf.EXTEND_TEST_BAD = False
            
if conf.EXTEND_TEST_BAD:
    
    try: conf.EXTEND_TEST_BAD_PARTITIONS
    except: 
        print('WARNING: parameter EXTEND_TEST_BAD_PARTITIONS not found in config file, setting to -1.')
        conf.EXTEND_TEST_BAD_PARTITIONS = [-1]*nbadsets
    if len(conf.EXTEND_TEST_BAD_PARTITIONS)!=nbadsets:
        raise Exception('ERROR: found incompatible lengths of EXTEND_TEST_BAD_PARTITIONS and TEST_BAD_MASKS')
    try: conf.EXTEND_TEST_BAD_FUNCTION
    except:
        print('WARNING: parameter EXTEND_TEST_BAD_FUNCTION not found in config file.'
                       +' setting it to None (will not extend this histogram set).')
        conf.EXTEND_TEST_BAD_FUNCTION = None
    
    for histname in histstruct.histnames:
        print('generating bad data for '+histname)
        for i in range(nbadsets):
            print('  generating set {} of {}...'.format(i+1,nbadsets))
            badhists = hu.averagehists( histstruct.get_histograms( histname=histname, masknames=conf.TEST_BAD_MASKS[i] ), 
                                        conf.EXTEND_TEST_BAD_PARTITIONS[i] )
            if conf.EXTEND_TEST_BAD_FUNCTION is not None:
                try: conf.EXTEND_TEST_BAD_OPTIONS
                except: raise Exception('ERROR: parameter EXTEND_TEST_BAD_OPTIONS not found in config file.'
                       +' Please provide valid options to the resampling functions.')
                badhists = conf.EXTEND_TEST_BAD_FUNCTION( badhists, **conf.EXTEND_TEST_BAD_OPTIONS)
            histstruct.add_exthistograms( mu.get_combined_name(conf.TEST_BAD_MASKS[i]), histname, badhists )
            print('  -> generated {} histograms'.format(len(badhists)))
            print('  evaluating the classifier on this set...')
            histstruct.evaluate_classifier( histname, extname=mu.get_combined_name(conf.TEST_BAD_MASKS[i]) )

In [None]:
### save the fully extended and evaluated histstruct for further processing
    
savename = os.path.splitext(conf.HISTSTRUCT_FILE_NAME)[0]+'_evaluated.zip'
histstruct.save( savename )