**Train and test an NMF model for a particular type of histogram**

Notes:  
- This script is intended to be used with 1D histograms. (For the extension to 2D histograms, see nmf_2d.ipynb!)  
- We only consider one type of histogram at a time. (For the extension to multiple histograms, see nmf_combine.ipynb!)  
- In this case the use of the custom class HistStruct is not necessary and one can work with the dataframe and numpy arrays directly. However, we use it here as a proof-of-principle.

In [None]:
### imports

# external modules
import sys
import numpy as np
#import matplotlib.pyplot as plt
import importlib

# local modules
sys.path.append('../utils')
import csv_utils as csvu
import dataframe_utils as dfu
import json_utils as jsonu
import hist_utils as hu
import autoencoder_utils as aeu
import plot_utils as pu
import generate_data_utils as gdu
importlib.reload(csvu)
importlib.reload(dfu)
importlib.reload(jsonu)
importlib.reload(hu)
importlib.reload(aeu)
importlib.reload(pu)
importlib.reload(gdu)
sys.path.append('../src')
sys.path.append('../src/classifiers')
import HistStruct
importlib.reload(HistStruct)
import NMFClassifier
importlib.reload(NMFClassifier)

In [None]:
### define run properties
# in this cell all major run properties are going to be set,
# e.g. what runs to train on and what runs to test on

# define a list of good 'reference' runs (found by eye)
# should be replaced at some point by the reference runs defined by the DQM/DC team.
goodrunsls = {'2017':
                {
                "297056":[[-1]],
                "297177":[[-1]],
                "301449":[[-1]],
                }
             }

# define core test set of clearly bad runs (found by eye)
badrunsls = {'2017':
                {
                "297287":[[-1]],
                "297288":[[-1]],
                #"297289":[[-1]],
                "299316":[[-1]],
                "299324":[[-1]],
                #"299326":[[-1]],
                #"301086":[[88,126]] # only bad for size_PXDisk_+1 -> maybe do not use for now (unclear what are real anomalies)
                }
            }

# set year to use
year = '2017'

# set histogram names to use 
histname = 'chargeInner_PXLayer_2'
            
# set whether to train globally or locally
training_mode = 'local'

if training_mode == 'global':
    runsls_training = None # use none to not add a mask for training (can e.g. use DCS-bit on mask)
    runsls_good = None # use none to not add a mask for good runs (can e.g. use templates)
    runsls_bad = badrunsls[year] # predefined bad runs
    print('selected runs/lumisections for training: all')
    
elif training_mode == 'local':
    # train locally on a small set of runs
    # for now on n runs preceding a chosen application run,
    # to be extended with choosing reference runs.
    
    # select application run
    available_runs = dfu.get_runs( dfu.select_dcson( csvu.read_csv('../data/DF'+year+'_'+histname+'.csv') ) )
    run_application = 305351
    run_application_index = available_runs.index(run_application)
    # select training set
    ntraining = 5
    runsls_training = jsonu.tuplelist_to_jsondict([(el,[-1]) for el in available_runs[run_application_index-ntraining:run_application_index]])
    runsls_bad = badrunsls[year]
    runsls_good = jsonu.tuplelist_to_jsondict([(run_application,[-1])])
    print('selected runs/lumisections for training: ')
    print(runsls_training)
    print('selected runs/lumisections as good test set:')
    print(runsls_good)
    print('selected runs/lumisections as bad test set:')
    print(runsls_bad)

In [None]:
### read the data based on the configuration defined above

readnew = True
save = False
hsfilename = 'test.pkl'

if readnew:
    
    # add the histograms
    histstruct = HistStruct.HistStruct()
    # loop over the histogram types to take into account
    print('adding {}...'.format(histname))
    # read the histograms from the csv file
    filename = '../data/DF'+year+'_'+histname+'.csv'
    df = csvu.read_csv( filename )
    # in case of local training, we can remove most of the histograms
    if( runsls_training is not None and runsls_good is not None and runsls_bad is not None ):
        runsls_total = {k: v for d in (runsls_training, runsls_good, runsls_bad) for k, v in d.items()}
        df = dfu.select_runsls( df, runsls_total )
    histstruct.add_dataframe( df )
    print('found {} histograms'.format(len(histstruct.runnbs)))
    
    # add masks
    histstruct.add_dcsonjson_mask( 'dcson' )
    histstruct.add_goldenjson_mask('golden' )
    histstruct.add_hightstat_mask( 'highstat' )
    if runsls_training is not None: histstruct.add_json_mask( 'training', runsls_training )
    if runsls_good is not None: histstruct.add_json_mask( 'good', runsls_good )
    nbadruns = 0
    if runsls_bad is not None:
        histstruct.add_json_mask( 'bad', runsls_bad )
        # special case for bad runs: add a mask per run (different bad runs have different characteristics)
        nbadruns = len(runsls_bad.keys())
        for i,badrun in enumerate(runsls_bad.keys()):
            histstruct.add_json_mask( 'bad{}'.format(i), {badrun:runsls_bad[badrun]} )
    
    if save:
        histstruct.save( hsfilename )
        
if not readnew:
    
    histstruct = HistStruct.HistStruct.load( hsfilename )
    nbadruns = len([name for name in list(histstruct.masks.keys()) if 'bad' in name])
    
print('created a histstruct with the following properties:')
print('- number of histogram types: {}'.format(len(histstruct.histnames)))
print('- number of lumisections: {}'.format(len(histstruct.lsnbs)))
print('- masks: {}'.format(list(histstruct.masks.keys())))

In [None]:
skipthiscell = False

if( training_mode=='local' and not skipthiscell ):
    
    # training and application runs
    histstruct.plot_histograms( masknames=[['dcson','highstat','training'],['highstat','good']],
                                labellist = ['training','testing'],
                                colorlist = ['blue','green']
                              )
    
    # application run and bad test runs
    histstruct.plot_histograms( masknames=[['dcson','highstat','good'],['bad']],
                                labellist = ['good','bad'],
                                colorlist = ['green','red']
                              )

In [None]:
### extend the training set using artificial data

extendtraining = False

if extendtraining:
    histstruct.exthistograms['training'] = {}
    print('generating artificial training data for '+histname)
    hists = histstruct.get_histograms( histname=histname, masknames=['dcson','highstat','training'] )
    print('  original number of histograms: {}'.format(len(hists)))
    histstruct.exthistograms['training'][histname] = gdu.upsample_hist_set( hists , 5e4)
    print('  -> generated {} histograms'.format(len(histstruct.exthistograms['training'][histname])))

In [None]:
### define and train an NMF model

if training_mode=='local': hists_train = histstruct.get_histograms( histname=histname, masknames=['dcson','highstat','training'] )
elif training_mode=='global':
    # use all available data for training (with DCS-on and statistics selection)
    #hists_train = histstruct.get_histograms( histname=histname, masknames=['dcson','highstat'] )
    # this can however take a long time... alternatively, use averaged histograms for training
    hists_train = hu.averagehists( histstruct.get_histograms( histname=histname, masknames=['dcson','highstat'] ), 1000 )
if extendtraining: hists_train = histstruct.exthistograms['training'][histname]
classifier = NMFClassifier.NMFClassifier( hists_train, ncomponents=3, nmax=10 )
_ = histstruct.add_classifier( histname, classifier, evaluate=False )

In [None]:
### plot the NMF components

components = classifier.get_components()
_ = pu.plot_hists_multi( components, colorlist=list(range(len(components))), xaxtitle='bin number', yaxtitle='arbitrary units', title='NMF components' )

In [None]:
### evaluate the models on all histograms in the (non-extended) histstruct

print('evaluating model for '+histname)
_ = histstruct.evaluate_classifier(histname)

In [None]:
# transform to arrays with correct shape

if training_mode=='local':
    mse_train = histstruct.get_scores( histname=histname, masknames=['dcson','highstat','training'])
    print('found mse array for training set of following shape: {}'.format(mse_train.shape))
    mse_good = histstruct.get_scores( histname=histname, masknames=['dcson','highstat','good'])
    print('found mse array for good set of following shape: {}'.format(mse_good.shape))
    mse_bad = []
    for i in range(nbadruns):
        mse_bad.append( histstruct.get_scores( histname=histname, masknames=['dcson','bad{}'.format(i)]) )
        print('found mse array for bad set of following shape: {}'.format(mse_bad[i].shape))

elif training_mode=='global':
    mse_train = histstruct.get_scores( histname=histname, masknames=['dcson','highstat'])
    print('found mse array for training set of following shape: {}'.format(mse_train.shape))
    hists_good = hu.averagehists( histstruct.get_histograms( histname=histname, masknames=['dcson','highstat']), 100 )
    mse_good = histstruct.classifiers[histname].evaluate( hists_good )
    print('found mse array for good set of following shape: {}'.format(mse_good.shape))
    mse_bad = []
    for i in range(nbadruns):
        mse_bad.append( histstruct.get_scores( histname=histname, masknames=['dcson','bad{}'.format(i)]) )
        print('found mse array for bad set of following shape: {}'.format(mse_bad[i].shape))

In [None]:
### extend the test set using artificial data generation and evaluate the model on the extended test set

skipthiscell = False # to prevent running this cell by accident

if not skipthiscell:
    
    histstruct.exthistograms['good'] = {}
    for i in range(nbadruns): histstruct.exthistograms['bad{}'.format(i)] = {}
    print('generating data for '+histname)
    if 'good' in histstruct.masks.keys():
        goodhists = histstruct.get_histograms( histname=histname,masknames=['dcson','highstat','good'] )
    else:
        goodhists = hu.averagehists( histstruct.get_histograms( histname=histname, masknames=['dcson','highstat'] ), 15 )
    histstruct.exthistograms['good'][histname] = gdu.upsample_hist_set( goodhists,
                                                    figname='',ntarget=nbadruns*5e3,fourierstdfactor=20.)
    for i in range(nbadruns):
        badhists = histstruct.get_histograms( histname=histname,masknames=['dcson','bad{}'.format(i)] )
        histstruct.exthistograms['bad{}'.format(i)][histname] = gdu.upsample_hist_set(
            badhists,figname='',ntarget=5e3,fourierstdfactor=20.)

    print('evaluating: '+histname)
    # calculate mse for good test set
    mse_good_ext = histstruct.classifiers[histname].evaluate( histstruct.exthistograms['good'][histname] )
    print('found mse array for good set of following shape: {}'.format(mse_good_ext.shape))
    mse_bad_ext = []
    for i in range(nbadruns):
        # calculate mse
        mse_bad_ext.append( histstruct.classifiers[histname].evaluate( histstruct.exthistograms['bad{}'.format(i)][histname] ) )
        print('found mse array for bad set of following shape: {}'.format(mse_bad_ext[i].shape))

In [None]:
### make a roc curve based on the test results above

importlib.reload(aeu)

use_ext = True
mse_good_eval = mse_good
mse_bad_eval = mse_bad
if use_ext:
    mse_good_eval = mse_good_ext
    mse_bad_eval = mse_bad_ext

mse_bad_eval_flat = np.concatenate(tuple(mse_bad_eval))
labels_good = np.zeros(len(mse_good_eval)) # background: label = 0
labels_bad = np.ones(len(mse_bad_eval_flat)) # signal: label = 1

labels = np.concatenate(tuple([labels_good,labels_bad]))
scores = np.concatenate(tuple([mse_good_eval,mse_bad_eval_flat]))

pu.plot_score_dist(scores, labels, nbins=50, normalize=True)

auc = aeu.get_roc(scores, labels, mode='geom', doprint=False)

threshold = 0.1e-3
aeu.get_confusion_matrix(scores,labels,threshold)

In [None]:
### plot some random examples

# define reference histograms
refhists = {}
if( 'good' in histstruct.masks.keys() ): 
    refhists[histname] = hu.averagehists( histstruct.get_histograms( histname=histname, masknames=['highstat','dcson','good']), 15 )
else: 
    refhists[histname] = hu.averagehists( histstruct.get_histograms( histname=histname, masknames=['dcson','highstat'] ), 15 )

# define number of plots to make
nplot = 10

# make plots for good histograms
runnbs_good = histstruct.get_runnbs( masknames=['highstat','dcson'] )
lsnbs_good = histstruct.get_lsnbs( masknames=['highstat','dcson'] )
indices = np.random.choice( np.arange(len(runnbs_good)), size=nplot, replace=False )
for i in indices:
    _ = histstruct.plot_ls( runnbs_good[i], lsnbs_good[i], recohist='auto', refhists=refhists )
    
# make plots for bad histograms
runnbs_bad = histstruct.get_runnbs( masknames=['dcson','bad'] )
lsnbs_bad = histstruct.get_lsnbs( masknames=['dcson','bad'] )
indices = np.random.choice( np.arange(len(runnbs_bad)), size=nplot, replace=False )
for i in indices:
    _ = histstruct.plot_ls( runnbs_bad[i], lsnbs_bad[i], recohist='auto', refhists=refhists )