**Train an autoencoder (very similar to autoencoder.ipynb) on several types of histograms and study the combined prediction**

In [None]:
%run utils/utils.ipynb
%run utils/clustering_utils.ipynb
%run utils/autoencoder_utils.ipynb
%run utils/ae_combine_utils.ipynb
%run utils/generate_data_utils.ipynb
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle

In [None]:
### define run properties
# in this cell all major run properties are going to be set,
# e.g. what runs to train on and what runs to test on

# define a list of good 'reference' runs (found by eye)
# should be replaced at some point by the reference runs defined by the DQM/DC team.
goodrunsls = {'2017':
              [
                (297056,[-1]),
                (297177,[-1]),
                (301449,[-1]),
              ],
              '2018': # needs to be re-checked, not guaranteed to be full correct or representative.
              [  (315267,[-1])]}

# define core test set of clearly bad runs (found by eye)
badrunsls = {'2017':
                [
                (297287,[-1]),
                (297288,[-1]),
                (297289,[-1]),
                (299316,[-1]),
                (299324,[-1]),
                (299326,[-1]),
                (301086,list(range(88,126))) # only bad for size_PXDisk_+1 -> maybe do not use for now (unclear what are real anomalies)
                ],
            '2018': # needs to be re-checked, not guaranteed to be full correct or representative.
                [
                #(317479,[-1]),
                (317480,[-1]),
                (317481,[-1]),
                (317482,[-1]),
                #(319847,list(range(1,35)))
            ]}

# set year to use
year = '2017'

# set histogram names to use 
histnames = [
            'chargeInner_PXLayer_2','chargeInner_PXLayer_3',
             'charge_PXDisk_+1','charge_PXDisk_+2','charge_PXDisk_+3',
             'size_PXLayer_1','size_PXLayer_2','size_PXLayer_3'
            ]

# set whether to train globally or locally
training_mode = 'local'

if training_mode == 'global':
    # train on the entire dataset (per year),
    # define good runs for testing using averages of this entire dataset,
    # define bad runs for testing using a fixed list of runs.
    runsls_training = [] # use empty list for all runs
    runsls_good = [-15] # use a negative integer for averaging instead of actual runs/ls.
    runsls_bad = badrunsls[year]
    print('selected runs/lumisections for training: all')
    
elif training_mode == 'local':
    # train locally on n runs preceding a chosen application run,
    # define good runs for testing as the application run (NOT guaranteed, check plots!),
    # define bad runs for testing using a fixed list of runs.
    ntraining = 5
    available_runs = get_runs(select_dcson(read_csv('data/DF'+year+'_'+histnames[0]+'.csv')))
    runindex = np.random.choice(range(ntraining,len(available_runs)))
    runsls_good = [(available_runs[runindex],[-1])]
    runsls_training = [(el,[-1]) for el in available_runs[runindex-ntraining:runindex]]
    runsls_bad = badrunsls[year]
    print('selected runs/lumisections for training: ')
    for r in runsls_training: print(r)
    print('selected runs/lumisections for application:')
    print(runsls_good)

In [None]:
### read the data based on the configuration defined above
# todo: adapt reading and writing to new paradigm

readnewtraining = True
readnewgood = True
readnewbad = True
save = False
structname = 'teststruct.pkl'

if readnewtraining:
    
    histstruct_training = histstructure()
    histstruct_good = histstructure()
    histstruct_bad = histstructure()
    print('--- reading training set ---')
    histstruct_training.create(year,histnames,highstatonly=True,dcsononly=True,
                               runsls=runsls_training)
    print('found {} histograms'.format(len(histstruct_training.runnbs)))
    
if readnewgood:
    
    print('--- reading good test set ---')
    histstruct_good.create(year,histnames,highstatonly=True,dcsononly=True,
                           runsls=runsls_good)
    print('found {} histograms'.format(len(histstruct_good.runnbs)))
    # make an additional link to the same set (useful for uniformity when testing)
    histstruct_good.custom['histograms'] = histstruct_good.histograms
    
if readnewbad:
    
    print('--- reading bad test set ---')
    histstruct_bad.create(year,histnames,runsls=runsls_bad)
    print('found {} histograms'.format(len(histstruct_bad.runnbs)))
    # add a list of bad histograms split per run
    indpr = histstruct_bad.get_perrun_indices()
    nbadsets = len(indpr)
    histstruct_bad.custom['histograms'] = {}
    for name,histograms in histstruct_bad.histograms.items():
        histstruct_bad.custom['histograms'][name] = []
        for ind in indpr:
            histstruct_bad.custom['histograms'][name].append( histograms[ind] )
    print('split bad set into {} runs'.format(nbadsets))
    
'''if save:
    with open(structname,'wb') as f:
        pickle.dump(histstruct,f)
            
else:
    with open(structname,'rb') as f:
        histstruct = pickle.load(f)
    nhists = len(histstruct.lsnbs)
    nhisttypes = len(histstruct.names)
    print('found {} histogram types with {} histograms each'.format(nhisttypes,nhists))'''

In [None]:
### plot the training and/or test sets
# especially useful if running in local mode to check if training set is relevant for targeted application run,
# and if the application run contains anomalies

if training_mode=='local':
    
    # training and application runs
    fig,axs = plt.subplots(2,4,figsize=(24,12))
    for j,name in enumerate(histstruct_training.names):
        # (assume the histogram names are the same for training, good and bad)
        print('making plot for '+name)
        hists_training = histstruct_training.histograms[name]
        hists_testing = histstruct_good.histograms[name]
        plot_sets([hists_training,hists_testing],
                  ax=axs[int(j/4),j%4],
                  title=name,
                  colorlist=['blue','green'],labellist=['training','testing'],
                  transparencylist=[1.,1.])
    
    # application run and bad test runs
    fig,axs = plt.subplots(2,4,figsize=(24,12))
    for j,name in enumerate(histstruct_good.names):
        # (assume the histogram names are the same for training, good and bad)
        print('making plot for '+name)
        hists_good = histstruct_good.histograms[name]
        hists_bad = histstruct_bad.histograms[name]
        plot_sets([hists_bad,hists_good],
                  ax=axs[int(j/4),j%4],
                  title=name,
                  colorlist=['red','green'],labellist=['bad','testing'],
                  transparencylist=[1.,1.])

In [None]:
### extend the training set using artificial data

extendtraining = True

histstruct_training.custom['histograms_ext'] = {}

if extendtraining:
    for (name,hists) in histstruct_training.histograms.items():
        print('generating artificial training data for '+name)
        histstruct_training.custom['histograms_ext'][name] = upsample_hist_set(hists,5e4)
else:
    histstruct_training.custom['histograms_ext'] = histstruct_training.histograms

In [None]:
### define and train an autoencoder for each element
from keras.models import load_model

trainnew = True
savemodels = False
modelname_extension = 'dcson_five_runs' # ignored if savemodels is False
histstruct_training.custom['models'] = {}
histstruct_good.custom['models'] = {}
histstruct_bad.custom['models'] = {}
if trainnew:
    for (name,hists) in histstruct_training.custom['histograms_ext'].items():
        modelname=name+'_'+modelname_extension
        if not savemodels: modelname = '' # empty string means do not save models
        #nepochs = -1 # automatic number of epochs
        nepochs = 40 # manual number of epochs
        # modify training parameters depending on type of histogram
        #if 'chargeInner_PXLayer_2' in name:
            # increased statistics threshold
            #hists = hists[np.where( histstruct.entries_all[name]/len(hists[0]) > 1000 )]
        model = train_simple_autoencoder(hists,nepochs=nepochs,modelname=modelname)
        histstruct_training.custom['models'][name] = model
        histstruct_good.custom['models'][name] = model
        histstruct_bad.custom['models'][name] = model
else:
    for name in histstruct_training.names:
        print('loading model for '+name)
        # note: for now use 2017 models for 2018 data as well (to check behaviour on unseen data)
        model_name = name.replace('data','models')+'_'+modelname_extension+'.h5'
        #model_name = model_name.replace('2018','2017')
        model = load_model(model_name,custom_objects={'mseTop10': mseTop10})
        histstruct_training.custom['models'][name] = model
        histstruct_good.custom['models'][name] = model
        histstruct_bad.custom['models'][name] = model

In [None]:
### evaluate the models on the (non-extended) training set

histstruct_training.custom['mse_histograms'] = {}
for (name,hists) in histstruct_training.histograms.items():
    print('evaluating model for '+name)
    pred = histstruct_training.custom['models'][name].predict(hists)
    #mse = K.eval(mseTop10(hists,pred))
    mse = mseTop10Raw(hists,pred)
    histstruct_training.custom['mse_histograms'][name] = mse

In [None]:
### plot the multidemensional mse and fit a log-normal distribution
import matplotlib as mpl

fitnew = True    # fit new distribution or load previously saved one
savefit = False  # save the new fit or not
valkey = 'mse_histograms' # perform fit on (non-extended) training set

dimslist = []
fitfunclist = []
nhisttypes = len(histstruct_training.names)
for i in range(0,nhisttypes-1):
    for j in range(i+1,nhisttypes):
        dimslist.append((i,j))

plt.close('all')
for dims in dimslist:
   
    fitfunc = fitseminormal(histstruct_training,valkey,dims,fitnew=fitnew,savefit=savefit)
    #fitfunc = fitgaussiankde(histstruct_training,valkey,dims,maxnpoints=10000)
    plotfit2d(histstruct_training,valkey,dims,fitfunc,doinitialplot=True)
    #plt.close('all') # release plot memory
    fitfunclist.append(fitfunc)
    
# do a total fit
if fitnew:
    fitfunc = seminormal( get_mse_array(histstruct_training,valkey) )
    #fitfunc = fitgaussiankde(histstruct_training,valkey,maxnpoints=10000)
    if savefit: fitfunc.save('seminormal_fit_2017_8dim.npy')
else:
    fitfunc = seminormal()
    fitfunc.load('seminormal_fit_2017_8dim.npy')

In [None]:
### extend the test set using artificial data generation

skipthiscell = False # to prevent running this cell by accident

if not skipthiscell:
    
    histstruct_good.custom['histograms_ext'] = {}
    histstruct_bad.custom['histograms_ext'] = {}
    for name in histstruct_training.names:
        print('generating data for '+name)
        histstruct_good.custom['histograms_ext'][name] = upsample_hist_set(histstruct_good.histograms[name],
                                                            figname='',ntarget=nbadsets*4e3,fourierstdfactor=20.)
        histstruct_bad.custom['histograms_ext'][name] = []
        for badset in histstruct_bad.custom['histograms'][name]:
            histstruct_bad.custom['histograms_ext'][name].append(upsample_hist_set(badset,
                                                                              figname='',ntarget=4e3,
                                                                              fourierstdfactor=20.))

In [None]:
### evaluate the model on the (potentially extended) test set

use_ext = False
# (whether to calculate the mse on the extended datasets)
# (note that in any case the mse will (also) be calculated on the original sets and stored in the histstructs)

key = 'histograms'
if use_ext:
    key = key + '_ext'
mse_good = np.zeros((len(histstruct_good.custom[key][histstruct_good.names[0]]),nhisttypes))
mse_bad = [np.zeros((len(histstruct_bad.custom[key][histstruct_bad.names[0]][j]),nhisttypes)) for j in range(nbadsets)]

histstruct_good.custom['mse_'+key] = {}
histstruct_bad.custom['mse_'+key] = {}
if use_ext:
    histstruct_good.custom['mse_histograms'] = {}
    histstruct_bad.custom['mse_histograms'] = {}
for i,name in enumerate(histstruct_training.names):
    print('evaluating: '+name)
    mse_good[:,i] = mseTopNRaw(histstruct_good.custom[key][name],histstruct_training.custom['models'][name].predict(histstruct_good.custom[key][name]), n=10)
    histstruct_good.custom['mse_'+key][name] = mse_good[:,i]
    if use_ext:
        histstruct_good.custom['mse_histograms'][name] = mseTopNRaw(histstruct_good.custom['histograms'][name],histstruct_training.custom['models'][name].predict(histstruct_good.custom['histograms'][name]), n=10)
    histstruct_bad.custom['mse_'+key][name] = []
    if use_ext: histstruct_bad.custom['mse_histograms'][name] = []
    for j in range(nbadsets):
        mse_bad[j][:,i] = mseTopNRaw(histstruct_bad.custom[key][name][j],histstruct_training.custom['models'][name].predict(histstruct_bad.custom[key][name][j]), n=10)
        histstruct_bad.custom['mse_'+key][name].append(mse_bad[j][:,i])
        if use_ext: 
            histstruct_bad.custom['mse_histograms'][name].append( mseTopNRaw(histstruct_bad.custom['histograms'][name][j],histstruct_training.custom['models'][name].predict(histstruct_bad.custom['histograms'][name][j]), n=10) )

In [None]:
### make a new plot of probability contours and overlay data points
### (only 2D projections!)

plt.close('all')
colorlist = ['red','lightcoral','firebrick','chocolate','fuchsia','orange','purple']
#colorlist = ['red']*nbadsets
if len(colorlist)<nbadsets:
    print('### ERROR ###: need more colors...')

for dims,partialfitfunc in zip(dimslist,fitfunclist):
    fig,ax = plotfit2d(histstruct_training,'mse_histograms',dims,partialfitfunc,doinitialplot=False,onlycontour=True,rangestd=50)
    for j in range(len(mse_bad)): ax.plot(mse_bad[j][:,dims[0]],mse_bad[j][:,dims[1]],'.',color=colorlist[j],markersize=4)
    ax.plot(mse_good[:,dims[0]],mse_good[:,dims[1]],'.b',markersize=4)

# get the minimum log probability of histograms in good set
print('--- good lumesections ---')
logprob_good = np.log(fitfunc.pdf(mse_good))
print('length of log prob array: '+str(len(logprob_good)))
print('minimum of log prob: '+str(np.min(logprob_good)))
#print(sorted(logprob_good))
print('--- bad lumisections ---')
logprob_bad_parts = [np.log(fitfunc.pdf(mse_bad[j])) for j in range(len(mse_bad))]
#for lp in logprob_bad_parts: print(str(sorted(lp))+'\n\n')
logprob_bad = np.concatenate(tuple(logprob_bad_parts))
print('length of log prob array: '+str(len(logprob_bad)))
print('maximum of log prob: '+str(np.max(logprob_bad)))
#print(sorted(logprob_bad))

In [None]:
### make a roc curve based on the test results above
# note: smaller logprob = less probable = more outlier = more anomalous
# so if anomalies are signal and good histograms are background, -logprob is a suitable score definition,
# since everything above a certain threshold will be considered signal and below it background.

labels_good = np.zeros(len(logprob_good)) # background: label = 0
labels_bad = np.ones(len(logprob_bad)) # signal: label = 1

labels = np.concatenate(tuple([labels_good,labels_bad]))
scores = np.concatenate(tuple([-logprob_good,-logprob_bad]))
maxnoninf = np.max(np.where(scores==np.inf,np.min(scores),scores)) + 1
scores = np.where(scores==np.inf,maxnoninf,scores)
print('logprobs of -inf were reset to {}'.format(-maxnoninf))

auc = get_roc(scores,labels)
plt.show()

plt.figure()
logprob_threshold = 84 # everything below this logprob will be considered signal (i.e. anomalous)
get_confusion_matrix(scores,labels,-logprob_threshold)

In [None]:
### investigate particular lumisections: functionality

def plotlsreco(histstruct, run, ls, histstruct_ref, doprint=False):
    # plot the histograms for a given run/ls number with their reconstruction
    msepoint = []
    fig,axs = plt.subplots(2,4,figsize=(24,12))
    # find index that given run and ls number correspond to
    index = (set(list(np.where(histstruct.runnbs==run)[0])) & set(list(np.where(histstruct.lsnbs==ls)[0])))
    if len(index)!=1: 
        print('index has unexpected shape: '+str(index))
        return
    (index,) = index
    # loop over all histograms belonging to this lumisection and make the plots
    for j,name in enumerate(histstruct.names):
        hist = histstruct.histograms[name][index:index+1,:]
        reco = histstruct.custom['models'][name].predict(hist)
        mse = mseTop10Raw(hist,reco)[0]
        msepoint.append(mse)
        plot_sets([histstruct_ref.histograms[name],hist,reco],
                  ax=axs[int(j/4),j%4],
                  title=name,
                  colorlist=['blue','black','red'],labellist=['good hists','hist (run: '+str(int(run))+', ls: '+str(int(ls))+')','reco'],
                  transparencylist=[0.3,1.,1.])
        # additional prints
        if doprint:
            print('mse (this histogram): '+str(mse))
            print('mse (average good): '+str(np.average(histstruct_ref.custom['mse_histograms'][name])))
    return {'msepoint':msepoint,'figure':fig}

def plotrunreco(histstruct, run, histstruct_ref, doprint=False):
    # call plotlsreco for all ls in a given run
    lsnbs = histstruct.lsnbs[np.where(histstruct.runnbs==run)]
    print('plotting {} lumisections...'.format(len(lsnbs)))
    for lsnb in lsnbs:
        _ = plotlsreco(histstruct,run,lsnb,histstruct_ref,doprint=doprint)

In [None]:
### investigate particular lumisections: calling

run = 300674
ls = 1
mode = 'ls'
histstruct_eval = histstruct_good
# (choose histstruct_good or histstruct_training, depending on what you want to investigate)

print(histstruct_good.runnbs[:10])
print(histstruct_good.lsnbs[:10])

if mode=='ls':
    # plot this particular run/ls
    temp = plotlsreco(histstruct_eval,run,ls,histstruct_good,doprint=True)
    msepoint = temp['msepoint']
    logprob = np.log(fitfunc.pdf(np.array([msepoint])))
    print('logprob: '+str(logprob))

if mode=='run':
    # plot given run
    plotrunreco(histstruct_eval,run,histstruct_good)

In [None]:
### evaluate the method on the golden json

histstruct_eval = histstruct_good
# (note: use either histstruct_good or histstruct_training
# when training on the entire dataset, the latter is informative (the former will not even work if using templates)
# when training on a subset, the target run, i.e. histstruct_good is more informative)

mse_test_golden = np.zeros((len(histstruct_eval.get_golden_indices()),len(histstruct_eval.names)))
for i,name in enumerate(histstruct_eval.names):
    print('evaluating: '+name)
    mse_test_golden[:,i] = histstruct_eval.custom['mse_histograms'][name][histstruct_eval.get_golden_mask()]

In [None]:
### investigate how the method performs on golden json data (useful if not using golden json for training)

print('evaluating pdf on mse points')
logprob_test_golden = np.log(fitfunc.pdf(mse_test_golden))

def get_runsls_inrange(logprob,runnbs,lsnbs,logprob_up=None,logprob_down=None):
    # get a list of tuples of (run number, ls number) corresponding to ls with log probability within a given range
    # - logprob, runnbs and lsnbs are equally long 1D arrays
    # - logprob_up and logprob_down are upper and lower thresholds
    #     if both are not None, the lumisections with logprob between the boundaries are returned
    #     if logprob_up is None, the lumisections with logprob > logprob_down are returned
    #     if logprob_down is None, the lumisections with logprob < logprob_up are returned
    indices = np.array([])
    if logprob_down is None:
        indices = np.nonzero(logprob<logprob_up)[0]
    elif logprob_up is None:
        indices = np.nonzero(logprob>logprob_down)[0]
    else:
        indices = np.nonzero((logprob>logprob_down) & (logprob<logprob_up))[0]
    runsinrange = runnbs[indices]
    lsinrange = lsnbs[indices]
    runslsinrange = []
    for rr,lsr in zip(runsinrange,lsinrange):
        runslsinrange.append((int(rr),int(lsr)))
    return {'indices':indices,'runslsinrange':runslsinrange}

logup = None
logdown = 84
temp = get_runsls_inrange(logprob_test_golden, histstruct_eval.runnbs[histstruct_eval.get_golden_mask()], histstruct_eval.lsnbs[histstruct_eval.get_golden_mask()],
                          logprob_up = logup, logprob_down = logdown)

runslsinrange = temp['runslsinrange']
print('{} out of {} LS are within these boundaries'.format(len(runslsinrange),len(logprob_test_golden)))

# make plots
nplotsmax = 1
from matplotlib.backends.backend_pdf import PdfPages
pdf = PdfPages('test.pdf')
for i,(runnb,lsnb) in enumerate(runslsinrange):
    if i>=nplotsmax:
        print('maximum number of plots reached')
        break
    print('------------------------')
    temp = plotlsreco(histstruct_eval,runnb,lsnb,histstruct_good)
    msepoint = temp['msepoint']
    fig = temp['figure']
    fig.show()
    pdf.savefig(fig)
    # only for 2 dimensions: extra contour plot
    #if nhisttypes != 2: continue
    #fig,ax = plt.subplots()
    #contourplot = ax.contourf(x, y, np.log(fitfunc.pdfgrid(pos)))
    #plt.colorbar(contourplot)
    #ax.plot(msepoint[0],msepoint[1],'.k',markersize=10)
    #ax.set_xlim((0.,xlim))
    #ax.set_ylim((0.,ylim))
pdf.close()

In [None]:
### investigate other 'pathological' histograms in the training set
# deprecated, not used anymore and not guaranteed to work anymore...

logprob_train = np.log(fitfunc.pdf(mse_train))
print('Minimum log prob value in training set: '+str(np.min(logprob_train)))
logupboundary = 0
loglowboundary = 71 # if larger than logupboundary, no lower boundary
if loglowboundary > logupboundary:
    indices = np.nonzero(logprob_train<logupboundary)[0]
else:
    indices = np.nonzero((logprob_train>loglowboundary) & (logprob_train<logupboundary))[0]
nout = len(indices)
print(str(nout)+' out of '+str(mse_train.shape[0])+' histograms are within these boundary.')

# make list of run and ls numbers
badruns = histstruct[0]['runnbs_train'][indices]
badls = histstruct[0]['lsnbs_train'][indices]
badrunsls = []
for br,bls in zip(badruns,badls):
    badrunsls.append((int(br),int(bls)))
print(badrunsls)

# make plots
nplotsmax = 20
from matplotlib.backends.backend_pdf import PdfPages
pdf = PdfPages('test.pdf')
#np.random.shuffle(indices)
for i,index in enumerate(indices):
    if i>=nplotsmax:
        print('maximum number of plots reached')
        break
    print('------------------------')
    msepoint = []
    fig,axs = plt.subplots(2,4,figsize=(24,12))
    for name in histstruct['names']:
        hist = histstruct['hists_all'][name][index:index+1,:]
        reco = histstruct['models'][name].predict(hist)
        mse = histstruct['mse_train'][name][index]
        runnb = histstruct[j]['runnbs_all'][index]
        lsnb = histstruct[j]['lsnbs_all'][index]
        msepoint.append(mse)
        #fig,ax = plt.subplots()
        CheckPredictions(hist[0],reco[0],mse,0,0,histstruct[j]['name']) 
        #plot_sets([hist,reco,histstruct[j]['X_test_good']],
        #          ax=axs[int(j/4),j%4],
        #          title=histstruct[j]['name'],
        #          colorlist=['black','red','blue'],labellist=['hist (run: '+str(int(runnb))+', ls: '+str(int(lsnb))+')','reco','good hists'],
        #          transparencylist=[1.,1.,0.3])
        # additional prints
        #print('mse (this histogram): '+str(mse))
        #print('mse (average good): '+str(np.average(histstruct[j]['mse_test_good'])))
    #fig.show()
    pdf.savefig(fig)
    # only for 2 dimensions: extra contour plot
    if nhisttypes != 2: continue
    fig,ax = plt.subplots()
    contourplot = ax.contourf(x, y, np.log(fitfunc.pdfgrid(pos)))
    plt.colorbar(contourplot)
    ax.plot(msepoint[0],msepoint[1],'.k',markersize=10)
    ax.set_xlim((0.,xlim))
    ax.set_ylim((0.,ylim))
pdf.close()