**Train an autoencoder (very similar to autoencoder.ipynb) on several types of histograms and study the combined prediction**

In [None]:
%run utils/utils.ipynb
%run utils/clustering_utils.ipynb
%run utils/autoencoder_utils.ipynb
%run utils/ae_combine_utils.ipynb
%run utils/generate_data_utils.ipynb
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle

In [None]:
### read the data

readnew = True
save = False
structname = 'teststruct.pkl'

if readnew:
    
    year = '2017'
    histnames = [
            'chargeInner_PXLayer_2','chargeInner_PXLayer_3',
             'charge_PXDisk_+1','charge_PXDisk_+2','charge_PXDisk_+3',
             'size_PXLayer_1','size_PXLayer_2','size_PXLayer_3'
            ]
        
    #selected_runs = [] # empty list means keeping all runs
    selected_runs = get_runs(read_csv('data/DF'+year+'_'+histnames[0]+'.csv'))[0:5]
    if len(selected_runs) < 50: print('selected runs: '+str(selected_runs))
    
    histstruct = histstructure()
    histstruct.create(year,histnames,highstat=True,runs=selected_runs)
    
    nhists = len(histstruct.lsnbs)
    nhisttypes = len(histstruct.names)
    print('found {} histogram types with {} histograms each'.format(nhisttypes,nhists))
    
    if save:
        with open(structname,'wb') as f:
            pickle.dump(histstruct,f)
            
else:
    with open(structname,'rb') as f:
        histstruct = pickle.load(f)
    nhists = len(histstruct.lsnbs)
    nhisttypes = len(histstruct.names)
    print('found {} histogram types with {} histograms each'.format(nhisttypes,nhists))

In [None]:
### extend the training set using artificial data

extendtraining = True

histstruct.custom['hists_train'] = {}

if extendtraining:
    for (name,hists) in histstruct.histograms.items():
        print('generating artificial training data for '+name)
        histstruct.custom['hists_train'][name] = upsample_hist_set(hists,1e4)
else:
    histstruct.custom['hists_train'] = histstruct.hists_all

In [None]:
### define and train an autoencoder for each element
from keras.models import load_model

trainnew = True
savemodels = False
modelname_extension = 'dcson_five_runs' # ignored if savemodels is False
histstruct.custom['models'] = {}
if trainnew:
    for (name,hists) in histstruct.custom['hists_train'].items():
        modelname=name+'_'+modelname_extension
        if not savemodels: modelname = '' # empty string means do not save models
        #nepochs = -1 # automatic number of epochs
        nepochs = 40 # manual number of epochs
        # modify training parameters depending on type of histogram
        #if 'chargeInner_PXLayer_2' in name:
            # increased statistics threshold
            #hists = hists[np.where( histstruct.entries_all[name]/len(hists[0]) > 1000 )]
        model = train_simple_autoencoder(hists,nepochs=nepochs,modelname=modelname)
        histstruct.custom['models'][name] = model
else:
    for name in histstruct.names:
        print('loading model for '+name)
        # note: for now use 2017 models for 2018 data as well (to check behaviour on unseen data)
        model_name = 'models/'+name+'_'+modelname_extension+'.h5'
        #model_name = model_name.replace('2018','2017')
        histstruct.custom['models'][name] = load_model(model_name,custom_objects={'mseTop10': mseTop10})

In [None]:
### evaluate the models on the original set

histstruct.custom['mse_histograms'] = {}
for (name,hists) in histstruct.histograms.items():
    print('evaluating model for '+name)
    pred = histstruct.custom['models'][name].predict(hists)
    #mse = K.eval(mseTop10(hists,pred))
    mse = mseTop10Raw(hists,pred)
    histstruct.custom['mse_histograms'][name] = mse

In [None]:
### plot the multidemensional mse and fit a log-normal distribution
import matplotlib as mpl

fitnew = True    # fit new distribution or load previously saved one
savefit = False  # save the new fit or not
valkey = 'mse_histograms' # perform fit on (non-extended) training set

dimslist = []
fitfunclist = []
for i in range(0,nhisttypes-1):
    for j in range(i+1,nhisttypes):
        dimslist.append((i,j))

plt.close('all')
for dims in dimslist:
   
    fitfunc = fitseminormal(histstruct,valkey,dims,fitnew=fitnew,savefit=savefit)
    #fitfunc = fitgaussiankde(histstruct,valkey,dims,maxnpoints=10000)
    plotfit2d(histstruct,valkey,dims,fitfunc,doinitialplot=True)
    #plt.close('all') # release plot memory
    fitfunclist.append(fitfunc)
    
# do a total fit
if fitnew:
    fitfunc = seminormal( get_mse_array(histstruct,valkey) )
    #fitfunc = fitgaussiankde(histstruct,valkey,maxnpoints=10000)
    if savefit: fitfunc.save('seminormal_fit_2017_8dim.npy')
else:
    fitfunc = seminormal()
    fitfunc.load('seminormal_fit_2017_8dim.npy')

In [None]:
### get a test set

goodrunsls = {'2017':
              [
                (297056,[-1]),
                # later added these as well for more representative set
                #(297177,[-1]),
                #(301449,[-1])
              ],
              '2018':
              [  (315267,[-1])]} 
# if training on full dataset, better use averaging method (see below)

# core test set of clearly bad runs
badrunsls = {'2017':
                [
                (297287,[-1]),
                (297288,[-1]),
                (297289,[-1]),
                (299316,[-1]),
                (299324,[-1]),
                (299326,[-1]),
                (301086,list(range(88,126))) # only bad for size_PXDisk_+1 -> do not use for now (unclear what are real anomalies)
                ],
            '2018':
                [
                #(317479,[-1]),
                (317480,[-1]),
                (317481,[-1]),
                (317482,[-1]),
                #(319847,list(range(1,35)))
            ]}

histstruct.custom['hists_test_good'] = {}
histstruct.custom['hists_test_bad'] = {}
for (name,hists) in histstruct.histograms.items():
    print('retrieving good and bad test set for '+name)
    df = read_csv(name+'.csv')
    year = '2017' if '2017' in name else '2018'
    # get good test set from predefined run and ls numbers
    (ghists,runnbs,lsnbs) = preparedatafromdf(select_runsls(df,goodrunsls[year]),returnrunls=True,onlygolden=True)
    # alternative: get good test set from averaging all histograms
    #ntemplates = 15
    #ghists = np.zeros((ntemplates,hists.shape[1]))
    #nsub = int(len(hists)/ntemplates)
    #for i in range(ntemplates):
    #    startindex = i*nsub
    #    stopindex = (i+1)*nsub
    #    ghists[i,:] = np.mean(hists[startindex:stopindex,:],axis=0)
    histstruct.custom['hists_test_good'][name] = ghists
    # list of separate test sets per run (allows for better visualization)
    histstruct.custom['hists_test_bad'][name] = []
    for badrun in badrunsls[year]:
        (bhists,runnbs,lsnbs) = preparedatafromdf(select_runsls(df,[badrun]),returnrunls=True)
        histstruct.custom['hists_test_bad'][name].append(bhists)
    nbadsets = len(badrunsls[year])
    # alternative version: one set of all bad runs together
    #(bhists,runnbs,lsnbs) = preparedatafromdf(select_runsls(df,badrunsls))
    #histdict.custom['hists_test_bad'][name] = bhists
    #nbadsets = 1

In [None]:
### extend the test set using artificial data generation

skipthiscell = False # to prevent running this cell by accident

if not skipthiscell:

    histstruct.custom['hists_test_good_ext'] = {}
    histstruct.custom['hists_test_bad_ext'] = {}
    for name in histstruct.names:
        print('generating data for '+name)
        histstruct.custom['hists_test_good_ext'][name] = upsample_hist_set(histstruct.custom['hists_test_good'][name],
                                                            figname='',ntarget=nbadsets*4e3,fourierstdfactor=20.)
        histstruct.custom['hists_test_bad_ext'][name] = []
        for badset in histstruct.custom['hists_test_bad'][name]:
            histstruct.custom['hists_test_bad_ext'][name].append(upsample_hist_set(badset,figname='f',ntarget=4e3,
                                                              fourierstdfactor=20.))

In [None]:
### evaluate the model on the test set

use_ext = True

xgood = 'hists_test_good'
xbad = 'hists_test_bad'
if use_ext:
    xgood = xgood + '_ext'
    xbad = xbad + '_ext'
mse_good = np.zeros((len(histstruct.custom[xgood][histstruct.names[0]]),nhisttypes))
mse_bad = [np.zeros((len(histstruct.custom[xbad][histstruct.names[0]][j]),nhisttypes)) for j in range(nbadsets)]
histstruct.custom['mse_test_good'] = {}
histstruct.custom['mse_test_bad'] = {}
for i,name in enumerate(histstruct.names):
    print('evaluating: '+name)
    mse_good[:,i] = mseTopNRaw(histstruct.custom[xgood][name],histstruct.custom['models'][name].predict(histstruct.custom[xgood][name]), n=10)
    histstruct.custom['mse_test_good'][name] = mse_good[:,i]
    histstruct.custom['mse_test_bad'][name] = []
    for j in range(nbadsets):
        mse_bad[j][:,i] = mseTopNRaw(histstruct.custom[xbad][name][j],histstruct.custom['models'][name].predict(histstruct.custom[xbad][name][j]), n=10)
        histstruct.custom['mse_test_bad'][name].append(mse_bad[j][:,i])

In [None]:
### make a new plot of probability contours and overlay data points
### (only 2D projections!)

plt.close('all')
colorlist = ['red','lightcoral','firebrick','chocolate','fuchsia','orange','purple']
#colorlist = ['red']*nbadsets
if len(colorlist)<nbadsets:
    print('### ERROR ###: need more colors...')

for dims,partialfitfunc in zip(dimslist,fitfunclist):
    fig,ax = plotfit2d(histstruct,'mse_histograms',dims,partialfitfunc,doinitialplot=False,onlycontour=True,rangestd=50)
    for j in range(len(mse_bad)): ax.plot(mse_bad[j][:,dims[0]],mse_bad[j][:,dims[1]],'.',color=colorlist[j],markersize=4)
    ax.plot(mse_good[:,dims[0]],mse_good[:,dims[1]],'.b',markersize=4)

# get the minimum log probability of histograms in good set
print('--- good lumesections ---')
logprob_good = np.log(fitfunc.pdf(mse_good))
print('length of log prob array: '+str(len(logprob_good)))
print('minimum of log prob: '+str(np.min(logprob_good)))
#print(sorted(logprob_good))
print('--- bad lumisections ---')
logprob_bad_parts = [np.log(fitfunc.pdf(mse_bad[j])) for j in range(len(mse_bad))]
#for lp in logprob_bad_parts: print(str(sorted(lp))+'\n\n')
logprob_bad = np.concatenate(tuple(logprob_bad_parts))
print('length of log prob array: '+str(len(logprob_bad)))
print('maximum of log prob: '+str(np.max(logprob_bad)))
#print(sorted(logprob_bad))

In [None]:
### make a roc curve based on the test results above

labels_good = np.zeros(len(logprob_good))
labels_bad = np.ones(len(logprob_bad))

labels = np.concatenate(tuple([labels_good,labels_bad]))
scores = np.concatenate(tuple([-logprob_good,-logprob_bad]))
maxnoninf = np.max(np.where(scores==np.inf,np.min(scores),scores))
scores = np.where(scores==np.inf,maxnoninf,scores)

auc = get_roc(scores,labels)
plt.show()

plt.figure()
get_confusion_matrix(scores,labels,0)

In [None]:
### investigate particular lumisections: functionality

def plotlsreco(histstruct, run, ls, doprint=False):
    # plot the histograms for a given run/ls number with their reconstruction
    msepoint = []
    fig,axs = plt.subplots(2,4,figsize=(24,12))
    # find index that given run and ls number correspond to
    index = (set(list(np.where(histstruct.runnbs==run)[0])) & set(list(np.where(histstruct.lsnbs==ls)[0])))
    if len(index)!=1: 
        print('index has unexpected shape: '+str(index))
        return
    (index,) = index
    # determine the indices of the 'reference' histograms to be plotted as baseline
    refhistindices = np.arange( len(histstruct.custom['hists_test_good'][histstruct.names[0]]) )
    np.random.shuffle(refhistindices)
    refhistindices = refhistindices[:20]
    # loop over all histograms belongingto this lumisection and make the plots
    for j,name in enumerate(histstruct.names):
        hist = histstruct.histograms[name][index:index+1,:]
        reco = histstruct.custom['models'][name].predict(hist)
        mse = mseTop10Raw(hist,reco)[0]
        msepoint.append(mse)
        plot_sets([hist,reco,histstruct.custom['hists_test_good'][name][refhistindices]],
                  ax=axs[int(j/4),j%4],
                  title=name,
                  colorlist=['black','red','blue'],labellist=['hist (run: '+str(int(run))+', ls: '+str(int(ls))+')','reco','good hists'],
                  transparencylist=[1.,1.,0.3])
        # additional prints
        if doprint:
            print('mse (this histogram): '+str(mse))
            print('mse (average good): '+str(np.average(histstruct.custom['mse_test_good'][name])))
    return {'msepoint':msepoint,'figure':fig}

def plotrunreco(histstruct, run, doprint=False):
    # call plotlsreco for all ls in a given run
    lsnbs = histstruct.lsnbs[np.where(histstruct.runnbs==run)]
    print('plotting {} lumisections...'.format(len(lsnbs)))
    for lsnb in lsnbs:
        _ = plotlsreco(histstruct,run,lsnb,doprint=doprint)

In [None]:
### investigate particular lumisections: calling

run = 297047
ls = 10
mode = 'ls'

if mode=='ls':
    # plot this particular run/ls
    temp = plotlsreco(histstruct,run,ls,doprint=True)
    msepoint = temp['msepoint']
    logprob = np.log(fitfunc.pdf(np.array([msepoint])))
    print('logprob: '+str(logprob))

if mode=='run':
    # plot given run
    plotrunreco(histstruct,run)

In [None]:
### evaluate the method on the golden json

mse_test_golden = np.zeros((len(histstruct.get_golden_indices()),len(histstruct.names)))

for i,name in enumerate(histstruct.names):
    mse_test_golden[:,i] = histstruct.custom['mse_histograms'][name][histstruct.get_golden_mask()]

In [None]:
### investigate how the method performs on golden json data (useful if not using golden json for training)

print('evaluating pdf on mse points')
logprob_test_golden = np.log(fitfunc.pdf(mse_test_golden))

def get_runsls_inrange(logprob,logup,logdown,runnbs,lsnbs):
    # get a list of tuples of (run number, ls number) corresponding to ls with log probability within a given range
    # note: if logdown is larger than logup, no lower boundary is applied
    if logdown > logup:
        indices = np.nonzero(logprob<logup)[0]
    else:
        indices = np.nonzero((logprob>logdown) & (logprob<logup))[0]
    runsinrange = runnbs[indices]
    lsinrange = lsnbs[indices]
    runslsinrange = []
    for rr,lsr in zip(runsinrange,lsinrange):
        runslsinrange.append((int(rr),int(lsr)))
    return {'indices':indices,'runslsinrange':runslsinrange}

temp = get_runsls_inrange(logprob_test_golden, 0, 100, histstruct.runnbs[histstruct.get_golden_mask()], histstruct.lsnbs[histstruct.get_golden_mask()])
badrunsls = temp['runslsinrange']
print('{} out of {} LS are within these boundaries'.format(len(badrunsls),len(logprob_test_golden)))

# make plots
nplotsmax = 1
from matplotlib.backends.backend_pdf import PdfPages
pdf = PdfPages('test.pdf')
for i,(runnb,lsnb) in enumerate(badrunsls):
    if i>=nplotsmax:
        print('maximum number of plots reached')
        break
    print('------------------------')
    temp = plotlsreco(histstruct,runnb,lsnb)
    msepoint = temp['msepoint']
    fig = temp['figure']
    fig.show()
    pdf.savefig(fig)
    # only for 2 dimensions: extra contour plot
    #if nhisttypes != 2: continue
    #fig,ax = plt.subplots()
    #contourplot = ax.contourf(x, y, np.log(fitfunc.pdfgrid(pos)))
    #plt.colorbar(contourplot)
    #ax.plot(msepoint[0],msepoint[1],'.k',markersize=10)
    #ax.set_xlim((0.,xlim))
    #ax.set_ylim((0.,ylim))
pdf.close()

In [None]:
### investigate other 'pathological' histograms in the training set
# deprecated, not used anymore and not guaranteed to work anymore...

logprob_train = np.log(fitfunc.pdf(mse_train))
print('Minimum log prob value in training set: '+str(np.min(logprob_train)))
logupboundary = 0
loglowboundary = 71 # if larger than logupboundary, no lower boundary
if loglowboundary > logupboundary:
    indices = np.nonzero(logprob_train<logupboundary)[0]
else:
    indices = np.nonzero((logprob_train>loglowboundary) & (logprob_train<logupboundary))[0]
nout = len(indices)
print(str(nout)+' out of '+str(mse_train.shape[0])+' histograms are within these boundary.')

# make list of run and ls numbers
badruns = histstruct[0]['runnbs_train'][indices]
badls = histstruct[0]['lsnbs_train'][indices]
badrunsls = []
for br,bls in zip(badruns,badls):
    badrunsls.append((int(br),int(bls)))
print(badrunsls)

# make plots
nplotsmax = 20
from matplotlib.backends.backend_pdf import PdfPages
pdf = PdfPages('test.pdf')
#np.random.shuffle(indices)
for i,index in enumerate(indices):
    if i>=nplotsmax:
        print('maximum number of plots reached')
        break
    print('------------------------')
    msepoint = []
    fig,axs = plt.subplots(2,4,figsize=(24,12))
    for name in histstruct['names']:
        hist = histstruct['hists_all'][name][index:index+1,:]
        reco = histstruct['models'][name].predict(hist)
        mse = histstruct['mse_train'][name][index]
        runnb = histstruct[j]['runnbs_all'][index]
        lsnb = histstruct[j]['lsnbs_all'][index]
        msepoint.append(mse)
        #fig,ax = plt.subplots()
        CheckPredictions(hist[0],reco[0],mse,0,0,histstruct[j]['name']) 
        #plot_sets([hist,reco,histstruct[j]['X_test_good']],
        #          ax=axs[int(j/4),j%4],
        #          title=histstruct[j]['name'],
        #          colorlist=['black','red','blue'],labellist=['hist (run: '+str(int(runnb))+', ls: '+str(int(lsnb))+')','reco','good hists'],
        #          transparencylist=[1.,1.,0.3])
        # additional prints
        #print('mse (this histogram): '+str(mse))
        #print('mse (average good): '+str(np.average(histstruct[j]['mse_test_good'])))
    #fig.show()
    pdf.savefig(fig)
    # only for 2 dimensions: extra contour plot
    if nhisttypes != 2: continue
    fig,ax = plt.subplots()
    contourplot = ax.contourf(x, y, np.log(fitfunc.pdfgrid(pos)))
    plt.colorbar(contourplot)
    ax.plot(msepoint[0],msepoint[1],'.k',markersize=10)
    ax.set_xlim((0.,xlim))
    ax.set_ylim((0.,ylim))
pdf.close()