**Train and test an autoencoder for a particular type of histogram**  
At this stage all of the available data (per year) is used to train the autoencoder.  
For the case where only a small subset of the data is used for training, see autoencoder_iterative.

In [None]:
### imports

# external modules
import sys
#import pandas as pd
import numpy as np
#import matplotlib.pyplot as plt
from keras import backend as K
import tensorflow as tf
import importlib

# local modules
sys.path.append('utils')
import csv_utils as csvu
import dataframe_utils as dfu
import hist_utils as hu
import clustering_utils as cu
import autoencoder_utils as aeu
import plot_utils as pu
import generate_data_utils as gdu
importlib.reload(csvu)
importlib.reload(dfu)
importlib.reload(hu)
importlib.reload(cu)
importlib.reload(aeu)
importlib.reload(pu)
importlib.reload(gdu)

In [None]:
### read and select data

histtype = 'DF2017_chargeInner_PXLayer_2'
df = csvu.read_csv('data/'+histtype+'.csv')
print('raw input data shape: {}'.format( dfu.get_hist_values(df)[0].shape ))
df = dfu.select_dcson(df)
print('input data shape: {}'.format( dfu.get_hist_values(df)[0].shape ))

In [None]:
### filter step 1: filter out low statistics

df = dfu.select_highstat(df,entries_to_bins_ratio=1000)
print('number of passing lumisections: {}'.format( len(df) ))

In [None]:
### filter step 2: filter out clearly anomalous data based on distance in moment space
# at this point this is a little deprecated, however, maybe re-introduce later on!

# current implementation is still experimental and probably not optimal, if only the speed of it... 
# and does not work properly.......

skipthiscell = True

if not skipthiscell:
    
    nprevious = 5 # number of previous runs to compare with
    runlist = get_runs(df)
    threshold = -100
    selector = []

    # settings for moments
    orders = [1,2]
    xmin = 0.
    xmax = 1.
    nbins = df.at[0,'Xbins']
    binwidth = (xmax-xmin)/nbins
    bins = np.linspace(xmin+binwidth/2,xmax-binwidth/2,num=nbins,endpoint=True)

    for i,run in enumerate(runlist):
        #print('now investigating run '+str(run))
        if i < nprevious: continue
        # get this run
        thisdf = select_runs(df,[run])
        (thishists,_,thisls) = get_hist_values(thisdf)
        thishists = thishists[:,1:-1]
        thismoments = histmoments(bins,thishists,orders)
        # get nprevious runs
        pruns = select_runs(df,runlist[i-nprevious:i])
        (phists,_,_) = get_hist_values(pruns)
        phists = phists[:,1:-1]
        pmoments = histmoments(bins,phists,orders)
        # fit kde
        fitfunc = gaussiankde(pmoments,bw='scott')
        # evaluate on this run
        thislogprob = np.log(fitfunc.pdf(thismoments))
        passingls = thisls[np.where(thislogprob>threshold)]
        if len(passingls)>0: selector.append((run,passingls))
        # print some info
        nrejected = len(np.asarray(thislogprob<threshold).nonzero()[0])
        #print('{} out of {} ls were rejected'.format(nrejected,len(thislogprob)))

    dfpass = select_runsls(df,selector)
    print('number of passing lumisections: '+str(len(dfpass)))

In [None]:
### alternative if not using moment method
dfpass = df

In [None]:
### preprocessing of the data: rebinning and normalizing

rebinningfactor = 1

X_train = hu.preparedatafromdf(dfpass,rebinningfactor=rebinningfactor,doplot=True)
(ntrain,nbins) = X_train.shape
print('Size of training set: '+str(X_train.shape))

In [None]:
### build the model and train it, or load an already saved model

# choose whether to train new model or load one 
trainnew = True
savemodel = False
modelname = histtype+'_dcson_40epochs.h5'

# imports
#import math
#from keras.callbacks import ModelCheckpoint, EarlyStopping
#from keras.layers import Input, Dense
#from keras.layers.advanced_activations import PReLU
#from keras.models import Model, load_model
from keras.models import load_model

# case 1: train new model
if trainnew:
    
    input_size = X_train.shape[1]
    arch = [int(X_train.shape[1]/2.)]
    act = ['tanh']*len(arch)
    opt = 'adam'
    loss = aeu.mseTop10
    autoencoder = aeu.getautoencoder(input_size,arch,act,opt,loss)
    
    history = autoencoder.fit(X_train, X_train, epochs=2, batch_size=500, shuffle=False, verbose=1, validation_split=0.1)
    pu.plot_loss(history, title = 'model loss')
    if savemodel: autoencoder.save(modelname)
    
# case 2: load existing model
else:
    autoencoder = load_model('models/'+modelname,custom_objects={'mseTop10': mseTop10})

In [None]:
### evaluate the model on the training set

predictionTrain = autoencoder.predict(X_train)
mseTrain = aeu.mseTop10Raw(X_train, predictionTrain)

In [None]:
### plot the global MSE trend

pu.plot_mse(mseTrain)
(mean,std) = pu.plot_mse(mseTrain,doplot=False)
print('mean mse: {}'.format(mean))
print('std mse: {}'.format(std))

In [None]:
### impose a mse upper boundary and plot random examples of passing and failing histograms
# note: at this point, only the training set (usually golden json) is considered!
# for a test set: see cell below.

cutvalue = mean + 3*std
#cutvalue = 4.73e-6
print('The mse threshold is: '+str(cutvalue))
goodindices = np.arange(0,len(mseTrain))[mseTrain<cutvalue]
badindices = np.arange(0,len(mseTrain))[mseTrain>cutvalue]

print('Number of passing histograms: '+str(len(goodindices)))
print('Number of failing histograms: '+str(len(badindices)))

nplot = 20
print('examples of good histograms and reconstruction:')
randint = np.random.choice(goodindices,size=nplot,replace=False)
for i in randint: 
    histlist = [X_train[int(i),:],predictionTrain[int(i),:]]
    labellist = ['data','reconstruction']
    colorlist = ['black','blue']
    pu.plot_hists(histlist,colorlist=colorlist,labellist=labellist)

print('examples of bad histograms and reconstruction:')
randint = np.random.choice(badindices,size=nplot,replace=False)
for i in randint:
    histlist = [X_train[int(i),:],predictionTrain[int(i),:]]
    labellist = ['data','reconstruction']
    colorlist = ['black','blue']
    pu.plot_hists(histlist,colorlist=colorlist,labellist=labellist)

In [None]:
### get a test set and evaluate the model

goodrunsls = { "297056":[[-1]],
                "297177":[[-1]],
                "301449":[[-1]] 
             }
badrunsls = {
                #"297048":[[-1]],
                #"297282":[[-1]],
                #"297283":[[-1]],
                #"297284":[[-1]],
                "297287":[[-1]],
                #"297288":[[-1]],
                #"297289":[[-1]],
                #"299316":[[-1]],
                #"299317":[[-1]],
                #"299318":[[-1]],
                #"299324":[[-1]],
                #"299326":[[-1]],
                #"301086":[[88,126]],
                #"301086":[[89,89]],
                #"303948":[[1710,1710]],
            }
df = csvu.read_csv('data/'+histtype+'.csv')
df = dfu.select_dcson(df)
X_test_good = hu.preparedatafromdf( dfu.select_runsls(df,goodrunsls) )
X_test_bad = hu.preparedatafromdf( dfu.select_runsls(df,badrunsls) )

pu.plot_sets([X_test_good,X_test_bad],colorlist=['b','r'],
             labellist=['Histograms in test set labeled "good"','Histograms in test set labeled "bad"'])

prediction_test_good = autoencoder.predict(X_test_good)
mse_test_good = aeu.mseTopNRaw(X_test_good, prediction_test_good, n=10 )
prediction_test_bad = autoencoder.predict(X_test_bad)
mse_test_bad = aeu.mseTopNRaw(X_test_bad, prediction_test_bad, n=10 )

print('average mse on good set: '+str(np.mean(mse_test_good)))
print('average mse on bad set: '+str(np.mean(mse_test_bad)))

nplot = 10
print('examples of good histograms and reconstruction:')
randint = np.random.choice(np.arange(len(X_test_good)),size=nplot,replace=False)
for i in randint: 
    histlist = [X_test_good[int(i),:],prediction_test_good[int(i),:]]
    labellist = ['data','reconstruction']
    colorlist = ['black','blue']
    pu.plot_hists(histlist,colorlist=colorlist,labellist=labellist)

print('examples of bad histograms and reconstruction:')
randint = np.random.choice(np.arange(len(X_test_bad)),size=nplot,replace=False)
for i in randint:
    histlist = [X_test_bad[int(i),:],prediction_test_bad[int(i),:]]
    labellist = ['data','reconstruction']
    colorlist = ['black','blue']
    pu.plot_hists(histlist,colorlist=colorlist,labellist=labellist)

In [None]:
### use artificial data to assess the model performance

goodhists = gdu.fourier_noise(X_test_good,nresamples=60,nonnegative=True,stdfactor=15.,figname='f')
badhists = gdu.fourier_noise(X_test_bad,nresamples=2400,nonnegative=True,stdfactor=15.,figname='f')
print('number of good histograms: '+str(len(goodhists)))
print('number of bad histograms: '+str(len(badhists)))

validation_data = np.vstack((goodhists,badhists))
labels = np.hstack((np.zeros(len(goodhists)),np.ones(len(badhists))))
prediction = autoencoder.predict(validation_data)
mse = aeu.mseTopNRaw(validation_data, prediction, n=10 )
print('examples of artificial histograms and reconstruction:')
shuffled_indices = np.arange(len(validation_data))
_ = np.random.shuffle(shuffled_indices)
validation_data = validation_data[shuffled_indices]
labels = labels[shuffled_indices]
prediction = prediction[shuffled_indices]
mse = mse[shuffled_indices]

# distribution of output scores
pu.plot_score_dist(mse,labels,nbins=200,normalize=True)
print(np.amin(mse[np.where(labels==1)]))
print(np.amax(mse[np.where(labels==0)]))
# classical ROC curve: signal efficiency (good data marked as good) vs background efficiency (bad data marked as good)
auc = aeu.get_roc(mse, labels)

In [None]:
### continution of previous cell: choose wp and plot confusion matrix

msewp = 0.9e-3
aeu.get_confusion_matrix_from_hists(validation_data,labels,prediction,msewp)

**Cells below are deprecated and not used anymore**  
No guarantee that they will still run or that any useful conclusions can be drawn from them.

In [None]:
### use the function above to train a network for different types 
### of histograms and compare the output
# part 1: process all histograms
# function definition has been moved to autoencoder_utils.py!

dfratio = fit_autoencoder('DF2018_MainDiagonal_Position.csv')
dfpixel = fit_autoencoder('DF2018_NumberOfClustersInPixel.csv',rebinningfactor=8)
dfstrip = fit_autoencoder('DF2018_NumberOfClustersInStrip.csv',rebinningfactor=10)

In [None]:
### use the function above to train a network for different types 
### of histograms and compare the output
# part 2: check consistency

print(np.sum(np.array(dfratio['passencoder'])))
print(np.sum(np.array(dfpixel['passencoder'])))
print(np.sum(np.array(dfstrip['passencoder'])))
print('----')
npass = np.zeros(len(dfratio))
for i in range(len(dfratio)):
    n = 0
    if(dfratio.at[i,'passencoder']): n+=1
    if(dfpixel.at[i,'passencoder']): n+=1
    if(dfstrip.at[i,'passencoder']): n+=1
    npass[i] = n
print(np.sum(np.where(npass==3,1,0)))
print(np.sum(np.where(npass==2,1,0)))
print(np.sum(np.where(npass==1,1,0)))
print(np.sum(np.where(npass==0,1,0)))
print('----')
print(np.sum(np.where(np.array(dfratio['passencoder'])+np.array(dfpixel['passencoder'])==2,1,0)))
print(np.sum(np.where(np.array(dfratio['passencoder'])+np.array(dfstrip['passencoder'])==2,1,0)))
print(np.sum(np.where(np.array(dfpixel['passencoder'])+np.array(dfstrip['passencoder'])==2,1,0)))

In [None]:
### another idea: train a self-consistent autoencoder

datafilename = 'DF2017_MainDiagonal_Position.csv'
df = read_csv(datafilename)
print('raw input data shape: '+str(get_hist_values(df)[0].shape))
df = select_golden(df)
print('golden input data shape: '+str(get_hist_values(df)[0].shape))
domoment = True
if domoment:
    (df,dfpass,_,_,_) = filteranomalous(df,rmlargest=0.005,doplot=True)
    momentmask = np.array(df['passmomentmethod'])
    momentinds = np.nonzero(momentmask)
else:
    dfpass = df
    momentinds = np.arange(0,len(df))

print('number of passing lumisections: '+str(len(dfpass)))

rebinningfactor = 1

(histpass,_,_) = get_hist_values(dfpass)
histpass = histpass[:,1:-1]
print('histograms shape: '+str(histpass.shape))
rhistpass = rebin2d(histpass,rebinningfactor)

from sklearn.preprocessing import normalize
rhistpass = normalize(rhistpass, norm='l1', axis=1) #normalise the sample, i.e the rows

nremoved = 1e10
encodermask = np.zeros(len(df))
encodermask[momentinds] = 1
while(nremoved>100):
    ninit = len(rhistpass)
    print('Starting network training for '+str(ninit)+' instances.')
    (_,mse) = train_autoencoder(rhistpass,datafilename)
    (gmean,gstd) = globalMSETrend(mse)
    cutvalue = gmean+5*gstd
    encodermask[encodermask==1] = np.where(mse<cutvalue,1,0)
    rhistpass = rhistpass[mse<cutvalue]
    print('Network training finished, '+str(len(rhistpass))+' out of '+str(ninit)+' instances passed.')
    nremoved = ninit - len(rhistpass)
df['passencoder'] = encodermask

In [None]:
### check histograms resulting from cell above

nplot = 100
temp = df[df['passencoder']==1]
temp.reset_index(drop=True,inplace=True)
(ghists,_,_) = get_hist_values(temp)
print(ghists.shape)
randint = np.random.choice(np.arange(0,len(ghists)),size=nplot,replace=False)
for i in randint:
    plt.figure()
    plt.plot(ghists[i,:],color='b')

temp = df[df['passencoder']==0]
temp.reset_index(drop=True,inplace=True)
(bhists,_,_) = get_hist_values(temp)
print(bhists.shape)
randint = np.random.choice(np.arange(0,len(bhists)),size=nplot,replace=False)
for i in randint:
    plt.figure()
    plt.plot(bhists[i,:],color='r')