**Train and test an autoencoder for a particular type of histogram**  
At this stage all of the available data (per year) is used to train the autoencoder.  
For the case where only a small subset of the data is used for training, see autoencoder_iterative.

In [None]:
### imports

# external modules
import sys
import numpy as np
import matplotlib.pyplot as plt
from keras import backend as K
import tensorflow as tf
import importlib

# local modules
sys.path.append('../utils')
import csv_utils as csvu
import dataframe_utils as dfu
import hist_utils as hu
import autoencoder_utils as aeu
import plot_utils as pu
import generate_data_utils as gdu
importlib.reload(csvu)
importlib.reload(dfu)
importlib.reload(hu)
importlib.reload(aeu)
importlib.reload(pu)
importlib.reload(gdu)

In [None]:
### read and select data

histtype = 'DF2017_chargeInner_PXLayer_2'
df = csvu.read_csv('../data/'+histtype+'.csv')
print('raw input data shape: {}'.format( dfu.get_hist_values(df)[0].shape ))

In [None]:
### filtering: select only DCS-bin on data and filter out low statistics

df = dfu.select_dcson(df)
print('number of passing lumisections after DCS selection: {}'.format( len(df) ))

df = dfu.select_highstat(df,entries_to_bins_ratio=100)
print('number of passing lumisections after high statistics selection: {}'.format( len(df) ))

In [None]:
### preprocessing of the data: rebinning and normalizing

rebinningfactor = 1

X_train = hu.preparedatafromdf(df,rebinningfactor=rebinningfactor,donormalize=True,doplot=True)
(ntrain,nbins) = X_train.shape
print('size of training set: '+str(X_train.shape))

In [None]:
### build the model and train it, or load an already saved model

# choose whether to train new model or load one 
trainnew = True
savemodel = False
modelname = histtype+'_dcson_40epochs.h5'

# imports
from keras.models import load_model

# case 1: train new model
if trainnew:
    
    input_size = X_train.shape[1]
    arch = [int(X_train.shape[1]/2.)]
    act = ['tanh']*len(arch)
    opt = 'adam'
    loss = aeu.mseTop10
    autoencoder = aeu.getautoencoder(input_size,arch,act,opt,loss)
    
    history = autoencoder.fit(X_train, X_train, epochs=20, batch_size=500, shuffle=False, verbose=1, validation_split=0.1)
    pu.plot_loss(history, title = 'model loss')
    if savemodel: autoencoder.save(modelname)
    
# case 2: load existing model
else:
    autoencoder = load_model('../models/'+modelname,custom_objects={'mseTop10': aeu.mseTop10})

In [None]:
### evaluate the model on the training set

predictionTrain = autoencoder.predict(X_train)
mseTrain = aeu.mseTop10Raw(X_train, predictionTrain)

In [None]:
### plot the global MSE trend

pu.plot_mse(mseTrain,rmlargest=0.005)
(mean,std) = pu.plot_mse(mseTrain,doplot=False,rmlargest=0.005)
print('mean mse: {}'.format(mean))
print('std mse: {}'.format(std))

In [None]:
### impose a mse upper boundary and plot random examples of passing and failing histograms
# note: at this point, only the training set is considered!
# for a test set: see cell below.

cutvalue = mean + 3*std
#cutvalue = 4.73e-6
print('The mse threshold is: '+str(cutvalue))
goodindices = np.arange(0,len(mseTrain))[mseTrain<cutvalue]
badindices = np.arange(0,len(mseTrain))[mseTrain>cutvalue]

print('Number of passing histograms: '+str(len(goodindices)))
print('Number of failing histograms: '+str(len(badindices)))

nplot = 5
print('examples of good histograms and reconstruction:')
randint = np.random.choice(goodindices,size=nplot,replace=False)
for i in randint: 
    histlist = [X_train[int(i),:],predictionTrain[int(i),:]]
    labellist = ['data','reconstruction']
    colorlist = ['black','blue']
    pu.plot_hists(histlist,colorlist=colorlist,labellist=labellist)
    plt.show()

print('examples of bad histograms and reconstruction:')
randint = np.random.choice(badindices,size=nplot,replace=False)
for i in randint:
    histlist = [X_train[int(i),:],predictionTrain[int(i),:]]
    labellist = ['data','reconstruction']
    colorlist = ['black','blue']
    pu.plot_hists(histlist,colorlist=colorlist,labellist=labellist)
    plt.show()

In [None]:
### get a test set and evaluate the model

goodrunsls = { "297056":[[-1]],
                "297177":[[-1]],
                "301449":[[-1]] 
             }
badrunsls = {
                #"297048":[[-1]],
                #"297282":[[-1]],
                #"297283":[[-1]],
                #"297284":[[-1]],
                "297287":[[-1]],
                #"297288":[[-1]],
                #"297289":[[-1]],
                #"299316":[[-1]],
                #"299317":[[-1]],
                #"299318":[[-1]],
                #"299324":[[-1]],
                #"299326":[[-1]],
                #"301086":[[88,126]],
                #"301086":[[89,89]],
                #"303948":[[1710,1710]],
            }
df = csvu.read_csv('../data/'+histtype+'.csv')
df = dfu.select_dcson(df)
X_test_good = hu.preparedatafromdf( dfu.select_runsls(df,goodrunsls),donormalize=True )
X_test_bad = hu.preparedatafromdf( dfu.select_runsls(df,badrunsls),donormalize=True )

pu.plot_sets([X_test_good,X_test_bad],colorlist=['b','r'],
             labellist=['Histograms in test set labeled "good"','Histograms in test set labeled "bad"'])

prediction_test_good = autoencoder.predict(X_test_good)
mse_test_good = aeu.mseTopNRaw(X_test_good, prediction_test_good, n=10 )
prediction_test_bad = autoencoder.predict(X_test_bad)
mse_test_bad = aeu.mseTopNRaw(X_test_bad, prediction_test_bad, n=10 )

print('average mse on good set: '+str(np.mean(mse_test_good)))
print('average mse on bad set: '+str(np.mean(mse_test_bad)))

nplot = 10
print('examples of good histograms and reconstruction:')
randint = np.random.choice(np.arange(len(X_test_good)),size=nplot,replace=False)
for i in randint: 
    histlist = [X_test_good[int(i),:],prediction_test_good[int(i),:]]
    labellist = ['data','reconstruction']
    colorlist = ['black','blue']
    pu.plot_hists(histlist,colorlist=colorlist,labellist=labellist)
    plt.show()

print('examples of bad histograms and reconstruction:')
randint = np.random.choice(np.arange(len(X_test_bad)),size=nplot,replace=False)
for i in randint:
    histlist = [X_test_bad[int(i),:],prediction_test_bad[int(i),:]]
    labellist = ['data','reconstruction']
    colorlist = ['black','blue']
    pu.plot_hists(histlist,colorlist=colorlist,labellist=labellist)
    plt.show()

In [None]:
### use artificial data to assess the model performance

goodhists = gdu.fourier_noise(X_test_good,nresamples=60,nonnegative=True,stdfactor=15.,figname='f')
badhists = gdu.fourier_noise(X_test_bad,nresamples=2400,nonnegative=True,stdfactor=15.,figname='f')
print('number of good histograms: '+str(len(goodhists)))
print('number of bad histograms: '+str(len(badhists)))

validation_data = np.vstack((goodhists,badhists))
labels = np.hstack((np.zeros(len(goodhists)),np.ones(len(badhists))))
prediction = autoencoder.predict(validation_data)
mse = aeu.mseTopNRaw(validation_data, prediction, n=10 )
print('examples of artificial histograms and reconstruction:')
shuffled_indices = np.arange(len(validation_data))
_ = np.random.shuffle(shuffled_indices)
validation_data = validation_data[shuffled_indices]
labels = labels[shuffled_indices]
prediction = prediction[shuffled_indices]
mse = mse[shuffled_indices]

# distribution of output scores
pu.plot_score_dist(mse,labels,nbins=200,normalize=True)
print(np.amin(mse[np.where(labels==1)]))
print(np.amax(mse[np.where(labels==0)]))
# classical ROC curve: signal efficiency (good data marked as good) vs background efficiency (bad data marked as good)
auc = aeu.get_roc(mse, labels)

In [None]:
### continution of previous cell: choose wp and plot confusion matrix

msewp = 1.5e-5
aeu.get_confusion_matrix_from_hists(validation_data,labels,prediction,msewp)