**Grouping some useful functions for autoencoder_combine and autoencoder_iterative**  
These utilities mainly consist of a commom data structure and functions to fit and plot distributions to the point cloud

In [None]:
# imports

%run utils/utils.ipynb
%run utils/autoencoder_utils.ipynb
%run utils/generate_data_utils.ipynb
%run utils/clustering_utils.ipynb

In [None]:
### loading the main datastructure combining the information from several histograms
# DEPRECATED data structure, use alternative in cell below!

def make_hist_struct(year,histnames,runs=[],highstat=False):
    # make the main data structure used throughout this notebook
    
    dfstruct = {} # separate structure for dataframes (can be deleted at the end of this function)
    histstruct = {'names':[],'hists_all':{},'hists_golden':{},'entries_all':{}} # store all histogram information in this structure
    runlist = []
    selectorlist = []
    for histname in histnames:
        # for each type of histogram, add the full data to the structure 
        # also build additional selectors depending on all histogram types simulaneously 
        f = 'data/DF'+year+'_'+histname+'.csv'
        print('reading '+f)
        df = read_csv(f)
        # select runs if requested
        if len(runs) > 0:
            df = select_runs(df,runs)
        dfstruct[f.replace('.csv','')] = df
        # build high statistics selector if requested
        if highstat: selectorlist.append(get_highstat(df)) # select high statistics
        else: selectorlist.append(get_runsls(df)) # do no selection
        

    # make combined selector now that we know selectors for each histogram type individually
    totalselector = get_lcs(selectorlist)

    for i,name in enumerate(dfstruct.keys()):
        print('adding '+name)
        histstruct['names'].append(name)
        df_temp = select_runsls(dfstruct[name],totalselector) # apply selector
        df_temp = select_dcson(df_temp) # include only DCS-on json in training
        # determine statistics (must be done before normalizing)
        histstruct['entries_all'][name] = np.array(df_temp['entries'])
        # add the full dataset to the histogram structure
        (hists_all,runnbs_all,lsnbs_all) = preparedatafromdf(df_temp,returnrunls=True,onlygolden=False,rebinningfactor=1)
        histstruct['hists_all'][name] = hists_all
        # add the golden dataset to the histogram structure
        (hists_g,runnbs_g,lsnbs_g) = preparedatafromdf(df_temp,returnrunls=True,onlygolden=True,rebinningfactor=1)
        histstruct['hists_golden'][name] = hists_g
        # if processing first histogram type, add run numbers, lsnumbers and golden indices to histstruct
        if i==0:
            histstruct['runnbs_all'] = runnbs_all
            histstruct['lsnbs_all'] = lsnbs_all
            histstruct['runnbs_golden'] = runnbs_g
            histstruct['lsnbs_golden'] = lsnbs_g
        # else check consistency
        else:
            if( not ( (runnbs_all==histstruct['runnbs_all']).all() and (lsnbs_all==histstruct['lsnbs_all']).all() ) ):
                print('### WARNING ###: incompatible run and lumisection numbers')
            if( not ( (runnbs_g==histstruct['runnbs_golden']).all() and (lsnbs_g==histstruct['lsnbs_golden']).all() ) ):
                print('### WARNING ###: incompatible golden run and lumisection numbers')
    # return data structure
    del dfstruct
    return histstruct

In [None]:
### loading the main datastructure combining the information from several histograms

class histstructure:
    
    # properties
    names = [] # list of histogram names
    histograms = {} # dict mapping histogram name to 2D numpy array of histograms
    entries = {} # dict mapping histogram name to 1D numpy array of number of entries per histogram
    runnbs = [] # 1D numpy array of run numbers (same length as histograms)
    lsnbs = [] # 1D numpy array of lumisection numbers (same length as histograms)
    custom = {} # this dict remains empty, so the histstruct can be extended at runtime
    
    def __init__(self):
        ### do nothing
        pass
    
    def create(self,year,histnames,runs=[],highstat=False):
        
        dfstruct = {} # temporary structure for dataframes
        self.names = []
        self.histograms = {}
        self.entries = {}
        runlist = []
        selectorlist = []
        for histname in histnames:
            histfile = 'data/DF'+str(year)+'_'+histname+'.csv'
            name = histfile.replace('.csv','')
            # for each type of histogram, add the full data to the structure 
            # also build additional selectors depending on all histogram types simulaneously 
            print('reading '+histfile)
            df = read_csv(histfile)
            # select runs if requested
            if len(runs) > 0:
                df = select_runs(df,runs)
            dfstruct[name] = df
            # build high statistics selector if requested
            if highstat: selectorlist.append(get_highstat(df)) # select high statistics
            else: selectorlist.append(get_runsls(df)) # do no selection

        # make combined selector now that we know selectors for each histogram type individually
        totalselector = get_lcs(selectorlist)

        for i,name in enumerate(dfstruct.keys()):
            print('adding '+name)
            self.names.append(name)
            df_temp = select_runsls(dfstruct[name],totalselector) # apply selector
            df_temp = select_dcson(df_temp) # include only DCS-on json in training
            # determine statistics (must be done before normalizing)
            self.entries[name] = np.array(df_temp['entries'])
            # add the full dataset to the histogram structure
            (hists_all,runnbs_all,lsnbs_all) = preparedatafromdf(df_temp,returnrunls=True,onlygolden=False,rebinningfactor=1)
            self.histograms[name] = hists_all
            runnbs_all = runnbs_all.astype(int)
            lsnbs_all = lsnbs_all.astype(int)
            # if processing first histogram type, add run numbers, lsnumbers and golden indices to histstruct
            if i==0:
                self.runnbs = runnbs_all
                self.lsnbs = lsnbs_all
            # else check consistency
            else:
                if( not ( (runnbs_all==self.runnbs).all() and (lsnbs_all==self.lsnbs).all() ) ):
                    print('### WARNING ###: incompatible run and lumisection numbers')
        # delete temporary data structure
        del dfstruct
    
    def get_golden_mask(self):
        mask = np.array(isgolden(self.runnbs,self.lsnbs))
        return mask
    
    def get_golden_indices(self):
        indices = np.arange(len(self.runnbs))[self.get_golden_mask()]
        return indices

In [None]:
### functions for fitting a normal-like or gaussian kernel distribution to a point cloud of mse's and making plots

def get_mse_array(histstruct,valkey,dims=[]):
    if len(dims)==0:
        dims = list(range(len(histstruct.names)))
    coords = np.expand_dims( histstruct.custom[valkey][histstruct.names[dims[0]]], axis=1 )
    for dim in dims[1:]:
        coords = np.concatenate( ( coords, np.expand_dims(histstruct.custom[valkey][histstruct.names[dim]], axis=1) ), axis=1 )
    return coords

def fitseminormal(histstruct,valkey,dims=[],fitnew=True,savefit=False):
    coords = get_mse_array(histstruct,valkey,dims=dims)
    if fitnew:
        fitfunc = seminormal(coords)
        if savefit:
            fitfunc.save('seminormal_fit_'+xname+'_'+yname+'.npy')
    else:
        fitfunc = seminormal()
        fitfunc.load('seminormal_fit_'+xname.replace('2018','2017')+'_'+yname.replace('2018','2017')+'.npy')
    
    return fitfunc

def fitgaussiankde(histstruct,valkey,dims=[],maxnpoints=-1):
    coords = get_mse_array(histstruct,valkey,dims=dims)
    if( maxnpoints>0 and maxnpoints<len(coords) ): coords = coords[ np.random.choice(list(range(len(coords))),size=maxnpoints,replace=False) ]
    fitfunc = gaussiankde(coords)
    return fitfunc

def plotfit2d(histstruct,valkey,dims,fitfunc,doinitialplot=True,onlycontour=False,rangestd=30):
    
    xname = histstruct.names[dims[0]]
    yname = histstruct.names[dims[1]]
    xvals = histstruct.custom[valkey][xname]
    yvals = histstruct.custom[valkey][yname]
    
    if doinitialplot:
        # make an initial scatter plot of the data points
        fig,ax = plt.subplots()
        ax.plot(xvals,yvals,'.',markersize=1)
        plt.xticks(rotation=90)
        ax.set_xlabel(xname+' MSE')
        ax.set_ylabel(yname+' MSE')
        
    # determine plotting range as a fixed zoom from scatter plot
    #xlim = ax.get_xlim()[1]
    #ylim = ax.get_ylim()[1]
    #zoomxlim = xlim/1.
    #zoomylim = ylim/1.
    # determine plotting range as a fixed number of stds
    zoomxlim = rangestd*np.sqrt(fitfunc.cov[0,0])
    zoomylim = rangestd*np.sqrt(fitfunc.cov[1,1])
    
    x,y = np.mgrid[0.:zoomxlim:zoomxlim/100.,
                   0.:zoomylim:zoomylim/100.]
    pos = np.dstack((x, y))

    # make a new plot of probability contours and overlay data points
    fig,ax = plt.subplots()
    contourplot = ax.contourf(x, y, np.log(fitfunc.pdfgrid(pos)),30)
    plt.colorbar(contourplot)
    if not onlycontour: ax.plot(xvals,yvals,'.b',markersize=2)
    ax.set_xlim((0.,zoomxlim))
    ax.set_ylim((0.,zoomylim))
    #plt.xticks(rotation=90)
    ax.set_xlabel(xname+' MSE')
    ax.set_ylabel(yname+' MSE')
    ax.ticklabel_format(axis='both', style='sci', scilimits=(0,0))
    
    return (fig,ax)

In [None]:
# wrapper for fourier_noise allowing for a fixed target number of histograms instead of a fixed resampling factor

def upsample_hist_set(hists,ntarget,fourierstdfactor=15.,figname='f'):
    nresamples = int(float(ntarget)/len(hists))    
    hists_ext = fourier_noise(hists,figname=figname,nresamples=nresamples,nonnegative=True,stdfactor=fourierstdfactor)
    return hists_ext