**Two-dimensional histogram classification based on non-negative matrix factorization**  
Very preliminary version, as a first check/ proof-of-principle.  
Need to pay more attention to definition of training and testing set(s) and hyperparameters.

In [None]:
### imports

# external modules
import sys
import numpy as np
import matplotlib.pyplot as plt
import importlib

# local modules
sys.path.append('../utils')
import csv_utils as csvu
import dataframe_utils as dfu
import plot_utils as pu
import hist_utils as hu
import generate_data_2d_utils as g2u
importlib.reload(csvu)
importlib.reload(dfu)
importlib.reload(pu)
importlib.reload(hu)
importlib.reload(g2u)
sys.path.append('../src/classifiers')
import NMFClassifier
importlib.reload(NMFClassifier)

In [None]:
### load the histograms

# note: you might need to change the path or file name, depending on where you have a valid csv file stored!
dffile = '../data/DF2017B_clusterposition_zphi_ontrack_PXLayer_1_subset.csv'
histdf = csvu.read_csv(dffile)
histdf = dfu.select_dcson(histdf)
(hists_ref) = hu.preparedatafromdf(histdf, cropslices=[slice(1,-1,None),slice(81,221,None)], rebinningfactor=(2,2), donormalize=True, doplot=False)
_ = pu.plot_hists_2d(hists_ref[:4], ncols=4, title='some example histograms for NMF model')
print('number of lumisections: '+str(len(histdf)))

dffile = '../data/DF2017B_clusterposition_zphi_ontrack_PXLayer_1_run297056.csv'
histdf = csvu.read_csv(dffile)
histdf = dfu.select_dcson(histdf)
(hists_good, runnbs_good, lsnbs_good) = hu.preparedatafromdf(histdf, returnrunls=True, cropslices=[slice(1,-1,None),slice(81,221,None)], rebinningfactor=(2,2), donormalize=True, doplot=False)
_ = pu.plot_hists_2d(hists_good[:4], ncols=4, title='some example histograms in good test set')
print('number of lumisections: '+str(len(histdf)))

dffile = '../data/DF2017B_clusterposition_zphi_ontrack_PXLayer_1_run297289.csv'
histdf = csvu.read_csv(dffile)
histdf = dfu.select_dcson(histdf)
(hists_bad, runnbs_bad, lsnbs_bad) = hu.preparedatafromdf(histdf, returnrunls=True, cropslices=[slice(1,-1,None),slice(81,221,None)], rebinningfactor=(2,2), donormalize=True, doplot=False)
_ = pu.plot_hists_2d(hists_bad[:4], ncols=4, title='some example histograms in bad test set')
print('number of lumisections: '+str(len(histdf)))

In [None]:
### build an NMF model

classifier = NMFClassifier.NMFClassifier( hists_ref, ncomponents=10 )

In [None]:
### plot some of the components

components = classifier.getcomponents()
_ = pu.plot_hists_2d(components, ncols=4, title='NMF model components')

In [None]:
### optionally enlarge the test set using resampling

do_resampling = True

if do_resampling:
    hists_good_ext = g2u.fourier_noise_nd(hists_good, nresamples=4, nonnegative=True, 
                     stdfactor=10., kmaxscale=0.5, ncomponents=3)
    print(hists_good_ext.shape)
    hists_bad_ext = g2u.fourier_noise_nd(hists_bad, nresamples=40, nonnegative=True, stdfactor=5., kmaxscale=0.5, ncomponents=3)
    print(hists_bad_ext.shape)

In [None]:
### perform the classification

hists_good_appl = hists_good
hists_bad_appl = hists_bad
if do_resampling:
    hists_good_appl = hists_good_ext
    hists_bad_appl = hists_bad_ext
    
scores_good = classifier.evaluate( hists_good_appl, nmax=50 )
labels_good = np.zeros(len(scores_good))
scores_bad = classifier.evaluate( hists_bad_appl, nmax=50 )
labels_bad = np.ones(len(scores_bad))
scores = np.concatenate((scores_good,scores_bad))
labels = np.concatenate((labels_good,labels_bad))
_ = pu.plot_score_dist( scores, labels, nbins=50, normalize=True,
                        title='output score distributions for signal and background',
                        xaxtitle='output score', yaxtitle=None)

In [None]:
### check some examples

nplot = 5

inds_good = np.random.choice(np.array(list(range(len(hists_good)))),size=nplot)
print('example histograms from good test set:')
for i in inds_good:
    histlist = [hists_good[i],classifier.reconstruct(np.array([hists_good[i]]))[0]]
    subtitles = ['good test histogram','NMF reconstruction']
    title = 'index: {}, run: {}, lumisection: {}, MSE: {}'.format(i, runnbs_good[i],lsnbs_good[i],scores_good[i])
    pu.plot_hists_2d(histlist, ncols=2, title = title, subtitles=subtitles, xaxtitle=None, yaxtitle=None)
plt.show()

inds_bad = np.random.choice(np.array(range(len(hists_bad))),size=nplot)
print('example histograms from bad test set:')
for i in inds_bad:
    histlist = [hists_bad[i],classifier.reconstruct(np.array([hists_bad[i]]))[0]]
    subtitles = ['bad test histogram','NMF reconstruction']
    title = 'index: {}, run: {}, lumisection: {}, MSE: {}'.format(i, runnbs_bad[i],lsnbs_bad[i],scores_bad[i])
    pu.plot_hists_2d(histlist, ncols=2, title = title, subtitles=subtitles, xaxtitle=None, yaxtitle=None)
plt.show()