## baseline fit and covariance estimation

To assess the impact of various sources of systematic, we will rely on an Asimov dataset.

In [1]:
## imports and configuration
%cd '/home/naodell/work/wbr/analysis'
#%load_ext autoreload

from functools import partial
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
from scipy.optimize import minimize
from tqdm import tqdm_notebook

import scripts.plot_tools as pt
import scripts.fit_helpers as fh
from nllfit.nllfitter import ScanParameters

np.set_printoptions(precision=3)
rc_params = {
             'figure.figsize': (10, 10),
             'axes.labelsize': 20,
             'axes.facecolor': 'white',
             'axes.titlesize':'x-large',
             'legend.fontsize': 20,
             'xtick.labelsize':18,
             'ytick.labelsize':18,
             'font.size':18,
             'font.sans-serif':['Arial', 'sans-serif'],
             'mathtext.sf':'Arial',
             'lines.markersize':8.,
             'lines.linewidth':2.5,
            }
matplotlib.rcParams.update(rc_params)

%connect_info

/home/naodell/work/wbr/analysis
{
  "shell_port": 35105,
  "iopub_port": 59127,
  "stdin_port": 56239,
  "control_port": 33281,
  "hb_port": 46369,
  "ip": "127.0.0.1",
  "key": "982f09b8-3cf4c1a21c3031b57a9e803b",
  "transport": "tcp",
  "signature_scheme": "hmac-sha256",
  "kernel_name": ""
}

Paste the above JSON into a file, and connect with:
    $> jupyter <app> --existing <file>
or, if you are local, you can connect with just:
    $> jupyter <app> --existing kernel-588856ca-10f2-4467-a221-1c8c98228bd8.json
or even just:
    $> jupyter <app> --existing
if this is the most recent Jupyter kernel you have started.


In [2]:
# configure, get the input data, and do any additional processing that is needed
processes = ['ttbar', 't', 'ww', 'wjets', 'zjets_alt', 'diboson', 'fakes'] 
selections = [
              'ee',  'mumu',  
              'emu', 
              'mutau', 'etau', 
              'mu4j', 'e4j'
             ]
plot_labels = fh.fancy_labels

# initialize fit data
input_dir  = f'local_data/templates/nominal/'
fit_data = fh.FitData(input_dir, selections, processes, process_cut=0.01)

In [3]:
# prepare Asimov dataset
params = fit_data._parameters
params_pre = fit_data.get_params_init().values
asimov_data = dict()
for category, model_data in fit_data._model_data.items():
    # parse lepton selection and jet category
    cat_split = category.split('_')
    sel = cat_split[0]
    jet_cat = '_'.join(cat_split[1:])
    
    # build prefit expectation from model_tensor
    expected_pre, expected_var = fit_data.mixture_model(params_pre, category)
    expected_test = fit_data.model_sums(sel, jet_cat) # for testing template removal
    
    asimov_data[category] = (expected_pre, np.sqrt(expected_pre))


In [4]:
# initialize veto list
fit_data.veto_list = [
    # baseline
    #'ee_cat_gt2_eq1_b', 'ee_cat_gt2_gt2_b', 
    #'mumu_cat_gt2_eq1_b', 'mumu_cat_gt2_gt2_b', 
    #'emu_cat_gt2_eq1_a', 'emu_cat_gt2_gt2_a', 
    #'etau_cat_eq2_eq1', 'etau_cat_gt3_eq1', 'etau_cat_eq2_gt2', 'etau_cat_gt3_gt2', 
    #'mutau_cat_eq2_eq1', 'mutau_cat_gt3_eq1', 'mutau_cat_eq2_gt2', 'mutau_cat_gt3_gt2', 
    #'e4j_cat_gt4_eq1', 'e4j_cat_gt4_gt2'
    #'mu4j_cat_gt4_eq1', 'mu4j_cat_gt4_gt2', 
    'e4j_cat_eq3_gt2', 'mu4j_cat_eq3_gt2',
    
    # e/mu DY CR
    'ee_cat_gt2_eq0',  'mumu_cat_gt2_eq0', 
    
    # e+mu additional ttbar
    #'emu_cat_gt2_eq0', 'emu_cat_eq1_eq0_a', 'emu_cat_eq1_eq1_a', 
    
    # e+mu WW
    #'emu_cat_eq0_eq0_a', 
    
    # e/mu+tau additional CR
    #'mutau_cat_eq0_eq0', 'mutau_cat_eq1_eq0', 
    #'mutau_cat_gt2_eq0', 'mutau_cat_eq1_eq1', 
    #'etau_cat_eq0_eq0', 'etau_cat_eq1_eq0', 
    #'etau_cat_gt2_eq0', 'etau_cat_eq1_eq1', 
]

In [5]:
# fit configuration #

# bounds
bounds = [(0.1, 0.12), (0.1, 0.12), (0.1, 0.12), (0.64, 0.7)]
bounds += [(0.16, 0.18), (0.16, 0.18), (0.64, 0.66)]
bounds += fit_data._nnorm*[(0.5, 1.5), ]
bounds += fit_data._nshape*[(-3., 3.), ]
bounds = np.array(bounds)

# minimizer options
min_options = dict(#eps=1e-9, 
                   #xtol=1e-3, 
                   #ftol=1e-12, 
                   #maxcor=2,
                   #maxls=30,
                   #stepmx=0.1, 
                   maxfun=1e5,
                   disp=None
                  )

# configure the objective
mask = params['active'].values.astype(bool)
fobj = partial(fh.reduced_objective,
               params_fixed = params_pre,
               mask = mask.copy(),
               objective=fit_data.objective,
               do_mc_stat = True,
               data = None
              )

# randomize initial values
pinit = params_pre + np.random.randn(params_pre.size)*params['err_init'].values
pinit[:4] = params_pre[:4]

In [13]:
# carry out fit for null hypothesis
fobj_null = partial(fh.objective_lu, test_type = 1)
mask[1:4] = False
result_null = minimize(fobj_null, pinit[mask],
                       method  = 'L-BFGS-B', 
                       options = min_options,
                       bounds  = bounds[mask],
                       args = (fobj)
                      )

print(' null : ', result_null.status, result_null.fun, result_null.x[0]*100)

 null :  1 541.336115834905 10.935748767142718


In [14]:
# carry out fit for alt 1
fobj_alt = partial(fh.objective_lu, test_type=2)
mask[1] = True
pinit[0:3] = result_null.x[0]
pinit[3] = 1 - 3*result_null.x[0]
#pinit[4:] = result_null.x[1:]
bnds = bounds[mask]
result_alt1 = minimize(fobj_alt, pinit[mask],
                       method  = 'L-BFGS-B', 
                       options = min_options,
                       bounds  = bnds,
                       args = (fobj)
                      )
        
print(' alt. 1: ', result_alt1.status, result_alt1.fun, result_alt1.x[:2]*100)

 alt. 1:  1 538.9659452950672 [10.845 11.249]


In [15]:
# carry out fit for alt 2
#pinit[mask][4:] = result_alt1.x[:2]
pinit[0:2] = result_alt1.x[0]
pinit[2] = result_alt1.x[1]
pinit[3] = 1 - pinit[:3].sum()
mask[:4] = True
bnds = bounds[mask]
result_alt2 =  minimize(fobj, pinit[mask],
                        method = 'L-BFGS-B', 
                        #options = min_options,
                        bounds = bnds,
                       )

print(' alt. 2: ', result_alt1.status, result_alt2.fun, result_alt2.x[:4]*100)

 alt. 2:  1 867.7802823217011 [10.487 10.842 10.792 67.878]


In [9]:
# Wilk's theorem
from scipy.stats import chi2, norm
q1 = 2*(result_null.fun - result_alt1.fun)
z1 = -norm.ppf(chi2.sf(q1, 1))

q2 = 2*(result_null.fun - result_alt2.fun)
z2 = -norm.ppf(chi2.sf(q2, 2))

print(' ', q1, z1, q2, z2)

  34.999448451408284 5.800900465479455 -311.38437490102365 -inf


In [10]:
# calculate covariance matrix from the inverse of the Hessian of the NLL
pinit = result_alt2.x
stderr, corr = fh.calculate_covariance(fobj, pinit)

In [11]:
# plot errors
parameters = fit_data._parameters.copy()[mask]
parameters['val_fit'] = result_alt2.x
parameters['err_fit'] = stderr

# scale branching fraction values
param_mask = np.array(parameters.type == 'poi', dtype=bool)
val_init = parameters.loc[param_mask, 'val_init']
parameters.loc[param_mask, 'err_init'] *= 100/val_init
parameters.loc[param_mask, 'err_fit'] *= 100/val_init

parameters['ratio'] = parameters.err_fit/parameters.err_init

# scale normalization values
param_mask = np.array(parameters.type == 'norm', dtype=bool)
#parameters.loc[param_mask, 'err_init'] *= 100
#parameters.loc[param_mask, 'err_fit'] *= 100

pd.set_option('display.float_format', lambda x: '%.3f' % x)
pd.set_option('display.max_columns', parameters.shape[0])
pd.set_option('display.max_rows', parameters.shape[0])

# plot nuisance parameter errors and constraints
parameters.set_index('label')
parameters.to_csv('local_data/pulls.csv')

parameters[['val_init', 'err_init', 'val_fit', 'err_fit']]
#parameters[['err_init', 'err_fit', 'ratio']]#.head(4)


Unnamed: 0_level_0,val_init,err_init,val_fit,err_fit
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
beta_e,0.108,99.715,0.108,0.972
beta_mu,0.108,99.715,0.109,0.57
beta_tau,0.113,95.587,0.12,1.744
beta_h,0.67,100.836,0.664,0.308
br_tau_e,0.178,0.224,0.179,0.224
br_tau_mu,0.174,0.23,0.174,0.23
br_tau_h,0.648,0.154,0.649,0.154
lumi,1.0,0.025,1.143,0.019
xs_diboson,1.0,0.1,0.86,0.098
xs_ww,1.0,0.1,0.879,0.025


In [12]:
# produce correlation matrix
p_labels = list(params.index)
p_labels_fancy = fit_data._parameters.label[mask]

df_corr = pd.DataFrame(corr, columns=p_labels_fancy, index=p_labels_fancy)
df_corr.to_latex('local_data/corr_table_full.tex')
df_corr.to_csv('local_data/corr_table_full.csv')

#import seaborn as sns
#fig, ax = plt.subplots(1, 1, facecolor='white', figsize=(40, 32))
#sns.heatmap(df_corr, cmap='Spectral', annot=False, fmt='.2f', ax=ax, cbar_kws={'label': r'correlation coefficient'})
#plt.plot([4, df_corr.shape[0]], [4, 4], 'r--')
#plt.plot([4, 4], [4, df_corr.shape[0]], 'r--')
#ax.set_xlabel('')
#ax.set_ylabel('')
#ax.xaxis.tick_top()
#plt.xticks(rotation=90)
#
#plt.tight_layout()
#plt.subplots_adjust(top=0.9)
#plt.savefig('plots/systematics/correlation_matrix.pdf')
#plt.show()

In [13]:
## helper functions and imports
from tqdm import tqdm_notebook, trange
from multiprocessing import Pool

def reduced_objective(p, mask, p_init, do_mc_stat=False):
    masked_p = p_init.copy()
    masked_p[mask] = p
    return fit_data.objective(masked_p, data=asimov_data, do_mc_stat=do_mc_stat)


In [14]:
# stat only
params_init = params_pre
mask = np.ones(len(params_init)).astype(bool)
mask[4:] = False

fobj = partial(reduced_objective, mask=mask, p_init=params_init, do_mc_stat=False)
err_stat, _ = fh.calculate_covariance(fobj, params_init[:4])
err_stat = np.concatenate([err_stat, np.zeros(params_init[4:].size)])

fobj = partial(fit_data.objective, 
               data=asimov_data, 
               cost_type='poisson', 
               no_shape=False,
               do_mc_stat=False
              )
err_no_bb, _ = fh.calculate_covariance(fobj, params_init)

err_syst = np.sqrt(err_no_bb**2 - err_stat**2)

  sig         = np.sqrt(hinv.diagonal())


## estimating individual systematic contributions

The effect of any individual systematic uncertainty is somewhat complicated by it's mutual covariance with the POI (i.e., the W branching fractions) and any other systematic uncertainty.  To get a rough idea of the percent-wise contribution to the total uncertainty from each individual systematic, I use the following scheme:

   * the fit is carried out as in the nominal case and $\sigma_{0}$ is estimated
   * the fit is carried out for for each of the $n$ nuisance parameters $\sigma_{theta}$
   * the difference between the nominal case and the ``$n-1$" case is calculated,
   * this quantity is normalized to $\sum_{\theta} \sigma_{\theta}^{2}$

In [15]:
# n-1 systematics
pool = Pool(processes=10)
results = []
for i in range(4, len(params_init)):
    mask = np.ones(len(params_init)).astype(bool)
    mask[i] = False
    fobj = partial(reduced_objective, mask=mask, p_init=params_init, do_mc_stat=False)
    #res = pool.apply_async(fh.calculate_covariance, args=(fobj, np.delete(params_init, i)))
    #results.append(res)
    
pool.close()
pool.join()

output = [r.get() for r in results]

In [16]:
# convert to errors
sub_errs = [o[0] for o in output]
errs = np.array([np.concatenate([sub_errs[i-4][:i], [0], sub_errs[i-4][i:]]) for i in range(4, params_init.size)])
errs = err_no_bb**2 - errs**2
errs[errs < 0] = 0

errs = np.sqrt(errs)
errs = np.vstack([errs, err_stat, stderr])

errs = pd.DataFrame(errs, columns=p_labels, index=p_labels[4:] + ['stat', 'total'])
beta_errs = errs.iloc[:,:4].multiply(100)/params_init[:4]

IndexError: list index out of range

In [None]:
# print table

#beta_errs.divide(params_init[:4]/100, axis=1).to_latex('local_data/errors.tex')
#beta_errs.divide(params_init[:4]/100, axis=1).to_csv('local_data/errors.csv')
#beta_errs.divide(params_init[:4]/100, axis=1)
beta_errs.to_latex('local_data/summary_errors.tex', escape=False)
beta_errs

In [None]:
jes_mask = np.array([True if ('jes' in pname and 'btag' not in pname) else False for pname in beta_errs.index])
btag_mask = np.array([True if 'btag' in pname else False for pname in beta_errs.index])
tau_misid_mask = np.array([True if ('misid_tau' in pname and pname not in ['misid_tau_e', 'misid_tau_h']) else False for pname in beta_errs.index])

btag_errs = beta_errs[btag_mask]
jes_errs = beta_errs[jes_mask]
tau_misid_errs = beta_errs[tau_misid_mask]

summary_errs = beta_errs[~btag_mask&~jes_mask&~tau_misid_mask].copy()
summary_errs.index = [fit_data._parameters.loc[p].label if p in fit_data._parameters.index else p for p in summary_errs.index]
summary_errs.loc['b-tag',:] = np.sqrt(np.sum(btag_errs**2))
summary_errs.loc['JES',:]  = np.sqrt(np.sum(jes_errs**2))
summary_errs.loc[r'$\sf jet\rightarrow\tau$',:]  = np.sqrt(np.sum(tau_misid_errs**2))

summary_errs = summary_errs.divide(params_init[:4]/100, axis=1)
summary_errs.to_latex('local_data/summary_errors.tex', escape=False)
summary_errs.to_csv('local_data/summary_errors.csv')
summary_errs