# bias studies with full systematics

To assess the impact of various sources of systematic, we will rely on an Asimov dataset.

In [1]:
## imports and configuration
%cd '/home/naodell/work/wbr/analysis'
#%load_ext autoreload

from functools import partial
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
from scipy.optimize import minimize, basinhopping
from tqdm import tqdm_notebook

import scripts.plot_tools as pt
import scripts.fit_helpers as fh
from nllfit.nllfitter import ScanParameters

np.set_printoptions(precision=4)
rc_params = {
             'figure.figsize': (10, 10),
             'axes.labelsize': 20,
             'axes.facecolor': 'white',
             'axes.titlesize':'x-large',
             'legend.fontsize': 20,
             'xtick.labelsize':20,
             'ytick.labelsize':20,
             'font.size':18,
             'font.sans-serif':['Arial', 'sans-serif'],
             'mathtext.sf':'Arial',
             'lines.markersize':8.,
             'lines.linewidth':2.5,
            }
matplotlib.rcParams.update(rc_params)

%connect_info

/home/naodell/work/wbr/analysis
{
  "shell_port": 55553,
  "iopub_port": 56243,
  "stdin_port": 43121,
  "control_port": 46031,
  "hb_port": 58969,
  "ip": "127.0.0.1",
  "key": "35b2a0a0-70c361f1ead27a761d5aa9ae",
  "transport": "tcp",
  "signature_scheme": "hmac-sha256",
  "kernel_name": ""
}

Paste the above JSON into a file, and connect with:
    $> jupyter <app> --existing <file>
or, if you are local, you can connect with just:
    $> jupyter <app> --existing kernel-ab17eb1c-5c73-433c-910a-c909364aa829.json
or even just:
    $> jupyter <app> --existing
if this is the most recent Jupyter kernel you have started.


In [2]:
# configure, get the input data, and do any additional processing that is needed
input_dir  = f'local_data/templates/nominal/'
processes = ['ttbar', 't', 'ww', 'wjets', 'zjets_alt', 'diboson', 'fakes'] 
selections = [
              'ee', 'mumu',  
              'emu', 
              'mutau', 'etau', 
              'mu4j', 'e4j'
             ]
plot_labels = fh.fancy_labels

# initialize fit data
fit_data = fh.FitData(input_dir, selections, processes, process_cut=0.1)
params = fit_data._parameters
params_pre = fit_data.get_params_init().values.copy()
sample = {cat:fit_data.mixture_model(params_pre, cat) for cat in fit_data._model_data.keys()}

In [3]:
# configure fit

# minimizer options
min_options = dict(#eps=1e-9, 
                   #xtol=1e-3, 
                   #ftol=1e-9, 
                   #stepmx=0.1, 
                   #maxCGit=50, 
                   #accuracy=1e-10,
                   disp=None
                  )

# prepare scan data
pname = 'lumi'
iparam = list(params.index).index(pname)
if iparam <= 3:
    scan_vals = np.linspace(bounds[iparam][0], bounds[iparam][1], 11)
else:
    scan_vals = np.linspace(params_pre[iparam] - 2*params.err_init[iparam], params_pre[iparam] + 2*params.err_init[iparam], 11)
sample = {cat:fit_data.mixture_model(params_pre, cat) for cat in fit_data._model_data.keys()}

# configure the objective
mask = fit_data._pmask
mask[iparam] = False
fobj = partial(fit_data.objective,
               data = sample,
               do_bb_lite = True,
               lu_test = 2
              )

fobj_jac = partial(fit_data.objective_jacobian,
                   data = sample,
                   do_bb_lite = True,
                   lu_test = 2
                  )



In [4]:
# carry out scan and save results
results = []
cost = []
sv_accept = []
cost_cache = []
for sv in tqdm_notebook(scan_vals):
    # randomize n.p.
    #fit_data._pval_init[4:] = params_pre[4:] + fit_data._perr_init[4:]*np.random.randn(params_pre[4:].size)
    
    # produce sample with statistical randomization
    #sample = {cat:fit_data.mixture_model(params_pre, cat, randomize=True) for cat in fit_data._model_data.keys()}
    
    # use reduced objective to fix scan parameter
    fit_data._pval_fit[iparam] = sv
    pinit = fit_data._pval_init[mask]
    result = minimize(fobj, pinit,
                      jac = fobj_jac,
                      method = 'BFGS', 
                      options = min_options,
                     )
    
    print(fobj(pinit), result.fun, sv)
    if result.success or result.status == 1:
        sv_accept.append(sv)
        results.append(result.x)
        cost.append(result.fun)
        
        # unpack cost cache
        new_cache = []
        for cat, cache in fit_data._cache.items():
            new_cache.extend(cache['cost'])
        cost_cache.append(new_cache)
        
        #print(fit_data._cache['ee_cat_gt2_eq0']['cost'])
    else:
        print(result)
        print(sv)
        

HBox(children=(IntProgress(value=0, max=11), HTML(value='')))

2338.561473500402 802.4667008952227 0.95
      fun: 802.4667008952227
 hess_inv: array([[ 4.8610e-06,  2.3925e-06,  4.5148e-06, ..., -9.8625e-05,
        -1.6960e-05,  5.7740e-04],
       [ 2.3925e-06,  3.5379e-06,  2.9143e-06, ..., -8.1352e-05,
        -1.4654e-05,  4.9641e-04],
       [ 4.5148e-06,  2.9143e-06,  1.9260e-05, ..., -1.4450e-04,
        -2.4204e-05,  9.7641e-04],
       ...,
       [-9.8625e-05, -8.1352e-05, -1.4450e-04, ...,  9.9205e-01,
        -1.9470e-03,  4.9585e-02],
       [-1.6960e-05, -1.4654e-05, -2.4204e-05, ..., -1.9470e-03,
         9.9959e-01,  1.0492e-02],
       [ 5.7740e-04,  4.9641e-04,  9.7641e-04, ...,  4.9585e-02,
         1.0492e-02,  7.2492e-01]])
      jac: array([ 2.4213e+05,  8.1784e+04,  2.2715e+05,  2.2313e+05, -1.3854e+04,
       -2.1915e+04, -5.2440e+03, -8.6839e+01, -2.3074e+02, -4.4408e+02,
       -1.2068e+03, -4.2149e+01, -6.8204e+02, -7.4950e+01,  3.2495e+02,
       -1.7654e+04, -8.6984e+00,  2.9773e+01,  5.2643e+01, -4.4564e+00,
       

830.2745522279248 259.3762164757363 0.97
      fun: 259.3762164757363
 hess_inv: array([[ 1.9470e-06,  7.7209e-07,  1.8639e-06, ...,  1.8089e-06,
        -1.5222e-07,  4.9373e-05],
       [ 7.7209e-07,  2.1827e-06,  1.9200e-06, ..., -1.8509e-05,
        -5.0938e-06,  1.7297e-04],
       [ 1.8639e-06,  1.9200e-06,  2.1328e-05, ..., -2.5284e-05,
         4.7493e-06,  2.1002e-04],
       ...,
       [ 1.8089e-06, -1.8509e-05, -2.5284e-05, ...,  9.8571e-01,
        -2.7698e-03,  7.6320e-02],
       [-1.5222e-07, -5.0938e-06,  4.7493e-06, ..., -2.7698e-03,
         9.9945e-01,  1.4675e-02],
       [ 4.9373e-05,  1.7297e-04,  2.1002e-04, ...,  7.6320e-02,
         1.4675e-02,  5.9579e-01]])
      jac: array([ 1.0578e+05,  7.8819e+03,  1.2640e+05,  1.2677e+05, -1.0590e+04,
       -1.6198e+04, -1.4353e+02, -4.8395e+01, -2.8123e+02, -6.5291e+02,
       -1.0085e+03, -3.8516e+01, -6.7383e+02,  2.1869e+00,  2.8796e+01,
       -1.4689e+04, -1.0745e+00,  8.8722e+00,  2.8193e+01, -1.0905e+01,
       

91.02294976835523 43.251026546322905 0.99
      fun: 43.251026546322905
 hess_inv: array([[ 1.3546e-05,  9.1880e-06, -1.0365e-05, ..., -1.7022e-04,
        -2.9250e-05,  8.7615e-04],
       [ 9.1880e-06,  8.6286e-06, -5.4892e-06, ..., -1.0914e-04,
        -1.8779e-05,  6.1325e-04],
       [-1.0365e-05, -5.4892e-06,  7.9621e-05, ...,  6.7749e-04,
         1.3706e-04, -3.1274e-03],
       ...,
       [-1.7022e-04, -1.0914e-04,  6.7749e-04, ...,  1.0133e+00,
         2.1578e-03, -5.9553e-02],
       [-2.9250e-05, -1.8779e-05,  1.3706e-04, ...,  2.1578e-03,
         1.0003e+00, -9.4805e-03],
       [ 8.7615e-04,  6.1325e-04, -3.1274e-03, ..., -5.9553e-02,
        -9.4805e-03,  1.2640e+00]])
      jac: array([ 1.8944e+04, -2.4993e+04,  4.0585e+04,  4.2795e+04, -5.3548e+03,
       -7.8115e+03, -5.5269e+02, -1.4122e+01, -5.1274e+01, -4.2911e+02,
       -6.1702e+02, -4.9711e+01, -3.0201e+02, -3.8107e+01, -2.4263e+01,
       -8.4458e+03, -2.2580e+00,  4.8358e+00,  1.7284e+00, -3.3174e+00,
     

AttributeError: 'FitData' object has no attribute '_cache'

In [None]:
# fit scan data with a parabola and calculate curvature
results = np.array(results)
cost = np.array(cost)
sv_accept = np.array(sv_accept)
cost_cache = np.array(cost_cache)

mask = cost >= 0 
nll_coeff = np.polyfit(sv_accept[mask], cost[mask], deg=2)#, w=np.exp(-0.5*(params_init[ix] - scan_points)**2/sigma_fisher**2))
nll_poly = np.poly1d(nll_coeff)
d_nll_poly = np.polyder(nll_poly, m=2)
sigma_post = 1/np.sqrt(d_nll_poly(0))

if params_pre[iparam] != 0:
    err = 2*sigma_post*100/params_pre[iparam]
else:
    err = 2*sigma_post*100
print(err)

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(8,8), facecolor='white')

ax.plot(sv_accept, cost, 'ko')
ax.plot(scan_vals, nll_poly(scan_vals), 'r--')
ax.grid()
ax.set_xlabel(params.loc[pname].label)
ax.set_ylabel('NLL')
ax.text(sv_accept[1], 0.95*np.max(cost), r'$\hat{\theta} = $' + f'{sv_accept[np.argmin(cost)]:.3f}')
ax.text(sv_accept[1], 0.89*np.max(cost), r'$\sigma_{\theta} = $' + f'{err/100:.2%}')

#ax.set_ylim(0, 10)

plt.savefig(f'plots/nll_scans/{pname}.png')
plt.show()

In [None]:
# unpack cached data

err = []
for i in range(cost_cache.shape[1]):
    #ax.plot(sv_accept, cost_cache[:,i], 'o')
    
    nll_coeff = np.polyfit(sv_accept, cost_cache[:,i], deg=2)#, w=np.exp(-0.5*(params_init[ix] - scan_points)**2/sigma_fisher**2))
    nll_poly = np.poly1d(nll_coeff)
    d_nll_poly = np.polyder(nll_poly, m=2)
    sigma = 1/np.sqrt(d_nll_poly(0))
    err.append((sigma**2)/(sigma_post**2))
    
err = np.array(err)
err[np.isnan(err)] = np.inf

fig, axes = plt.subplots(5, 1, facecolor='white', figsize=(20, 16))
selections = ['ll', 'emu', 'mutau', 'etau', 'lh']
for ix, selection in enumerate(selections):
    ax = axes[ix]
    if selection == 'll':
        ax.set_title(r'$ee/\mu\mu$', fontsize=16)
        imin, imax = 0, 90 # ee, mumu
    elif selection == 'emu':
        ax.set_title(r'$e\mu$', fontsize=16)
        imin, imax = 90, 178 # emu
    elif selection == 'mutau':
        ax.set_title(r'$\mu\tau$', fontsize=16)
        imin, imax = 178, 265 # mutau
    elif selection == 'etau':
        ax.set_title(r'$e\tau$', fontsize=16)
        imin, imax = 265, 345 # etau
    elif selection == 'lh':
        ax.set_title(r'$e/\mu+h$', fontsize=16)
        imin, imax = 345, 401 # e4j, mu4j

    # get bins per category
    ibin = 0
    err_max = np.max(1/err[imin:imax])
    for cat, data in fit_data._model_data.items():

        nbins = data['data'][0].size
        ibin += nbins
        if ibin <= imin or ibin > imax: 
            continue

        jet_cat = '_'.join(cat.split('_')[1:])
        jet_cat = pt.categories[jet_cat].label 
        ax.text(ibin-nbins+1, 1.05*err_max, jet_cat, rotation=0, fontsize=14)
        ax.plot([ibin, ibin], [0, 1.5*err_max], 'r:')

    #ax.bar(np.arange(cost_cache.shape[1]), 1/err)
    ax.bar(np.arange(imin, imax), 1/err[imin:imax], align='edge')

    ax.grid(axis='Y')
    ax.set_xlim(imin, imax)
    ax.set_ylim(0, 1.2*err_max)
    ax.set_xlabel('bin')
    if ix == 2:
        ax.set_ylabel(r'$\partial^{2}NLL_{bin}/\partial^{2}NLL_{total}$')

plt.tight_layout()
plt.savefig(f'plots/nll_scans/{pname}_scan_bins_{selection}.png')
plt.show()