## baseline fit and covariance estimation

To assess the impact of various sources of systematic, we will rely on an Asimov dataset.

In [1]:
## imports and configuration
%cd '/home/naodell/work/wbr/analysis'
#%load_ext autoreload

from multiprocessing import Pool
from functools import partial
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
from scipy.optimize import minimize
from tqdm import tqdm_notebook, trange

import scripts.plot_tools as pt
import scripts.fit_helpers as fh

np.set_printoptions(precision=3)
rc_params = {
             'figure.figsize': (10, 10),
             'axes.labelsize': 20,
             'axes.facecolor': 'white',
             'axes.titlesize':'x-large',
             'legend.fontsize': 20,
             'xtick.labelsize':18,
             'ytick.labelsize':18,
             'font.size':18,
             'font.sans-serif':['Arial', 'sans-serif'],
             'mathtext.sf':'Arial',
             'lines.markersize':8.,
             'lines.linewidth':2.5,
            }
matplotlib.rcParams.update(rc_params)

%connect_info

/home/naodell/work/wbr/analysis
{
  "shell_port": 56691,
  "iopub_port": 46467,
  "stdin_port": 56425,
  "control_port": 56147,
  "hb_port": 35669,
  "ip": "127.0.0.1",
  "key": "d001c7b9-a9af4e0ebec9d919b7a07060",
  "transport": "tcp",
  "signature_scheme": "hmac-sha256",
  "kernel_name": ""
}

Paste the above JSON into a file, and connect with:
    $> jupyter <app> --existing <file>
or, if you are local, you can connect with just:
    $> jupyter <app> --existing kernel-6d6976e8-6847-412b-8cfe-217e2e93a495.json
or even just:
    $> jupyter <app> --existing
if this is the most recent Jupyter kernel you have started.


In [2]:
# configure, get the input data, and do any additional processing that is needed
processes = ['ttbar', 't', 'ww', 'wjets', 'zjets_alt', 'diboson', 'fakes'] 
selections = [
              'ee',  'mumu',  
              'emu', 
              'mutau', 'etau', 
              'mu4j', 'e4j'
             ]
plot_labels = fh.fancy_labels

# initialize fit data
input_dir  = f'local_data/templates/updated_e_sf/'
fit_data = fh.FitData(input_dir, selections, processes, 
                      param_file = 'data/model_parameters_default.csv',
                      use_prefit = False,
                      process_cut = 0.05
                     )

In [3]:
# prepare Asimov dataset
parameters = fit_data._parameters
asimov_data = {cat:fit_data.mixture_model(parameters.val_init.values, cat) for cat in fit_data._model_data.keys()}

In [4]:
# initialize veto list
fit_data.veto_list = [
    # baseline
    #'ee_cat_gt2_eq1_b', 'ee_cat_gt2_gt2_b', 
    #'mumu_cat_gt2_eq1_b', 'mumu_cat_gt2_gt2_b', 
    #'emu_cat_gt2_eq1_a', 'emu_cat_gt2_gt2_a', 
    #'etau_cat_eq2_eq1', 'etau_cat_gt3_eq1', 'etau_cat_eq2_gt2', 'etau_cat_gt3_gt2', 
    #'mutau_cat_eq2_eq1', 'mutau_cat_gt3_eq1', 'mutau_cat_eq2_gt2', 'mutau_cat_gt3_gt2', 
    #'e4j_cat_gt4_eq1', 'e4j_cat_gt4_gt2',
    #'mu4j_cat_gt4_eq1', 'mu4j_cat_gt4_gt2', 
    'e4j_cat_eq3_gt2', 'mu4j_cat_eq3_gt2',
    
    # e/mu DY CR
    'ee_cat_gt2_eq0',  'mumu_cat_gt2_eq0', 
    
    # e+mu additional ttbar
    #'emu_cat_gt2_eq0', 'emu_cat_eq1_eq0_a', 'emu_cat_eq1_eq1_a', 
    
    # e+mu WW
    #'emu_cat_eq0_eq0_a', 
    
    # e/mu+tau additional CR
    #'mutau_cat_eq0_eq0', 'mutau_cat_eq1_eq0', 
    #'mutau_cat_gt2_eq0', 'mutau_cat_eq1_eq1', 
    #'etau_cat_eq0_eq0', 'etau_cat_eq1_eq0', 
    #'etau_cat_gt2_eq0', 'etau_cat_eq1_eq1', 
]

In [5]:
# fit configuration #

# minimizer options
min_options = dict(#eps=1e-9, 
                   gtol = 1e-2,
                   disp = True
                  )

# configure the objective
mask = fit_data._pmask.copy()
sample = asimov_data
fobj = partial(fit_data.objective,
               data = None,
               do_bb_lite = True,
               lu_test = 0
              )

fobj_jac = partial(fit_data.objective_jacobian,
                   data = None,
                   do_bb_lite = True,
                   lu_test = 0
                  )


In [6]:
# carry out fit for null hypothesis
#min_options['verbose'] = 2
#result_null = minimize(fobj, params_pre[mask],
#                       jac = fobj_jac,
#                       hess = '2-point',
#                       method  = 'trust-constr', 
#                       #method  = 'BFGS', 
#                       options = min_options,
#                       args = ()
#                      )
#
#print(' null : ', result_null.status, result_null.fun, result_null.x[:4]*100)

In [7]:
# carry out fit for null hypothesis
params_pre = parameters['val_init'].values
result_null = minimize(fobj, params_pre[mask],
                       jac = fobj_jac,
                       method  = 'BFGS', 
                       options = min_options,
                       args = ()
                      )

print(' null : ', result_null.status, result_null.fun, result_null.x[:4]*100)

         Current function value: 467.142150
         Iterations: 55
         Function evaluations: 133
         Gradient evaluations: 121
 null :  2 467.1421502832533 [10.908 10.908 10.908 67.276]


In [8]:
# carry out fit for alt 1
fobj.keywords['lu_test'] = 1
fobj_jac.keywords['lu_test'] = 1
result_alt1 = minimize(fobj, params_pre[mask],
                       jac     = fobj_jac,
                       method  = 'BFGS', 
                       options = min_options,
                       args = ()
                      )
        
print(' alt. 1: ', result_alt1.status, result_alt1.fun, result_alt1.x[:4]*100)

         Current function value: 462.606856
         Iterations: 52
         Function evaluations: 150
         Gradient evaluations: 139
 alt. 1:  2 462.60685573181377 [10.897 10.897 11.579 66.628]


In [9]:
# carry out fit for alt 2
fobj.keywords['lu_test'] = 2
fobj_jac.keywords['lu_test'] = 2
result_alt2 =  minimize(fobj, params_pre[mask],
                        jac     = fobj_jac,
                        method  = 'BFGS', 
                        options = min_options,
                        #bounds  = bnds,
                       )

print(' alt. 2: ', result_alt2.status, result_alt2.fun, result_alt2.x[:4]*100)

KeyboardInterrupt: 

In [None]:
# Wilk's theorem
from scipy.stats import chi2, norm
q1 = 2*(result_null.fun - result_alt1.fun)
z1 = -norm.ppf(chi2.sf(q1, 1))

q2 = 2*(result_null.fun - result_alt2.fun)
z2 = -norm.ppf(chi2.sf(q2, 2))

print(f' test hypothesis 1: q = {q1:.1f}, z = {z1:.1f}')
print(f' test hypothesis 2: q = {q2:.1f}, z = {z2:.1f}')

In [None]:
# calculate covariance matrix from the inverse of the Hessian of the NLL
print('Calculating full covariance...')
p_mle = result_alt2.x
stderr, corr = fh.calculate_covariance(fobj, p_mle)

# no MC stat
print('Calculating covariance without MC stats nuisance parameters...')
fobj.keywords['do_bb_lite'] = False
fobj_jac.keywords['do_bb_lite'] = False

result_no_bb =  minimize(fobj, params_pre[mask],
                    jac     = fobj_jac,
                    method  = 'BFGS', 
                    options = min_options,
                    #bounds  = bnds,
                    )
err_no_bb, _ = fh.calculate_covariance(fobj, p_mle_no_bb)

# stat only
print('Calculating covariance without any nuisance parameters...')
mask = fit_data._pmask.copy()
fit_data._pmask[4:] = False
err_stat, _ = fh.calculate_covariance(fobj, p_mle[:4])
err_stat = np.concatenate([err_stat, np.zeros(p_mle.size - 4)])

# w/ MC stat, but no systematics
print('Calculating covariance with only MC stat nuisance parameters...')
fobj.keywords['do_bb_lite'] = True
fobj_jac.keywords['do_bb_lite'] = True
err_mc_stat_alt, _ = fh.calculate_covariance(fobj, p_mle[:4])
err_mc_stat_alt = np.concatenate([err_mc_stat_alt, np.zeros(p_mle.size - 4)])
fit_data._pmask = mask

# combine components
err_syst = np.sqrt(err_no_bb**2 - err_stat**2)
err_mc_stat = np.sqrt(stderr**2 - err_no_bb**2)
err_mc_stat_alt = np.sqrt(err_mc_stat_alt**2 - err_stat**2)

In [None]:
# plot errors
parameters = fit_data._parameters
parameters['val_fit'] = parameters['val_init'].copy()
parameters['err_fit'] = parameters['err_init'].copy()
parameters.loc[mask, 'val_fit'] = result_alt2.x
parameters.loc[mask, 'err_fit'] = stderr
parameters['ratio'] = parameters.err_fit/parameters.err_init

parameters['err_no_bb'] = parameters['err_init'].copy()
parameters.loc[mask, 'err_no_bb'] = err_no_bb
parameters.loc[mask, 'err_stat'] = err_stat
parameters.loc[~mask, 'err_stat'] = 0.

# scale branching fraction values
#param_mask = np.array(parameters.type == 'poi', dtype=bool)
#val_init = parameters.loc[param_mask, 'val_init']
#parameters.loc[param_mask, 'err_init'] *= 100/val_init
#parameters.loc[param_mask, 'err_fit'] *= 100/val_init

pd.set_option('display.float_format', lambda x: '%.4f' % x)
pd.set_option('display.max_columns', parameters.shape[0])
pd.set_option('display.max_rows', parameters.shape[0])

# save fit data as input to full fit
outfile = open('local_data/fit_data_postfit.pkl', 'wb')
pickle.dump(fit_data, outfile)
outfile.close()

# plot nuisance parameter errors and constraints
parameters.set_index('label')
parameters.to_csv('data/model_parameters_asimov.csv')
parameters[['val_init', 'val_fit', 'err_init', 'err_fit', 'err_no_bb', 'err_stat', 'ratio']]
#parameters[['err_init', 'err_fit', 'ratio']]#.head(4)

In [None]:
# produce correlation matrix
p_labels = list(parameters.index)
p_labels_fancy = fit_data._parameters.label[mask]

df_corr = pd.DataFrame(corr, columns=p_labels_fancy, index=p_labels_fancy)
df_corr.to_latex('local_data/corr_table_full.tex')
df_corr.to_csv('local_data/corr_table_full.csv')

import seaborn as sns
fig, ax = plt.subplots(1, 1, facecolor='white', figsize=(40, 40))
sns.heatmap(df_corr, cmap='Spectral', annot=False, fmt='.2f', ax=ax, cbar_kws={'label': r'correlation coefficient'})
plt.plot([4, df_corr.shape[0]], [4, 4], 'r--')
plt.plot([4, 4], [4, df_corr.shape[0]], 'r--')
ax.set_xlabel('')
ax.set_ylabel('')
ax.xaxis.tick_top()
#ax.xaxis.set_ticks(np.arange(0, df_corr.shape[0], 1))
plt.xticks(rotation=90)

plt.tight_layout()
plt.subplots_adjust(top=0.9)
plt.savefig('plots/systematics/correlation_matrix.pdf')
plt.show()

In [None]:
# save to dataframe
print(' stat. error: ', 100*err_stat[:4]/p_mle[:4])
print(' syst. error: ', 100*err_syst[:4]/p_mle[:4])
print(' MC stat. error: ', 100*err_mc_stat[:4]/p_mle[:4])
print(' full error: ', 100*stderr[:4]/p_mle[:4])