In [11]:
#imports and config
%cd '/home/naodell/work/CMS/amumu'
%matplotlib notebook

from __future__ import division

import os, sys
from timeit import default_timer as timer

import numpy as np
from numpy.polynomial.legendre import legval
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
from scipy.stats import norm, chi2
from scipy import integrate
from lmfit import Parameter, Parameters
from tqdm import tqdm_notebook

from nllfitter import Model, NLLFitter
import nllfitter.fit_tools as ft
import nllfitter.plot_tools as pt

pt.set_new_tdr()
matplotlib.rcParams['figure.figsize'] = (8,8)

/home/naodell/work/CMS/amumu


In [12]:
# get the data
do_data     = False
ntuple_dir  = 'data/flatuples/mumu_2012'
output_path = 'plots/fits/mumu_2012'
datasets    = ['muon_2012A', 'muon_2012B', 'muon_2012C', 'muon_2012D']
signal      = 'fcnc'
features    = ['dilepton_mass', 'dilepton_pt_over_m']
cuts        = '(lepton1_pt > 25 and abs(lepton1_eta) < 2.1 \
               and lepton2_pt > 25 and abs(lepton2_eta) < 2.1 \
               and lepton1_q != lepton2_q \
               and 12 < dilepton_mass < 70) \
               and ((n_bjets == 1 and n_fwdjets > 0 and n_jets == 0) or \
               (n_bjets > 0 and n_fwdjets == 0 and (n_bjets + n_jets) == 2 \
               and four_body_delta_phi > 2.5 and met_mag < 40))'


data_manager = pt.DataManager(input_dir     = ntuple_dir,
                              dataset_names = datasets,
                              selection     = 'mumu',
                              period        = 2012,
                              cuts          = cuts
                             )
df_data = data_manager.get_dataframe('data')

Loading dataframes: 100%|███████████████| 4.00/4.00 [00:04<00:00, 1.06it/s]


In [13]:
# set up fitters for data
bg_params = Parameters()
bg_params.add_many(
                   ('a1', 0., True, None, None, None),
                   ('a2', 0., True, None, None, None)
                  )
bg_model  = Model(ft.bg_pdf, bg_params)
bg_fitter = NLLFitter(bg_model, verbose=False)

#
sig_params = Parameters()
sig_params.add_many(
                    ('A'     , 0.01 , True , 0.0 , 1.  , None) ,
                    ('mu'    , 29.  , True , 20. , 50. , None) ,
                    ('gamma' , 1.9  , True , 0.1 , 3.  , None)
                   )
sig_params += bg_params.copy()
sig_model  = Model(ft.sig_pdf_alt, sig_params)
sig_fitter = NLLFitter(sig_model, verbose=False)

scan_var = np.linspace(0., 3., 60)
z_scores_data = []
for x in tqdm_notebook(scan_var,
                       desc       = 'scanning',
                       unit_scale = True,
                       #ncols      = 75,
                       total      = len(scan_var)
                      ):

    # get z scores from data
    data       = df_data.query('dilepton_pt_over_m > {0}'.format(x))['dilepton_mass']
    data       = data.values
    bg_result  = bg_fitter.fit(data, calculate_corr=False)
    sig_result = sig_fitter.fit(data, calculate_corr=False)
    q = 2*(bg_model.calc_nll(data) - sig_model.calc_nll(data))
    z_scores_data.append(q)





In [14]:
### Get the ensemble of MC (generated in mc_mixture.ipynb)
input_dir = 'data/mc_mixes_{0}/'.format(signal)
mc_mixtures = []
for f in tqdm_notebook(os.listdir(input_dir)):
    df_mc   = pd.read_csv('{0}/{1}'.format(input_dir, f), sep=' ')
    df_mc   = df_mc.query(cuts)
    mc_mixtures.append(df_mc)




In [15]:
# set up fitters for mc (I would just recycle them but I was getting some strange slow-down and it is easier to just re-initialize them)

q_mc = []
for df_mc in tqdm_notebook(mc_mixtures[:50]):

    bg_params = Parameters()
    bg_params.add_many(
                       ('a1', 0., True, None, None, None),
                       ('a2', 0., True, None, None, None)
                      )
    bg_model  = Model(ft.bg_pdf, bg_params)
    bg_fitter = NLLFitter(bg_model, verbose=False)

    #
    sig_params = Parameters()
    sig_params.add_many(
                        ('A'     , 0.01 , True , 0.0 , 1.  , None) ,
                        ('mu'    , 29.  , True , 20. , 50. , None) ,
                        ('gamma' , 1.9  , True , 0.1 , 3.  , None)
                       )
    sig_params += bg_params.copy()
    sig_model  = Model(ft.sig_pdf_alt, sig_params)
    sig_fitter = NLLFitter(sig_model, verbose=False)
    
    q = [] 
    for x in tqdm_notebook(scan_var,
                           desc       = 'scanning cuts',
                           unit_scale = True,
                           leave      = False,
                           #ncols      = 75,
                           total      = len(scan_var)
                      ):

        # get z scores from mc
        df_mc      = df_mc.query('dilepton_pt_over_m > {0}'.format(x))
        mass       = df_mc['dilepton_mass'].values
        bg_result  = bg_fitter.fit(mass, calculate_corr=False)
        sig_result = sig_fitter.fit(mass, calculate_corr=False)
        q0 = 2*(bg_model.calc_nll(mass) - sig_model.calc_nll(mass))
        q.append(q0)
    q_mc.append(q)
q_mc = np.array(q_mc)




In [16]:
# remove scans that result in nan or negative numbers

q_mc = q_mc[~np.isnan(q_mc).any(axis=1)]
q_mc[q_mc < 0.] = 0.

# calculate mean and std
z_scores_mc   = np.sqrt(q_mc)
z_scores_mean = np.mean(z_scores_mc, axis=0)
z_scores_std  = np.std(z_scores_mc, axis=0)

In [21]:
plt.plot(scan_var, np.sqrt(z_scores_data), 'b-', linewidth=2.)
plt.plot(scan_var, z_scores_mean, 'r-', linewidth=2.)
plt.fill_between(scan_var, z_scores_mean+z_scores_std, z_scores_mean-z_scores_std,
                 color='m',
                 alpha=0.25,
                 interpolate=True
                )

plt.legend(['data', 'simulation', r'$\sf \pm \sigma$'], loc=2)

plt.xlabel(r'$\sf p_{T}/M_{\mu\mu}$')
#plt.xlabel(r'$\sf p_{T,\,\mu2}$')
plt.ylabel(r'$\sf z_{0}$')
plt.grid()
plt.savefig('plots/pt_over_m_significance_{0}.pdf'.format(signal))
plt.show()

<IPython.core.display.Javascript object>

In [20]:
# Consider the correlations of the z score at the nominal cut versus the max
z0 = z_scores_mc[:,0]
z_max = z_scores_mc[:,40]
plt.hist(np.abs(z0 - z_max), bins=20, histtype='step', normed=True)
plt.xlabel(r'$\sf |z_{0} - z_{max}|$')
plt.grid()
plt.savefig('plots/z_diff_validate.pdf')
plt.show()

<IPython.core.display.Javascript object>