In [None]:
import os
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats as sps
import scipy.optimize as optimize

# Searching for the Higgs boson

Recall that in weeks 7 of this course we searched for dark matter in a simplified data set.

This week we are going to seach for the Higgs Boson.  In spirit this is similar to the dark matter search,
but there are going to be a few differences.

1. The detectors are quite different.  The detectors at CERN that were used to discover the Higgs particle are massive, basically the size of a 6 story building (about 150 long and 75 feet tall).  The detectors capture information from many particles that are produced when two protons collide at very nearly the speed of light.  Complicated data analysis algorithms then filter that data to find events that might have contained a Higgs particle.  The last step of the analysis is to combined together the particles that were flagged as possibly resulting from the Higgs particle's decay, as seeing if they are consistent with the mass of the Higgs particle.

2. In the dark matter search in week 7, we used the log(s2) (i.e., the log of the electron charge signal) to distinguish signal from background.  For the Higgs search we as going to be using using the mass computed from combined different sets of particles collected in the detector.  For background events that mass will be randomly distributed, and we will model it as a linear function.  For signal events that mass will be centered of the mass of the Higgs particle ( $m_{H} = 125.1 \frac{\rm GeV}{c^2}$ ).   This means that in the Higgs search, we will be cutting on both sides of the signal.   I.e., the signal events are $ | m - m_{H} | < w $, where $w$ is the width of the cut window.

3. In this search, we have much more data, even at the last step of the data analysis.  Both the rate of signal events and background events is much higher that it was for the dark matter search.


You can try two different ways of doing the search.

1. "Cut and count".  Basically you define a signal region and background region and show that there are more events in the signal region that in the background region.  In our case, since the background is not constant, it make sense to pick two background regions, one on either side of the signal, and to average them.

<img src="figures/higgs_cut_and_count_2.png" width="400"/>


2. "Fitting".  In this case you make signal and background models and use a fit optimizer to extract the size of the signal.

<img src="figures/Higgs_Mass_fit.png" width="400"/>


For comparison, here is a figure from the Higgs discover paper:

<img src="figures/2012Higgsplot.png" width="400"/>


Some goals for this project could include:

1. Optimize the width of the cut window

2. Estimate the significance you might expect as a function of time using the "cut and count" analysis

3. Apply the "cut and count" analysis to the real data.

4. Apply the "fitting" analysis to the real data, and compare the results with the "cut and count" analysis.

# Some useful stuff

First make our lives easy by using the true values for the Higgs particle mass and the width of the mass peak.

Then we will suppose that we are going to be histograming Higgs mass data in units of $1 {\rm GeV}{c^2}$, as was done in the plots above.  And we will look at all the data between 90 and 160 ${\rm GeV}{c^2}$

In [None]:
Higgs_Mass = 125.1  # In Units of MeV / c**2
Higgs_Width = 4.2   # In Units of MeV / c**2

mass_grid = np.linspace(90., 160., 71)

# Some useful functions for cut and counting

You can use these three functions to:

1. `passed_cuts`: tells how many signal and background events from the idealized data pass your cuts if you use a cut window of a particular width, using models for how much signal and background we expect.   

2. `extract_sig_from_data`: tells you many events are in your signal region on the "real" data.  Since it is real data, you don't know if they are signal or background.

3. `extract_bkg_from_data`: tells you many events are in your background region on the "real" data.  

In [None]:
def passed_cuts(cut_width, masses, model_sig, model_bkg):
    mask = np.abs(masses - 125) < cut_width
    n_sig = np.sum(model_sig[mask])
    n_bkg = np.sum(model_bkg[mask])
    return n_sig, n_bkg

def extract_sig_from_data(cut_width, masses, nevts):
    mask = np.abs(masses - 125) < cut_width
    return np.sum(nevts[mask])

def estimate_bkg_from_data(cut_width, masses, nevts):
    mask_bkg_lo = np.abs(masses-105) < cut_width
    mask_bkg_hi = np.abs(masses-145) < cut_width
    mask_bkg = np.bitwise_or(mask_bkg_lo, mask_bkg_hi)
    bkg_estimate = 0.5 * np.sum(nevts[mask_bkg])
    return (bkg_estimate, np.sqrt(bkg_estimate))

# Some useful functions optimize data and setting expectations

You can use these three functions to:

1. `plot_nexp_passed_cuts`: makes a plot of how many signal and background events from the idealized data pass your cuts if you as a function of the cut width.

2. `find_sig2noise`: makes of plot of the signal to noise, i.e., the number of signal events divided by the uncertainty in the background, as this is the key quantity for understanding the significance of the signal.

3. `sig2noise_v_time`: shows you how the signal to noise will increase with time, assuming you keep taking data.

In [None]:
def plot_nexp_passed_cuts(masses, model_sig, model_bkg):
    sig_cts = np.zeros(26)
    bkg_cts = np.zeros(26)
    widths = np.linspace(0, 25, 26)
    for i, width in enumerate(widths):
        sig_cts[i], bkg_cts[i] = passed_cuts(width, masses, model_sig, model_bkg)
    _ = plt.plot(widths, sig_cts, label="Signal")
    _ = plt.plot(widths, bkg_cts, label="Background")
    _ = plt.yscale('log')
    _ = plt.xlabel(r"Cut Width [GeV / $c^2$]")
    _ = plt.ylabel(r"Events [per GeV / $c^2$ / month]")
    
def find_sig2noise(mass_grid, model_sig, model_bkg, plot=True):
    sig_cts = np.zeros(26)
    bkg_cts = np.zeros(26)
    widths = np.linspace(0, 25, 26)
    for i, width in enumerate(widths):
        if i == 0:
            continue
        sig_cts[i], bkg_cts[i] = passed_cuts(width, mass_grid, model_sig, model_bkg)
    sig2noise = np.zeros(26)
    sig2noise[1:] = sig_cts[1:]/np.sqrt(bkg_cts[1:])
    if plot:
        _ = plt.plot(widths, sig_cts/np.sqrt(bkg_cts))
        _ = plt.xlabel(r"Cut Width [GeV / $c^2$]")
        _ = plt.ylabel(r"$\frac{n_{\rm sig}}{\sqrt{n_{\rm bkg}}}$ for one month")
    return sig2noise

def sig2noise_v_time(mass_grid, model_sig, model_bkg, plot=True):
    max_s2n = np.zeros(24)
    best_cut = np.zeros(24)
    n_months_array = np.arange(24)
    for n_months in n_months_array:
        if n_months == 0:
            continue
        s2n = find_sig2noise(mass_grid, n_months*model_sig, n_months*model_bkg, plot=False)
        max_s2n[n_months] = np.max(s2n)
        best_cut[n_months] = np.argmax(s2n)
    if plot:
        _ = plt.scatter(n_months_array, max_s2n)
        _ = plt.xlabel(r"Time [months]")
        _ = plt.ylabel(r"$\frac{n_{\rm sig}}{\sqrt{n_{\rm bkg}}}$ for N months")

    return max_s2n

# Some useful functions for fitting the data

The important one here is `fitAndPlotResult`, which will find you data an plot the results.

You need to provide it with:

1. masses: the array of mass points

2. nevs: the number of events observed a each mass

3. ref_mass: the reference mass for the background model (use $125 \frac{\rm GeV}{c^2}$)

4. init_pars: guesses for the initial parameters.

The three parameters are: 

1. The total number of signal events.

2. The number of background events in the bin at the reference mass.

3. The slope of the background model, in events per bin.

In [None]:
from functools import partial

def Gauss(x, nsig, mu, sigma):
    return nsig*sps.norm(loc=mu, scale=sigma).pdf(x)

def poly1(x, ref_mass, offset, slope):
    return offset + (x-ref_mass)*slope

def model_func(x, ref_mass, nsig, offset, slope):
    return Gauss(x, nsig, Higgs_Mass, Higgs_Width) + poly1(x, ref_mass, offset, slope)

def generic_chi2(params, data_vals, model, x, ref_mass):
    model_vals = model(x, ref_mass, *params)
    return np.sum(((data_vals - model_vals)**2)/data_vals)

def cost_func(data_vals, model, x, ref_mass):
    return partial(generic_chi2, data_vals=data_vals, model=model, x=x, ref_mass=ref_mass)

def fitAndPlotResult(masses, nevts, ref_mass, init_pars):
    our_cost_func = cost_func(nevts, model_func, masses, ref_mass=ref_mass)
    result = optimize.minimize(our_cost_func, x0=np.array(init_pars))
    fit_pars = result['x']
    cov = result['hess_inv']
    model_fit = model_func(masses, ref_mass, *fit_pars)
    background_fit = poly1(masses, ref_mass, fit_pars[1], fit_pars[2])
    print("Best Fit ---------")
    print("N Signal: %.1f [Events]" % fit_pars[0])
    print(r"Higgs Peak: %.4f $[\frac{\rm GeV}{c^2}]$" % Higgs_Mass)
    print(r"Higgs Width: %.4f $[\frac{\rm GeV}{c^2}]$" % Higgs_Width)
    print(r"Background at 125 GeV: %.2f $[{\rm Events} / \frac{\rm GeV}{c^2}]" % fit_pars[1])
    print(r"Background slope: %.2f $[{\rm Events} / \frac{\rm GeV}{c^2} / \frac{\rm GeV}{c^2}]$" % fit_pars[2])
    _ = plt.errorbar(masses, nevts, yerr=np.sqrt(nevts), fmt='.', label="data")
    _ = plt.plot(masses, background_fit, label="background model")
    _ = plt.plot(masses, model_fit, label="full model")
    _ = plt.xlabel(r"mass $[\frac{\rm GeV}{c^2}]$")
    _ = plt.ylabel(r"Events $[{\rm per }\frac{\rm GeV}{c^2}]$")
    _ = plt.legend()
    return (fit_pars[0], np.sqrt(cov[0,0]))

# Here are your models for the signal and background.

Note that they are expressed in terms of events per month.

In [None]:
ref_mass = 130.
nsig_per_month = 20.
nbkg_per_mev_per_month = 40.
bkg_slope_per_mev_per_month = -0.2
model_bkg = poly1(mass_grid, ref_mass, nbkg_per_mev_per_month, bkg_slope_per_mev_per_month)
model_sig = Gauss(mass_grid, nsig_per_month, Higgs_Mass, Higgs_Width)

_ = plt.scatter(mass_grid, model_sig, label="Signal", marker='.')
_ = plt.scatter(mass_grid, model_bkg, label="Background", marker='.')
_ = plt.scatter(mass_grid, model_sig+model_bkg, label="Combined", marker='.')
_ = plt.xlabel(r"Mass [GeV/$c^2$]")
_ = plt.ylabel(r"Counts [per GeV/$c^2$ / month]")
_ = plt.legend()

# Here is your "real" data.

Note that that is simulated for 24 months of data.

In [None]:
data = np.loadtxt('../data/Higgs.txt')
masses = data[:,0]
nevts = data[:,1]
errors = np.sqrt(nevts)
_ = plt.errorbar(masses, nevts, yerr=errors, fmt='.')
_ = plt.xlabel(r"Mass [GeV/$c^2$]")
_ = plt.ylabel(r"Counts [per GeV/$c^2$]")

In [None]:
fitAndPlotResult(masses, nevts, ref_mass, [480, 1000, -5.])

In [None]:
data = np.loadtxt('../data/Higgs.txt')
masses = data[:,0]
nevts = data[:,1]
errors = np.sqrt(nevts)
_ = plt.errorbar(masses, nevts, yerr=errors, fmt='.')
_ = plt.xlabel(r"Mass [GeV/$c^2$]")
_ = plt.ylabel(r"Counts [per GeV/$c^2$]")