# Useful functions to make plots

In [None]:
# python libraries
import glob
import numpy as np
import h5py
import math
import time
import matplotlib.pyplot as plt
import os
import getpass
from scipy.stats import chi2, norm, poisson

In [None]:
def Plot_Percentiles(tvalues_check, patience=1, title='', ymax=300, ymin=0, save=False, output_path=''):
    '''
    The function creates the plot of the evolution in the epochs of the [2.5%, 25%, 50%, 75%, 97.5%] quantiles of the toy sample distribution.
    
    patience: interval between two check points (epochs).
    tvalues_check: array of t=-2*loss, shape = (N_toys, N_check_points)
    '''
    epochs_check = []
    nr_check_points = tvalues_check.shape[1]
    for i in range(nr_check_points):
        epoch_check = patience*(i+1)
        epochs_check.append(epoch_check)
    
    fig=plt.figure(figsize=(8, 8))
    quantiles=[2.5, 25, 50, 75, 97.5]
    percentiles=np.array([])
    plt.xlabel('Epoch', fontsize=12)
    plt.ylabel('t', fontsize=12)
    plt.ylim(ymin, ymax)
    for i in range(tvalues_check.shape[1]):
        percentiles_i = np.percentile(tvalues_check[:, i], quantiles)
        #print(percentiles_i.shape)
        percentiles_i = np.expand_dims(percentiles_i, axis=1)
        #print(percentiles_i.shape)
        if not i:
            percentiles = percentiles_i.T
        else:
            percentiles = np.concatenate((percentiles, percentiles_i.T))
    legend=[]
    print(percentiles.shape)
    for j in range(percentiles.shape[1]):
        plt.plot(epochs_check, percentiles[:, j], marker='.')
        legend.append(str(quantiles[j])+' % quantile')
    plt.legend(legend, fontsize=13)
    plt.grid()
    if save:
        if output_path=='':
            print('argument output_path is not defined. The figure will not be saved.')
        else:
            fig.savefig(output_path+title+'_PlotPercentiles.png')
    plt.show()
    plt.close(fig)
    return

def Plot_Percentiles_ref(tvalues_check, dof, patience=1, title='', wc=None, ymax=300, ymin=0, save=False, output_path=''):
    '''
    The funcion creates the plot of the evolution in the epochs of the [2.5%, 25%, 50%, 75%, 97.5%] quantiles of the toy sample distribution.
    The percentile lines for the target chi2 distribution (dof required!) are shown as a reference.
    patience: interval between two check points (epochs).
    tvalues_check: array of t=-2*loss, shape = (N_toys, N_check_points)
    '''
    epochs_check = []
    nr_check_points = tvalues_check.shape[1]
    for i in range(nr_check_points):
        epoch_check = patience*(i+1)
        epochs_check.append(epoch_check)
    
    fig=plt.figure(figsize=(8, 8))
    quantiles=[2.5, 25, 50, 75, 97.5]
    percentiles=np.array([])
    plt.xlabel('Training Epochs', fontsize=16)
    plt.ylabel('t', fontsize=16)
    plt.ylim(ymin, ymax)
    if wc != None:
        plt.title('Weight Clipping = '+wc, fontsize=16)
    for i in range(tvalues_check.shape[1]):
        percentiles_i = np.percentile(tvalues_check[:, i], quantiles)
        #print(percentiles_i.shape)
        percentiles_i = np.expand_dims(percentiles_i, axis=1)
        #print(percentiles_i.shape)
        if not i:
            percentiles = percentiles_i.T
        else:
            percentiles = np.concatenate((percentiles, percentiles_i.T))
    legend=[]
    #print(percentiles.shape)
    for j in range(percentiles.shape[1]):
        plt.plot(epochs_check, percentiles[:, j], marker='.', linewidth=3)
        #print(percentiles[:, j])
        legend.append(str(quantiles[j])+' % quantile')
    for j in range(percentiles.shape[1]):
        plt.plot(epochs_check, chi2.ppf(quantiles[j]/100., df=dof, loc=0, scale=1)*np.ones_like(epochs_check),
                color='grey', ls='--', linewidth=1)
        #print( chi2.ppf(quantiles[j]/100., df=dof, loc=0, scale=1))
        if j==0:
            legend.append("Target "+r"$\chi^2(dof=$"+str(dof)+")")
            
    plt.legend(legend, fontsize=16)
    if save:
        if output_path=='':
            print('argument output_path is not defined. The figure will not be saved.')
        else:
            fig.savefig(output_path+title+'_PlotPercentiles.png')
    plt.show()
    plt.close(fig)
    return


def plot_1distribution(t, df, xmin=0, xmax=300, nbins=10, save=False, output_path='', label=''):
    '''
    Plot the histogram of a test statistics sample (t) and the target chi2 distribution (df must be specified!). 
    The median and the error on the median are calculated in order to calculate the median Z-score and its error.
    '''
    fig  = plt.figure(figsize=(12, 9))
    # plot distribution histogram
    bins      = np.linspace(xmin, xmax, nbins+1)
    Z_obs     = norm.ppf(chi2.cdf(np.median(t), df))
    t_obs_err = 1.2533*np.std(t)*1./np.sqrt(t.shape[0])
    Z_obs_p   = norm.ppf(chi2.cdf(np.median(t)+t_obs_err, df))
    Z_obs_m   = norm.ppf(chi2.cdf(np.median(t)-t_obs_err, df))
    label  = 'sample %s\nsize: %i\nmedian: %s\nstd: %s\n'%(label, t.shape[0], str(np.around(np.median(t), 2)),str(np.around(np.std(t), 2)))
    label += 'Z = %s (+%s/-%s)'%(str(np.around(Z_obs, 2)), str(np.around(Z_obs_p-Z_obs, 2)), str(np.around(Z_obs-Z_obs_m, 2)))
    binswidth = (xmax-xmin)*1./nbins
    h = plt.hist(t, weights=np.ones_like(t)*1./(t.shape[0]*binswidth), color='lightblue', ec='dodgerblue',
                 bins=bins, label=label)
    err = np.sqrt(h[0]/(t.shape[0]*binswidth))
    x   = 0.5*(bins[1:]+bins[:-1])
    plt.errorbar(x, h[0], yerr = err, color='dodgerblue', marker='o', ls='')
    # plot reference chi2
    x  = np.linspace(chi2.ppf(0.0001, df), chi2.ppf(0.9999, df), 100)
    plt.plot(x, chi2.pdf(x, df),'darkorange', lw=5, alpha=0.8, label=r'$\chi^2$('+str(df)+')')
    plt.legend(fontsize=14)
    plt.xlabel('t', fontsize=14)
    plt.ylabel('Probability', fontsize=14)
    if save:
        if output_path=='':
            print('argument output_path is not defined. The figure will not be saved.')
        else:
            plt.savefig(output_path+ title+'_distribution.png')
    plt.show()
    plt.close(fig)
    return

def plot_2distribution(t1, t2, df, xmin=0, xmax=300, nbins=10, save=False, output_path='', label1='1', label2='2'):
    '''
    Plot the histogram of a test statistics sample (t) and the target chi2 distribution (df must be specified!).
    The median and the error on the median are calculated in order to calculate the median Z-score and its error.
    '''
    fig  = plt.figure(figsize=(12, 9))
    # plot distribution histogram
    bins      = np.linspace(xmin, xmax, nbins+1)
    binswidth = (xmax-xmin)*1./nbins
    # t1
    Z_obs     = norm.ppf(chi2.cdf(np.median(t1), df))
    t_obs_err = 1.2533*np.std(t1)*1./np.sqrt(t1.shape[0])
    Z_obs_p   = norm.ppf(chi2.cdf(np.median(t1)+t_obs_err, df))
    Z_obs_m   = norm.ppf(chi2.cdf(np.median(t1)-t_obs_err, df))
    label  = 'sample %s\nsize: %i\nmedian: %s\nstd: %s\n'%(label1, t1.shape[0], str(np.around(np.median(t1), 2)),str(np.around(np.std(t1), 2)))
    label += 'Z = %s (+%s/-%s)'%(str(np.around(Z_obs, 2)), str(np.around(Z_obs_p-Z_obs, 2)), str(np.around(Z_obs-Z_obs_m, 2)))
    h = plt.hist(t1, weights=np.ones_like(t1)*1./(t1.shape[0]*binswidth), color='lightblue', ec='dodgerblue',
                 bins=bins, label=label)
    err = np.sqrt(h[0]/(t1.shape[0]*binswidth))
    x   = 0.5*(bins[1:]+bins[:-1])
    plt.errorbar(x, h[0], yerr = err, color='dodgerblue', marker='o', ls='')
    # t2
    Z_obs     = norm.ppf(chi2.cdf(np.median(t2), df))
    t_obs_err = 1.2533*np.std(t2)*1./np.sqrt(t2.shape[0])
    Z_obs_p   = norm.ppf(chi2.cdf(np.median(t2)+t_obs_err, df))
    Z_obs_m   = norm.ppf(chi2.cdf(np.median(t2)-t_obs_err, df))
    label  = 'sample %s\nsize: %i\nmedian: %s\nstd: %s\n'%(label2, t2.shape[0], str(np.around(np.median(t2), 2)),str(np.around(np.std(t2), 2)))
    label += 'Z = %s (+%s/-%s)'%(str(np.around(Z_obs, 2)), str(np.around(Z_obs_p-Z_obs, 2)), str(np.around(Z_obs-Z_obs_m, 2)))
    h = plt.hist(t2, weights=np.ones_like(t2)*1./(t2.shape[0]*binswidth), color='lightgreen', ec='green',
                 bins=bins, label=label)
    err = np.sqrt(h[0]/(t2.shape[0]*binswidth))
    x   = 0.5*(bins[1:]+bins[:-1])
    plt.errorbar(x, h[0], yerr = err, color='green', marker='o', ls='')
    # plot reference chi2
    x  = np.linspace(chi2.ppf(0.0001, df), chi2.ppf(0.9999, df), 100)
    plt.plot(x, chi2.pdf(x, df),'darkorange', lw=5, alpha=0.8, label=r'$\chi^2$('+str(df)+')')
    plt.legend(fontsize=14, ncol=3, loc='upper center')
    plt.xlabel('t', fontsize=14)
    plt.ylabel('Probability', fontsize=14)
    plt.ylim(0., np.max(chi2.pdf(x, df))*1.3)
    if save:
        if output_path=='':
            print('argument output_path is not defined. The figure will not be saved.')
        else:
            plt.savefig(output_path+ title+'_2distribution.png')
    plt.show()
    plt.close(fig)
    return

def get_percntiles_Zscore(t, df, percentage_list=[], verbose=False):
    '''
    For a given test statistic sample (t), it returns the percentile and the corresponding Z-score for each percentage given in percentage_list.
    '''
    p = np.percentile(t, percentage_list)
    z = norm.ppf(chi2.cdf(p, df))
    if verbose:
        for i in range(p.shape[0]):
            print('%s percentile: %s, Z-score: %s'%(str(np.around(percentage_list[i], 2)), str(np.around(p[i], 2)), str(np.around(z[i], 2)) ))
    return p, z

def get_percentage_from_Zscore (t, df, Zscore_star_list=[], verbose=False):
    '''
    For a given test statistic sample (t), it returns the percentage of toys with Zscore greater or equal to Z-score-star for each Z-score-star in Zscore_star_list.
    '''
    t_star_list = chi2.ppf(norm.cdf(np.array(Zscore_star_list)),df)
    percentage  = np.array([np.sum(t>t_star)*1./t.shape[0] for t_star in t_star_list])
    if verbose:
        for i in range(percentage.shape[0]):
            print('Z-score > %s: t > %s, percentage: %s'%(str(np.around(Zscore_star_list[i], 2)), str(np.around(t_star_list[i], 2)), str(np.around(percentage[i], 2)) ))
    return t_star_list, percentage

In [None]:
def get_expected_Zref_Zprime(fs, fb, N_S, N_B):
    '''
    expected reference Z-score for a Zprime kind of signal.
    Note: a signal region (SR) must be selected to use this function.
    fs: signal fraction in the SR
    fb: background fraction in the SR
    N_S: total expected number of signal events
    N_B: total expected number of background events
    '''
    return norm.ppf(poisson.cdf(fs*N_S+fb*N_B, fb*N_B))

def get_observed_Zref_Zprime(n_obs, fb, N_B):
    '''
    observed reference Z-score for a Zprime kind of signal.
    Note: a signal region (SR) must be selected to use this function
    fb: background fraction in the SR
    N_B: total expected number of background events
    n_obs: observed number of events in the SR
    '''
    return norm.ppf(poisson.cdf(n_obs, fb*N_B))
