In [None]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import rc
rc('text', usetex=True)
font = {'size'   : 18}
rc('font', **font)
plt.rcParams['text.latex.preamble'] = r'\usepackage{amsmath}'
import scipy.stats as st

from sklearn.model_selection import cross_val_predict, KFold
import os
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import torch
from torch import nn
from torch.autograd.functional import jacobian
from functions import *
from copy import deepcopy

This notebook trains SA-CWoLA on the previously generated input.

# Data loader

Let's define the directory where we'll save the results of training

In [None]:
results_dir='training/'

Data loader

In [None]:
y_together = np.load('y_RandD.npy')
labels_together = np.load('labels_RandD.npy')
x_together = np.load('x_RandD.npy')
x_together[:,0]=x_together[:,0]-x_together[:,1] #to get $Delta m$
S_over_B = 0.0 # can at most be 0.1 if you want to keep approx 1M background events
nB = np.sum(labels_together==0.0)
nS = int(S_over_B*nB)#np.sum(labels_together==1.0)#
print(nB,nS)
x_sim = np.load('x_BB1.npy')
x_sim[:,0]=x_sim[:,0]-x_sim[:,1]
y_sim = np.load('y_BB1.npy')
labels_sim = np.load('labels_BB1.npy')
x_sim=x_sim[labels_sim==0.0]
y_sim=y_sim[labels_sim==0.0]

In [None]:
print(np.sum(labels_together[:nB+nS]))
y_together_bis=y_together[:nB+nS]
labels_together_bis=labels_together[:nB+nS]
x_together_bis=x_together[:nB+nS]

I standarize everything for training

In [None]:
scaler = StandardScaler()
x_together_bis = scaler.fit_transform(x_together_bis) # I only learn from data, not from simulation
x_sim_bis = scaler.transform(x_sim)

y_together_bis = y_together_bis
y_sim_bis = y_sim

Now let's shorten the mass range

In [None]:
# same as 2009.02205
y_low = 3100.0
y_high = 3900.0

x_together_bis=x_together_bis[y_together_bis>=y_low]
labels_together_bis=labels_together_bis[y_together_bis>=y_low]
y_together_bis=y_together_bis[y_together_bis>=y_low]

x_sim_bis = x_sim_bis[y_sim_bis>=y_low]
y_sim_bis = y_sim_bis[y_sim_bis>=y_low]

x_together_bis=x_together_bis[y_together_bis<=y_high]
labels_together_bis=labels_together_bis[y_together_bis<=y_high]
y_together_bis=y_together_bis[y_together_bis<=y_high]

x_sim_bis = x_sim_bis[y_sim_bis<=y_high]
y_sim_bis = y_sim_bis[y_sim_bis<=y_high]

Invariant mass binning

In [None]:
y_nbins=25
y_bins = np.array([np.quantile(y_together_bis,i*1.0/(y_nbins-1)) for i in range(y_nbins) ])


In [None]:
print(np.sum(labels_together_bis)/len(labels_together_bis),len(labels_together_bis))
SoverB = np.round(np.sum(labels_together_bis)/len(labels_together_bis),3)
print(SoverB)

Define $M_{1}$ and $M_{2}$

In [None]:
SR_min = 3300.0
SR_max = 3700.0
bins_SR = [np.argmin(np.abs(y_bins-SR_min)),np.argmin(np.abs(y_bins-SR_max))]
SR = [y_bins[np.argmin(np.abs(y_bins-SR_min))],y_bins[np.argmin(np.abs(y_bins-SR_max))]]
labels_mixture_together = np.array([1 if a and b else 0 for a,b in zip(y_together_bis>SR[0],y_together_bis<=SR[1])])
labels_sim_mixture = np.array([0 if a and b else 1 for a,b in zip(y_sim_bis>SR[0],y_sim_bis<=SR[1])])


In [None]:
print(SR)
bins_SR = [np.argmin(np.abs(y_bins-SR_min)),np.argmin(np.abs(y_bins-SR_max))]
print(bins_SR)
print(y_bins[bins_SR])


# CWoLA

I group everything together

In [None]:
x_train = np.vstack([x_together_bis,x_sim_bis])
y_train = np.hstack([labels_mixture_together,labels_sim_mixture])
print(x_train.shape)
print(y_train.shape)

## NN architecture and definitions

In [None]:
layers_data=[(64, nn.ReLU()),(64, nn.ReLU()),(64, nn.ReLU()),(1, nn.Sigmoid())]#[(8, nn.ReLU()),(16, nn.ReLU()),(8, nn.ReLU()),(4, nn.ReLU()),(1, nn.Sigmoid())]

In [None]:
dim_input = x_train.shape[1]
model = NeuralNetwork(dim_input=dim_input,layers_data=layers_data)
model.reset_weights()

In [None]:
# Print model's state_dict
print("Model's state_dict:")
for param_tensor in model.state_dict():
    print(param_tensor, "\t", model.state_dict()[param_tensor].size())

In [None]:
#KFolding
kf = KFold(n_splits=10,shuffle=True,random_state=42)

This function will, for every regularization hyper-parameter, perform CWoLA using KFold. It also performs several initializations per fold to select the best one. It returns the learned output values for each event

In [None]:
def SA_CWoLA_optimization(lambda_values,nseeds=2,model=model,kf=kf,x_train=x_train,y_train=y_train,x_together_bis=x_together_bis,x_sim_bis=x_sim_bis,y_together_bis=y_together_bis,y_sim_bis=y_sim_bis,labels_mixture_together=labels_mixture_together,labels_sim_mixture=labels_sim_mixture):

    s_values = np.zeros((len(lambda_values),len(x_together_bis)))
    s_sim = np.zeros((len(lambda_values),len(x_sim_bis)))
        
    for nlambda_val, lambda_val in enumerate(lambda_values):
        print(nlambda_val, lambda_val)
        weights_training = np.zeros(y_train.shape)
        #M1 and M2 labelling (inverted for simulation!)
        nM1 = np.sum(labels_mixture_together==1.0)
        nM2 = np.sum(labels_mixture_together==0.0)
        nM1_sim = np.sum(labels_sim_mixture==0.0)
        nM2_sim = np.sum(labels_sim_mixture==1.0)
        list_of_n = np.array([nM1,nM2,nM1_sim,nM2_sim])
        min_n = np.min(list_of_n)# this I do to reweight everything
        list_of_weights=min_n/list_of_n

        # weights definition, every class should weight the same with lambda providing the relative term between data and simulation
        weights_training[:len(labels_mixture_together)]=np.where(labels_mixture_together==1.0,list_of_weights[0],list_of_weights[1])
        weights_training[len(labels_mixture_together):len(labels_mixture_together)+len(labels_sim_mixture)]=np.where(labels_sim_mixture==0.0,lambda_val*list_of_weights[2],lambda_val*list_of_weights[3])
        
        #KFolding
        for fold,(train_idx,test_idx) in enumerate(kf.split(np.arange(len(x_train)))):
            print('------------fold no---------{}----------------------'.format(fold))
            # Build the training dataset
            train=xyDataset(x_train[train_idx],y_train[train_idx].reshape(-1,1),weights_training[train_idx].reshape(-1,1))

            # Separate between test data and test sim
            test_idx_mod = test_idx[test_idx<len(x_together_bis)]
            test_idx_mod_sim = test_idx[test_idx>=len(x_together_bis)]

            x_test_tensor = torch.from_numpy(np.array(x_train[test_idx_mod], dtype=np.float32))            
            x_test_tensor_sim = torch.from_numpy(np.array(x_train[test_idx_mod_sim], dtype=np.float32))
            
            # Training begins, first nseeds short trainings
            initial_losses = np.zeros(nseeds)
            model_aux_states = [[] for nseed in range(nseeds)]
            for nseed in range(nseeds):
                torch.manual_seed(nseed) 
                print(nseed)
                model.reset_weights()

                model.Train(train,batch_size=200,epochs=5,learning_rate=0.001);
                model_aux_states[nseed] = deepcopy(model.state_dict())

    
                s_values_aux = model(x_test_tensor)
                y_test_tensor = torch.from_numpy(np.array(y_train[test_idx_mod].reshape(-1,1), dtype=np.float32))            
                weights_test_tensor = torch.from_numpy(np.array(weights_training[test_idx_mod].reshape(-1,1), dtype=np.float32))
                initial_losses[nseed] = model.loss_function(s_values_aux,y_test_tensor,weights_test_tensor).detach().numpy()

            #get the best seed and train a little bit more
            print("Min seed")
            print(np.argmin(initial_losses))
            torch.manual_seed(np.argmin(initial_losses)) 
            model.reset_weights()
            model.load_state_dict(model_aux_states[np.argmin(initial_losses)])

            model.Train(train,batch_size=200,epochs=15,learning_rate=0.001);
            
            # Once training is done, evaluate on unseen data and save the values of the output
            s_values[nlambda_val,test_idx_mod] = model(x_test_tensor).detach().numpy()[:,0]
            s_sim[nlambda_val,test_idx_mod_sim-len(x_together_bis)] = model(x_test_tensor_sim).detach().numpy()[:,0]

    return s_values, s_sim

Now we perform the scan in both $S/B$ and $\lambda$

In [None]:
soverbs = [0.0,0.00035*2,0.00035*4,0.0035]
soverbs_labels = ['0.0','0.0025','0.005','0.01']
np.save(results_dir+'soverbs_labels.npy',soverbs_labels)
np.save(results_dir+'soverbs.npy',soverbs)

In [None]:
lambda_values = np.array([0.0,1.0])
np.save(results_dir+'lambda_values.npy',lambda_values)

For each $S/B$ we rebuild the dataset and perform SA-CWoLA

In [None]:
for nsoverb, soverb in enumerate(soverbs):
    print(soverb)
    nS = int(soverb*nB)#np.sum(labels_together==1.0)#
    print(nB,nS)
    y_together_bis=y_together[:nB+nS]
    labels_together_bis=labels_together[:nB+nS]
    x_together_bis=x_together[:nB+nS]    
    scaler = StandardScaler()
    x_together_bis = scaler.fit_transform(x_together_bis)
    x_sim_bis = scaler.transform(x_sim)
    y_together_bis = y_together_bis
    y_sim_bis = y_sim

    x_together_bis=x_together_bis[y_together_bis>=y_low]
    labels_together_bis=labels_together_bis[y_together_bis>=y_low]
    y_together_bis=y_together_bis[y_together_bis>=y_low]

    x_sim_bis = x_sim_bis[y_sim_bis>=y_low]
    y_sim_bis = y_sim_bis[y_sim_bis>=y_low]

    x_together_bis=x_together_bis[y_together_bis<=y_high]
    labels_together_bis=labels_together_bis[y_together_bis<=y_high]
    y_together_bis=y_together_bis[y_together_bis<=y_high]

    x_sim_bis = x_sim_bis[y_sim_bis<=y_high]
    y_sim_bis = y_sim_bis[y_sim_bis<=y_high]
    
    y_nbins=25
    y_bins = np.array([np.quantile(y_together_bis,i*1.0/(y_nbins-1)) for i in range(y_nbins) ])
    
    print(np.sum(labels_together_bis)/len(labels_together_bis),len(labels_together_bis))
    
    bins_SR = [np.argmin(np.abs(y_bins-SR_min)),np.argmin(np.abs(y_bins-SR_max))]
    SR = [y_bins[np.argmin(np.abs(y_bins-SR_min))],y_bins[np.argmin(np.abs(y_bins-SR_max))]]
    labels_mixture_together = np.array([1 if a and b else 0 for a,b in zip(y_together_bis>SR[0],y_together_bis<=SR[1])])
    labels_sim_mixture = np.array([0 if a and b else 1 for a,b in zip(y_sim_bis>SR[0],y_sim_bis<=SR[1])])

    bins_SR = [np.argmin(np.abs(y_bins-SR_min)),np.argmin(np.abs(y_bins-SR_max))]

    x_train = np.vstack([x_together_bis,x_sim_bis])
    y_train = np.hstack([labels_mixture_together,labels_sim_mixture])

    s_values_first_batch, s_sim_first_batch = SA_CWoLA_optimization(lambda_values,nseeds=20,model=model,kf=kf,x_train=x_train,y_train=y_train,x_together_bis=x_together_bis,x_sim_bis=x_sim_bis,y_together_bis=y_together_bis,y_sim_bis=y_sim_bis,labels_mixture_together=labels_mixture_together,labels_sim_mixture=labels_sim_mixture)
        
    np.save(results_dir+'s_values_'+str(nsoverb)+'.npy',s_values_first_batch)
    np.save(results_dir+'s_values_sim_'+str(nsoverb)+'.npy',s_sim_first_batch)
