# First part is to read multiple .csv's and concat them. Can skip if you already have the full data

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
import numpy as np

import glob

In [2]:
#Normalise data sets
from sklearn import preprocessing
min_max_scaler = preprocessing.MinMaxScaler()
def scaleColumns(df, cols_to_scale):
    for col in cols_to_scale:
        df[col] = pd.DataFrame(min_max_scaler.fit_transform(pd.DataFrame(df[col])),columns=[col])
    return df

In [3]:
path = "/media/joao/TOSHIBA EXT/Background_samples/CSV_data"

all_Signal = glob.glob(path + "/LQ_*.csv")
all_ttbar = glob.glob(path + "/ttbar_*.csv")
all_WW = glob.glob(path + "/WW_*.csv")
all_ZZ = glob.glob(path + "/ZZ_*.csv")
all_ZW = glob.glob(path + "/ZW_*.csv")
all_Zplusjets = glob.glob(path + "/Z_plus_jets_*.csv")

In [4]:
Vars_to_use = ['pt_l1', 'pt_l2', 'E_l1', 'E_l2', 'DeltaR_l1l2', 'DeltaPhi_l1l2', 'DeltaTheta_l1l2',
               'cos_l1l2', 'DeltaTheta_l1l2_CM', 'M_l1l2', 'b2_l1l2', 'b4_l1l2', 'SpinCorr_Rcosl1',
               'SpinCorr_Rcosl2', 'SpinCorr_Ncosl1','SpinCorr_Ncosl2', 'xs_weight']

In [5]:
li_Signal = []
li_ttbar = []
li_WW = []
li_ZZ = []
li_ZW = []
li_Zplusjets = []

#Signal
for filename in all_Signal:
    df = pd.read_csv(filename, index_col=None, header=0)
    li_Signal.append(df)

Signal = pd.concat(li_Signal, axis=0, ignore_index=True)
Signal = Signal[Vars_to_use]

#ttbar
for filename in all_ttbar:
    df = pd.read_csv(filename, index_col=None, header=0)
    li_ttbar.append(df)

ttbar = pd.concat(li_ttbar, axis=0, ignore_index=True)
ttbar = ttbar[Vars_to_use]

#WW
for filename in all_WW:
    df = pd.read_csv(filename, index_col=None, header=0)
    li_WW.append(df)

WW = pd.concat(li_WW, axis=0, ignore_index=True)
WW = WW[Vars_to_use]

#ZZ
for filename in all_ZZ:
    df = pd.read_csv(filename, index_col=None, header=0)
    li_ZZ.append(df)

ZZ = pd.concat(li_ZZ, axis=0, ignore_index=True)
ZZ = ZZ[Vars_to_use]

#ZW
for filename in all_ZW:
    df = pd.read_csv(filename, index_col=None, header=0)
    li_ZW.append(df)

ZW = pd.concat(li_ZW, axis=0, ignore_index=True)
ZW = ZW[Vars_to_use]

#Z plus jets
for filename in all_Zplusjets:
    df = pd.read_csv(filename, index_col=None, header=0)
    li_Zplusjets.append(df)

Z_plus_jets = pd.concat(li_Zplusjets, axis=0, ignore_index=True)
Z_plus_jets = Z_plus_jets[Vars_to_use]


In [6]:
#Cross sections: Sums all values in collumn 'weight' in each data frame
#Divided by the number of samples. I have batches of 10 samples, each with 100K events,
#so I must divide the result by 30

#ttbar
ttbar_N3LO = 988.57
ttbar_LO = 93.32
ttbar_new_weight = (ttbar['xs_weight']*ttbar_N3LO)/ttbar_LO
xsec_N3LO_ttbar = (ttbar_new_weight.sum()*1000)/10

#Z plus jets
Zjets_NNLO = 6.33*1e4
Zjets_LO = 4128.0
Zjets_new_weight = (Z_plus_jets['xs_weight']*Zjets_NNLO)/Zjets_LO
xsec_NNLO_Zjets = (Zjets_new_weight.sum()*1000)/10

#VV +jets. Cross sections for each individual process 
#WW + jets
WW_NLO = 124.31
WW_LO = 77.22
WW_new_weight = (WW['xs_weight']*WW_NLO)/WW_LO
xsec_NNLO_WW = (WW_new_weight.sum()*1000)/10

#WZ + jets
ZW_NLO = 51.82
ZW_LO = 0.9827
ZW_new_weight = (ZW['xs_weight']*ZW_NLO)/ZW_LO
xsec_NNLO_ZW = (ZW_new_weight.sum()*1000)/10

#ZZ + jets
ZZ_NLO = 17.72
ZZ_LO = 0.03654
ZZ_new_weight = (ZZ['xs_weight']*ZZ_NLO)/ZZ_LO
xsec_NNLO_ZZ = (ZZ_new_weight.sum()*1000)/10

print("Signal (fb):", (Signal['xs_weight'].sum()*1000))
print("Diboson (fb):", (xsec_NNLO_ZZ + xsec_NNLO_ZW + xsec_NNLO_WW))
print("ttbar (fb):", xsec_N3LO_ttbar)
print("Z plus jets (fb):", xsec_NNLO_Zjets)

Signal (fb): 0.23040527818452006
Diboson (fb): 32191.469264447667
ttbar (fb): 75269.65792903882
Z plus jets (fb): 10819462.000118723


In [7]:
ttbar['xs_weight'] = ttbar_new_weight
Z_plus_jets['xs_weight'] = Zjets_new_weight
WW['xs_weight'] = WW_new_weight
ZW['xs_weight'] = ZW_new_weight
ZZ['xs_weight'] = ZZ_new_weight

In [8]:
# Concat diboson data
Diboson = pd.concat([WW, ZZ, ZW], axis=0, ignore_index = True)


In [9]:
List = [Diboson, Signal, ttbar, Z_plus_jets]
#Check for NaNs
for i in range(0,len(List)):
    print(i, List[i].isnull().values.any())

0 False
1 False
2 False
3 False


In [10]:
#Save data
Signal.to_csv('/media/joao/TOSHIBA EXT/Machine_learning/Data/Signal_1p5TeV.csv',sep=',',index=False)
Diboson.to_csv('/media/joao/TOSHIBA EXT/Machine_learning/Data/Diboson.csv',sep=',',index=False)
ttbar.to_csv('/media/joao/TOSHIBA EXT/Machine_learning/Data/ttbar.csv',sep=',',index=False)
Z_plus_jets.to_csv('/media/joao/TOSHIBA EXT/Machine_learning/Data/Z_plus_jets.csv',sep=',',index=False)

# Read CSV data already concated correctly

In [11]:
Signal = pd.read_csv('/media/joao/TOSHIBA EXT/Machine_learning/Data/Signal_1p5TeV.csv', sep=',')
Diboson = pd.read_csv('/media/joao/TOSHIBA EXT/Machine_learning/Data/Diboson.csv', sep=',')
ttbar = pd.read_csv('/media/joao/TOSHIBA EXT/Machine_learning/Data/ttbar.csv', sep=',')
Z_plus_jets = pd.read_csv('/media/joao/TOSHIBA EXT/Machine_learning/Data/Z_plus_jets.csv', sep=',')


In [12]:
#Drop weight for the machine learning
for i in [Signal, Diboson, ttbar, Z_plus_jets]:
    i.drop(['xs_weight'], axis=1, inplace=True)

In [13]:
#Sanity check. Make sure all labels for the kinematics are correct
#If result is false, then there is something wrong with the tabular data
Signal = Signal.reindex(sorted(Signal.columns), axis=1)
Diboson = Diboson.reindex(sorted(Diboson.columns), axis=1)
ttbar = ttbar.reindex(sorted(ttbar.columns), axis=1)
Z_plus_jets = Z_plus_jets.reindex(sorted(Z_plus_jets.columns), axis=1)

list(Signal)==list(Diboson)==list(ttbar)==list(Z_plus_jets)

True

In [14]:
#Add classification labels
Signal['signal'] = 0
Diboson['signal'] = 1
ttbar['signal'] = 2
Z_plus_jets['signal'] = 3

In [15]:
#Get labels
np.transpose(list(Signal))

array(['DeltaPhi_l1l2', 'DeltaR_l1l2', 'DeltaTheta_l1l2',
       'DeltaTheta_l1l2_CM', 'E_l1', 'E_l2', 'M_l1l2', 'SpinCorr_Ncosl1',
       'SpinCorr_Ncosl2', 'SpinCorr_Rcosl1', 'SpinCorr_Rcosl2', 'b2_l1l2',
       'b4_l1l2', 'cos_l1l2', 'pt_l1', 'pt_l2', 'signal'], dtype='<U18')

In [16]:
combined = pd.concat([Signal, Diboson, ttbar, Z_plus_jets],ignore_index=True)

In [17]:
#Normalise the dataset
analysis = scaleColumns(combined, ['DeltaPhi_l1l2', 'DeltaR_l1l2', 'DeltaTheta_l1l2',
                                   'DeltaTheta_l1l2_CM', 'E_l1', 'E_l2', 'M_l1l2', 'SpinCorr_Ncosl1',
                                   'SpinCorr_Ncosl2', 'SpinCorr_Rcosl1', 'SpinCorr_Rcosl2', 'b2_l1l2',
                                   'b4_l1l2', 'cos_l1l2', 'pt_l1', 'pt_l2'])

In [18]:
X = analysis.drop(['signal'], axis=1)
y = analysis['signal']
#Divide into train and test/val data. 80% for train, 20% for validation
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, test_size=0.2)

In [19]:
X_train.shape

(1505133, 16)

In [20]:
np.bincount(y_train)

array([   6118, 1129094,  194402,  175519])

In [21]:
#Resample data using SMOTE algorithm
X_resampled, y_resampled = SMOTE().fit_resample(X_train, y_train)

In [22]:
X_resampled.shape

(4516376, 16)

In [23]:
#Save data in numpy format
np.savez('/media/joao/TOSHIBA EXT/Machine_learning/Data/X_train_1p5TeV.npz', x=X_resampled, y=y_resampled)
np.savez('/media/joao/TOSHIBA EXT/Machine_learning/Data/X_test_1p5TeV.npz', x=X_test, y=y_test)