In [80]:
%config InlineBackend.figure_format = "svg"

import matplotlib.pyplot as plt
from glob import glob
from tqdm import tqdm
import pandas as pd
import numpy as np

# Import data

In [3]:
# Path of the data
datapath = "data"

# Files to gather
files = glob(datapath + "/*.h5")

# Display all available files
print(files)

['data/wohg_hq1200.h5', 'data/hg3000_hq1000.h5', 'data/fcnc.h5', 'data/wohg_hq1400.h5', 'data/hg3000_hq1400.h5', 'data/hg3000_hq1200.h5', 'data/wohg_hq1000.h5', 'data/bkg.h5']


In [91]:
def df_datasplit(df, filename, result_dict={}, sup_split=0.3, seed=42):
    """
    A function that processes a dataframe file and returns its features, labels and weights.
    
    Questions: 
        - Do I have to sort dataframe initially? How so?
        - Do I have to recalculate weights for future support and query splits?
    
    Data-cuts applied according to model transferability paper: https://arxiv.org/pdf/1912.04220.pdf
        - At least 2 final state leptons
        - At least 1 b-tagged jet
        - Must have a large scalar sum of transverse momentum (>500GeV)
    """
    
    # Initial size
    initial_size = len(df)
    
    # Apply data cuts
    df = df[(df["Electron_Multi"] + df["Muon_Multi"]) > 1]
    df = df[(df["Jet1_BTag"] + df["Jet2_BTag"] + df["Jet3_BTag"] + df["Jet4_BTag"] + df["Jet5_BTag"]) > 0]
    df = df[df["ScalarHT_HT"] > 500]
    
    # Determine cut ratio
    split = df["gen_split"].iloc[0].capitalize()
    cut_ratio = 100 * (initial_size - len(df)) / initial_size
    
    # Display cut ratio information
    spacing = 0 if split == "Train" else (1 if split == "Test" else 2)
    print(f"File: {filename} | Split: {split}" + spacing*" " + f" | Drop ratio: {cut_ratio}%")
    
    # Gather weights and labels
    w = df["gen_weights"].to_numpy()
    y = df["gen_label"]
    
    # Replace label strings with integers
    y = y.replace({"signal": 1, "bkg": 0}).to_numpy()
    
    # Gather features (drop all gen collumns)
    drop_cols = [col for col in df if "gen" in col]
    columns = df.drop(columns=drop_cols).columns.to_list()
    X = df.drop(columns=drop_cols).to_numpy()
    
    # Join everything in a dictionary
    split = split.lower()
    result_dict[split] = {} if split not in result_dict else result_dict[split]
    result_dict[split][filename] = {
        "columns": columns,
        "features": X, 
        "labels": y, 
        "weights": w
    }
    
    return result_dict

In [92]:
# Define data parameters
data_dict = {}
sup_split = 0.3
random_seed = 42

# Populate data dict
for file in tqdm(files, total=len(files), desc="Processing data"):
    # Read .h5 file
    df = pd.read_hdf(file)
    filename = file.split("/")[-1].split(".")[0]
    
    # Train, Val and Test split
    df_train = df[df["gen_split"] == "train"]
    df_val = df[df["gen_split"] == "val"]
    df_test = df[df["gen_split"] == "test"]
    
    # Insert data into dictionary
    data_dict = df_datasplit(df_train, filename, data_dict, sup_split, random_seed)
    data_dict = df_datasplit(df_val, filename, data_dict, sup_split, random_seed)
    data_dict = df_datasplit(df_test, filename, data_dict, sup_split, random_seed)

Processing data:  12%|█▎        | 1/8 [00:00<00:02,  3.06it/s]

File: wohg_hq1200 | Split: Train | Drop ratio: 3.191828917969997%
File: wohg_hq1200 | Split: Val   | Drop ratio: 3.1449213769655757%
File: wohg_hq1200 | Split: Test  | Drop ratio: 3.1729131175468486%


Processing data:  25%|██▌       | 2/8 [00:00<00:02,  2.96it/s]

File: hg3000_hq1000 | Split: Train | Drop ratio: 2.610543357280178%
File: hg3000_hq1000 | Split: Val   | Drop ratio: 2.3605150214592276%
File: hg3000_hq1000 | Split: Test  | Drop ratio: 2.3593095700132776%


Processing data:  38%|███▊      | 3/8 [00:01<00:02,  2.29it/s]

File: fcnc | Split: Train | Drop ratio: 0.0%
File: fcnc | Split: Val   | Drop ratio: 0.0%
File: fcnc | Split: Test  | Drop ratio: 0.0%


Processing data:  50%|█████     | 4/8 [00:01<00:01,  2.42it/s]

File: wohg_hq1400 | Split: Train | Drop ratio: 2.958243258618728%
File: wohg_hq1400 | Split: Val   | Drop ratio: 2.508591065292096%
File: wohg_hq1400 | Split: Test  | Drop ratio: 2.8042753706470522%


Processing data:  62%|██████▎   | 5/8 [00:02<00:01,  2.54it/s]

File: hg3000_hq1400 | Split: Train | Drop ratio: 2.058660763696735%
File: hg3000_hq1400 | Split: Val   | Drop ratio: 2.178649237472767%
File: hg3000_hq1400 | Split: Test  | Drop ratio: 2.102933038184837%


Processing data:  75%|███████▌  | 6/8 [00:02<00:00,  2.71it/s]

File: hg3000_hq1200 | Split: Train | Drop ratio: 2.4547945205479453%
File: hg3000_hq1200 | Split: Val   | Drop ratio: 2.3273855702094646%
File: hg3000_hq1200 | Split: Test  | Drop ratio: 2.311624805512336%


Processing data:  88%|████████▊ | 7/8 [00:02<00:00,  2.75it/s]

File: wohg_hq1000 | Split: Train | Drop ratio: 3.354134165366615%
File: wohg_hq1000 | Split: Val   | Drop ratio: 3.087629876494805%
File: wohg_hq1000 | Split: Test  | Drop ratio: 3.8438967136150235%
File: bkg | Split: Train | Drop ratio: 0.3194953551864649%
File: bkg | Split: Val   | Drop ratio: 0.33745518401990193%
File: bkg | Split: Test  | Drop ratio: 0.33878341185800254%


Processing data: 100%|██████████| 8/8 [00:19<00:00,  2.38s/it]


In [96]:
for sample in data_dict["train"]:
    print(f"Sample: {sample} | Number of columns: " + str(len(data_dict["train"][sample]["columns"])))

Sample: wohg_hq1200 | Number of columns: 118
Sample: hg3000_hq1000 | Number of columns: 118
Sample: fcnc | Number of columns: 69
Sample: wohg_hq1400 | Number of columns: 118
Sample: hg3000_hq1400 | Number of columns: 118
Sample: hg3000_hq1200 | Number of columns: 118
Sample: wohg_hq1000 | Number of columns: 118
Sample: bkg | Number of columns: 118


In [None]:
def plot_histogram(data, filename, split, axis):
    X, w = data[split][filename]["features"], data[split][filename]["weights"]
    
    # Do stuff