# Example for Higgs Challenge with PyTorch

In [None]:
from pathlib import Path
import urllib

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.preprocessing import StandardScaler
import torch
from torch import nn
import torch.nn.functional as F
from tqdm.auto import tqdm

In [None]:
path = Path("atlas-higgs-challenge-2014-v2.csv.gz")
url = "http://opendata.cern.ch/record/328/files/atlas-higgs-challenge-2014-v2.csv.gz"

def prepare_data():
    if path.exists():
        return
    path_prev_tutorial = Path("../05-overfitting-validation-metrics") / path
    if path_prev_tutorial.exists():
        path.symlink_to(path_prev_tutorial)
    if not path.exists():
        urllib.request.urlretrieve(url, path)

prepare_data()

df = pd.read_csv(path)

In [None]:
feature_names = [col for col in df.columns if col.startswith("DER") or col.startswith("PRI")]
feature_names

In [None]:
X = df[feature_names]
y = df['Label']
weight = df['Weight']

(
    X_train,
    X_test,
    y_train,
    y_test,
    weight_train,
    weight_test,
) = train_test_split(
    X.to_numpy(dtype=np.float32),
    (y == "s").to_numpy(dtype=np.float32),
    weight.to_numpy(dtype=np.float32),
    test_size=0.33,
    random_state=42
)

# set "missing" values to 0
X_train[X_train==-999] = 0
X_test[X_test==-999] = 0

# scale
scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

# to balance weighted sum of signal and background
class_weight = np.array([
    len(y_train) / weight_train[y_train==0].sum(),
    len(y_train) / weight_train[y_train==1].sum(),
])

# to have average weight = 1
# use this weight in the fit
weight_for_fit = weight_train * class_weight[y_train.astype(int)]
weight_for_fit /= weight_for_fit.mean()

## Model Definition

In [None]:
neurons = 128
dropout = 0.05

model = nn.Sequential(
    nn.Linear(len(feature_names), neurons),
    nn.ReLU(),
    nn.Linear(neurons, neurons),
    nn.BatchNorm1d(neurons),
    nn.Dropout1d(dropout),
    nn.ReLU(),
    nn.Linear(neurons, neurons),
    nn.BatchNorm1d(neurons),
    nn.Dropout1d(dropout),
    nn.ReLU(),
    nn.Linear(neurons, 1)
)

## Data loading

we will also split the train data again into train/val

Docs:
- [torch.utils.data](https://pytorch.org/docs/stable/data.html)
- [DataLoader](https://pytorch.org/docs/stable/data.html#torch.utils.data.DataLoader)
- [StackDataset](https://pytorch.org/docs/stable/data.html#torch.utils.data.StackDataset)

In [None]:
from torch.utils.data import DataLoader, Dataset

In [None]:
class StackDataset(Dataset):
    """
    manual implementation of StackDataset in newer torch versions:
    https://pytorch.org/docs/stable/data.html#torch.utils.data.StackDataset
    """
    def __init__(self, **kwargs):
        self.dict = kwargs
        
    def __len__(self):
        return len(next(iter(self.dict.values())))
    
    def __getitem__(self, i):
        return {k: v[i] for k, v in self.dict.items()}

In [None]:
batch_size = 512
splits = train_test_split(X_train, y_train, weight_for_fit)
dl_train = DataLoader(
    StackDataset(X=splits[0], y=splits[2], w=splits[4]),
    batch_size=batch_size,
    shuffle=True,
)
dl_val = DataLoader(
    StackDataset(X=splits[1], y=splits[3], w=splits[5]),
    batch_size=batch_size
)

In [None]:
batch = next(iter(dl_train))

In [None]:
batch["X"].shape, batch["y"].shape, batch["w"].shape

## Optimization loop

In [None]:
device = "cpu"

In [None]:
model.to(device)

In [None]:
optimizer = torch.optim.Adam(model.parameters())

In [None]:
history = []

In [None]:
def fit(dl_train, dl_val, opt, history, epochs=1, device="cpu", patience=5):
    loss_fn = F.binary_cross_entropy_with_logits

    def train_step(batch):
        model.train()
        opt.zero_grad()
        logits = model(batch["X"]).squeeze(1)
        loss = loss_fn(logits, batch["y"], weight=batch["w"])
        loss.backward()
        opt.step()
        return loss.cpu().detach().item()

    def val_step(batch):
        model.eval()
        with torch.no_grad():
            logits = model(batch["X"]).squeeze(1)
            loss = loss_fn(logits, batch["y"], weight=batch["w"])
            return loss.cpu().item()

    for epoch in range(epochs):
        losses = []
        for batch in tqdm(dl_train, desc=f"Epoch {epoch}"):
            batch = {k: v.to(device) for k, v in batch.items()}
            losses.append(train_step(batch))

        val_losses = []
        for batch in dl_val:
            batch = {k: v.to(device) for k, v in batch.items()}
            val_losses.append(val_step(batch))
    
        history.append({"loss": np.mean(losses), "val_loss": np.mean(val_losses)})
        print(history[-1])

In [None]:
fit(dl_train, dl_val, optimizer, history=history, epochs=5, device=device)

In [None]:
pd.DataFrame(history).plot()

## Evaluation

In [None]:
def ams(s, b):
    """
    Approximate median significance, as defined in Higgs Kaggle challenge

    The number 10, added to the background yield, is a regularization term to decrease the variance of the AMS.
    """
    return np.sqrt(2 * ((s + b + 10) * np.log(1 + s / (b + 10)) - s))

sumw = df.groupby("Label").Weight.sum()
nsig_tot = sumw["s"]
nbkg_tot = sumw["b"]

In [None]:
dl_test = DataLoader(
    torch.tensor(X_test, dtype=torch.float32),
    batch_size=batch_size
)

In [None]:
def predict(dl, device="cpu"):
    out = []
    for X in dl:
        X = X.to(device)
        with torch.no_grad():
            out.append(model(X).sigmoid().cpu()) # <- need to apply activation function here
    return torch.cat(out)

In [None]:
p_test = predict(dl_test, device=device).numpy()

In [None]:
roc_auc_score(y_test, p_test, sample_weight=weight_test)

In [None]:
fpr, tpr, thr = roc_curve(y_test, p_test, sample_weight=weight_test)

In [None]:
ams(tpr * nsig_tot, fpr * nbkg_tot).max()