# Goals
- train predictor on training trusted and untrusted sets 
- use confidence as tampering score

In [5]:
import os
from cupbearer import data, detectors, models, scripts, tasks, utils
from torch.utils.data import DataLoader
import transformers
import torch
import submitit

  from .autonotebook import tqdm as notebook_tqdm


In [18]:
LOCAL_HPARAMS = {
    "model": "pythia-14m",
    "batch_size_on_device": 4,
    "num_epochs": 1,
    "dataset_len": 2,
    "slurm_params": {}
}
REAL_HPARAMS = {
    "model": "code-gen",
    "batch_size_on_device": 4, 
    "num_epochs": 5, 
    "dataset_len": None,
    "slurm_params": {
        "slurm_mem_gb": 80, 
        "gres": "gpu:A100-SXM4-80GB:1",
        "num_nodes": 1, 
        "tiemout_min": 60 * 10,
        "job_name": "bash",
        "qos": "high"
    }
}

HPARAMS = LOCAL_HPARAMS

# Model

In [7]:
transformer, tokenizer, emb_dim, max_len = models.transformers_hf.load_transformer(
    HPARAMS["model"]
)
model = models.TamperingPredictionTransformer(
        model=transformer,
        embed_dim=emb_dim
    )
tokenizer = model.set_tokenizer(tokenizer)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


# Data

In [8]:
train_data = data.TamperingDataset("diamonds", tokenizer=tokenizer, max_length=max_len, 
                                   train=True, dataset_len=HPARAMS["dataset_len"])
val_data = data.TamperingDataset("diamonds", tokenizer=tokenizer, max_length=max_len, 
                                 train=False, dataset_len=HPARAMS["dataset_len"])

# Set Experiment Directory

In [9]:
exp_dir = os.path.abspath(utils.log_path("logs/tampering/predictor"))

# Train Measurement Predictor

In [10]:
from lightning.pytorch.callbacks import DeviceStatsMonitor

In [11]:
train_pred_dir = os.path.join(exp_dir, "train_pred")
os.makedirs(train_pred_dir, exist_ok=True)

In [12]:
lr = 2e-5
weight_decay = 2e-2
num_warmup_steps = 64
batch_size_base = 32
precision="16-mixed"

batch_size_on_device = HPARAMS["batch_size_on_device"]
accumulate_grad_batches = batch_size_base // batch_size_on_device
num_epochs = HPARAMS["num_epochs"]
loss_weights = [0.7, 0.3]

In [13]:
train_loader = DataLoader(train_data, batch_size=batch_size_on_device, shuffle=True)
val_loader = DataLoader(val_data, batch_size=batch_size_on_device, shuffle=False)
total_steps = num_epochs * len(train_loader)

In [14]:
loss_func = lambda logits, labels: \
    torch.nn.functional.binary_cross_entropy_with_logits(logits[:, :3], labels[:, :3]) * loss_weights[0] + \
    torch.nn.functional.binary_cross_entropy_with_logits(logits[:, 3], labels[:, 3]) * loss_weights[1]

In [16]:
executor = submitit.AutoExecutor(folder=train_pred_dir)
executor.update_parameters(**HPARAMS["slurm_params"])

In [19]:
job = executor.submit(scripts.train_classifier,
    path=exp_dir,
    model=model,
    train_loader=train_loader,
    task="multilabel",
    num_labels=4,
    val_loaders=val_loader,
    optim_builder=torch.optim.AdamW,
    optim_conf={"lr": lr, "weight_decay": weight_decay},
    lr_scheduler_conf={
        "num_warmup_steps": num_warmup_steps,
        "num_training_steps": total_steps
    },
    lr_scheduler_builder=transformers.optimization.get_cosine_schedule_with_warmup,
    max_epochs=num_epochs,
    wandb=False,
    callbacks=[DeviceStatsMonitor()],
    precision=precision,
    accumulate_grad_batches=accumulate_grad_batches,
    loss_func=loss_func
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [21]:
job.result()

{'train/loss': tensor(16.7149),
 'train/acc_step': tensor(0.2500),
 'val/loss': tensor(11.3901),
 'val/acc_step': tensor(0.5000),
 'val/acc_epoch': tensor(0.5000),
 'train/acc_epoch': tensor(0.2500)}

# Eval Measurement Predictor

In [24]:
eval_pred_dir = os.path.join(exp_dir, "eval_job")

In [22]:
val_data_dirty = [el for el in val_data if not el["info"]["clean"]]

In [25]:
executor = submitit.AutoExecutor(folder=eval_pred_dir)
executor.update_parameters(**HPARAMS["slurm_params"])

In [26]:
eval_pred_job = executor.submit(scripts.eval_classifier(
    data=val_data_dirty,
    model=model, 
    path=exp_dir,
    batch_size=HPARAMS["batch_size_on_device"]
))

GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
/Users/oliverdaniels-koch/miniforge3/envs/cupbearer/lib/python3.10/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:441: The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.


Testing DataLoader 0:   0%|          | 0/1 [00:00<?, ?it/s]



Testing DataLoader 0: 100%|██████████| 1/1 [00:04<00:00,  0.23it/s]


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


# Train Ground-Truth Probe

# Eval Ground-Truth Probe