# Goals
- train predictor on training trusted and untrusted sets 
- use confidence as tampering score

In [1]:
from cupbearer import data, detectors, models, scripts, tasks, utils
from torch.utils.data import DataLoader

  from .autonotebook import tqdm as notebook_tqdm


# Train Measurement Predictor

In [2]:
import os
# uncomment to set cuda visible devices
# os.environ["CUDA_VISIBLE_DEVICES"] = "7"
# os.environ["CUDA_VISIBLE_DEVICES"]

In [3]:
transformer, tokenizer, emb_dim, max_len = models.transformers_hf.load_transformer(
    "pythia-14m"
)
model = models.TamperingPredictionTransformer(
        model=transformer,
        embed_dim=emb_dim
    )
tokenizer = model.set_tokenizer(tokenizer)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [4]:
train_data = data.TamperingDataset("diamonds", tokenizer=tokenizer, max_length=max_len, train=True, dataset_len=2)
val_data = data.TamperingDataset("diamonds", tokenizer=tokenizer, max_length=max_len, train=False, dataset_len=2)

In [5]:
[batch for batch in DataLoader(train_data, batch_size=2, shuffle=True)]

[{'x': {'input_ids': tensor([[    0,     0,     0,  ...,   544, 35991,   187],
           [    0,     0,     0,  ...,   544, 35991,   187]]),
   'attention_mask': tensor([[0, 0, 0,  ..., 1, 1, 1],
           [0, 0, 0,  ..., 1, 1, 1]]),
   'sensor_inds': tensor([[2035, 2038, 2046],
           [2035, 2038, 2046]])},
  'y': tensor([[1., 1., 1., 1.],
          [1., 1., 1., 1.]]),
  'info': {'correct': tensor([1., 1.]), 'clean': tensor([0., 1.])}},
 {'x': {'input_ids': tensor([[    0,     0,     0,  ...,   544, 35991,   187],
           [    0,     0,     0,  ...,   544, 35991,   187]]),
   'attention_mask': tensor([[0, 0, 0,  ..., 1, 1, 1],
           [0, 0, 0,  ..., 1, 1, 1]]),
   'sensor_inds': tensor([[2035, 2038, 2046],
           [2035, 2038, 2046]])},
  'y': tensor([[0., 0., 0., 0.],
          [1., 1., 0., 0.]]),
  'info': {'correct': tensor([0., 0.]), 'clean': tensor([0., 0.])}},
 {'x': {'input_ids': tensor([[    0,     0,     0,  ...,   544, 35991,   187]]),
   'attention_mask': te

In [9]:
train_loader = DataLoader(train_data, batch_size=32, shuffle=True)
num_epochs = 1
total_steps = num_epochs * len(train_loader)
lr = 2e-5
scripts.train_classifier( # NOTE: - paper uses 64 warmup steps, but seems hard
    path=(classifier_path := utils.log_path("logs/tampering/predictor")),
    model=model,
    train_loader=train_loader,
    task="multilabel",
    num_labels=4,
    val_loaders=DataLoader(val_data, batch_size=1024, shuffle=False),
    lr=lr,
    lr_scheduler_conf={
        "lr_warmup_steps": 64,
        "total_steps": total_steps,
        "lr": lr
    },
    lr_scheduler_builder=scripts.lr_scheduler.CosineWarmUpBuilder,
    max_epochs=num_epochs,
)

GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name           | Type                           | Params
------------------------------------------------------------------
0 | model          | TamperingPredictionTransformer | 7.6 M 
1 | train_accuracy | MultilabelAccuracy             | 0     
2 | val_accuracy   | ModuleList                     | 0     
3 | test_accuracy  | ModuleList                     | 0     
------------------------------------------------------------------
7.6 M     Trainable params
0         Non-trainable params
7.6 M     Total params
30.517    Total estimated model params size (MB)


Sanity Checking DataLoader 0:   0%|          | 0/1 [00:00<?, ?it/s]

/Users/oliverdaniels-koch/miniforge3/envs/cupbearer/lib/python3.10/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.


                                                                           

/Users/oliverdaniels-koch/miniforge3/envs/cupbearer/lib/python3.10/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.
/Users/oliverdaniels-koch/miniforge3/envs/cupbearer/lib/python3.10/site-packages/lightning/pytorch/loops/fit_loop.py:293: The number of training batches (1) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.


Epoch 0: 100%|██████████| 1/1 [00:00<00:00,  1.28it/s, train/loss=6.070]

`Trainer.fit` stopped: `max_epochs=1` reached.


Epoch 0: 100%|██████████| 1/1 [00:00<00:00,  1.04it/s, train/loss=6.070]


{'train/loss': tensor(6.0702),
 'train/acc_step': tensor(0.4500),
 'val/loss': tensor(3.3786),
 'val/acc_step': tensor(0.6000),
 'val/acc_epoch': tensor(0.6000),
 'train/acc_epoch': tensor(0.4500)}