In [1]:
import os
import math

In [12]:
os.chdir(f"/nas/ucb/{os.environ['USER']}/cupbearer")

# Train Measurement Predictor

In [3]:
# TOOD: refactor such that scripts function can be run directly by submitit
def train_classifier(log_path, lr=2e-5, warmup_steps=64, batch_size=16, accumulate_grad_batches=2, 
                     weight_decay=2e-2, num_epochs=1, precision="16-mixed", model_name="pythia-14m"):

    from cupbearer import data, detectors, models, scripts, tasks, utils
    from torch.utils.data import DataLoader
    import torch.optim as optim
    import transformers
    from lightning.pytorch.callbacks import DeviceStatsMonitor # TODO: add

    transformer, tokenizer, emb_dim, max_len = models.transformers_hf.load_transformer(
       model_name
    )
    model = models.TamperingPredictionTransformer(
            model=transformer,
            embed_dim=emb_dim
        )
    tokenizer = model.set_tokenizer(tokenizer)

    train_data = data.TamperingDataset("diamonds", tokenizer=tokenizer, max_length=max_len, train=True)
    val_data = data.TamperingDataset("diamonds", tokenizer=tokenizer, max_length=max_len, train=False)
    train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_data, batch_size=1, shuffle=False)
    
    total_steps = num_epochs * len(train_loader)
    
    return scripts.train_classifier( # NOTE: - paper uses 64 warmup steps, but seems hard
        path=log_path,
        model=model,
        train_loader=train_loader,
        task="multilabel",
        num_labels=4,
        val_loaders=val_loader,
        lr=lr,
        optim_builder=optim.AdamW,
        optim_conf={"weight_decay": weight_decay},
        lr_scheduler_conf={
            "num_warmup_steps": warmup_steps,
            "num_training_steps": total_steps
        },
        lr_scheduler_builder=transformers.optimization.get_cosine_schedule_with_warmup,
        max_epochs=num_epochs,
        wandb=False,
        callbacks=[DeviceStatsMonitor()],
        precision=precision,
        accumulate_grad_batches=accumulate_grad_batches
    )

In [4]:
import submitit
from cupbearer import utils

  from .autonotebook import tqdm as notebook_tqdm


In [9]:
# test distributed with smaller model
exp_dir = os.path.abspath(utils.log_path("logs/tampering/predictor"))
job_dir = os.path.join(exp_dir, "job")
os.makedirs(job_dir, exist_ok=True)

# job hypers
# gpus_per_node = 3
gres="gpu:A100-SXM4-80GB:1"
num_nodes = 1
mem_gb=80
time_min=60
qos = "high"

# train hypers
lr_base = 2e-5
batch_size_base = 32
precision="16-mixed"

grad_batch_size = 32
lr = lr_base * math.sqrt(batch_size_base / grad_batch_size) # maintain lr batch_size ratio

accumulate_grad_batches = 8
batch_size = grad_batch_size // accumulate_grad_batches # apply gradient accumulation

num_epochs = 5
model_name = "code-gen"

executor = submitit.AutoExecutor(folder=job_dir)
#TODO: add gpu memory required
executor.update_parameters(slurm_mem_gb=mem_gb,gres=gres, 
                           nodes=num_nodes, timeout_min=time_min, job_name="bash", qos=qos)
job = executor.submit(train_classifier, batch_size=batch_size,lr=lr, num_epochs=num_epochs,
                      precision=precision, accumulate_grad_batches=accumulate_grad_batches, 
                      model_name=model_name, log_path=exp_dir)



In [46]:
exp_dir

'/nas/ucb/oliveradk/cupbearer/logs/tampering/predictor/2024-04-14_15-53-35'

In [7]:
job.job_id

'194393'

In [None]:
out = job.result()