# Basic demo: teaching to copy (random characters)

In [1]:
# import src to path
import sys
import os

sys.path.append("./src")
sys.path.append("../src")
sys.path.append("../../src")

if __name__ == "__main__":
    # try get SLURM JOB ID
    try:
        job_id = os.environ["SLURM_JOB_ID"]
    except:
        job_id = "debug"
    logdir = f"logs/{job_id}"
    os.makedirs(logdir, exist_ok=True)

## Setup and hyperparameters

In [2]:
dataset_size = 2000
dataset_min_len = 50
dataset_max_len = 50
seed = 39  # reproducible
d_model = 256
d_ff = 256
n_layers = 1 
n_decoder_layers = 1
evaluate_on_test = False
device = 'cuda'
train_epochs = 50
lr = 1e-4
betas = (0.9, 0.999)

## Data

In [None]:
import preprocessing
import ByT5Dataset
import torch.utils.data

dataset = preprocessing.generate_random_dataset(
    rows=dataset_size, min_length=dataset_min_len, max_length=dataset_max_len, space_frequency=.15, seed=seed
)
generator1 = torch.Generator().manual_seed(seed)
train_ex, dev_ex, test_ex = torch.utils.data.random_split(
    dataset,
    [round(0.8 * dataset_size), round(0.1 * dataset_size), round(0.1 * dataset_size)],
    generator=generator1,
)
train = ByT5Dataset.ByT5CopyDataset(train_ex, max_length=dataset_max_len)
dev = ByT5Dataset.ByT5CopyDataset(dev_ex, max_length=dataset_max_len)
test = ByT5Dataset.ByT5CopyDataset(test_ex, max_length=dataset_max_len)

## Model architecture

In [None]:
# We want a T5 architecutre but severely reduced in size
from transformers import T5ForConditionalGeneration, T5Config, ByT5Tokenizer

tokenizer = ByT5Tokenizer()
config = T5Config.from_pretrained("google/byt5-small")
config.num_layers = n_layers
config.num_decoder_layers = n_decoder_layers
config.d_model = d_model
config.d_ff = d_ff
config.num_heads = 2

model = T5ForConditionalGeneration(config=config)

## Training setup

In [None]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments
from transformers import (
    DataCollatorForSeq2Seq,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
training_args = Seq2SeqTrainingArguments(
    output_dir=logdir + "/output",
    evaluation_strategy="epoch",
    num_train_epochs=train_epochs,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    # accumulate gradients to simulate higher batch size
    gradient_accumulation_steps=4,
    save_total_limit=0,
    predict_with_generate=True,
    push_to_hub=False,
    logging_dir=logdir,
)
optim = torch.optim.Adam(model.parameters(), lr=lr, betas=betas)

## Training

In [None]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train,
    eval_dataset=dev,
    data_collator=data_collator,
    tokenizer=tokenizer,
    optimizers=(optim, None),
)

trainer.train()
trainer.save_model(logdir + "/model")

## Evaluation

In [None]:
if evaluate_on_test:
    pass
else:
    test = dev

In [None]:
import numpy as np
from utils import print_diffs
for index in range(50):
    preds = model.generate(input_ids=torch.tensor(test[index]["input_ids"]), max_length=dataset_max_len).to(device).view(1,-1)
    generated = tokenizer.decode(np.array(preds.cpu()[0]))
    expected = tokenizer.decode(np.array(test[index]["labels"]))
    print_diffs(expected, generated)