# Basic demo: teachng to copy (real dataset)

In [1]:
import sys
import os

# import src to path
sys.path.append("./enigma-transformed/src")
sys.path.append("./src")
sys.path.append("../src")
sys.path.append("../../src")

if __name__ == "__main__":
    # try get SLURM JOB ID
    try:
        job_id = os.environ["SLURM_JOB_ID"]
    except:
        job_id = "debug"
    logdir = f"logs/slurm_{job_id}"
    os.makedirs(logdir, exist_ok=True)

In [None]:
# 0. (optional) get data and preprocess it
import os
import utils
from preprocessing import preprocess_file

data_path = 'news.2012.en.shuffled.deduped'
if not os.path.exists(data_path):
    utils.download_newscrawl(2012,'en')
    preprocess_file('news.2012.en.shuffled.deduped')
    

In [None]:
dataset_size = 2000
dataset_min_len = 50
dataset_max_len = 50
seed = 39  # reproducible
evaluate_on_test = False
device = 'cuda:0'
train_epochs = 10 
lr = 1e-3

In [None]:
import ByT5Dataset
import torch.utils.data
from preprocessing import load_dataset

dataset = load_dataset(dataset_size, dataset_min_len, dataset_max_len, data_path, seed)
generator1 = torch.Generator().manual_seed(seed)
train_ex, dev_ex, test_ex = torch.utils.data.random_split(
    dataset,
    [round(0.8 * dataset_size), round(0.1 * dataset_size), round(0.1 * dataset_size)],
    generator=generator1,
)
train = ByT5Dataset.ByT5CopyDataset(train_ex, max_length=dataset_max_len)
dev = ByT5Dataset.ByT5CopyDataset(dev_ex, max_length=dataset_max_len)
test = ByT5Dataset.ByT5CopyDataset(test_ex, max_length=dataset_max_len)

In [None]:
# We want a T5 architecutre but severely reduced in size
from transformers import ByT5Tokenizer, AutoModelForSeq2SeqLM

tokenizer = ByT5Tokenizer()
model = AutoModelForSeq2SeqLM.from_pretrained("google/byt5-small")

In [None]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments
from transformers import (
    DataCollatorForSeq2Seq,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
training_args = Seq2SeqTrainingArguments(
    output_dir=logdir + "/output",
    evaluation_strategy="epoch",
    num_train_epochs=train_epochs,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    # accumulate gradients to simulate higher batch size
    gradient_accumulation_steps=4,
    save_total_limit=0,
    predict_with_generate=True,
    push_to_hub=False,
    logging_dir=logdir,
    save_steps=10000,
)

In [None]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train,
    eval_dataset=dev,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

trainer.train()
trainer.save_model(logdir + "/model")

In [None]:
if evaluate_on_test:
    pass
else:
    test = dev

In [None]:
from utils import levensthein_distance, print_avg_median_mode_error
from transformers import pipeline, logging
logging.set_verbosity(logging.ERROR)

error_counts = []
translate = pipeline("translation", model=model, tokenizer=tokenizer, device=device)
for index in range(len(test)):
    generated = translate(test[index]["input_text"], max_length=(dataset_max_len+1)*2)[0]["translation_text"]
    error_counts.append(levensthein_distance(generated, test[index]["output_text"]))
    if error_counts[-1] > 0:
        print(f"Example {index}, error count {error_counts[-1]}")
        print("In :", test[index]["input_text"])
        print("Gen:", generated)
        expected = test[index]["output_text"]
        print("Exp:", expected)
    else:
        print(f"Example {index} OK")
    print("-----------------------")


print_avg_median_mode_error(error_counts)