# Exploring models that are already trained

In [1]:
# import src to path
import sys
import os

sys.path.append("./src")
sys.path.append("../src")
sys.path.append("../../src")

if __name__ == "__main__":
    # try get SLURM JOB ID
    try:
        job_id = os.environ["SLURM_JOB_ID"]
    except:
        job_id = "debug"
    logdir = f"logs/slurm_{job_id}"
    os.makedirs(logdir, exist_ok=True)

## Setup and hyperparameters

In [2]:
dataset_size = 2000
dataset_min_len = 50
dataset_max_len = 50
seed = 39  # reproducible
d_model = 256
d_ff = 256
n_layers = 1 
n_decoder_layers = 1
evaluate_on_test = False
device = 'cuda'
train_epochs = 50
lr = 1e-4
betas = (0.9, 0.999)

## Data

In [None]:
import preprocessing
import ByT5Dataset
import torch.utils.data

dataset = preprocessing.generate_random_dataset(
    rows=dataset_size, min_length=dataset_min_len, max_length=dataset_max_len, space_frequency=.15, seed=seed
)
generator1 = torch.Generator().manual_seed(seed)
train_ex, dev_ex, test_ex = torch.utils.data.random_split(
    dataset,
    [round(0.8 * dataset_size), round(0.1 * dataset_size), round(0.1 * dataset_size)],
    generator=generator1,
)
train = ByT5Dataset.ByT5CopyDataset(train_ex, max_length=dataset_max_len)
dev = ByT5Dataset.ByT5CopyDataset(dev_ex, max_length=dataset_max_len)
test = ByT5Dataset.ByT5CopyDataset(test_ex, max_length=dataset_max_len)

## Model architecture

In [None]:
from transformers import AutoModel, AutoTokenizer, T5ForConditionalGeneration, AutoModelForSeq2SeqLM

model_path = "logs/slurm_16587/model/"
# model_path = "logs/slurm_16581/output/checkpoint-500"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model: AutoModelForSeq2SeqLM = AutoModelForSeq2SeqLM.from_pretrained(model_path)


## Evaluation

In [None]:
if evaluate_on_test:
    pass
else:
    test = dev

In [None]:
import numpy as np
from utils import print_diffs
generation_config = {
    "max_length": dataset_max_len,  # Set the maximum length of the generated output
    # "num_beams": 4,  # Set the number of beams for beam search
}
for index in range(len(test)):
    preds = model.generate(test[index]["input_ids"].unsqueeze(0),**generation_config)
    # remove the first token of preds[0]
    generated = tokenizer.decode(preds[0][1:])

    print("Input:", test[index]["input_text"])
    print("Generated:", generated)
    expected = test[index]["output_text"]
    print("Expected:", expected)
    print("-----------------------")
    print_diffs(expected, generated)