In [2]:
%load_ext autoreload
%load_ext rich
from rich import print
import datasets as ds
import model_utils as mu
import transformers as tr
import torch
from omegaconf import OmegaConf
torch.set_float32_matmul_precision("medium")

2024-03-27 14:49:08.236004: I tensorflow/core/util/port.cc:111] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-03-27 14:49:08.283833: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-27 14:49:08.283863: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-27 14:49:08.283889: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-03-27 14:49:08.300984: I tensorflow/core/platform/cpu_feature_g

In [2]:
%%writefile config/debug.yaml
misc:
    model_name: "debug"
    base_model_name: "google/mt5-small"
    kind: "2way"
    alpha1: 0.5
    alpha2: 0.25
    dataset: "./data/dataset.jsonl.zst"
    dev_size: 0.05
    subsample: true
    seed: 0

batch:
    train_size: 8
    eval_size: 8
    accumulation: 1

train:
    lr: 5e-5
    weight_decay: 0.1
    max_grad_norm: 1.0
    max_steps: 500
    lr_scheduler: "cosine"
    warmup_ratio: 0.06

Overwriting config/debug.yaml


In [4]:
config = OmegaConf.load("config/debug.yaml")

In [4]:
arguments = tr.Seq2SeqTrainingArguments(
    output_dir=f"./output/{config.misc.model_name}", 
    overwrite_output_dir=True,
    remove_unused_columns=False,
    do_train=True, do_eval=True,
    evaluation_strategy='steps', eval_steps=100,
    logging_strategy='steps', logging_steps=100,
    predict_with_generate=True, prediction_loss_only=False, 
    per_device_train_batch_size=config.batch.train_size, 
    per_device_eval_batch_size=config.batch.eval_size,
    gradient_accumulation_steps=config.batch.accumulation,
    learning_rate=config.train.lr, weight_decay=config.train.weight_decay,
    max_grad_norm=config.train.max_grad_norm, max_steps=config.train.max_steps,
    lr_scheduler_type=config.train.lr_scheduler, warmup_ratio=config.train.warmup_ratio,
    save_strategy='steps', save_steps=100, save_safetensors=True, group_by_length=True, length_column_name='length',
    fp16=True, fp16_opt_level='O2'
)

In [5]:
dataset = ds.Dataset.from_json(config.misc.dataset)

In [6]:
if config.misc.subsample:
    dataset = dataset.select(range(1000))

In [7]:
dataset = dataset.train_test_split(test_size=config.misc.dev_size, seed=config.misc.seed)

In [8]:
effective_batch_size = arguments.per_device_train_batch_size * arguments.gradient_accumulation_steps
n_epochs = (effective_batch_size * arguments.max_steps) / len(dataset['train'])
print(f"Effective # of epochs: {n_epochs:.4f}")

In [5]:
model = tr.AutoModelForSeq2SeqLM.from_pretrained(config.misc.base_model_name)
tokenizer = tr.AutoTokenizer.from_pretrained(config.misc.base_model_name)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [10]:
tokenize_fn = mu.get_tokenize_fn(
    tokenizer=tokenizer,
    kind=config.misc.kind
)

In [11]:
dataset = dataset.map(
    tokenize_fn, 
    batched=True, 
    batch_size=128, 
    num_proc=8,
    remove_columns=dataset['train'].column_names
)

In [12]:
dataset = dataset.map(
    mu.length_fn,
    batched=True,
    batch_size=128,
    num_proc=8
)

In [13]:
class DataCollator2Way(tr.DataCollatorForSeq2Seq):

    def __call__(self, inputs):
        pred_features = super().__call__([
            {
                'input_ids': item['input_ids'],
                'attention_mask': item['attention_mask'],
                'labels': item['labels']
            } for item in inputs
        ])
        expl_features = super().__call__([
            {
                'input_ids': item['expl_input_ids'],
                'attention_mask': item['expl_attention_mask'],
                'labels': item['expl_labels']
            } for item in inputs
        ])
        return {
            'pred': pred_features,
            'expl': expl_features
        }

In [14]:
data_collator = DataCollator2Way(tokenizer, padding=True, pad_to_multiple_of=8, max_length=1024)

In [15]:
import numpy as np
def compute_metrics_text(tokenizer):
    def compute_metrics(eval_pred):
        predictions, labels = eval_pred
        decoded_preds = tokenizer.batch_decode(predictions[0], skip_special_tokens=True)

        labels = np.where(labels[0] != -100, labels[0], tokenizer.pad_token_id)
        decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

        acc = np.mean(np.array(decoded_preds) == np.array(decoded_labels))

        return {'accuracy': acc}

    return compute_metrics

In [16]:
trainer = mu.TaskPrefixTrainer(
    alpha1=config.misc.alpha1,
    alpha2=config.misc.alpha2,
    kind=config.misc.kind,
    data_collator=data_collator,
    model=model,
    args=arguments,
    train_dataset=dataset['train'],
    eval_dataset=dataset['test'],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics_text(tokenizer)
)

Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [17]:
trainer.train()

Could not estimate the number of tokens of the input, floating-point operations will not be computed


Step,Training Loss,Validation Loss,Accuracy
100,19.9652,11.556708,0.85
200,10.1062,6.222573,0.9
300,5.3863,3.643426,0.8
400,4.0608,2.937597,0.85
500,3.7867,2.864522,0.85




Checkpoint destination directory ./output/debug/checkpoint-100 already exists and is non-empty. Saving will proceed but saved results may be invalid.




Checkpoint destination directory ./output/debug/checkpoint-200 already exists and is non-empty. Saving will proceed but saved results may be invalid.




Checkpoint destination directory ./output/debug/checkpoint-300 already exists and is non-empty. Saving will proceed but saved results may be invalid.




Checkpoint destination directory ./output/debug/checkpoint-400 already exists and is non-empty. Saving will proceed but saved results may be invalid.




Checkpoint destination directory ./output/debug/checkpoint-500 already exists and is non-empty. Saving will proceed but saved results may be invalid.



[1;35mTrainOutput[0m[1m([0m
    [33mglobal_step[0m=[1;36m500[0m,
    [33mtraining_loss[0m=[1;36m8[0m[1;36m.661045349121094[0m,
    [33mmetrics[0m=[1m{[0m
        [32m'train_runtime'[0m: [1;36m192.516[0m,
        [32m'train_samples_per_second'[0m: [1;36m41.555[0m,
        [32m'train_steps_per_second'[0m: [1;36m2.597[0m,
        [32m'total_flos'[0m: [1;36m0.0[0m,
        [32m'train_loss'[0m: [1;36m8.661045349121094[0m,
        [32m'epoch'[0m: [1;36m8.33[0m
    [1m}[0m
[1m)[0m

In [48]:
def do_test(idx: int = 0, n: int = 2):
    examples = dataset['test'][idx:idx+n]
    result = []
    for i, ex in enumerate(examples['input_ids']):
        result.append(f"{i} => {tokenizer.decode(ex)} => [{tokenizer.decode(examples['labels'][i])}]")

    examples.pop('labels')
    examples.pop("length")
    examples.pop("expl_input_ids")
    examples.pop("expl_labels")
    examples.pop("expl_attention_mask")
    
    examples = tokenizer.pad(examples, return_attention_mask=True, return_tensors='pt')
    
    with torch.inference_mode():
        examples = {
            k: v.to(model.device) for k, v in examples.items()
        }
        outputs = model.generate(**examples)
        outputs = outputs.cpu().numpy()

    for i, out in enumerate(outputs):
        result[i] += f" *Predicted*: [{tokenizer.decode(out, skip_special_tokens=True)}]"

    for item in result:
        print(item)

In [49]:
do_test(0, 10)