In [7]:
import torch
import numpy as np
import evaluate
import nltk

from nlp481 import T5BiLDModel
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from datasets import load_dataset, Dataset, load_from_disk
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer
from functools import partial

In [2]:
model_large = AutoModelForSeq2SeqLM.from_pretrained("google-t5/t5-large")

In [3]:
model_small = AutoModelForSeq2SeqLM.from_pretrained("google-t5/t5-small")

In [5]:
model_large.to("cuda:0")
model_small.to("cuda:0")

T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Drop

In [4]:
model_bild = T5BiLDModel(
    model_large,
    model_small
)



In [5]:
cnn_dataset = load_dataset("cnn_dailymail", "1.0.0")

In [6]:
cnn_dataset

DatasetDict({
    train: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 287113
    })
    validation: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 13368
    })
    test: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 11490
    })
})

In [59]:
t5_tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-small")


def preprocess_dataset(
    examples: Dataset,
    tokenizer: AutoTokenizer,
    input_key: str = "article",
    output_key: str = "highlights",
    prefix: str = "summarize: "
) -> Dataset:
    inputs = [prefix + doc for doc in examples[input_key]]
    model_inputs = tokenizer(
        inputs,
        padding="max_length",
        max_length=1024,
        truncation=True
    )

    labels = tokenizer(
        text_target=examples[output_key],
        padding='max_length',
        max_length = 128, 
        truncation = True
    )
    model_inputs["labels"] = labels["input_ids"]

    return model_inputs

tokenized_cnn = cnn_dataset.map(
    partial(preprocess_dataset, tokenizer = t5_tokenizer), 
    batched = True
)


[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
  0%|          | 0/500000 [05:41<?, ?it/s]

[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

In [13]:
t5_tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-small")
tokenized_cnn = load_from_disk("tokenized-datasets/cnn-dm")

In [9]:
tokenized_cnn = tokenized_cnn.remove_columns(cnn_dataset["train"].column_names)
tokenized_cnn.save_to_disk("tokenized-datasets/cnn-dm")

ValueError: Column name article not in the dataset. Current columns in the dataset: ['input_ids', 'attention_mask', 'labels']

In [14]:
training_args = Seq2SeqTrainingArguments(
    output_dir='./train-results',
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    logging_steps=1,
    save_steps=5,
    eval_steps=1,
    max_steps=500000,
    evaluation_strategy="steps",
    predict_with_generate=True,
    report_to=None,
    metric_for_best_model="rouge_l",
    load_best_model_at_end=True,
)

In [15]:

metric = evaluate.load("rouge")

def compute_metrics(eval_preds):
    preds, labels = eval_preds

    # decode preds and labels
    labels = np.where(labels != -100, labels, t5_tokenizer.pad_token_id)
    decoded_preds = t5_tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = t5_tokenizer.batch_decode(labels, skip_special_tokens=True)

    # rougeLSum expects newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]

    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    return result

trainer = Seq2SeqTrainer(
    model = model_bild,
    args = training_args,
    train_dataset = tokenized_cnn["train"],
    eval_dataset = tokenized_cnn["validation"],
    tokenizer = t5_tokenizer,
    compute_metrics = compute_metrics,
)

AttributeError: 'T5BiLDModel' object has no attribute 'device'