In [2]:
import torch
import os
from datasets import Dataset, load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainer, Seq2SeqTrainingArguments, AutoModelForPreTraining, pipeline
from transformers import T5Tokenizer, T5ForConditionalGeneration

In [3]:
ds = load_dataset("cnn_dailymail", "1.0.0", split='train')
ds = ds.train_test_split(test_size=.05, shuffle=True)
ds

DatasetDict({
    train: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 272757
    })
    test: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 14356
    })
})

In [4]:
torch.cuda.is_available()

True

In [5]:
tokenizer = T5Tokenizer.from_pretrained("google-t5/t5-small", max_seq_len=1024)
#model = T5ForConditionalGeneration.from_pretrained("google-t5/t5-small", max_memory = 1024)
#model = T5ForConditionalGeneration.from_pretrained("./summary/last-checkpoint-10240", max_memory = 1024)
model = T5ForConditionalGeneration.from_pretrained("google-t5/t5-small", max_memory = 1024)
print("Model weights loaded...\n")

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Model weights loaded...


In [6]:
def process_func(examples):
    contents = ['Generate summary: \n' + e for e in examples['article']]
    inputs = tokenizer(contents, max_length=1024, truncation=True)
    labels = tokenizer(text_target=examples['highlights'], max_length=128, truncation=True)
    inputs['labels'] = labels['input_ids']
    return inputs

In [7]:
tokenized_ds = ds.map(process_func, batched=True)


Map:   0%|          | 0/272757 [00:00<?, ? examples/s]

KeyboardInterrupt: 

In [None]:
tokenizer.decode(tokenized_ds['train'][0]['input_ids'])

In [None]:
tokenizer.decode(tokenized_ds['train'][0]['labels'])

In [None]:
ds['train'][0]['highlights']

In [None]:
import numpy as np
from rouge import Rouge

rouge = Rouge()

In [None]:
def compute_metric(evalPred):
    preds, labels = evalPred
    decode_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decode_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    decode_preds = [" ".join(p) for p in decode_preds]
    decode_labels = [" ".join(p) for p in decode_labels]
    scores = rouge.get_scores(decode_preds, decode_labels, avg=True)
    return {
        "rouge-1": scores['rouge-1']['f'],
        "rouge-2": scores['rouge-2']['f'],
        "rouge-l": scores['rouge-l']['f']
    }

In [None]:
args = Seq2SeqTrainingArguments(
    output_dir="./summary",
    learning_rate=1e-4,
    num_train_epochs=1,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=8,
    warmup_steps=128,
    logging_steps=512,
    logging_dir="./logging",
    evaluation_strategy="steps",
    save_strategy="steps",
    save_steps=512,
    save_total_limit=5,     # save the last 5 model
    metric_for_best_model="rouge-l",
    predict_with_generate=True,  # must set True
    #load_best_model_at_end=True
)

In [95]:
trainer = Seq2SeqTrainer(
    args=args,
    model=model,
    train_dataset=tokenized_ds['train'],
    eval_dataset=tokenized_ds['test'],
    compute_metrics=compute_metric,
    tokenizer=tokenizer,
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer),
    max_seq_length = 1024
)

TypeError: __init__() got an unexpected keyword argument 'max_seq_length'

In [28]:
trainer.train()

Step,Training Loss,Validation Loss


Step,Training Loss,Validation Loss,Rouge-1,Rouge-2,Rouge-l
512,1.9641,1.699828,0.75309,0.383804,0.674303


Step,Training Loss,Validation Loss,Rouge-1,Rouge-2,Rouge-l
512,1.9641,1.699828,0.75309,0.383804,0.674303
1024,1.88,1.688986,0.752873,0.382462,0.673876


Step,Training Loss,Validation Loss,Rouge-1,Rouge-2,Rouge-l
512,1.9641,1.699828,0.75309,0.383804,0.674303
1024,1.88,1.688986,0.752873,0.382462,0.673876
1536,1.864,1.678948,0.752369,0.381655,0.673445


Step,Training Loss,Validation Loss,Rouge-1,Rouge-2,Rouge-l
512,1.9641,1.699828,0.75309,0.383804,0.674303
1024,1.88,1.688986,0.752873,0.382462,0.673876
1536,1.864,1.678948,0.752369,0.381655,0.673445
2048,1.857,1.67848,0.753513,0.382921,0.674309


Step,Training Loss,Validation Loss,Rouge-1,Rouge-2,Rouge-l
512,1.9641,1.699828,0.75309,0.383804,0.674303
1024,1.88,1.688986,0.752873,0.382462,0.673876
1536,1.864,1.678948,0.752369,0.381655,0.673445
2048,1.857,1.67848,0.753513,0.382921,0.674309
2560,1.8545,1.675138,0.753197,0.383015,0.674771


KeyboardInterrupt: 

In [81]:
original_model = T5ForConditionalGeneration.from_pretrained("google-t5/t5-small", max_memory =1024)
trained_model = T5ForConditionalGeneration.from_pretrained("./summary/checkpoint-2560", max_memory = 1024)
trained_model2 = T5ForConditionalGeneration.from_pretrained("./summary/last-checkpoint-10240", max_memory = 1024)

In [78]:
def comparison(model1, model2, tokenizer, num):
    text = ds['test'][num]['article']
    target = ds['test'][num]['highlights']
    pipe1 = pipeline('text2text-generation', model=model1, tokenizer=tokenizer, device=0)
    pipe2 = pipeline('text2text-generation', model=model2, tokenizer=tokenizer, device=0)
    print(text)
    print(f"-"*50)
    print(target)
    print(f"-"*50)
    pip_res1 = pipe1("Generate summary:\n" + text, max_length = 128)
    t5_summary1 = pip_res1[0]['generated_text']
    print(t5_summary1)
    print(f"-"*50)
    print("Rouge-L between label and generate summary with original model is ", rouge.get_scores(target, t5_summary1)[0]['rouge-l'])
    print(f"-"*50)
    
    pip_res2 = pipe2("Generate summary:\n" + text, max_length = 128)
    t5_summary2 = pip_res2[0]['generated_text']
    print(t5_summary2)
    print(f"-"*50)
    print("Rouge-L between label and generate summary with trained model is ", rouge.get_scores(target, t5_summary2)[0]['rouge-l'])
    print(f"-"*50)

In [85]:
num = 2000
comparison(original_model, trained_model, tokenizer, num)

(CNN) -- Alfred Hitchcock, one of the most famous film directors in the world, has re-emerged into the zeitgeist this year. In August, Hitchcock's "Vertigo" displaced "Citizen Kane" from its perch at the top of the prestigious Sight & Sound list of the greatest films ever made. There also had been renewed interest in "Psycho" on its 50th anniversary two years ago as well as the release of shiny new Blu-ray packages of his films. But not all the attention has been flattering. In September, HBO's film "The Girl" dramatized Tippi Hedren's behind-the-scenes horror stories from the making of "The Birds" (1963). According to the movie, Hitchcock was obsessed with his protege and sexually abusive. And now comes "Hitchcock," an Oscar wannabe scripted by John J. McLauglin ("Black Swan") and directed by Sacha Gervasi ("Anvil"), with Anthony Hopkins as the portly English master of suspense and Scarlett Johansson (very good) as "Psycho" star Janet Leigh. (By the way, Hitch never won an Academy Awa

In [86]:
comparison(original_model, trained_model2, tokenizer, num)

(CNN) -- Alfred Hitchcock, one of the most famous film directors in the world, has re-emerged into the zeitgeist this year. In August, Hitchcock's "Vertigo" displaced "Citizen Kane" from its perch at the top of the prestigious Sight & Sound list of the greatest films ever made. There also had been renewed interest in "Psycho" on its 50th anniversary two years ago as well as the release of shiny new Blu-ray packages of his films. But not all the attention has been flattering. In September, HBO's film "The Girl" dramatized Tippi Hedren's behind-the-scenes horror stories from the making of "The Birds" (1963). According to the movie, Hitchcock was obsessed with his protege and sexually abusive. And now comes "Hitchcock," an Oscar wannabe scripted by John J. McLauglin ("Black Swan") and directed by Sacha Gervasi ("Anvil"), with Anthony Hopkins as the portly English master of suspense and Scarlett Johansson (very good) as "Psycho" star Janet Leigh. (By the way, Hitch never won an Academy Awa

In [1]:
tokenized_ds['train'][2000]['input_ids']

NameError: name 'tokenized_ds' is not defined

In [96]:
trained_model.generate(ds['test'][num]['article'])

AttributeError: 'str' object has no attribute 'shape'

In [94]:
original_model.train()

T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Drop