In [1]:
kwargs = {
    "seed": 42,
    "data_dir": "data/",
    "model_file": "outputs/pytorch_model.bin",
    "train_dir": "outputs/",
    "epoch": 6,
    "learning_rate": 1e-5,
    "batch_size": 8,
    "do_train": False,
    "checkpoint": "google/pegasus-cnn_dailymail",
    "max_output_length": 100
}

In [2]:
from google.colab import drive
if kwargs["do_train"]:
  drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [3]:
!pip install datasets
!pip install transformers
!pip install route_score
!pip install evaluate
!pip install rouge_score
!pip install sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.9.0-py3-none-any.whl (462 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m462.8/462.8 KB[0m [31m20.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub<1.0.0,>=0.2.0
  Downloading huggingface_hub-0.12.0-py3-none-any.whl (190 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.3/190.3 KB[0m [31m26.0 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash
  Downloading xxhash-3.2.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (213 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m213.0/213.0 KB[0m [31m26.8 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess
  Downloading multiprocess-0.70.14-py38-none-any.whl (132 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.0/132.0 KB[0m [31m20.8 MB/s[0m eta [36m0:00:00[0m
Collecting responses<0.19
  Downloa

In [4]:
!python -m torch.utils.collect_env

Collecting environment information...
PyTorch version: 1.13.1+cu116
Is debug build: False
CUDA used to build PyTorch: 11.6
ROCM used to build PyTorch: N/A

OS: Ubuntu 20.04.5 LTS (x86_64)
GCC version: (Ubuntu 9.4.0-1ubuntu1~20.04.1) 9.4.0
Clang version: 10.0.0-4ubuntu1 
CMake version: version 3.22.6
Libc version: glibc-2.31

Python version: 3.8.10 (default, Nov 14 2022, 12:59:47)  [GCC 9.4.0] (64-bit runtime)
Python platform: Linux-5.10.147+-x86_64-with-glibc2.29
Is CUDA available: True
CUDA runtime version: 11.2.152
CUDA_MODULE_LOADING set to: LAZY
GPU models and configuration: GPU 0: Tesla T4
Nvidia driver version: 510.47.03
cuDNN version: Probably one of the following:
/usr/lib/x86_64-linux-gnu/libcudnn.so.8.1.1
/usr/lib/x86_64-linux-gnu/libcudnn_adv_infer.so.8.1.1
/usr/lib/x86_64-linux-gnu/libcudnn_adv_train.so.8.1.1
/usr/lib/x86_64-linux-gnu/libcudnn_cnn_infer.so.8.1.1
/usr/lib/x86_64-linux-gnu/libcudnn_cnn_train.so.8.1.1
/usr/lib/x86_64-linux-gnu/libcudnn_ops_infer.so.8.1.1
/usr/

In [2]:
import numpy as np
import torch
from transformers import AutoTokenizer, Seq2SeqTrainingArguments, AutoModelForSeq2SeqLM, AutoConfig, DataCollatorForSeq2Seq, \
    Seq2SeqTrainer
import evaluate
from datasets import load_dataset, DatasetDict, concatenate_datasets
import json
from rouge_score import rouge_scorer
from nltk.tokenize import sent_tokenize
from nltk import download

In [None]:
download('punkt')

In [3]:
train_dataset = load_dataset('json', data_files='dataset.json', field="train", split="train")
eval_dataset = load_dataset('json', data_files='dataset.json', field="validation", split="train")
test_dataset = load_dataset('json', data_files='dataset.json', field="test", split="train")



In [4]:
ds = DatasetDict({"train":train_dataset,"test":test_dataset, "validation":eval_dataset})
ds

DatasetDict({
    train: Dataset({
        features: ['id', 'text', 'summary'],
        num_rows: 44
    })
    test: Dataset({
        features: ['id', 'text', 'summary'],
        num_rows: 46
    })
    validation: Dataset({
        features: ['id', 'text', 'summary'],
        num_rows: 44
    })
})

In [5]:
tokenizer = AutoTokenizer.from_pretrained(
    kwargs["checkpoint"],
    mask_token_sent="[MASK]"
    )


def tokenize__data(data):
    input_feature = tokenizer(data["text"], truncation=True, padding=True, max_length=1024)
    label = tokenizer(data["summary"], truncation=True, padding=True, max_length=kwargs["max_output_length"])
    return {
        "input_ids": input_feature["input_ids"],
        "attention_mask": input_feature["attention_mask"],
        "labels": label["input_ids"],
    }

1

In [6]:
ds_for_train = ds.map(
    tokenize__data,
    remove_columns=["id", "summary", "text"],
    batched=True,
    batch_size=kwargs["batch_size"])
ds_for_train



  0%|          | 0/6 [00:00<?, ?ba/s]



DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 44
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 46
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 44
    })
})

In [7]:
# Check GPU
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

cuda


In [10]:
config = AutoConfig.from_pretrained(
    kwargs["checkpoint"],
    max_length=kwargs["max_output_length"]
)
model = AutoModelForSeq2SeqLM.from_pretrained(kwargs["checkpoint"], config=config)
model.resize_token_embeddings(len(tokenizer.vocab))

Embedding(96104, 1024)

In [11]:
if not kwargs["do_train"]:
    model.load_state_dict(torch.load(kwargs["model_file"], map_location=torch.device(device)))

In [12]:
model.to(device)

PegasusForConditionalGeneration(
  (model): PegasusModel(
    (shared): Embedding(96104, 1024)
    (encoder): PegasusEncoder(
      (embed_tokens): Embedding(96104, 1024)
      (embed_positions): PegasusSinusoidalPositionalEmbedding(1024, 1024)
      (layers): ModuleList(
        (0): PegasusEncoderLayer(
          (self_attn): PegasusAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (activation_fn): ReLU()
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias=True)
          (final_layer_norm): LayerNorm((1024,), eps=1e-05, ele

In [13]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="pt")

training_args = Seq2SeqTrainingArguments(
    output_dir=kwargs["train_dir"],
    seed=kwargs["seed"],
    overwrite_output_dir=True,
    label_names=["labels"],
    learning_rate=kwargs["learning_rate"],
    num_train_epochs=kwargs["epoch"],
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="rouge1",
    generation_max_length = kwargs["max_output_length"],
    predict_with_generate=True,
)

rouge_metric = evaluate.load("rouge")


def tokenize_sentence(arg):
    encoded_arg = tokenizer(arg)
    return tokenizer.convert_ids_to_tokens(encoded_arg.input_ids)


def get_pred_label(predictions, labels):
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

    predictions = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    text_predicitons = ["\n".join(np.char.strip(p)) for p in sent_tokenize(predictions)]
    text_labels = ["\n".join(np.char.strip(l)) for l in sent_tokenize(labels)]


    print(text_predicitons)
    print(text_labels)
    return text_predicitons, text_labels


def compute_metrics(eval_preds):
    predictions, labels = eval_preds
    predictions, labels = get_pred_label(predictions, labels)
    return rouge_metric.compute(predictions=predictions, references=labels, tokenizer=tokenize_sentence)


trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=ds_for_train["train"],
    eval_dataset=ds_for_train["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [16]:
torch.cuda.empty_cache()
if kwargs["do_train"]:
    trainer.train()
    trainer.save_model()

***** Running training *****
  Num examples = 44
  Num Epochs = 6
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 1
  Gradient Accumulation steps = 1
  Total optimization steps = 264
  Number of trainable parameters = 568700928
You're using a PegasusTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
1,No log,7.253953,0.181131,0.01705,0.133413,0.133335
2,No log,7.027465,0.185745,0.021262,0.134085,0.13407
3,No log,6.91448,0.196647,0.024916,0.139178,0.138983
4,No log,6.840398,0.197369,0.021835,0.142055,0.142053
5,No log,6.797301,0.201912,0.026084,0.147214,0.147534
6,No log,6.788785,0.19682,0.025993,0.143686,0.143842


***** Running Evaluation *****
  Num examples = 46
  Batch size = 1
Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "forced_eos_token_id": 1,
  "length_penalty": 0.8,
  "max_length": 128,
  "min_length": 32,
  "num_beams": 8,
  "pad_token_id": 0,
  "transformers_version": "4.26.0"
}

Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "forced_eos_token_id": 1,
  "length_penalty": 0.8,
  "max_length": 128,
  "min_length": 32,
  "num_beams": 8,
  "pad_token_id": 0,
  "transformers_version": "4.26.0"
}

Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "forced_eos_token_id": 1,
  "length_penalty": 0.8,
  "max_length": 128,
  "min_length": 32,
  "num_beams": 8,
  "pad_token_id": 0,
  "transformers_version": "4.26.0"
}

Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 0,
  "eos_t

["The shooting took place at Marjory Stoneman Douglas High School in Parkland, Florida, on Wednesday morning.<n>Fifteen students and three adults were killed in the shooting, which took place on the school's football field.", 'Police officers from across the country took to the streets in protest of gun laws in the U.S.<n>Officers from across the country took to the streets in protest of gun laws in the U.S.<n>Police officers from across the country took to the streets in protest of gun laws in the U.S.<n>Officers from across the country took to the streets in protest of gun laws in the U.S.<n>Police officers from across the country took to the streets in protest of gun laws in the U.S.<n>Officers from across the country took to the streets in protest of gun laws in the U.', "China is the world's largest producer of pandas, accounting for more than half of the world's pandas.<n>China has become the world's largest producer of pandas.<n>China has become the world's largest producer of p

Saving model checkpoint to outputs/checkpoint-44
Configuration saved in outputs/checkpoint-44/config.json
Configuration saved in outputs/checkpoint-44/generation_config.json
Model weights saved in outputs/checkpoint-44/pytorch_model.bin
tokenizer config file saved in outputs/checkpoint-44/tokenizer_config.json
Special tokens file saved in outputs/checkpoint-44/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 46
  Batch size = 1
Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "forced_eos_token_id": 1,
  "length_penalty": 0.8,
  "max_length": 128,
  "min_length": 32,
  "num_beams": 8,
  "pad_token_id": 0,
  "transformers_version": "4.26.0"
}

Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "forced_eos_token_id": 1,
  "length_penalty": 0.8,
  "max_length": 128,
  "min_length": 32,
  "num_beams": 8,
  "pad_token_id": 0,
  "transformers_version":

['At least 12 people were killed in a shooting at a high school in Virginia on Wednesday morning.<n>The shooting took place at Columbine High School, which is located in Littleton, Colorado.', 'Police have been accused of using excessive force during protests over the death of Michael Brown in Ferguson, Missouri.<n>Police have been accused of using excessive force during protests over the death of Michael Brown in Ferguson, Missouri.<n>Police have been accused of using excessive force during protests over the death of Michael Brown in Ferguson, Missouri.<n>Police have been accused of using excessive force during protests over the death of Michael Brown in Ferguson, Missouri.<n>Police have been accused of using excessive force during protests over the death of Michael Brown in Ferguson, Missouri.<n>Police have been accused of using excessive force during protests over the death of', 'China is home to about 4,000 giant pandas, more than any other country in the world.<n>The number of gia

Saving model checkpoint to outputs/checkpoint-88
Configuration saved in outputs/checkpoint-88/config.json
Configuration saved in outputs/checkpoint-88/generation_config.json
Model weights saved in outputs/checkpoint-88/pytorch_model.bin
tokenizer config file saved in outputs/checkpoint-88/tokenizer_config.json
Special tokens file saved in outputs/checkpoint-88/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 46
  Batch size = 1
Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "forced_eos_token_id": 1,
  "length_penalty": 0.8,
  "max_length": 128,
  "min_length": 32,
  "num_beams": 8,
  "pad_token_id": 0,
  "transformers_version": "4.26.0"
}

Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "forced_eos_token_id": 1,
  "length_penalty": 0.8,
  "max_length": 128,
  "min_length": 32,
  "num_beams": 8,
  "pad_token_id": 0,
  "transformers_version":



Saving model checkpoint to outputs/checkpoint-132
Configuration saved in outputs/checkpoint-132/config.json
Configuration saved in outputs/checkpoint-132/generation_config.json
Model weights saved in outputs/checkpoint-132/pytorch_model.bin
tokenizer config file saved in outputs/checkpoint-132/tokenizer_config.json
Special tokens file saved in outputs/checkpoint-132/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 46
  Batch size = 1
Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "forced_eos_token_id": 1,
  "length_penalty": 0.8,
  "max_length": 128,
  "min_length": 32,
  "num_beams": 8,
  "pad_token_id": 0,
  "transformers_version": "4.26.0"
}

Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "forced_eos_token_id": 1,
  "length_penalty": 0.8,
  "max_length": 128,
  "min_length": 32,
  "num_beams": 8,
  "pad_token_id": 0,
  "transformers_ver

['Columbine High School in Littleton, Colorado, was the site of a massacre in 1999 that left 13 people dead.<n>The massacre took place on April 20, 1999 and was followed by a series of other mass shootings across the U.S. in recent years.<n>The Columbine High School massacre took place on April 20, 1999 and was followed by a series of other mass shootings across the U.S. in recent years.<n> Columbine High School in Littleton, Colorado, was the site of a massacre in 1999 that left 13 people dead.<n>The massacre took place on April 20, 1999 and was followed by a series of', "The New York Police Department has been under fire for its handling of the Michael Brown case.<n>Brown, a black man, was fatally shot by a white police officer in Brown's patrol car in Brown's hometown of Ferguson, Missouri.<n>The police department has been under fire for its handling of the Michael Brown case.<n>Brown's mother, Donna Brown, said her son's death was a tragic accident.<n>The police department has been

Saving model checkpoint to outputs/checkpoint-176
Configuration saved in outputs/checkpoint-176/config.json
Configuration saved in outputs/checkpoint-176/generation_config.json
Model weights saved in outputs/checkpoint-176/pytorch_model.bin
tokenizer config file saved in outputs/checkpoint-176/tokenizer_config.json
Special tokens file saved in outputs/checkpoint-176/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 46
  Batch size = 1
Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "forced_eos_token_id": 1,
  "length_penalty": 0.8,
  "max_length": 128,
  "min_length": 32,
  "num_beams": 8,
  "pad_token_id": 0,
  "transformers_version": "4.26.0"
}

Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "forced_eos_token_id": 1,
  "length_penalty": 0.8,
  "max_length": 128,
  "min_length": 32,
  "num_beams": 8,
  "pad_token_id": 0,
  "transformers_ver

['Columbine High School in Littleton, Colorado, was the site of a mass shooting in 1999 that killed 13 students and a teacher.<n>The shooter, who has not been identified, is believed to have been a student at the school at the time of the shooting.', 'The trial of former New York City Mayor Michael Bloomberg is set to begin on Monday in federal court in New York City.<n> Bloomberg is accused of misappropriating millions of dollars in public funds while serving as mayor between 2001 and 2007.<n>The New York City Police Department has been criticized for its handling of the case.', 'China is home to more pandas than any other country in the world.<n>The number of giant pandas in captivity in China has more than doubled in the past 30 years.<n>China is home to more than 2,000 endangered pandas.<n>The number of giant pandas in captivity in China has more than doubled in the past 30 years.<n>China is home to more than 2,000 endangered pandas.<n>The number of giant pandas in captivity in Chi

Saving model checkpoint to outputs/checkpoint-220
Configuration saved in outputs/checkpoint-220/config.json
Configuration saved in outputs/checkpoint-220/generation_config.json
Model weights saved in outputs/checkpoint-220/pytorch_model.bin
tokenizer config file saved in outputs/checkpoint-220/tokenizer_config.json
Special tokens file saved in outputs/checkpoint-220/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 46
  Batch size = 1
Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "forced_eos_token_id": 1,
  "length_penalty": 0.8,
  "max_length": 128,
  "min_length": 32,
  "num_beams": 8,
  "pad_token_id": 0,
  "transformers_version": "4.26.0"
}

Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "forced_eos_token_id": 1,
  "length_penalty": 0.8,
  "max_length": 128,
  "min_length": 32,
  "num_beams": 8,
  "pad_token_id": 0,
  "transformers_ver

['Columbine High School in Littleton, Colorado, was the site of a mass shooting in 1999 that killed 13 students and three adults.<n>The shooter, who has not been identified, is believed to have been a student at the school at the time of the shooting.', 'The trial of former New York City Mayor Michael Bloomberg is set to begin on Monday in federal court in New York City.<n> Bloomberg is accused of misappropriating millions of dollars in public funds while serving as mayor between 2001 and 2007.<n>The New York City Police Department has been criticized for its handling of the case.', 'China is home to more pandas than any other country in the world.<n>The number of giant pandas in China has more than doubled in the last 30 years.<n>China is home to more than 1,000 giant pandas.<n>The number of giant pandas in China has more than doubled in the last 30 years.<n>China is home to more than 1,000 giant pandas.<n>The number of giant pandas in China has more than doubled in the last 30 years.

Saving model checkpoint to outputs/checkpoint-264
Configuration saved in outputs/checkpoint-264/config.json
Configuration saved in outputs/checkpoint-264/generation_config.json
Model weights saved in outputs/checkpoint-264/pytorch_model.bin
tokenizer config file saved in outputs/checkpoint-264/tokenizer_config.json
Special tokens file saved in outputs/checkpoint-264/special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from outputs/checkpoint-220 (score: 0.2019119441263447).


In [17]:
!cp "/content/outputs/pytorch_model.bin" "/content/gdrive/MyDrive/pytorch_model_sum.bin"

Saving model checkpoint to outputs/
Configuration saved in outputs/config.json
Configuration saved in outputs/generation_config.json
Model weights saved in outputs/pytorch_model.bin
tokenizer config file saved in outputs/tokenizer_config.json
Special tokens file saved in outputs/special_tokens_map.json


In [21]:
final_validation_predictions = trainer.predict(ds_for_train["validation"])

***** Running Prediction *****
  Num examples = 44
  Batch size = 1
Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "forced_eos_token_id": 1,
  "length_penalty": 0.8,
  "max_length": 128,
  "min_length": 32,
  "num_beams": 8,
  "pad_token_id": 0,
  "transformers_version": "4.26.0"
}



Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "forced_eos_token_id": 1,
  "length_penalty": 0.8,
  "max_length": 128,
  "min_length": 32,
  "num_beams": 8,
  "pad_token_id": 0,
  "transformers_version": "4.26.0"
}

Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "forced_eos_token_id": 1,
  "length_penalty": 0.8,
  "max_length": 128,
  "min_length": 32,
  "num_beams": 8,
  "pad_token_id": 0,
  "transformers_version": "4.26.0"
}

Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "forced_eos_token_id": 1,
  "length_penalty": 0.8,
  "max_length": 128,
  "min_length": 32,
  "num_beams": 8,
  "pad_token_id": 0,
  "transformers_version": "4.26.0"
}

Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "forced_eos_token_id": 1,
  "length_penalty": 0.8,
  



In [23]:
from torch.utils.data import DataLoader
from pathlib import Path

torch.cuda.empty_cache()

Path("outputs/D3").mkdir(parents=True, exist_ok=True)

print(final_validation_predictions)

validation_predictions, validation_labels, validation_metrics = final_validation_predictions

predictions, labels = get_pred_label(validation_predictions, validation_labels)

ids = eval_dataset["id"]

for i in range(0, len(eval_dataset)):
  print("***** Summary Text (Gold Text) *****")
  print(labels[i])
  print("***** Summary Text (Generated Text) *****")
  print(predictions[i])

  with open("outputs/D3/{}-A.M.100.{}.3".format(ids[i][:-1], ids[i][-1]), "w") as output_file:
    output_file.write(predictions[i])


PredictionOutput(predictions=array([[    0,   139,   475, ...,   279,   109,     1],
       [    0,  1224,   117, ...,  1330,   115,     1],
       [    0,  2474,   121, ...,     0,     0,     0],
       ...,
       [    0,  1144,   148, ...,     0,     0,     0],
       [    0,  1276, 12998, ...,  8258,   115,     1],
       [    0,   139,  8717, ...,  4858,  1575,     1]]), label_ids=array([[  651,  1350,  7361, ..., 94172,   189,     1],
       [ 1469,   750,   148, ...,     0,     0,     0],
       [  222,  1538,  4001, ...,   149,  2699,     1],
       ...,
       [ 2128,  3740, 10253, ...,     0,     0,     0],
       [61345, 17161,   111, ...,  1410,   107,     1],
       [ 1027,   785,  4858, ...,     0,     0,     0]]), metrics={'test_loss': 6.688015460968018, 'test_rouge1': 0.20050148883265168, 'test_rouge2': 0.02245072538660339, 'test_rougeL': 0.14544309105022094, 'test_rougeLsum': 0.14586569733271593, 'test_runtime': 130.3061, 'test_samples_per_second': 0.338, 'test_steps_p

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
