In [None]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Select the Runtime > "Change runtime type" menu to enable a GPU accelerator, ')
  print('and then re-execute this cell.')
else:
  print(gpu_info)

In [None]:
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('To enable a high-RAM runtime, select the Runtime > "Change runtime type"')
  print('menu, and then select High-RAM in the Runtime shape dropdown. Then, ')
  print('re-execute this cell.')
else:
  print('You are using a high-RAM runtime!')

In [None]:
# This run uses Pytorch Lightening to finetune the model
!pip install -q pytorch-lightning
!pip install -q transformers

In [None]:
# imports
import transformers
from torch.utils.data import DataLoader, TensorDataset, random_split, RandomSampler, Dataset
import pandas as pd
import numpy as np

import torch.nn.functional as F
import pytorch_lightning as pl
import torch
from pytorch_lightning.callbacks import ModelCheckpoint

import math
import random
import re

# install datasets
!pip install datasets

from datasets import list_datasets, list_metrics, load_dataset, load_metric

from pprint import pprint

from transformers import T5Tokenizer, T5ForConditionalGeneration
from tqdm import tqdm

import argparse

# Firing up Google Drive
Load up your google drive for loading the dataset for training and for saving model weights

In [None]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=False)
root_dir = "/content/gdrive/My Drive/masters_thesis/"
base_dir = root_dir

# Pytorch Lightning for running the training

In [None]:
class LitModel(pl.LightningModule):
  # Instantiate the model
  def __init__(self, learning_rate, tokenizer, model, freeze_encoder, freeze_embeds):
    super().__init__()
    self.tokenizer = tokenizer
    self.model = model
    self.learning_rate = learning_rate
    #self.hparams = hparams
    self.freeze_encoder = freeze_encoder
    self.freeze_embeds = freeze_embeds

    if self.freeze_encoder:
      freeze_params(self.model.get_encoder())

    if self.freeze_embeds:
      self.freeze_embeds()
  
  def freeze_embeds(self):
    ''' freeze the positional embedding parameters of the model; adapted from finetune.py '''
    freeze_params(self.model.model.shared)
    for d in [self.model.model.encoder, self.model.model.decoder]:
      freeze_params(d.embed_positions)
      freeze_params(d.embed_tokens)

  # Do a forward pass through the model
  def forward(self, input_ids, **kwargs):
    return self.model(input_ids, **kwargs)
  
  def configure_optimizers(self):
    optimizer = torch.optim.Adam(self.parameters(), lr = self.learning_rate)
    return optimizer

  def training_step(self, batch, batch_idx):
    # Load the data into variables
    src_ids, src_mask = batch[0], batch[1]
    tgt_ids = batch[2]
    # Shift the decoder tokens right (but NOT the tgt_ids)
    decoder_input_ids = shift_tokens_right(tgt_ids, tokenizer.pad_token_id)

    # Run the model and get the logits
    outputs = self(src_ids, attention_mask=src_mask, decoder_input_ids=decoder_input_ids, use_cache=False)
    lm_logits = outputs[0]
    # Create the loss function
    ce_loss_fct = torch.nn.CrossEntropyLoss(ignore_index=self.tokenizer.pad_token_id)
    # Calculate the loss on the un-shifted tokens
    loss = ce_loss_fct(lm_logits.view(-1, lm_logits.shape[-1]), tgt_ids.view(-1))

    return {'loss':loss}

  def validation_step(self, batch, batch_idx):

    src_ids, src_mask = batch[0], batch[1]
    tgt_ids = batch[2]

    decoder_input_ids = shift_tokens_right(tgt_ids, tokenizer.pad_token_id)
    
    # Run the model and get the logits
    outputs = self(src_ids, attention_mask=src_mask, decoder_input_ids=decoder_input_ids, use_cache=False)
    lm_logits = outputs[0]

    ce_loss_fct = torch.nn.CrossEntropyLoss(ignore_index=self.tokenizer.pad_token_id)
    val_loss = ce_loss_fct(lm_logits.view(-1, lm_logits.shape[-1]), tgt_ids.view(-1))

    return {'loss': val_loss}
  
  # Method that generates text using the T5ForConditionalGeneration's generate() method
  def generate_text(self, text, eval_beams, early_stopping = True, max_len = 512):
    ''' Function to generate text '''
    device = "cuda" if torch.cuda.is_available() else "cpu"
    #device = "cpu"
    #device = "cuda"
    generated_ids = self.model.generate(
        text["input_ids"].to(device),
        attention_mask=text["attention_mask"].to(device),
        num_beams= eval_beams,
        length_penalty=2.0,
        max_length = max_len,
        min_length= 120,
        no_repeat_ngram_size=3,
        early_stopping = early_stopping,
        decoder_start_token_id= self.tokenizer.eos_token_id
    )
    return [self.tokenizer.decode(w, skip_special_tokens=True, clean_up_tokenization_spaces=False) for w in generated_ids]

def freeze_params(model):
  ''' Function that takes a model as input (or part of a model) and freezes the layers for faster training
      adapted from finetune.py '''
  for layer in model.parameters():
    layer.requires_grade = False


In [None]:
# Create a dataloading module as per the PyTorch Lightning Docs
class SummaryDataModule(pl.LightningDataModule):
  def __init__(self, tokenizer, data_file, batch_size, num_examples = 7000):
    super().__init__()
    self.tokenizer = tokenizer
    self.data_file = data_file
    self.batch_size = batch_size
    self.num_examples = num_examples
  
  # Loads and splits the data into training, validation and test sets with a 60/20/20 split
  # Updated: training and validation data 80/20 split
  def prepare_data(self):
    self.data = pd.read_csv(self.data_file).dropna()[:self.num_examples]

    self.data['source'].astype(str)
    self.data['target'].astype(str)
    # train and validation only
    #self.train, self.validate, self.test = np.split(self.data.sample(frac=1), [int(.6*len(self.data)), int(.8*len(self.data))])
    self.train = self.data.sample(frac=0.8,random_state=200) 
    self.validate = self.data.drop(self.train.index)

  # encode the sentences using the tokenizer  
  def setup(self, stage):
    self.train = encode_sentences(self.tokenizer, self.train['source'], self.train['target'])
    self.validate = encode_sentences(self.tokenizer, self.validate['source'], self.validate['target'])
    #self.test = encode_sentences(self.tokenizer, self.test['source'], self.test['target'])

  # Load the training, validation and test sets in Pytorch Dataset objects
  def train_dataloader(self):
    dataset = TensorDataset(self.train['input_ids'], self.train['attention_mask'], self.train['labels'])                          
    train_data = DataLoader(dataset, sampler = RandomSampler(dataset), batch_size = self.batch_size)
    return train_data

  def val_dataloader(self):
    dataset = TensorDataset(self.validate['input_ids'], self.validate['attention_mask'], self.validate['labels']) 
    val_data = DataLoader(dataset, batch_size = self.batch_size)                       
    return val_data

  # def test_dataloader(self):
  #   dataset = TensorDataset(self.test['input_ids'], self.test['attention_mask'], self.test['labels']) 
  #   test_data = DataLoader(dataset, batch_size = self.batch_size)                   
  #   return test_data



In [None]:
def shift_tokens_right(input_ids, pad_token_id):
  """ Shift input ids one token to the right, and wrap the last non pad token (usually <eos>).
      This is taken directly from modeling_bart.py
  """
  prev_output_tokens = input_ids.clone()
  index_of_eos = (input_ids.ne(pad_token_id).sum(dim=1) - 1).unsqueeze(-1)
  prev_output_tokens[:, 0] = input_ids.gather(1, index_of_eos).squeeze()
  prev_output_tokens[:, 1:] = input_ids[:, :-1]
  return prev_output_tokens

def encode_sentences(tokenizer, source_sentences, target_sentences, max_length=512, pad_to_max_length=True, return_tensors="pt"):
  ''' Function that tokenizes a sentence 
      Args: tokenizer - the T5 tokenizer; source and target sentences are the source and target sentences
      Returns: Dictionary with keys: input_ids, attention_mask, target_ids
  '''

  input_ids = []
  attention_masks = []
  target_ids = []
  tokenized_sentences = {}

  for sentence in source_sentences:
    encoded_dict = tokenizer(
          sentence,
          max_length=max_length,
          padding="max_length" if pad_to_max_length else None,
          truncation=True,
          return_tensors=return_tensors
      )

    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])

  input_ids = torch.cat(input_ids, dim = 0)
  attention_masks = torch.cat(attention_masks, dim = 0)

  for sentence in target_sentences:
    encoded_dict = tokenizer(
          str(sentence),
          max_length=max_length,
          padding="max_length" if pad_to_max_length else None,
          truncation=True,
          return_tensors=return_tensors
      )
    # Shift the target ids to the right
    # shifted_target_ids = shift_tokens_right(encoded_dict['input_ids'], tokenizer.pad_token_id)
    target_ids.append(encoded_dict['input_ids'])

  target_ids = torch.cat(target_ids, dim = 0)
  

  batch = {
      "input_ids": input_ids,
      "attention_mask": attention_masks,
      "labels": target_ids,
  }

  return batch


def noise_sentence(sentence_, percent_words, replacement_token = "<mask>"):
  '''
  Function that noises a sentence by adding <mask> tokens
  Args: sentence - the sentence to noise
        percent_words - the percent of words to replace with <mask> tokens; the number is rounded up using math.ceil
  Returns a noised sentence
  '''
  # Create a list item and copy
  sentence_ = sentence_.split(' ')
  sentence = sentence_.copy()
  
  num_words = math.ceil(len(sentence) * percent_words)
  
  # Create an array of tokens to sample from
  sample_tokens = set(np.arange(0, np.maximum(1, len(sentence)-1)))
  
  words_to_noise = random.sample(sample_tokens, num_words)
  
  # Swap out words, but not full stops
  for pos in words_to_noise:
      if sentence[pos] != '.':
          sentence[pos] = replacement_token
  
  # Remove redundant spaces
  sentence = re.sub(r' {2,5}', ' ', ' '.join(sentence))
  
  # Combine concurrent <mask> tokens into a single token; this just does two rounds of this; more could be done
  sentence = re.sub(r'<mask> <mask>', "<mask>", sentence)
  sentence = re.sub(r'<mask> <mask>', "<mask>", sentence)
  return sentence
  

In [None]:
# Load the model
!pip install sentencepiece
from transformers import T5Tokenizer, T5ForConditionalGeneration

tokenizer = T5Tokenizer.from_pretrained("t5-base")

t5_model = T5ForConditionalGeneration.from_pretrained("t5-base")



In [None]:
# Load the data into the model for training
data_path = root_dir + "setup6_training.csv"
summary_data = SummaryDataModule(tokenizer, data_path,
                                 batch_size = 3)

# Load the model from a pre-saved checkpoint; alternatively use the code below to start training from scratch
# model = LitModel.load_from_checkpoint(base_dir + "checkpoint_files_2/8_ep_140k_simple_0210.ckpt",
#                                       learning_rate = 2e-5, tokenizer = tokenizer, model = bart_model, hparams = hparams)

custom_model = LitModel(learning_rate = 2e-5, tokenizer = tokenizer, model = t5_model, freeze_encoder = True, freeze_embeds = False)

# Training the model with Pytorch Lightning
The below code utilises Pytorch Lightning's fantastic Trainer module that helps to control the training process. After creating a ModelCheckpoint object, the other options are fed into the Trainer module. I found that my colab crashed when I didn't explicitly set progress_bar_refresh_rate to something and I found that setting it to 500 seemed to work just fine.

In [None]:
is_training = False
if is_training:
  checkpoint = ModelCheckpoint(dirpath=base_dir + 'checkpoint_files/')
  trainer = pl.Trainer(gpus = 1,
                      max_epochs = 3,
                      min_epochs = 3,
                      auto_lr_find = False,
                      checkpoint_callback = checkpoint,
                      progress_bar_refresh_rate = 500)
  print(summary_data)
  # Fit the instantiated model to the data
  trainer.fit(custom_model, summary_data)
  torch.save(custom_model, base_dir + "models/t5_pretrained_setup6_training_model_april_23_beam_10.pt")

In [None]:
#import re
import pickle
from datetime import datetime
# import copy
import csv

DEFAULT_DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
#DEFAULT_DEVICE = "cpu"
#DEFAULT_DEVICE = "cuda"
def create_csv(all_sentences, targets, file_to_write):
  sources = []
  fieldnames = ["source", "target"]
  test_array = []
  with open(file_to_write, 'w') as csvfile:
    csvwriter = csv.DictWriter(csvfile, delimiter=',', fieldnames=fieldnames)
    for t in range(len(targets)):
        test_array.append({"source": all_sentences[t], "target": targets[t]})
        #writer.writerow({sources[t], targets[t]})
    csvwriter.writerow(dict((fn,fn) for fn in fieldnames))
    for row in test_array:
      csvwriter.writerow(row)


def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i : i + n]

def generate_summaries(lns, metric, batch_size=1, device=DEFAULT_DEVICE):

    t5_model = T5ForConditionalGeneration.from_pretrained("t5-base").to(device)
    #device = torch.device('cuda:0')
    tokenizer = T5Tokenizer.from_pretrained('t5-base')
    model = t5_model
    is_training = False
    if is_training:
      model = custom_model.to(device)
    
    article_batches = list(chunks(lns['source'], batch_size))
    target_batches = list(chunks(lns['target'], batch_size))
    ls_prediction = []
    ls_groundtruth = []

    dec_batches_untokenized = []
    target_batches_untokenized = []

    for article_batch, target_batch in tqdm(zip(article_batches, target_batches)
    , total=len(article_batches)):
        dct = tokenizer.batch_encode_plus(article_batch,
                                          max_length=1024,
                                          truncation=True,
                                          padding='max_length',
                                          return_tensors="pt")
        dec = []
        if is_training:
          dec = model.generate_text(dct,10)
        else:
          summaries = model.generate(
              input_ids=dct["input_ids"].to(device),
              attention_mask=dct["attention_mask"].to(device),
              num_beams=10,
              length_penalty=2.0,
              max_length=512,
              min_length=120,
              no_repeat_ngram_size=3,
              early_stopping=True,
              decoder_start_token_id=tokenizer.eos_token_id,
          )
          dec = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summaries]  
        dec = [d.replace('. ', '.\n') for d in dec]

        dec_batches_untokenized.append(dec)
        target_batches_untokenized.append(target_batch)
        

        ls_prediction.extend(dec)
        ls_groundtruth.extend(target_batch)

    
    
    ls_prediction_tokenized = coreNLP_tokenizer(ls_prediction)
    target_batch_tokenized = coreNLP_tokenizer(ls_groundtruth)

    #for i in range(len(ls_prediction_tokenized)):
    # print(ls_prediction_tokenized[i])
    # print (target_batch_tokenized[i])
    # print('==============================')
    #print (ls_prediction_tokenized)
    #print(target_batch_tokenized)
    #dec_batches = list(chunks(ls_prediction_tokenized, batch_size))
    #target_batches = list(chunks(target_batch_tokenized, batch_size))


    #for dec_batch, target_batch in tqdm(zip(dec_batches, target_batches), total=len(dec_batches)):
    metric.add_batch(predictions=ls_prediction_tokenized, references=target_batch_tokenized)

    score = metric.compute()
    str_now = str(datetime.now())
    result_file_name = root_dir + "t5_pretrained_setup6_testing_custom_model_generated_summaries_april_24_beam_10.csv"
    create_csv(ls_groundtruth, ls_prediction, result_file_name)
    #with open('/content/gdrive/Shareddrives/Informed Consent/202012_summarization_results/{0}_predictions.pkl'.format(str_now), 'wb') as fid:
    #    pickle.dump(ls_prediction, fid)
    #with open('/content/gdrive/Shareddrives/Informed Consent/202012_summarization_results/{0}_groundtruth.pkl'.format(str_now), 'wb') as fid:
    #    pickle.dump(ls_groundtruth, fid)
    print("ls prediction: ")
    print(ls_prediction)
    print("ls groundtruth: ")
    print(ls_groundtruth)
    return score

In [None]:
!pip install rouge_score
from datasets import list_metrics
metrics_list = list_metrics()
len(metrics_list)
print (metrics_list)
rouge_metric = load_metric('rouge')

In [None]:
# Install stanza; note that the prefix "!" is not needed if you are running in a terminal
!pip install stanza

# Import stanza
import stanza

In [None]:
# Download the Stanford CoreNLP package with Stanza's installation command
# This'll take several minutes, depending on the network speed
corenlp_dir = './corenlp'
stanza.install_corenlp(dir=corenlp_dir)

# Set the CORENLP_HOME environment variable to point to the installation location
import os
os.environ["CORENLP_HOME"] = corenlp_dir

In [None]:
# Examine the CoreNLP installation folder to make sure the installation is successful
!export CORENLP_HOME='./corenlp'
!ls $CORENLP_HOME

In [None]:
# Import client module
from stanza.server import CoreNLPClient

In [None]:
texts = ["Albert Einstein was a German-born theoretical physicist.", "He was going to the school!"]
def coreNLP_tokenizer(inputDocsList):
  tokenizedDocsList = []
  with CoreNLPClient(annotators="tokenize ssplit pos lemma ner depparse".split(), memory='4G', endpoint='http://localhost:9001', be_quiet=True) as client:
    for d in inputDocsList:
      ann = client.annotate(d)

      # You can access annotations using ann.
      sentence = ann.sentence[0]

      # You can access any property within a sentence.
      #print(sentence.text)

      # Likewise for tokens
      #token = sentence.token[0]
      #print (token)
      tokenizedDocsList.append(' '.join([token.word.lower() for token in sentence.token]))
  return tokenizedDocsList

print(coreNLP_tokenizer(texts))

In [None]:
import pandas as pd

df = pd.read_csv(root_dir + "setup6_testing.csv")
df.dropna()
df['source'] = df['source'].astype(str)
df['target'] = df['target'].astype(str)
df = df.iloc[:1000,:]
score = generate_summaries(df, rouge_metric) #tokenizer, bart_model

In [None]:
print(score)
#######.......//////////.....////

setup 4 in t5 (first 1000):
{'rouge1': AggregateScore(low=Score(precision=0.2588368848446092, recall=0.25122987056987783, fmeasure=0.24159818208918032), mid=Score(precision=0.26938102512722606, recall=0.26047505143685223, fmeasure=0.25007490042949343), high=Score(precision=0.28046001420673206, recall=0.2692709678688867, fmeasure=0.25917614865595223)), 'rouge2': AggregateScore(low=Score(precision=0.11255725678461885, recall=0.10476904845262759, fmeasure=0.10241127135610223), mid=Score(precision=0.12159803173368736, recall=0.11288291429396852, fmeasure=0.11012266509929843), high=Score(precision=0.13010995021653968, recall=0.12099060585571902, fmeasure=0.11759502895433997)), 'rougeL': AggregateScore(low=Score(precision=0.23253059700324577, recall=0.22681866514459234, fmeasure=0.21757340423495033), mid=Score(precision=0.24297955804783145, recall=0.23621253829663416, fmeasure=0.22585626245431717), high=Score(precision=0.2543204353100846, recall=0.24520426381322527, fmeasure=0.234950310186456)), 'rougeLsum': AggregateScore(low=Score(precision=0.23292684022002408, recall=0.22692990905414706, fmeasure=0.21789613310281677), mid=Score(precision=0.24349228577121668, recall=0.23603027794796122, fmeasure=0.22615060752782296), high=Score(precision=0.2536384238150224, recall=0.245574382028144, fmeasure=0.23461694249608914))}

changed to 6 epoch

setup 4 in t5 (first 1000): {'rouge1': AggregateScore(low=Score(precision=0.24532993130828096, recall=0.23419495869906115, fmeasure=0.22629296089386128), mid=Score(precision=0.2549001763718447, recall=0.24293340467657282, fmeasure=0.23458867826333243), high=Score(precision=0.26570077183329144, recall=0.2516751389195236, fmeasure=0.24316837664654664)), 'rouge2': AggregateScore(low=Score(precision=0.10423858436663155, recall=0.09565089473538853, fmeasure=0.09407981394057427), mid=Score(precision=0.11191813321289143, recall=0.10258799830516005, fmeasure=0.1005735296715049), high=Score(precision=0.12023558417894827, recall=0.10970805845506043, fmeasure=0.10743926538791972)), 'rougeL': AggregateScore(low=Score(precision=0.22197892144095474, recall=0.21261308592646552, fmeasure=0.20484986814970946), mid=Score(precision=0.23160497177345007, recall=0.220859012872395, fmeasure=0.21287705676699625), high=Score(precision=0.24175020241453177, recall=0.22947441018493234, fmeasure=0.2209461440380066)), 'rougeLsum': AggregateScore(low=Score(precision=0.22158425247935534, recall=0.21200687228817555, fmeasure=0.20502817338804402), mid=Score(precision=0.23150246217664958, recall=0.2208251204600968, fmeasure=0.21287791895767444), high=Score(precision=0.2409962145802621, recall=0.22988400549480406, fmeasure=0.22126249686953672))}


setup 6 in t5 (first 1000): {'rouge1': AggregateScore(low=Score(precision=0.2734287542926567, recall=0.2560322198466661, fmeasure=0.25134801112034877), mid=Score(precision=0.2836585496954529, recall=0.26475044966664607, fmeasure=0.2593615284826105), high=Score(precision=0.29395048853554884, recall=0.273451584557087, fmeasure=0.26737474721328086)), 'rouge2': AggregateScore(low=Score(precision=0.11937515196316832, recall=0.10741783714732911, fmeasure=0.10731777287275295), mid=Score(precision=0.12763559115106982, recall=0.11402597242108645, fmeasure=0.11393743257377573), high=Score(precision=0.13606010007446168, recall=0.12157016516965323, fmeasure=0.12102690058365138)), 'rougeL': AggregateScore(low=Score(precision=0.2468508258387302, recall=0.23163107438130756, fmeasure=0.22715665377144975), mid=Score(precision=0.25694137699027964, recall=0.2405508046403888, fmeasure=0.23491926467522134), high=Score(precision=0.2669517594507381, recall=0.2482987064499052, fmeasure=0.24306475182805207)), 'rougeLsum': AggregateScore(low=Score(precision=0.2471958001378389, recall=0.23238549149887847, fmeasure=0.22713919499685092), mid=Score(precision=0.25694767376473054, recall=0.24040916821297198, fmeasure=0.23498766678060393), high=Score(precision=0.26666459105940304, recall=0.24896723054549014, fmeasure=0.24277021987299077))}

changed encode sentence max length to 512, num of training epochs to 3, and beam number to be 10

setup 1 t5 all (first 1000): {'rouge1': AggregateScore(low=Score(precision=0.5255436538642831, recall=0.31372830779483274, fmeasure=0.3741903870515088), mid=Score(precision=0.5381175478122207, recall=0.322924044736426, fmeasure=0.38310615596935815), high=Score(precision=0.5505313341326222, recall=0.33165142893557314, fmeasure=0.39250453240400385)), 'rouge2': AggregateScore(low=Score(precision=0.32402092911145985, recall=0.19225491066135683, fmeasure=0.22999322301978498), mid=Score(precision=0.33668758016172284, recall=0.1996699962939506, fmeasure=0.23838104409281535), high=Score(precision=0.3502850407120836, recall=0.20818032169942782, fmeasure=0.2485942135992869)), 'rougeL': AggregateScore(low=Score(precision=0.47738684407827436, recall=0.2847449466731032, fmeasure=0.3398285211832176), mid=Score(precision=0.4907681752111453, recall=0.2937962918619609, fmeasure=0.3490004228674), high=Score(precision=0.5033991493536203, recall=0.3024189565071685, fmeasure=0.3583983921086392)), 'rougeLsum': AggregateScore(low=Score(precision=0.4788480166813028, recall=0.2847086027338496, fmeasure=0.3397681813523527), mid=Score(precision=0.4906212661613313, recall=0.29327093631127077, fmeasure=0.34880545339702745), high=Score(precision=0.5030227994090598, recall=0.30252853418064973, fmeasure=0.3582996187283443))}


setup 7 t5 all (first 1000): {'rouge1': AggregateScore(low=Score(precision=0.37224088131233374, recall=0.1580935928744946, fmeasure=0.21365619290578733), mid=Score(precision=0.3875441188474467, recall=0.16527337388357893, fmeasure=0.22294215895952463), high=Score(precision=0.4032160889651148, recall=0.1721600229296267, fmeasure=0.23221812561668298)), 'rouge2': AggregateScore(low=Score(precision=0.20553078959477716, recall=0.08491251307415748, fmeasure=0.11641891818497116), mid=Score(precision=0.2199322191856558, recall=0.09143309296161443, fmeasure=0.12507105537820828), high=Score(precision=0.23641600061115342, recall=0.09845427011167458, fmeasure=0.13485539730184978)), 'rougeL': AggregateScore(low=Score(precision=0.30563901257556336, recall=0.1289683578483327, fmeasure=0.17518982612025216), mid=Score(precision=0.32060140999465314, recall=0.1353417214372577, fmeasure=0.18352570811852528), high=Score(precision=0.33666192346098617, recall=0.14291469117687597, fmeasure=0.1934991766136125)), 'rougeLsum': AggregateScore(low=Score(precision=0.3060298257396111, recall=0.1289780963553509, fmeasure=0.17496327088771826), mid=Score(precision=0.321326729799685, recall=0.13562040613921328, fmeasure=0.18394195588391662), high=Score(precision=0.3367126767363716, recall=0.14219119987993198, fmeasure=0.192790284383627))}


setup 8 t5 all (first 1000): {'rouge1': AggregateScore(low=Score(precision=0.6162713632909924, recall=0.1266557737201762, fmeasure=0.20605166018056056), mid=Score(precision=0.6339809118990576, recall=0.13095583154030843, fmeasure=0.21277389794421325), high=Score(precision=0.6493731388658893, recall=0.13494452572805915, fmeasure=0.21856212595048505)), 'rouge2': AggregateScore(low=Score(precision=0.5067363618634424, recall=0.09757335355860502, fmeasure=0.16067631099166677), mid=Score(precision=0.5313657575333209, recall=0.10266901101255266, fmeasure=0.1689853905681598), high=Score(precision=0.5543027252648117, recall=0.10787953061726172, fmeasure=0.17693813972150899)), 'rougeL': AggregateScore(low=Score(precision=0.5855503497000452, recall=0.12062974455020485, fmeasure=0.19619373194419093), mid=Score(precision=0.6041583161719428, recall=0.1251030815223586, fmeasure=0.20307408561649948), high=Score(precision=0.6231587890308702, recall=0.12993460821807917, fmeasure=0.21056610816933352)), 'rougeLsum': AggregateScore(low=Score(precision=0.5863443055978745, recall=0.12060796871360516, fmeasure=0.19664766743783946), mid=Score(precision=0.6047781067306466, recall=0.12536234315443476, fmeasure=0.20356775294913482), high=Score(precision=0.6229572549602016, recall=0.12967396359670952, fmeasure=0.21026593302354818))}


setup 4 t5 all (first 1000): {'rouge1': AggregateScore(low=Score(precision=0.3582225240192015, recall=0.347989443175754, fmeasure=0.33544990336941205), mid=Score(precision=0.3752857390724563, recall=0.3655119596419827, fmeasure=0.35231212769568865), high=Score(precision=0.3918719314706983, recall=0.3822758862147936, fmeasure=0.36887532587875)), 'rouge2': AggregateScore(low=Score(precision=0.21260369953206304, recall=0.2070055755763381, fmeasure=0.20090761964949902), mid=Score(precision=0.23073571658980224, recall=0.22430266903179868, fmeasure=0.21831471299258756), high=Score(precision=0.2482252005195825, recall=0.24252897591685055, fmeasure=0.2361606852944207)), 'rougeL': AggregateScore(low=Score(precision=0.3323185971142232, recall=0.3240178716149586, fmeasure=0.3126665256272832), mid=Score(precision=0.3491036220993431, recall=0.34072992220070963, fmeasure=0.3282461689626657), high=Score(precision=0.3676753645270214, recall=0.35750524964648295, fmeasure=0.3451881159306764)), 'rougeLsum': AggregateScore(low=Score(precision=0.3309744701221845, recall=0.32283049947498005, fmeasure=0.3118686679753952), mid=Score(precision=0.3489514882415937, recall=0.34067494121398906, fmeasure=0.3282679893382323), high=Score(precision=0.3674213554274379, recall=0.3563216250707289, fmeasure=0.34436605566047596))}


setup 6 t5 all (first 1000): {'rouge1': AggregateScore(low=Score(precision=0.39011294210532266, recall=0.3905932170145604, fmeasure=0.3721957905579733), mid=Score(precision=0.4077396391683317, recall=0.4066657422841817, fmeasure=0.3880129714768519), high=Score(precision=0.4269929298713382, recall=0.4246648390411941, fmeasure=0.4057429048499838)), 'rouge2': AggregateScore(low=Score(precision=0.24701839990747487, recall=0.24193062860804307, fmeasure=0.23479748287996438), mid=Score(precision=0.2667013425820912, recall=0.26051425091990693, fmeasure=0.25376123451317867), high=Score(precision=0.28577782466587975, recall=0.2781975122448364, fmeasure=0.27138393665583205)), 'rougeL': AggregateScore(low=Score(precision=0.3640200665043463, recall=0.3634759813666907, fmeasure=0.34668810897366464), mid=Score(precision=0.38248024610129805, recall=0.38056506032879867, fmeasure=0.36422830971145115), high=Score(precision=0.4022069836191019, recall=0.39933484425572113, fmeasure=0.3825026916079375)), 'rougeLsum': AggregateScore(low=Score(precision=0.3650504829316673, recall=0.3645361015485961, fmeasure=0.34847448241503554), mid=Score(precision=0.3827158199117455, recall=0.3817142913721612, fmeasure=0.3651245573212353), high=Score(precision=0.401213920761873, recall=0.3981881946674005, fmeasure=0.38160681457297796))}


setup 1 pretrain (first 1000): {'rouge1': AggregateScore(low=Score(precision=0.03259547786872938, recall=0.0016706522989682635, fmeasure=0.0029989873955616644), mid=Score(precision=0.041912725829529436, recall=0.002135173317364351, fmeasure=0.003816674941054876), high=Score(precision=0.05108167583584909, recall=0.0025878158759678254, fmeasure=0.004553363355053297)), 'rouge2': AggregateScore(low=Score(precision=7.125576036866374e-05, recall=1.3333333333333335e-05, fmeasure=2.5641025641025643e-05), mid=Score(precision=0.0018043394777265743, recall=6.285159500693481e-05, fmeasure=0.00010952260458839406), high=Score(precision=0.004774020737327189, recall=0.0001223517643163274, fmeasure=0.00021905357328692952)), 'rougeL': AggregateScore(low=Score(precision=0.029875902676496285, recall=0.0015133829156594987, fmeasure=0.0027204596805181685), mid=Score(precision=0.03935367047630661, recall=0.0019008395455825744, fmeasure=0.0033935768353228063), high=Score(precision=0.04898549118569132, recall=0.0022851825280042733, fmeasure=0.004084403433700073)), 'rougeLsum': AggregateScore(low=Score(precision=0.030114075896663015, recall=0.0014895098226557846, fmeasure=0.0026845839274942575), mid=Score(precision=0.03884936370146141, recall=0.0018788242205798758, fmeasure=0.0033577439734692825), high=Score(precision=0.048762832769284295, recall=0.0022918990703361818, fmeasure=0.00406835296167083))}


setup 7 pretrain (first 1000): {'rouge1': AggregateScore(low=Score(precision=0.030992870246383673, recall=0.0024665710321339, fmeasure=0.003998513489365846), mid=Score(precision=0.03964802378455144, recall=0.003187447607871147, fmeasure=0.005019866082347868), high=Score(precision=0.0494274982070367, recall=0.003921897800185747, fmeasure=0.006003897165363594)), 'rouge2': AggregateScore(low=Score(precision=0.0004905393217893218, recall=8.262796799910006e-05, fmeasure=0.00013138530684600477), mid=Score(precision=0.0019618559286242216, recall=0.00016287886758631711, fmeasure=0.00026777642248022865), high=Score(precision=0.004625879183540473, recall=0.0002768266685804522, fmeasure=0.00045236160324821926)), 'rougeL': AggregateScore(low=Score(precision=0.02927608742165716, recall=0.002236362588623754, fmeasure=0.0036176224555653893), mid=Score(precision=0.03746676149329313, recall=0.002808029673002293, fmeasure=0.004433425646349449), high=Score(precision=0.047090823039561275, recall=0.003454211466429874, fmeasure=0.005346585818330104)), 'rougeLsum': AggregateScore(low=Score(precision=0.02884691663976738, recall=0.0022007259006459785, fmeasure=0.0035965532696290944), mid=Score(precision=0.03754718029131955, recall=0.002811135867842785, fmeasure=0.004421114709213617), high=Score(precision=0.047460064485485856, recall=0.0034396208807995297, fmeasure=0.005339585102256928))}


setup 8 pretrain (first 1000): {'rouge1': AggregateScore(low=Score(precision=0.03091683423860939, recall=0.0025142563393685713, fmeasure=0.004048965845486613), mid=Score(precision=0.03966581631230347, recall=0.0031814087644110875, fmeasure=0.005004501168629086), high=Score(precision=0.050309799221484014, recall=0.0039037898451492452, fmeasure=0.006030902945748073)), 'rouge2': AggregateScore(low=Score(precision=0.0005193102948389141, recall=7.563314896762498e-05, fmeasure=0.00012449489896024392), mid=Score(precision=0.0019243646422639343, recall=0.00016320793035253138, fmeasure=0.00026761681735320306), high=Score(precision=0.004461120566261695, recall=0.00026909629622543944, fmeasure=0.00044424131284745265)), 'rougeL': AggregateScore(low=Score(precision=0.02901157063436032, recall=0.0022090000322053586, fmeasure=0.00358546134796503), mid=Score(precision=0.03751561917476741, recall=0.0028053642592990435, fmeasure=0.004421083963753408), high=Score(precision=0.04713143385218611, recall=0.003447489984385766, fmeasure=0.005360962581723463)), 'rougeLsum': AggregateScore(low=Score(precision=0.028854213973295895, recall=0.002274766545794876, fmeasure=0.0036587949855803096), mid=Score(precision=0.03773258913978569, recall=0.002798515994299667, fmeasure=0.0044088888627801985), high=Score(precision=0.047390460587802274, recall=0.003442136768710669, fmeasure=0.0053193395999020995))}


setup 4 pretrain (first 1000): {'rouge1': AggregateScore(low=Score(precision=0.018925643670780676, recall=0.0032940894023704803, fmeasure=0.004759878955667924), mid=Score(precision=0.025762516790534776, recall=0.004282842559666898, fmeasure=0.006182844135155321), high=Score(precision=0.033473717841486464, recall=0.005462770556811458, fmeasure=0.007784247980584912)), 'rouge2': AggregateScore(low=Score(precision=1.1904761904761903e-05, recall=2.631578947368421e-05, fmeasure=1.7241379310344828e-05), mid=Score(precision=0.0004904761904761905, recall=0.0001388888888888889, fmeasure=0.00017733990147783253), high=Score(precision=0.001183630952380952, recall=0.00030266812865497076, fmeasure=0.00039819376026272574)), 'rougeL': AggregateScore(low=Score(precision=0.017579443225376197, recall=0.002958348445087641, fmeasure=0.0043371326903011865), mid=Score(precision=0.024789165278209603, recall=0.003936384145540256, fmeasure=0.00569274741028688), high=Score(precision=0.033054215119657636, recall=0.004953625313713798, fmeasure=0.007072376120417085)), 'rougeLsum': AggregateScore(low=Score(precision=0.017424819145004142, recall=0.002958173664274011, fmeasure=0.0042704284069091), mid=Score(precision=0.024909211160130975, recall=0.00390500235449091, fmeasure=0.0056597149714497155), high=Score(precision=0.03201574174040795, recall=0.004971711249632905, fmeasure=0.0070926954973514235))}


setup 6 pretrain (first 1000): {'rouge1': AggregateScore(low=Score(precision=0.02718192730880231, recall=0.0026717142148058636, fmeasure=0.0045562897201596566), mid=Score(precision=0.03591679015429016, recall=0.003489515114214574, fmeasure=0.00590481992109375), high=Score(precision=0.04599801587301587, recall=0.004392681221245657, fmeasure=0.007342675102130404)), 'rouge2': AggregateScore(low=Score(precision=0.0, recall=0.0, fmeasure=0.0), mid=Score(precision=0.0013, recall=0.0001253387533875339, fmeasure=0.0002265446224256293), high=Score(precision=0.0035666666666666663, recall=0.0003865282012195121, fmeasure=0.0006800343249427916)), 'rougeL': AggregateScore(low=Score(precision=0.024741345806970808, recall=0.0024378641268484927, fmeasure=0.004158288779800289), mid=Score(precision=0.03449874847374848, recall=0.003216194103782987, fmeasure=0.0054687982004725175), high=Score(precision=0.04444560592185592, recall=0.004065869224296131, fmeasure=0.006880409615877091)), 'rougeLsum': AggregateScore(low=Score(precision=0.025836417402042396, recall=0.0025075401660765747, fmeasure=0.004305123654097008), mid=Score(precision=0.03461459651459651, recall=0.003222597709792241, fmeasure=0.005451766170243225), high=Score(precision=0.04507080502830505, recall=0.004067099565297772, fmeasure=0.006880476218548522))}
