In [1]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

In [2]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Select the Runtime > "Change runtime type" menu to enable a GPU accelerator, ')
  print('and then re-execute this cell.')
else:
  print(gpu_info)

Fri May  6 05:38:11 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   36C    P8    10W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [3]:
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('To enable a high-RAM runtime, select the Runtime > "Change runtime type"')
  print('menu, and then select High-RAM in the Runtime shape dropdown. Then, ')
  print('re-execute this cell.')
else:
  print('You are using a high-RAM runtime!')

Your runtime has 27.3 gigabytes of available RAM

You are using a high-RAM runtime!


In [4]:
# install datasets
!pip install datasets

from datasets import list_datasets, list_metrics, load_dataset, load_metric
from pprint import pprint
!pip install torch
!pip install -q pytorch-lightning
!pip install -q transformers


import transformers
from torch.utils.data import DataLoader, TensorDataset, random_split, RandomSampler, Dataset
import pandas as pd
import numpy as np

from transformers import LEDForConditionalGeneration, LEDTokenizer
from tqdm import tqdm
import torch

#import torch.nn.functional as F
import pytorch_lightning as pl
import torch
#from pytorch_lightning.callbacks import ModelCheckpoint

#import math
#import random
#import re

import argparse

Collecting datasets
  Downloading datasets-2.1.0-py3-none-any.whl (325 kB)
[?25l[K     |█                               | 10 kB 31.2 MB/s eta 0:00:01[K     |██                              | 20 kB 34.2 MB/s eta 0:00:01[K     |███                             | 30 kB 19.9 MB/s eta 0:00:01[K     |████                            | 40 kB 12.2 MB/s eta 0:00:01[K     |█████                           | 51 kB 10.2 MB/s eta 0:00:01[K     |██████                          | 61 kB 11.8 MB/s eta 0:00:01[K     |███████                         | 71 kB 13.0 MB/s eta 0:00:01[K     |████████                        | 81 kB 12.4 MB/s eta 0:00:01[K     |█████████                       | 92 kB 13.6 MB/s eta 0:00:01[K     |██████████                      | 102 kB 13.4 MB/s eta 0:00:01[K     |███████████                     | 112 kB 13.4 MB/s eta 0:00:01[K     |████████████                    | 122 kB 13.4 MB/s eta 0:00:01[K     |█████████████                   | 133 kB 13.4 MB/s eta

In [5]:
# Downloading and loading a dataset
#dataset = load_dataset('cnn_dailymail', '3.0.0')

In [6]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=False)
root_dir = "/content/gdrive/My Drive/masters_thesis/"

Mounted at /content/gdrive


In [7]:
class LitModel(pl.LightningModule):
  # Instantiate the model
  def __init__(self, learning_rate, tokenizer, model, freeze_encoder, freeze_embeds):
    super().__init__()
    self.tokenizer = tokenizer
    self.model = model
    self.learning_rate = learning_rate
    self.freeze_encoder = freeze_encoder
    self.freeze_embeds = freeze_embeds

    if self.freeze_encoder:
      freeze_params(self.model.get_encoder())

    if self.freeze_embeds:
      self.freeze_embeds()
  
  def freeze_embeds(self):
    ''' freeze the positional embedding parameters of the model; adapted from finetune.py '''
    freeze_params(self.model.model.shared)
    for d in [self.model.model.encoder, self.model.model.decoder]:
      freeze_params(d.embed_positions)
      freeze_params(d.embed_tokens)

  # Do a forward pass through the model
  def forward(self, input_ids, **kwargs):
    return self.model(input_ids, **kwargs)
  
  def configure_optimizers(self):
    optimizer = torch.optim.Adam(self.parameters(), lr = self.learning_rate)
    return optimizer

  def training_step(self, batch, batch_idx):
    # Load the data into variables
    src_ids, src_mask = batch[0], batch[1]
    tgt_ids = batch[2]
    # Shift the decoder tokens right (but NOT the tgt_ids)
    decoder_input_ids = shift_tokens_right(tgt_ids, tokenizer.pad_token_id)

    # Run the model and get the logits
    outputs = self(src_ids, attention_mask=src_mask, decoder_input_ids=decoder_input_ids, use_cache=False)
    lm_logits = outputs[0]
    # Create the loss function
    ce_loss_fct = torch.nn.CrossEntropyLoss(ignore_index=self.tokenizer.pad_token_id)
    # Calculate the loss on the un-shifted tokens
    loss = ce_loss_fct(lm_logits.view(-1, lm_logits.shape[-1]), tgt_ids.view(-1))

    return {'loss':loss}

  def validation_step(self, batch, batch_idx):

    src_ids, src_mask = batch[0], batch[1]
    tgt_ids = batch[2]

    decoder_input_ids = shift_tokens_right(tgt_ids, tokenizer.pad_token_id)
    
    # Run the model and get the logits
    outputs = self(src_ids, attention_mask=src_mask, decoder_input_ids=decoder_input_ids, use_cache=False)
    lm_logits = outputs[0]

    ce_loss_fct = torch.nn.CrossEntropyLoss(ignore_index=self.tokenizer.pad_token_id)
    val_loss = ce_loss_fct(lm_logits.view(-1, lm_logits.shape[-1]), tgt_ids.view(-1))

    return {'loss': val_loss}
  
  # Method that generates text using the LEDForConditionalGeneration's generate() method
  def generate_text(self, text, eval_beams, early_stopping = True, max_len = 1024):
    ''' Function to generate text '''
    device = "cuda" if torch.cuda.is_available() else "cpu"
    #device = "cpu"
    #device = "cuda"
    generated_ids = self.model.generate(
        text["input_ids"].to(device),
        attention_mask=text["attention_mask"].to(device),
        num_beams= eval_beams,
        length_penalty=2.0,
        max_length = max_len,
        min_length= 120,
        no_repeat_ngram_size=3,
        early_stopping = early_stopping,
        decoder_start_token_id= self.tokenizer.eos_token_id
    )
    return [self.tokenizer.decode(w, skip_special_tokens=True, clean_up_tokenization_spaces=False) for w in generated_ids]

def freeze_params(model):
  ''' Function that takes a model as input (or part of a model) and freezes the layers for faster training
      adapted from finetune.py '''
  for layer in model.parameters():
    layer.requires_grade = False

In [8]:
# Create a dataloading module as per the PyTorch Lightning Docs
class SummaryDataModule(pl.LightningDataModule):
  def __init__(self, tokenizer, data_file, batch_size, num_examples = 7000):
    super().__init__()
    self.tokenizer = tokenizer
    self.data_file = data_file
    self.batch_size = batch_size
    self.num_examples = num_examples
  
  # Loads and splits the data into training, validation and test sets with a 60/20/20 split
  def prepare_data(self):
    self.data = pd.read_csv(self.data_file).dropna()[:self.num_examples]

    self.data['source'].astype(str)
    self.data['target'].astype(str)
    self.train, self.validate, self.test = np.split(self.data.sample(frac=1), [int(.6*len(self.data)), int(.8*len(self.data))])

  # encode the sentences using the tokenizer  
  def setup(self, stage):
    self.train = encode_sentences(self.tokenizer, self.train['source'], self.train['target'])
    self.validate = encode_sentences(self.tokenizer, self.validate['source'], self.validate['target'])
    self.test = encode_sentences(self.tokenizer, self.test['source'], self.test['target'])

  # Load the training, validation and test sets in Pytorch Dataset objects
  def train_dataloader(self):
    dataset = TensorDataset(self.train['input_ids'], self.train['attention_mask'], self.train['labels'])                          
    train_data = DataLoader(dataset, sampler = RandomSampler(dataset), batch_size = self.batch_size)
    return train_data

  def val_dataloader(self):
    dataset = TensorDataset(self.validate['input_ids'], self.validate['attention_mask'], self.validate['labels']) 
    val_data = DataLoader(dataset, batch_size = self.batch_size)                       
    return val_data

  def test_dataloader(self):
    dataset = TensorDataset(self.test['input_ids'], self.test['attention_mask'], self.test['labels']) 
    test_data = DataLoader(dataset, batch_size = self.batch_size)                   
    return test_data


In [9]:
def shift_tokens_right(input_ids, pad_token_id):
  """ Shift input ids one token to the right, and wrap the last non pad token (usually <eos>).
      This is taken directly from modeling_bart.py
  """
  prev_output_tokens = input_ids.clone()
  index_of_eos = (input_ids.ne(pad_token_id).sum(dim=1) - 1).unsqueeze(-1)
  prev_output_tokens[:, 0] = input_ids.gather(1, index_of_eos).squeeze()
  prev_output_tokens[:, 1:] = input_ids[:, :-1]
  return prev_output_tokens

def encode_sentences(tokenizer, source_sentences, target_sentences, max_length=1024, pad_to_max_length=True, return_tensors="pt"):
  ''' Function that tokenizes a sentence 
      Args: tokenizer - the LED tokenizer; source and target sentences are the source and target sentences
      Returns: Dictionary with keys: input_ids, attention_mask, target_ids
  '''

  input_ids = []
  attention_masks = []
  target_ids = []
  tokenized_sentences = {}

  for sentence in source_sentences:
    encoded_dict = tokenizer(
          sentence,
          max_length=max_length,
          padding="max_length" if pad_to_max_length else None,
          truncation=True,
          return_tensors=return_tensors,
          add_prefix_space = True
      )

    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])

  input_ids = torch.cat(input_ids, dim = 0)
  attention_masks = torch.cat(attention_masks, dim = 0)

  for sentence in target_sentences:
    encoded_dict = tokenizer(
          str(sentence),
          max_length=max_length,
          padding="max_length" if pad_to_max_length else None,
          truncation=True,
          return_tensors=return_tensors,
          add_prefix_space = True
      )
    # Shift the target ids to the right
    # shifted_target_ids = shift_tokens_right(encoded_dict['input_ids'], tokenizer.pad_token_id)
    target_ids.append(encoded_dict['input_ids'])

  target_ids = torch.cat(target_ids, dim = 0)
  

  batch = {
      "input_ids": input_ids,
      "attention_mask": attention_masks,
      "labels": target_ids,
  }

  return batch


def noise_sentence(sentence_, percent_words, replacement_token = "<mask>"):
  '''
  Function that noises a sentence by adding <mask> tokens
  Args: sentence - the sentence to noise
        percent_words - the percent of words to replace with <mask> tokens; the number is rounded up using math.ceil
  Returns a noised sentence
  '''
  # Create a list item and copy
  sentence_ = sentence_.split(' ')
  sentence = sentence_.copy()
  
  num_words = math.ceil(len(sentence) * percent_words)
  
  # Create an array of tokens to sample from; don't include the last word as an option because in the case of lyrics
  # that word is often a rhyming word and plays an important role in song construction
  sample_tokens = set(np.arange(0, np.maximum(1, len(sentence)-1)))
  
  words_to_noise = random.sample(sample_tokens, num_words)
  
  # Swap out words, but not full stops
  for pos in words_to_noise:
      if sentence[pos] != '.':
          sentence[pos] = replacement_token
  
  # Remove redundant spaces
  sentence = re.sub(r' {2,5}', ' ', ' '.join(sentence))
  
  # Combine concurrent <mask> tokens into a single token; this just does two rounds of this; more could be done
  sentence = re.sub(r'<mask> <mask>', "<mask>", sentence)
  sentence = re.sub(r'<mask> <mask>', "<mask>", sentence)
  return sentence

In [10]:
# Load the model
from transformers import AdamW

tokenizer = LEDTokenizer.from_pretrained('allenai/led-base-16384')

led_model = LEDForConditionalGeneration.from_pretrained("allenai/led-base-16384")

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/772 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/27.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.07k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/618M [00:00<?, ?B/s]

In [11]:
is_training = True

In [12]:
# Load the data into the model for training
data_path = root_dir + "setup1_training.csv"
summary_data = SummaryDataModule(tokenizer, data_path,
                                 batch_size = 3)

# Load the model from a pre-saved checkpoint; alternatively use the code below to start training from scratch
# model = LitModel.load_from_checkpoint(base_dir + "checkpoint_files_2/8_ep_140k_simple_0210.ckpt",
#                                       learning_rate = 2e-5, tokenizer = tokenizer, model = bart_model, hparams = hparams)

custom_model = LitModel(learning_rate = 2e-5, tokenizer = tokenizer, model = led_model, freeze_encoder = True, freeze_embeds = False)

In [13]:
from pytorch_lightning.callbacks import ModelCheckpoint
if is_training:
  checkpoint = ModelCheckpoint(dirpath=root_dir + 'checkpoint_files/')
  trainer = pl.Trainer(gpus = 1,
                      max_epochs = 3,
                      min_epochs = 3,
                      auto_lr_find = False,
                      checkpoint_callback = checkpoint,
                      progress_bar_refresh_rate = 500)
  trainer.fit(custom_model, summary_data)
  torch.save(custom_model, root_dir + "models/longformer_all_setup1_training_model_may_5_beam_10.pt")

  f"Setting `Trainer(checkpoint_callback={checkpoint_callback})` is deprecated in v1.5 and will "
  f"Setting `Trainer(progress_bar_refresh_rate={progress_bar_refresh_rate})` is deprecated in v1.5 and"
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
Missing logger folder: /content/lightning_logs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type                        | Params
------------------------------------------------------
0 | model | LEDForConditionalGeneration | 161 M 
------------------------------------------------------
161 M     Trainable params
0         Non-trainable params
161 M     Total params
647.378   Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]



Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

In [14]:
#import re
import pickle
from datetime import datetime
# import copy
import csv

DEFAULT_DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
#DEFAULT_DEVICE = "cpu"
def create_csv(all_sentences, targets, file_to_write):
  sources = []
  fieldnames = ["source", "target"]
  test_array = []
  with open(file_to_write, 'w') as csvfile:
    csvwriter = csv.DictWriter(csvfile, delimiter=',', fieldnames=fieldnames)
    for t in range(len(targets)):
        test_array.append({"source": all_sentences[t], "target": targets[t]})
        #writer.writerow({sources[t], targets[t]})
    csvwriter.writerow(dict((fn,fn) for fn in fieldnames))
    for row in test_array:
      csvwriter.writerow(row)


def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i : i + n]

def generate_summaries(lns, metric, batch_size=1, device=DEFAULT_DEVICE):
    model = None
    tokenizer = LEDTokenizer.from_pretrained('allenai/led-base-16384')
    if is_training:
      model = custom_model.to(device)
    else:
      model = LEDForConditionalGeneration.from_pretrained("allenai/led-base-16384").to(device)
    article_batches = list(chunks(lns['source'], batch_size))
    target_batches = list(chunks(lns['target'], batch_size))
    ls_prediction = []
    ls_groundtruth = []

    dec_batches_untokenized = []
    target_batches_untokenized = []
    for article_batch, target_batch in tqdm(zip(article_batches, target_batches)
    , total=len(article_batches)):
        with tokenizer.as_target_tokenizer():
          dct = tokenizer(str(article_batch), truncation=True, padding='longest', return_tensors="pt")
          dec = []
          if is_training:
            dec = model.generate_text(dct,10)
          else:
            summaries = model.generate(
                input_ids=dct.input_ids.to(device),
                attention_mask=dct["attention_mask"].to(device),
                num_beams=10,
                length_penalty=2.0,
                max_length=4096,
                min_length=120,
                no_repeat_ngram_size=3,
                early_stopping=True,
                decoder_start_token_id=tokenizer.eos_token_id,
            )
            dec = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summaries]  
          dec = [d.replace('. ', '.\n') for d in dec]

          dec_batches_untokenized.append(dec)
          target_batches_untokenized.append(target_batch)
          

          ls_prediction.extend(dec)
          ls_groundtruth.extend(target_batch)

    
    
    ls_prediction_tokenized = coreNLP_tokenizer(ls_prediction)
    target_batch_tokenized = coreNLP_tokenizer(ls_groundtruth)

    #for dec_batch, target_batch in tqdm(zip(dec_batches, target_batches), total=len(dec_batches)):
    metric.add_batch(predictions=ls_prediction_tokenized, references=target_batch_tokenized)

    score = metric.compute()
    str_now = str(datetime.now())
    result_file_name = root_dir + "longformer_all_setup1_testing_model_generated_summaries_may_5_beam_10.csv"
    create_csv(ls_groundtruth, ls_prediction, result_file_name)
    return score

In [15]:
!pip install rouge_score
from datasets import list_metrics
metrics_list = list_metrics()
len(metrics_list)
print (metrics_list)
rouge_metric = load_metric('rouge')

Collecting rouge_score
  Downloading rouge_score-0.0.4-py2.py3-none-any.whl (22 kB)
Installing collected packages: rouge-score
Successfully installed rouge-score-0.0.4
['accuracy', 'bertscore', 'bleu', 'bleurt', 'cer', 'chrf', 'code_eval', 'comet', 'competition_math', 'coval', 'cuad', 'exact_match', 'f1', 'frugalscore', 'glue', 'google_bleu', 'indic_glue', 'mae', 'mahalanobis', 'matthews_correlation', 'mauve', 'mean_iou', 'meteor', 'mse', 'pearsonr', 'perplexity', 'precision', 'recall', 'roc_auc', 'rouge', 'sacrebleu', 'sari', 'seqeval', 'spearmanr', 'squad', 'squad_v2', 'super_glue', 'ter', 'wer', 'wiki_split', 'xnli', 'xtreme_s']


Downloading builder script:   0%|          | 0.00/2.16k [00:00<?, ?B/s]

In [16]:
# Install stanza; note that the prefix "!" is not needed if you are running in a terminal
!pip install stanza

# Import stanza
import stanza

Collecting stanza
  Downloading stanza-1.4.0-py3-none-any.whl (574 kB)
[?25l[K     |▋                               | 10 kB 34.2 MB/s eta 0:00:01[K     |█▏                              | 20 kB 38.1 MB/s eta 0:00:01[K     |█▊                              | 30 kB 26.5 MB/s eta 0:00:01[K     |██▎                             | 40 kB 13.8 MB/s eta 0:00:01[K     |██▉                             | 51 kB 13.1 MB/s eta 0:00:01[K     |███▍                            | 61 kB 15.3 MB/s eta 0:00:01[K     |████                            | 71 kB 15.0 MB/s eta 0:00:01[K     |████▋                           | 81 kB 14.3 MB/s eta 0:00:01[K     |█████▏                          | 92 kB 15.8 MB/s eta 0:00:01[K     |█████▊                          | 102 kB 14.4 MB/s eta 0:00:01[K     |██████▎                         | 112 kB 14.4 MB/s eta 0:00:01[K     |██████▉                         | 122 kB 14.4 MB/s eta 0:00:01[K     |███████▍                        | 133 kB 14.4 MB/s eta 0:0

In [17]:
# Download the Stanford CoreNLP package with Stanza's installation command
# This'll take several minutes, depending on the network speed
corenlp_dir = './corenlp'
stanza.install_corenlp(dir=corenlp_dir)

# Set the CORENLP_HOME environment variable to point to the installation location
import os
os.environ["CORENLP_HOME"] = corenlp_dir

2022-05-06 07:43:09 INFO: Installing CoreNLP package into ./corenlp


Downloading https://huggingface.co/stanfordnlp/CoreNLP/resolve/main/stanford-corenlp-latest.zip:   0%|        …



In [18]:
# Examine the CoreNLP installation folder to make sure the installation is successful
!export CORENLP_HOME='./corenlp'
!ls $CORENLP_HOME

build.xml				  jollyday.jar
corenlp.sh				  LIBRARY-LICENSES
CoreNLP-to-HTML.xsl			  LICENSE.txt
ejml-core-0.39.jar			  Makefile
ejml-core-0.39-sources.jar		  patterns
ejml-ddense-0.39.jar			  pom-java-11.xml
ejml-ddense-0.39-sources.jar		  pom-java-17.xml
ejml-simple-0.39.jar			  pom.xml
ejml-simple-0.39-sources.jar		  protobuf-java-3.19.2.jar
input.txt				  README.txt
input.txt.out				  RESOURCE-LICENSES
input.txt.xml				  SemgrexDemo.java
istack-commons-runtime-3.0.7.jar	  ShiftReduceDemo.java
istack-commons-runtime-3.0.7-sources.jar  slf4j-api.jar
javax.activation-api-1.2.0.jar		  slf4j-simple.jar
javax.activation-api-1.2.0-sources.jar	  stanford-corenlp-4.4.0.jar
javax.json-api-1.0-sources.jar		  stanford-corenlp-4.4.0-javadoc.jar
javax.json.jar				  stanford-corenlp-4.4.0-models.jar
jaxb-api-2.4.0-b180830.0359.jar		  stanford-corenlp-4.4.0-sources.jar
jaxb-api-2.4.0-b180830.0359-sources.jar   StanfordCoreNlpDemo.java
jaxb-impl-2.4.0-b180830.0438.jar	  StanfordDependenciesManual.p

In [19]:
# Import client module
from stanza.server import CoreNLPClient

In [20]:
texts = ["Albert Einstein was a German-born theoretical physicist.", "He was going to the school!"]
def coreNLP_tokenizer(inputDocsList):
  tokenizedDocsList = []
  with CoreNLPClient(annotators="tokenize ssplit pos lemma ner depparse".split(), memory='4G', endpoint='http://localhost:9001', be_quiet=True) as client:
    for d in inputDocsList:
      ann = client.annotate(d)

      # You can access annotations using ann.
      sentence = ann.sentence[0]

      # You can access any property within a sentence.
      #print(sentence.text)

      # Likewise for tokens
      #token = sentence.token[0]
      #print (token)
      tokenizedDocsList.append(' '.join([token.word.lower() for token in sentence.token]))
  return tokenizedDocsList

#print(coreNLP_tokenizer(texts))



In [21]:
import pandas as pd

#CUDA_LAUNCH_BLOCKING = 1

df = pd.read_csv(root_dir + "setup1_testing.csv")
df.dropna()
df['source'] = df['source'].astype(str)
df['target'] = df['target'].astype(str)
df = df.iloc[:1000,:]
score = generate_summaries(df, rouge_metric)

100%|██████████| 1000/1000 [6:30:15<00:00, 23.42s/it]
2022-05-06 14:13:43 INFO: Writing properties to tmp file: corenlp_server-275ccefd475a4ae9.props
2022-05-06 14:13:43 INFO: Starting server with command: java -Xmx4G -cp ./corenlp/* edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9001 -timeout 60000 -threads 5 -maxCharLength 100000 -quiet True -serverProperties corenlp_server-275ccefd475a4ae9.props -annotators tokenize,ssplit,pos,lemma,ner,depparse -preload -outputFormat serialized
2022-05-06 14:40:07 INFO: Writing properties to tmp file: corenlp_server-5f4dc9931c57496e.props
2022-05-06 14:40:07 INFO: Starting server with command: java -Xmx4G -cp ./corenlp/* edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9001 -timeout 60000 -threads 5 -maxCharLength 100000 -quiet True -serverProperties corenlp_server-5f4dc9931c57496e.props -annotators tokenize,ssplit,pos,lemma,ner,depparse -preload -outputFormat serialized


In [22]:
print(score)
#########.........//////.....

{'rouge1': AggregateScore(low=Score(precision=0.36741981932054457, recall=0.2411219344539743, fmeasure=0.26426464596958127), mid=Score(precision=0.377265172600492, recall=0.25099803017008027, fmeasure=0.2729717070771628), high=Score(precision=0.38686443599408404, recall=0.25963379549328014, fmeasure=0.28072302740794974)), 'rouge2': AggregateScore(low=Score(precision=0.19515614158488404, recall=0.13238633757282703, fmeasure=0.14316685480249208), mid=Score(precision=0.2030294862144325, recall=0.1393367645109842, fmeasure=0.14953839309625772), high=Score(precision=0.2114526450052862, recall=0.14705671752277477, fmeasure=0.15668164191146322)), 'rougeL': AggregateScore(low=Score(precision=0.3089240869581699, recall=0.2005793497450505, fmeasure=0.21922397448427783), mid=Score(precision=0.3184304739125602, recall=0.2093513113661263, fmeasure=0.22734699878436287), high=Score(precision=0.3285496207185787, recall=0.21800339418733122, fmeasure=0.23536441331682095)), 'rougeLsum': AggregateScore(lo

setup 4 (first 1000): 
{'rouge1': AggregateScore(low=Score(precision=0.12676402749186647, recall=0.1243895771018444, fmeasure=0.10616343632842366), mid=Score(precision=0.1374734172869369, recall=0.13308975943829415, fmeasure=0.11371971241464641), high=Score(precision=0.1491361253114805, recall=0.14238259396567488, fmeasure=0.12192471636024398)), 'rouge2': AggregateScore(low=Score(precision=0.06763095844029772, recall=0.06306909627539771, fmeasure=0.055072135040352514), mid=Score(precision=0.07724502113588028, recall=0.07109708569045564, fmeasure=0.06232977530140962), high=Score(precision=0.08769973674024152, recall=0.07979096473471814, fmeasure=0.06947080937910077)), 'rougeL': AggregateScore(low=Score(precision=0.12131761505902182, recall=0.1174768978814088, fmeasure=0.10098369305260012), mid=Score(precision=0.13158133320361337, recall=0.126660483585323, fmeasure=0.1084044319408635), high=Score(precision=0.1431784588584353, recall=0.1353204374497491, fmeasure=0.11646913914239637)), 'rougeLsum': AggregateScore(low=Score(precision=0.12006980312156863, recall=0.11709664594441264, fmeasure=0.0998254621848517), mid=Score(precision=0.1312656232122225, recall=0.1261369172848047, fmeasure=0.10799070880559006), high=Score(precision=0.14347230071754044, recall=0.1361034697818301, fmeasure=0.1169094080568949))}


setup 7 (first 1000): {'rouge1': AggregateScore(low=Score(precision=0.11001208657683142, recall=0.04045481686863176, fmeasure=0.05237701500133258), mid=Score(precision=0.1194466562211498, recall=0.04302819905099328, fmeasure=0.05578340711606748), high=Score(precision=0.1292037026629098, recall=0.04567050649885161, fmeasure=0.05913484681833105)), 'rouge2': AggregateScore(low=Score(precision=0.046948508219031675, recall=0.015413266101110448, fmeasure=0.020719472564778105), mid=Score(precision=0.05485353160703156, recall=0.01736357628235217, fmeasure=0.023394284111719864), high=Score(precision=0.063666518462795, recall=0.019477039699428326, fmeasure=0.026273134065780513)), 'rougeL': AggregateScore(low=Score(precision=0.09636069235602283, recall=0.03483178713338002, fmeasure=0.04526095198313408), mid=Score(precision=0.10551445130054268, recall=0.03705465446587998, fmeasure=0.04839128892779364), high=Score(precision=0.11499993023092485, recall=0.03937464761585254, fmeasure=0.051622887285731035)), 'rougeLsum': AggregateScore(low=Score(precision=0.09639365060616946, recall=0.03488481013289982, fmeasure=0.045550986384301835), mid=Score(precision=0.10533836423014764, recall=0.037136149978025265, fmeasure=0.04851131834110149), high=Score(precision=0.11462710118088025, recall=0.03943424485318742, fmeasure=0.0516170263420659))}


changed min length to 120

setup 1 (first 1000): {'rouge1': AggregateScore(low=Score(precision=0.11687732289105966, recall=0.04014398160461853, fmeasure=0.05465367080095843), mid=Score(precision=0.1265933728855697, recall=0.04270572058375072, fmeasure=0.05801835106201137), high=Score(precision=0.13679797507158223, recall=0.045182368668333306, fmeasure=0.06145759179595859)), 'rouge2': AggregateScore(low=Score(precision=0.051868612117803324, recall=0.015808002505048175, fmeasure=0.021943016695097087), mid=Score(precision=0.05971351413557231, recall=0.017676904986553874, fmeasure=0.024669552087712543), high=Score(precision=0.06887206069821124, recall=0.019852127570665925, fmeasure=0.027584237824726483)), 'rougeL': AggregateScore(low=Score(precision=0.10274686276156642, recall=0.034853435483270585, fmeasure=0.047486266220433004), mid=Score(precision=0.11218288949929132, recall=0.036878331643355924, fmeasure=0.0503254101138548), high=Score(precision=0.12165771962612837, recall=0.03918461373553703, fmeasure=0.053606501218243736)), 'rougeLsum': AggregateScore(low=Score(precision=0.10271881078073201, recall=0.03463093520354525, fmeasure=0.04720201732571146), mid=Score(precision=0.11186515734589028, recall=0.03693152213802084, fmeasure=0.05040554251621235), high=Score(precision=0.1215055238360565, recall=0.03910123127081614, fmeasure=0.05336639892867457))}


setup 4 (first 1000): {'rouge1': AggregateScore(low=Score(precision=0.12849397403489038, recall=0.12270366552907798, fmeasure=0.10829120802919376), mid=Score(precision=0.140821431485676, recall=0.13137548389680376, fmeasure=0.11671142888419178), high=Score(precision=0.15246166763009875, recall=0.14147683382638573, fmeasure=0.12490570212097199)), 'rouge2': AggregateScore(low=Score(precision=0.06971420497405997, recall=0.06254955371536859, fmeasure=0.05708198199096667), mid=Score(precision=0.07894477232974675, recall=0.07085531046989284, fmeasure=0.06385896193628214), high=Score(precision=0.08828802395556645, recall=0.07908251926648725, fmeasure=0.07110458473943607)), 'rougeL': AggregateScore(low=Score(precision=0.12439181587016536, recall=0.11677445465279772, fmeasure=0.10367764341488736), mid=Score(precision=0.13469536399414372, recall=0.1253660941753088, fmeasure=0.11155678076244473), high=Score(precision=0.14667602221723405, recall=0.13506661252110905, fmeasure=0.11905509642501769)), 'rougeLsum': AggregateScore(low=Score(precision=0.12399153764743331, recall=0.11658736987457856, fmeasure=0.1036338937130028), mid=Score(precision=0.13467593946297157, recall=0.12520010289226813, fmeasure=0.11132459068533773), high=Score(precision=0.14658937907247752, recall=0.13526400377849546, fmeasure=0.11964983219874523))}


setup 6 (first 1000): {'rouge1': AggregateScore(low=Score(precision=0.12601862892492321, recall=0.12070698851152085, fmeasure=0.10557046274762849), mid=Score(precision=0.1372413679627249, recall=0.12939003949590488, fmeasure=0.11352699175023533), high=Score(precision=0.1485579632047288, recall=0.13929270813132713, fmeasure=0.1215063476549872)), 'rouge2': AggregateScore(low=Score(precision=0.06670723076696676, recall=0.06382670150082534, fmeasure=0.05515348933587496), mid=Score(precision=0.07618088568653203, recall=0.07108878602589082, fmeasure=0.06186477767003744), high=Score(precision=0.08602347189118403, recall=0.0798876933937434, fmeasure=0.06893467026529361)), 'rougeL': AggregateScore(low=Score(precision=0.12064352064757146, recall=0.11515314765773622, fmeasure=0.10070949771620794), mid=Score(precision=0.13112781307681176, recall=0.12400208127458631, fmeasure=0.10826928291723247), high=Score(precision=0.1423538716413128, recall=0.13331341013971537, fmeasure=0.11612373321593479)), 'rougeLsum': AggregateScore(low=Score(precision=0.12059938943675269, recall=0.11573433118896029, fmeasure=0.10113808051483987), mid=Score(precision=0.13134564862677936, recall=0.12381879528023762, fmeasure=0.10855163143223544), high=Score(precision=0.14355320698040513, recall=0.13331423887782756, fmeasure=0.11624232748101224))}


setup 7 (first 1000): 
{'rouge1': AggregateScore(low=Score(precision=0.11393711786848833, recall=0.03947783913378618, fmeasure=0.05368971971999923), mid=Score(precision=0.1235161564714315, recall=0.042052954553266346, fmeasure=0.05722932125028428), high=Score(precision=0.13301857057855784, recall=0.04460666500046138, fmeasure=0.06068823236305559)), 'rouge2': AggregateScore(low=Score(precision=0.0488114738625811, recall=0.015413266101110448, fmeasure=0.02155846338949785), mid=Score(precision=0.05661566928553223, recall=0.01736357628235217, fmeasure=0.024275653102911444), high=Score(precision=0.06565073587734113, recall=0.019477039699428326, fmeasure=0.027213279424686796)), 'rougeL': AggregateScore(low=Score(precision=0.09934090112635985, recall=0.03408877057771483, fmeasure=0.04653913904622801), mid=Score(precision=0.10872231263786207, recall=0.03633307490356384, fmeasure=0.04965210858233325), high=Score(precision=0.11836488886492418, recall=0.0386558203959537, fmeasure=0.052908752207389304)), 'rougeLsum': AggregateScore(low=Score(precision=0.09970007348669532, recall=0.034145861009419125, fmeasure=0.046613980757117285), mid=Score(precision=0.10859067854182662, recall=0.03639878993999923, fmeasure=0.04973670444481135), high=Score(precision=0.11771109570351279, recall=0.03869591648712051, fmeasure=0.05294469220930701))}


setup 8 (first 1000): {'rouge1': AggregateScore(low=Score(precision=0.11388365492449819, recall=0.039561718974868806, fmeasure=0.053678564272793036), mid=Score(precision=0.12339863480879322, recall=0.04212245753205353, fmeasure=0.05725323936775174), high=Score(precision=0.13370716079039746, recall=0.04466059813570184, fmeasure=0.060778046330899886)), 'rouge2': AggregateScore(low=Score(precision=0.04937110782070203, recall=0.015585328993421091, fmeasure=0.02176702560016567), mid=Score(precision=0.05673204213813374, recall=0.017404486422061237, fmeasure=0.02429341886715373), high=Score(precision=0.06507325538412874, recall=0.01950427094949566, fmeasure=0.02725147077070526)), 'rougeL': AggregateScore(low=Score(precision=0.09917085262246589, recall=0.03401001205094645, fmeasure=0.04633282463461707), mid=Score(precision=0.10834963228689595, recall=0.03635971455869416, fmeasure=0.04964390457059806), high=Score(precision=0.11848261340988835, recall=0.03861124808757903, fmeasure=0.0529738122010208)), 'rougeLsum': AggregateScore(low=Score(precision=0.09965632960710162, recall=0.03408026498474715, fmeasure=0.046601087764754405), mid=Score(precision=0.10869675122084535, recall=0.03633775054631419, fmeasure=0.04976251818208844), high=Score(precision=0.11812962292052906, recall=0.03865613161519378, fmeasure=0.05292000363049567))}


changed beam size to 10

setup 1 led pretrain (first 1000): {'rouge1': AggregateScore(low=Score(precision=0.1102898914433309, recall=0.039811379363591026, fmeasure=0.05274529467405813), mid=Score(precision=0.1196852650317021, recall=0.04232781704879806, fmeasure=0.056034108329345925), high=Score(precision=0.1305563130132194, recall=0.04491251197364141, fmeasure=0.05957784524140242)), 'rouge2': AggregateScore(low=Score(precision=0.04912289916105421, recall=0.015690069777642108, fmeasure=0.02132375788937462), mid=Score(precision=0.057207419692128336, recall=0.017611925402195954, fmeasure=0.02401028095521615), high=Score(precision=0.06616574084642274, recall=0.019546524413899768, fmeasure=0.02679756429876761)), 'rougeL': AggregateScore(low=Score(precision=0.09675283685764945, recall=0.03452070345100315, fmeasure=0.045762383183067336), mid=Score(precision=0.10624292366191432, recall=0.03679456487320833, fmeasure=0.04886068638193852), high=Score(precision=0.11645300052527718, recall=0.03897172789717304, fmeasure=0.05196954897849838)), 'rougeLsum': AggregateScore(low=Score(precision=0.09654085172159185, recall=0.03455884104697888, fmeasure=0.04555201456542956), mid=Score(precision=0.1061991069796288, recall=0.03668145976928683, fmeasure=0.04871067222098954), high=Score(precision=0.11584201947129528, recall=0.038990941496436664, fmeasure=0.051989338796642616))}


setup 4 pretrain (first 1000): {'rouge1': AggregateScore(low=Score(precision=0.11656255442773193, recall=0.11957724318556308, fmeasure=0.09956799779624041), mid=Score(precision=0.12830157138969658, recall=0.12888823568201768, fmeasure=0.10751390746127663), high=Score(precision=0.13942615726081037, recall=0.13879559218684898, fmeasure=0.1160157749428302)), 'rouge2': AggregateScore(low=Score(precision=0.062389753461162045, recall=0.06235844536602485, fmeasure=0.05197182138415283), mid=Score(precision=0.07170230564288867, recall=0.07075036112316273, fmeasure=0.05900042336040234), high=Score(precision=0.08192398702814235, recall=0.07925988705075095, fmeasure=0.06632558610794548)), 'rougeL': AggregateScore(low=Score(precision=0.11242664603912138, recall=0.11437861051238106, fmeasure=0.09550782979924025), mid=Score(precision=0.12315089248527095, recall=0.12326094220180966, fmeasure=0.10307579838712755), high=Score(precision=0.1341543847394124, recall=0.13273605798303578, fmeasure=0.11108102701893095)), 'rougeLsum': AggregateScore(low=Score(precision=0.11204661150899915, recall=0.1145005529582778, fmeasure=0.09541742105028406), mid=Score(precision=0.12287463987817077, recall=0.12368573631533053, fmeasure=0.10300907368748652), high=Score(precision=0.13483979709425828, recall=0.13331005128916315, fmeasure=0.11131215463411326))}


setup 6 pretrain (first 1000): {'rouge1': AggregateScore(low=Score(precision=0.11448877418215049, recall=0.12078815911800149, fmeasure=0.09892135837601854), mid=Score(precision=0.12616298801337922, recall=0.1291952468393606, fmeasure=0.10606865014033859), high=Score(precision=0.1372819859978413, recall=0.13816652477537272, fmeasure=0.11347392592668104)), 'rouge2': AggregateScore(low=Score(precision=0.06098883028701353, recall=0.06431091261541241, fmeasure=0.05169537218057904), mid=Score(precision=0.07018998361771908, recall=0.07182209898512787, fmeasure=0.058130744513861424), high=Score(precision=0.08005216668535753, recall=0.07989095915339807, fmeasure=0.06486307508805753)), 'rougeL': AggregateScore(low=Score(precision=0.11024178584048723, recall=0.11540148586756681, fmeasure=0.0942620688256891), mid=Score(precision=0.12049433923424843, recall=0.12376942284444556, fmeasure=0.10118254454574072), high=Score(precision=0.13134763736299182, recall=0.1330500936994045, fmeasure=0.10804190307589415)), 'rougeLsum': AggregateScore(low=Score(precision=0.11032290739239496, recall=0.11455845161913424, fmeasure=0.09397503967575796), mid=Score(precision=0.12037956803027683, recall=0.12357368805415643, fmeasure=0.10121237327459205), high=Score(precision=0.13078849915001994, recall=0.13236527008987978, fmeasure=0.10799187342879495))}


setup 7 pretrain (first 1000): {'rouge1': AggregateScore(low=Score(precision=0.10585352409651193, recall=0.038774313254995404, fmeasure=0.05135452736228228), mid=Score(precision=0.11578028524070172, recall=0.04134765237010897, fmeasure=0.054893354870034654), high=Score(precision=0.12554822288137202, recall=0.04387949926291936, fmeasure=0.05824363401754222)), 'rouge2': AggregateScore(low=Score(precision=0.045822826498453456, recall=0.015415500079728157, fmeasure=0.020897317724525546), mid=Score(precision=0.05321644727738433, recall=0.017331231121082286, fmeasure=0.0234467178534419), high=Score(precision=0.06139354655738272, recall=0.01930826290135928, fmeasure=0.026239008538554143)), 'rougeL': AggregateScore(low=Score(precision=0.09237631373498117, recall=0.03371278741731466, fmeasure=0.04483218798650849), mid=Score(precision=0.1014765747502962, recall=0.035744952548982536, fmeasure=0.04750294616052084), high=Score(precision=0.11047331516545919, recall=0.03805674576942187, fmeasure=0.05059823422150759)), 'rougeLsum': AggregateScore(low=Score(precision=0.09220758463329325, recall=0.033634270193815714, fmeasure=0.04461432619285068), mid=Score(precision=0.1016156400430542, recall=0.03576267056100264, fmeasure=0.04754239998928384), high=Score(precision=0.11058117422887431, recall=0.038013899876562576, fmeasure=0.05059033263428229))}


setup 8 pretrain (first 1000): {'rouge1': AggregateScore(low=Score(precision=0.10661531782778043, recall=0.03897264337884068, fmeasure=0.051695340977995365), mid=Score(precision=0.11594013941501805, recall=0.04148788070454498, fmeasure=0.05499215463358089), high=Score(precision=0.1256128040842046, recall=0.04389407687744274, fmeasure=0.05850980754432481)), 'rouge2': AggregateScore(low=Score(precision=0.04566550458882463, recall=0.015475102257720152, fmeasure=0.020828313534158514), mid=Score(precision=0.05314756854919865, recall=0.017295237500847375, fmeasure=0.023462298929750665), high=Score(precision=0.060828266528268504, recall=0.019256216541315964, fmeasure=0.02613259028209841)), 'rougeL': AggregateScore(low=Score(precision=0.09228550664885217, recall=0.033804528710642594, fmeasure=0.04474762826917471), mid=Score(precision=0.10130553329665662, recall=0.03580824882800744, fmeasure=0.04760151100148376), high=Score(precision=0.11108819031634436, recall=0.037975214229086535, fmeasure=0.050694816299376916)), 'rougeLsum': AggregateScore(low=Score(precision=0.09295453765782483, recall=0.03369185458016215, fmeasure=0.04475826647436695), mid=Score(precision=0.10154732903514693, recall=0.03577502406652523, fmeasure=0.047537743791587546), high=Score(precision=0.11116723727889279, recall=0.03798612702347863, fmeasure=0.05071368189246481))}


now fine-tuning with custom data (1024 encoding length)

setup 6 all (first 1000): {'rouge1': AggregateScore(low=Score(precision=0.2299274385988509, recall=0.24480616026924798, fmeasure=0.21008398661976518), mid=Score(precision=0.24149810947214323, recall=0.25418591225337156, fmeasure=0.21867586508683834), high=Score(precision=0.2545487798975191, recall=0.2641984776494098, fmeasure=0.22852456780645541)), 'rouge2': AggregateScore(low=Score(precision=0.10813204221744069, recall=0.10506748572145153, fmeasure=0.09448626599277889), mid=Score(precision=0.11802567565062193, recall=0.11292871943565323, fmeasure=0.10159667280687933), high=Score(precision=0.12785712404910127, recall=0.12140820151137183, fmeasure=0.10915251659301868)), 'rougeL': AggregateScore(low=Score(precision=0.21023597229547686, recall=0.2200078195332527, fmeasure=0.19029585753801617), mid=Score(precision=0.22168637253765736, recall=0.2292071837624754, fmeasure=0.19951387086443478), high=Score(precision=0.23378589191643268, recall=0.2386712762007025, fmeasure=0.20813416229768322)), 'rougeLsum': AggregateScore(low=Score(precision=0.210607701103706, recall=0.2200937134424895, fmeasure=0.1909758972252164), mid=Score(precision=0.22155788594966053, recall=0.22966777715823497, fmeasure=0.19941946840077207), high=Score(precision=0.23426710022999506, recall=0.23874505953631067, fmeasure=0.2080408289699081))}


setup 4 all (first 1000): 
{'rouge1': AggregateScore(low=Score(precision=0.25684771974270865, recall=0.29501472898513165, fmeasure=0.260115735448153), mid=Score(precision=0.266524002779728, recall=0.30449822549063826, fmeasure=0.2675315643666451), high=Score(precision=0.27562376465052857, recall=0.31441606824352325, fmeasure=0.27520266134260546)), 'rouge2': AggregateScore(low=Score(precision=0.11373369868161086, recall=0.12732785674033953, fmeasure=0.11237756540959457), mid=Score(precision=0.12096549935529746, recall=0.13519828476726667, fmeasure=0.11904420088452541), high=Score(precision=0.1283220995370017, recall=0.14356228499323856, fmeasure=0.12538194590600896)), 'rougeL': AggregateScore(low=Score(precision=0.23094416477835117, recall=0.2661573865078696, fmeasure=0.23362351106306084), mid=Score(precision=0.24051713371597916, recall=0.2758231309759618, fmeasure=0.24146710029863205), high=Score(precision=0.24993661468928882, recall=0.285119916650107, fmeasure=0.2496520056443254)), 'rougeLsum': AggregateScore(low=Score(precision=0.2312998326251717, recall=0.26599066159707085, fmeasure=0.23343375326341276), mid=Score(precision=0.240906570474671, recall=0.2759829799646702, fmeasure=0.24172720951179383), high=Score(precision=0.24939854438135944, recall=0.28533928882972687, fmeasure=0.24917913058047592))}


setup 7 all (first 1000):
{'rouge1': AggregateScore(low=Score(precision=0.23848372358030187, recall=0.116203733064573, fmeasure=0.14652831026816487), mid=Score(precision=0.2496566797236288, recall=0.12125655100559302, fmeasure=0.15262611711508262), high=Score(precision=0.2613669587962916, recall=0.12603506267066766, fmeasure=0.15897077395770345)), 'rouge2': AggregateScore(low=Score(precision=0.10415555606207981, recall=0.04707150359244188, fmeasure=0.06112328805918762), mid=Score(precision=0.11257075489253199, recall=0.050713847139136244, fmeasure=0.06588266537104556), high=Score(precision=0.1210673836462916, recall=0.05405635223940679, fmeasure=0.07056728624214115)), 'rougeL': AggregateScore(low=Score(precision=0.18965115773201233, recall=0.08923315807419371, fmeasure=0.11440076310548612), mid=Score(precision=0.20074747036670249, recall=0.09308392577499458, fmeasure=0.1198450453544593), high=Score(precision=0.21132454306868032, recall=0.09755798918835348, fmeasure=0.12582084683161124)), 'rougeLsum': AggregateScore(low=Score(precision=0.19041882435283822, recall=0.08930709028260499, fmeasure=0.11467834252191085), mid=Score(precision=0.2011208183354828, recall=0.09332231890960088, fmeasure=0.12013314892016799), high=Score(precision=0.21260492406103768, recall=0.0975096644494061, fmeasure=0.1258654349260666))}


setup 1 all (first 1000): {'rouge1': AggregateScore(low=Score(precision=0.3425980036758402, recall=0.2344876555238033, fmeasure=0.25582566305121085), mid=Score(precision=0.35230153701648154, recall=0.24458458408834932, fmeasure=0.2653038545409604), high=Score(precision=0.36262445440903984, recall=0.25456721835681323, fmeasure=0.2751058010312279)), 'rouge2': AggregateScore(low=Score(precision=0.18543514553515225, recall=0.13152461882167307, fmeasure=0.14214303155154892), mid=Score(precision=0.19459119083707183, recall=0.13853099297349716, fmeasure=0.1496318080257765), high=Score(precision=0.2031652588328815, recall=0.14657893988535375, fmeasure=0.15712160811403103)), 'rougeL': AggregateScore(low=Score(precision=0.2850551106851154, recall=0.1942123796170479, fmeasure=0.21294448626536422), mid=Score(precision=0.2949259534235572, recall=0.2029518517580875, fmeasure=0.22062553192259976), high=Score(precision=0.30475697957092573, recall=0.21181000141401612, fmeasure=0.22920887281512767)), 'rougeLsum': AggregateScore(low=Score(precision=0.28546682336764695, recall=0.1939819532668823, fmeasure=0.21172709007741952), mid=Score(precision=0.2949476502926327, recall=0.20289877835057213, fmeasure=0.2206760484859417), high=Score(precision=0.3040340037484996, recall=0.21180782666770814, fmeasure=0.2295344096468712))}


changed generate text max len to 1024

setup 8 all (first 1000): {'rouge1': AggregateScore(low=Score(precision=0.6477042398777727, recall=0.13046887910089194, fmeasure=0.21417678530531586), mid=Score(precision=0.6643746753246795, recall=0.13455889689842332, fmeasure=0.2207103073917106), high=Score(precision=0.6809470301757109, recall=0.13871816560434166, fmeasure=0.2269253080102008)), 'rouge2': AggregateScore(low=Score(precision=0.5454512912087879, recall=0.10388033485845145, fmeasure=0.17235379771652878), mid=Score(precision=0.5694820054945029, recall=0.10864960193501111, fmeasure=0.18001861142570974), high=Score(precision=0.589266291208789, recall=0.11292375628216621, fmeasure=0.18679838862315462)), 'rougeL': AggregateScore(low=Score(precision=0.6170471600458378, recall=0.12480881858576069, fmeasure=0.20509276298451762), mid=Score(precision=0.6353011459129134, recall=0.12925891707611326, fmeasure=0.2117679033342737), high=Score(precision=0.6543994174942733, recall=0.13368981462116564, fmeasure=0.21862741293637955)), 'rougeLsum': AggregateScore(low=Score(precision=0.6171193716577558, recall=0.12491283209705982, fmeasure=0.20516830046282597), mid=Score(precision=0.6359871275783062, recall=0.1295110468079277, fmeasure=0.21222081411791388), high=Score(precision=0.6536715106951901, recall=0.13389007956275747, fmeasure=0.2186259127456063))}


setup 7 all (first 1000):
{'rouge1': AggregateScore(low=Score(precision=0.25946790312166396, recall=0.148583745663492, fmeasure=0.17599028488054375), mid=Score(precision=0.269010249316047, recall=0.15363059663345605, fmeasure=0.18187769663397557), high=Score(precision=0.2806462710310483, recall=0.15854611984941094, fmeasure=0.18782307786037755)), 'rouge2': AggregateScore(low=Score(precision=0.11584966931662759, recall=0.06198210589996406, fmeasure=0.07610626187738395), mid=Score(precision=0.12326021841939902, recall=0.06514442198439252, fmeasure=0.08023669683065358), high=Score(precision=0.13043076718523589, recall=0.06863876084717674, fmeasure=0.08438915541025743)), 'rougeL': AggregateScore(low=Score(precision=0.20121473388477742, recall=0.1085555495220172, fmeasure=0.13252536697681896), mid=Score(precision=0.21065443391342112, recall=0.11263727500890047, fmeasure=0.13804781401036365), high=Score(precision=0.22072637024769748, recall=0.11656034906373333, fmeasure=0.14331766354168726)), 'rougeLsum': AggregateScore(low=Score(precision=0.20115956393856302, recall=0.10851332215492504, fmeasure=0.13264795184748493), mid=Score(precision=0.21074579699917625, recall=0.11266186750988068, fmeasure=0.13796024904625886), high=Score(precision=0.22002729478377042, recall=0.1164524595205137, fmeasure=0.14307115695263542))}


setup 6 all (first 1000): {'rouge1': AggregateScore(low=Score(precision=0.24085131561454856, recall=0.24417942278211283, fmeasure=0.2133641857523802), mid=Score(precision=0.2528688833457693, recall=0.25289004000438675, fmeasure=0.2219860541519942), high=Score(precision=0.2649580172455238, recall=0.2619254615190368, fmeasure=0.2302336445970384)), 'rouge2': AggregateScore(low=Score(precision=0.11448406527833123, recall=0.10695362271702485, fmeasure=0.09789364234423875), mid=Score(precision=0.12508427714832776, recall=0.11507849551848852, fmeasure=0.10528749166495789), high=Score(precision=0.13583601388667854, recall=0.12326362779028631, fmeasure=0.11320757190386396)), 'rougeL': AggregateScore(low=Score(precision=0.22062373461236187, recall=0.21935804467861267, fmeasure=0.1935835590952713), mid=Score(precision=0.23206521091238552, recall=0.22875470552069838, fmeasure=0.2022656612963093), high=Score(precision=0.24356787572182234, recall=0.2385974936783993, fmeasure=0.2106591801295899)), 'rougeLsum': AggregateScore(low=Score(precision=0.22007992658401865, recall=0.2191996951394736, fmeasure=0.19377029285046424), mid=Score(precision=0.23188423997679164, recall=0.22896763285273009, fmeasure=0.20211578142335296), high=Score(precision=0.24403833140237652, recall=0.238830412588623, fmeasure=0.21122064485390446))}


setup 4 all (first 1000): 
{'rouge1': AggregateScore(low=Score(precision=0.25373819042997525, recall=0.30635836619467094, fmeasure=0.26304592358092166), mid=Score(precision=0.2627243664724488, recall=0.3160756022522333, fmeasure=0.2711203008977511), high=Score(precision=0.27217864838021477, recall=0.3257835467890547, fmeasure=0.2794549256325368)), 'rouge2': AggregateScore(low=Score(precision=0.10486084152454479, recall=0.12398708294533804, fmeasure=0.10721469367537963), mid=Score(precision=0.11161532565207385, recall=0.13165474607971345, fmeasure=0.1135346795576222), high=Score(precision=0.11870037354145443, recall=0.14018658886994315, fmeasure=0.12075936266885984)), 'rougeL': AggregateScore(low=Score(precision=0.2266602000979277, recall=0.27551209297812984, fmeasure=0.2356384144794056), mid=Score(precision=0.2353225870166536, recall=0.28480353870142516, fmeasure=0.2434518298948588), high=Score(precision=0.24437620576171248, recall=0.2947469539188875, fmeasure=0.25178780425290087)), 'rougeLsum': AggregateScore(low=Score(precision=0.22657125823945548, recall=0.2756812716479142, fmeasure=0.23594026641537477), mid=Score(precision=0.23529559238468056, recall=0.284708010329467, fmeasure=0.2436382469586134), high=Score(precision=0.24409649917584955, recall=0.2945824701040448, fmeasure=0.2512008437659664))}


setup 1 all (first 1000):
{'rouge1': AggregateScore(low=Score(precision=0.36741981932054457, recall=0.2411219344539743, fmeasure=0.26426464596958127), mid=Score(precision=0.377265172600492, recall=0.25099803017008027, fmeasure=0.2729717070771628), high=Score(precision=0.38686443599408404, recall=0.25963379549328014, fmeasure=0.28072302740794974)), 'rouge2': AggregateScore(low=Score(precision=0.19515614158488404, recall=0.13238633757282703, fmeasure=0.14316685480249208), mid=Score(precision=0.2030294862144325, recall=0.1393367645109842, fmeasure=0.14953839309625772), high=Score(precision=0.2114526450052862, recall=0.14705671752277477, fmeasure=0.15668164191146322)), 'rougeL': AggregateScore(low=Score(precision=0.3089240869581699, recall=0.2005793497450505, fmeasure=0.21922397448427783), mid=Score(precision=0.3184304739125602, recall=0.2093513113661263, fmeasure=0.22734699878436287), high=Score(precision=0.3285496207185787, recall=0.21800339418733122, fmeasure=0.23536441331682095)), 'rougeLsum': AggregateScore(low=Score(precision=0.3082582220488033, recall=0.20030451847860145, fmeasure=0.21945295212734417), mid=Score(precision=0.3179568864270045, recall=0.20891997166237486, fmeasure=0.2271553317842516), high=Score(precision=0.3282999097905381, recall=0.21738726973755018, fmeasure=0.23437172885751284))}
