In [None]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Select the Runtime > "Change runtime type" menu to enable a GPU accelerator, ')
  print('and then re-execute this cell.')
else:
  print(gpu_info)

Thu Apr 21 06:22:44 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   43C    P0    27W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('To enable a high-RAM runtime, select the Runtime > "Change runtime type"')
  print('menu, and then select High-RAM in the Runtime shape dropdown. Then, ')
  print('re-execute this cell.')
else:
  print('You are using a high-RAM runtime!')

Your runtime has 13.6 gigabytes of available RAM

To enable a high-RAM runtime, select the Runtime > "Change runtime type"
menu, and then select High-RAM in the Runtime shape dropdown. Then, 
re-execute this cell.


In [None]:
# This run uses Pytorch Lightening to finetune the model
!pip install -q pytorch-lightning
!pip install -q transformers

In [None]:
# imports
import transformers
from torch.utils.data import DataLoader, TensorDataset, random_split, RandomSampler, Dataset
import pandas as pd
import numpy as np

import torch.nn.functional as F
import pytorch_lightning as pl
import torch
from pytorch_lightning.callbacks import ModelCheckpoint

import math
import random
import re

# install datasets
!pip install datasets

from datasets import list_datasets, list_metrics, load_dataset, load_metric

from pprint import pprint

from transformers import BartForConditionalGeneration, BartTokenizer
from tqdm import tqdm
import argparse



# Firing up Google Drive
Load up your google drive for loading the lyrics for training and for saving model weights

In [None]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=False)
root_dir = "/content/gdrive/My Drive/masters_thesis/"
base_dir = root_dir

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


# Pytorch Lightning for running the training
The below code uses Pytorch Lightning for training the model, which is explained very well (and simply) at https://pytorch-lightning.readthedocs.io/en/latest/. Very briefly, most of th usual methods one would set up for a Pytorch class are setup in a pl.LightningModule class. This then goes on to automate a bunch of the training for example updating the optimizer, clearing gradients etc.

In [None]:
class LitModel(pl.LightningModule):
  # Instantiate the model
  def __init__(self, learning_rate, tokenizer, model, freeze_encoder, freeze_embeds):
    super().__init__()
    self.tokenizer = tokenizer
    self.model = model
    self.learning_rate = learning_rate
    self.freeze_encoder = freeze_encoder
    self.freeze_embeds = freeze_embeds

    if self.freeze_encoder:
      freeze_params(self.model.get_encoder())

    if self.freeze_embeds:
      self.freeze_embeds()
  
  def freeze_embeds(self):
    ''' freeze the positional embedding parameters of the model; adapted from finetune.py '''
    freeze_params(self.model.model.shared)
    for d in [self.model.model.encoder, self.model.model.decoder]:
      freeze_params(d.embed_positions)
      freeze_params(d.embed_tokens)

  # Do a forward pass through the model
  def forward(self, input_ids, **kwargs):
    return self.model(input_ids, **kwargs)
  
  def configure_optimizers(self):
    optimizer = torch.optim.Adam(self.parameters(), lr = self.learning_rate)
    return optimizer

  def training_step(self, batch, batch_idx):
    # Load the data into variables
    src_ids, src_mask = batch[0], batch[1]
    tgt_ids = batch[2]
    # Shift the decoder tokens right (but NOT the tgt_ids)
    decoder_input_ids = shift_tokens_right(tgt_ids, tokenizer.pad_token_id)

    # Run the model and get the logits
    outputs = self(src_ids, attention_mask=src_mask, decoder_input_ids=decoder_input_ids, use_cache=False)
    lm_logits = outputs[0]
    # Create the loss function
    ce_loss_fct = torch.nn.CrossEntropyLoss(ignore_index=self.tokenizer.pad_token_id)
    # Calculate the loss on the un-shifted tokens
    loss = ce_loss_fct(lm_logits.view(-1, lm_logits.shape[-1]), tgt_ids.view(-1))

    return {'loss':loss}

  def validation_step(self, batch, batch_idx):

    src_ids, src_mask = batch[0], batch[1]
    tgt_ids = batch[2]

    decoder_input_ids = shift_tokens_right(tgt_ids, tokenizer.pad_token_id)
    
    # Run the model and get the logits
    outputs = self(src_ids, attention_mask=src_mask, decoder_input_ids=decoder_input_ids, use_cache=False)
    lm_logits = outputs[0]

    ce_loss_fct = torch.nn.CrossEntropyLoss(ignore_index=self.tokenizer.pad_token_id)
    val_loss = ce_loss_fct(lm_logits.view(-1, lm_logits.shape[-1]), tgt_ids.view(-1))

    return {'loss': val_loss}
  
  # Method that generates text using the BartForConditionalGeneration's generate() method
  def generate_text(self, text, eval_beams, early_stopping = True, max_len = 512):
    ''' Function to generate text '''
    device = "cuda" if torch.cuda.is_available() else "cpu"
    #device = "cpu"
    #device = "cuda"
    generated_ids = self.model.generate(
        text["input_ids"].to(device),
        attention_mask=text["attention_mask"].to(device),
        num_beams= eval_beams,
        length_penalty=2.0,
        max_length = max_len,
        min_length= 120,
        no_repeat_ngram_size=3,
        early_stopping = early_stopping,
        decoder_start_token_id= self.tokenizer.eos_token_id
    )
    return [self.tokenizer.decode(w, skip_special_tokens=True, clean_up_tokenization_spaces=False) for w in generated_ids]

def freeze_params(model):
  ''' Function that takes a model as input (or part of a model) and freezes the layers for faster training
      adapted from finetune.py '''
  for layer in model.parameters():
    layer.requires_grade = False


In [None]:
# Create a dataloading module as per the PyTorch Lightning Docs
class SummaryDataModule(pl.LightningDataModule):
  def __init__(self, tokenizer, data_file, batch_size, num_examples = 7000):
    super().__init__()
    self.tokenizer = tokenizer
    self.data_file = data_file
    self.batch_size = batch_size
    self.num_examples = num_examples
  
  # Loads and splits the data into training, validation and test sets with a 60/20/20 split
  # Updated: training and validation data 80/20 split
  def prepare_data(self):
    self.data = pd.read_csv(self.data_file).dropna()[:self.num_examples]

    self.data['source'].astype(str)
    self.data['target'].astype(str)
    # train and validation only
    #self.train, self.validate, self.test = np.split(self.data.sample(frac=1), [int(.6*len(self.data)), int(.8*len(self.data))])
    self.train = self.data.sample(frac=0.8,random_state=200) 
    self.validate = self.data.drop(self.train.index)

  # encode the sentences using the tokenizer  
  def setup(self, stage):
    self.train = encode_sentences(self.tokenizer, self.train['source'], self.train['target'])
    self.validate = encode_sentences(self.tokenizer, self.validate['source'], self.validate['target'])
    #self.test = encode_sentences(self.tokenizer, self.test['source'], self.test['target'])

  # Load the training, validation and test sets in Pytorch Dataset objects
  def train_dataloader(self):
    dataset = TensorDataset(self.train['input_ids'], self.train['attention_mask'], self.train['labels'])                          
    train_data = DataLoader(dataset, sampler = RandomSampler(dataset), batch_size = self.batch_size)
    return train_data

  def val_dataloader(self):
    dataset = TensorDataset(self.validate['input_ids'], self.validate['attention_mask'], self.validate['labels']) 
    val_data = DataLoader(dataset, batch_size = self.batch_size)                       
    return val_data

  # def test_dataloader(self):
  #   dataset = TensorDataset(self.test['input_ids'], self.test['attention_mask'], self.test['labels']) 
  #   test_data = DataLoader(dataset, batch_size = self.batch_size)                   
  #   return test_data



In [None]:
# Create the hparams dictionary to pass in the model
# I realise that this isn't really how this is meant to be used, but having this here reminds me that I can edit it when I need
# hparams = argparse.Namespace()

# hparams.freeze_encoder = True
# hparams.freeze_embeds = True
# hparams.eval_beams = 4

In [None]:
def shift_tokens_right(input_ids, pad_token_id):
  """ Shift input ids one token to the right, and wrap the last non pad token (usually <eos>).
      This is taken directly from modeling_bart.py
  """
  prev_output_tokens = input_ids.clone()
  index_of_eos = (input_ids.ne(pad_token_id).sum(dim=1) - 1).unsqueeze(-1)
  prev_output_tokens[:, 0] = input_ids.gather(1, index_of_eos).squeeze()
  prev_output_tokens[:, 1:] = input_ids[:, :-1]
  return prev_output_tokens
# NOTEEEEEEEEEEEE: CHANGE THISSSSSS max length
def encode_sentences(tokenizer, source_sentences, target_sentences, max_length=512, pad_to_max_length=True, return_tensors="pt"):
  ''' Function that tokenizes a sentence 
      Args: tokenizer - the BART tokenizer; source and target sentences are the source and target sentences
      Returns: Dictionary with keys: input_ids, attention_mask, target_ids
  '''

  input_ids = []
  attention_masks = []
  target_ids = []
  tokenized_sentences = {}

  for sentence in source_sentences:
    encoded_dict = tokenizer(
          sentence,
          max_length=max_length,
          padding="max_length" if pad_to_max_length else None,
          truncation=True,
          return_tensors=return_tensors,
          add_prefix_space = True
      )

    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])

  input_ids = torch.cat(input_ids, dim = 0)
  attention_masks = torch.cat(attention_masks, dim = 0)

  for sentence in target_sentences:
    encoded_dict = tokenizer(
          str(sentence),
          max_length=max_length,
          padding="max_length" if pad_to_max_length else None,
          truncation=True,
          return_tensors=return_tensors,
          add_prefix_space = True
      )
    # Shift the target ids to the right
    # shifted_target_ids = shift_tokens_right(encoded_dict['input_ids'], tokenizer.pad_token_id)
    target_ids.append(encoded_dict['input_ids'])

  target_ids = torch.cat(target_ids, dim = 0)
  

  batch = {
      "input_ids": input_ids,
      "attention_mask": attention_masks,
      "labels": target_ids,
  }

  return batch


def noise_sentence(sentence_, percent_words, replacement_token = "<mask>"):
  '''
  Function that noises a sentence by adding <mask> tokens
  Args: sentence - the sentence to noise
        percent_words - the percent of words to replace with <mask> tokens; the number is rounded up using math.ceil
  Returns a noised sentence
  '''
  # Create a list item and copy
  sentence_ = sentence_.split(' ')
  sentence = sentence_.copy()
  
  num_words = math.ceil(len(sentence) * percent_words)
  
  # Create an array of tokens to sample from; don't include the last word as an option because in the case of lyrics
  # that word is often a rhyming word and plays an important role in song construction
  sample_tokens = set(np.arange(0, np.maximum(1, len(sentence)-1)))
  
  words_to_noise = random.sample(sample_tokens, num_words)
  
  # Swap out words, but not full stops
  for pos in words_to_noise:
      if sentence[pos] != '.':
          sentence[pos] = replacement_token
  
  # Remove redundant spaces
  sentence = re.sub(r' {2,5}', ' ', ' '.join(sentence))
  
  # Combine concurrent <mask> tokens into a single token; this just does two rounds of this; more could be done
  sentence = re.sub(r'<mask> <mask>', "<mask>", sentence)
  sentence = re.sub(r'<mask> <mask>', "<mask>", sentence)
  return sentence
  

# Load BART
Here we load the model. I used "bart-base" because I had memory issues using "bart-large". "bart-base" appears to load without the use_cache argument, which by necessity must be turned to "False" for "bart-large".

In [None]:
# Load the model
from transformers import BartTokenizer, BartForConditionalGeneration, AdamW, BartConfig

tokenizer = BartTokenizer.from_pretrained('facebook/bart-base', add_prefix_space=True)

bart_model = BartForConditionalGeneration.from_pretrained(
    "facebook/bart-base")



In [None]:
# Load the data into the model for training
data_path = root_dir + "setup6_training.csv"
summary_data = SummaryDataModule(tokenizer, data_path,
                                 batch_size = 3)

# Load the model from a pre-saved checkpoint; alternatively use the code below to start training from scratch
# model = LitModel.load_from_checkpoint(base_dir + "checkpoint_files_2/8_ep_140k_simple_0210.ckpt",
#                                       learning_rate = 2e-5, tokenizer = tokenizer, model = bart_model, hparams = hparams)

custom_model = LitModel(learning_rate = 2e-5, tokenizer = tokenizer, model = bart_model, freeze_encoder = True, freeze_embeds = False)

# Training the model with Pytorch Lightning
The below code utilises Pytorch Lightning's fantastic Trainer module that helps to control the training process. After creating a ModelCheckpoint object, the other options are fed into the Trainer module. I found that my colab crashed when I didn't explicitly set progress_bar_refresh_rate to something and I found that setting it to 500 seemed to work just fine.

In [None]:

checkpoint = ModelCheckpoint(dirpath=base_dir + 'checkpoint_files/')
trainer = pl.Trainer(gpus = 1,
                     max_epochs = 3,
                     min_epochs = 3,
                     auto_lr_find = False,
                     checkpoint_callback = checkpoint,
                     progress_bar_refresh_rate = 500)

  f"Setting `Trainer(checkpoint_callback={checkpoint_callback})` is deprecated in v1.5 and will "
  f"Setting `Trainer(progress_bar_refresh_rate={progress_bar_refresh_rate})` is deprecated in v1.5 and"
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [None]:
print(summary_data)

<__main__.SummaryDataModule object at 0x7efd5e7aa590>


In [None]:
# Fit the instantiated model to the data
trainer.fit(custom_model, summary_data)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type                         | Params
-------------------------------------------------------
0 | model | BartForConditionalGeneration | 139 M 
-------------------------------------------------------
139 M     Trainable params
0         Non-trainable params
139 M     Total params
557.682   Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

In [None]:
# If you want to manually save a checkpoint, this works, although the model should automatically save (progressively better)
# checkpoints as it moves through the epochs...
#trainer.save_checkpoint(base_dir + "checkpoint_files_complete/setup3_training_model.ckpt")
torch.save(custom_model, base_dir + "models/bart_all_setup6_training_model_april_21_beam_10.pt")

In [None]:
#import re
import pickle
from datetime import datetime
# import copy
import csv

DEFAULT_DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
#DEFAULT_DEVICE = "cpu"
#DEFAULT_DEVICE = "cuda"
def create_csv(all_sentences, targets, file_to_write):
  sources = []
  fieldnames = ["source", "target"]
  test_array = []
  with open(file_to_write, 'w') as csvfile:
    csvwriter = csv.DictWriter(csvfile, delimiter=',', fieldnames=fieldnames)
    for t in range(len(targets)):
        test_array.append({"source": all_sentences[t], "target": targets[t]})
        #writer.writerow({sources[t], targets[t]})
    csvwriter.writerow(dict((fn,fn) for fn in fieldnames))
    for row in test_array:
      csvwriter.writerow(row)


def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i : i + n]

def generate_summaries(lns, metric, batch_size=1, device=DEFAULT_DEVICE):
    bart_model = BartForConditionalGeneration.from_pretrained("facebook/bart-base").to(device)
    tokenizer = BartTokenizer.from_pretrained('facebook/bart-base')
    model = custom_model.to(device)
    
    article_batches = list(chunks(lns['source'], batch_size))
    target_batches = list(chunks(lns['target'], batch_size))
    ls_prediction = []
    ls_groundtruth = []

    dec_batches_untokenized = []
    target_batches_untokenized = []

    for article_batch, target_batch in tqdm(zip(article_batches, target_batches)
    , total=len(article_batches)):
        dct = tokenizer.batch_encode_plus(article_batch,
                                          max_length=1024,
                                          truncation=True,
                                          padding='max_length',
                                          return_tensors="pt")
        # summaries = model.generate(
        #     input_ids=dct["input_ids"].to(device),
        #     attention_mask=dct["attention_mask"].to(device),
        #     num_beams=3,
        #     length_penalty=2.0,
        #     max_length=120,
        #     min_length=5,
        #     no_repeat_ngram_size=3,
        #     early_stopping=True,
        #     decoder_start_token_id=tokenizer.eos_token_id,
        # )
        dec = model.generate_text(dct,eval_beams=10)
        #dec = [tokenizer.decode(g.strip(), skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summaries]  
        dec = [d.replace('. ', '.\n') for d in dec]

        dec_batches_untokenized.append(dec)
        target_batches_untokenized.append(target_batch)
        

        ls_prediction.extend(dec)
        ls_groundtruth.extend(target_batch)

    
    
    ls_prediction_tokenized = coreNLP_tokenizer(ls_prediction)
    target_batch_tokenized = coreNLP_tokenizer(ls_groundtruth)

    #for i in range(len(ls_prediction_tokenized)):
    # print(ls_prediction_tokenized[i])
    # print (target_batch_tokenized[i])
    # print('==============================')
    #print (ls_prediction_tokenized)
    #print(target_batch_tokenized)
    #dec_batches = list(chunks(ls_prediction_tokenized, batch_size))
    #target_batches = list(chunks(target_batch_tokenized, batch_size))


    #for dec_batch, target_batch in tqdm(zip(dec_batches, target_batches), total=len(dec_batches)):
    metric.add_batch(predictions=ls_prediction_tokenized, references=target_batch_tokenized)

    score = metric.compute()
    str_now = str(datetime.now())
    result_file_name = root_dir + "bart_all_setup6_testing_custom_model_generated_summaries_april_21_beam_10.csv"
    create_csv(ls_groundtruth, ls_prediction, result_file_name)
    #with open('/content/gdrive/Shareddrives/Informed Consent/202012_summarization_results/{0}_predictions.pkl'.format(str_now), 'wb') as fid:
    #    pickle.dump(ls_prediction, fid)
    #with open('/content/gdrive/Shareddrives/Informed Consent/202012_summarization_results/{0}_groundtruth.pkl'.format(str_now), 'wb') as fid:
    #    pickle.dump(ls_groundtruth, fid)
    print("ls prediction: ")
    print(ls_prediction)
    print("ls groundtruth: ")
    print(ls_groundtruth)
    return score

In [None]:
!pip install rouge_score
from datasets import list_metrics
metrics_list = list_metrics()
len(metrics_list)
print (metrics_list)
rouge_metric = load_metric('rouge')

['accuracy', 'bertscore', 'bleu', 'bleurt', 'cer', 'chrf', 'code_eval', 'comet', 'competition_math', 'coval', 'cuad', 'exact_match', 'f1', 'frugalscore', 'glue', 'google_bleu', 'indic_glue', 'mae', 'mahalanobis', 'matthews_correlation', 'mauve', 'mean_iou', 'meteor', 'mse', 'pearsonr', 'perplexity', 'precision', 'recall', 'rouge', 'sacrebleu', 'sari', 'seqeval', 'spearmanr', 'squad', 'squad_v2', 'super_glue', 'ter', 'wer', 'wiki_split', 'xnli', 'xtreme_s']


In [None]:
# Install stanza; note that the prefix "!" is not needed if you are running in a terminal
!pip install stanza

# Import stanza
import stanza



In [None]:
# Download the Stanford CoreNLP package with Stanza's installation command
# This'll take several minutes, depending on the network speed
corenlp_dir = './corenlp'
stanza.install_corenlp(dir=corenlp_dir)

# Set the CORENLP_HOME environment variable to point to the installation location
import os
os.environ["CORENLP_HOME"] = corenlp_dir



In [None]:
# Examine the CoreNLP installation folder to make sure the installation is successful
!export CORENLP_HOME='./corenlp'
!ls $CORENLP_HOME

build.xml				  jollyday.jar
corenlp.sh				  LIBRARY-LICENSES
CoreNLP-to-HTML.xsl			  LICENSE.txt
ejml-core-0.39.jar			  Makefile
ejml-core-0.39-sources.jar		  patterns
ejml-ddense-0.39.jar			  pom-java-11.xml
ejml-ddense-0.39-sources.jar		  pom-java-17.xml
ejml-simple-0.39.jar			  pom.xml
ejml-simple-0.39-sources.jar		  protobuf-java-3.19.2.jar
input.txt				  README.txt
input.txt.out				  RESOURCE-LICENSES
input.txt.xml				  SemgrexDemo.java
istack-commons-runtime-3.0.7.jar	  ShiftReduceDemo.java
istack-commons-runtime-3.0.7-sources.jar  slf4j-api.jar
javax.activation-api-1.2.0.jar		  slf4j-simple.jar
javax.activation-api-1.2.0-sources.jar	  stanford-corenlp-4.4.0.jar
javax.json-api-1.0-sources.jar		  stanford-corenlp-4.4.0-javadoc.jar
javax.json.jar				  stanford-corenlp-4.4.0-models.jar
jaxb-api-2.4.0-b180830.0359.jar		  stanford-corenlp-4.4.0-sources.jar
jaxb-api-2.4.0-b180830.0359-sources.jar   StanfordCoreNlpDemo.java
jaxb-impl-2.4.0-b180830.0438.jar	  StanfordDependenciesManual.p

In [None]:
# Import client module
from stanza.server import CoreNLPClient

In [None]:
texts = ["Albert Einstein was a German-born theoretical physicist.", "He was going to the school!"]
def coreNLP_tokenizer(inputDocsList):
  tokenizedDocsList = []
  with CoreNLPClient(annotators="tokenize ssplit pos lemma ner depparse".split(), memory='4G', endpoint='http://localhost:9001', be_quiet=True) as client:
    for d in inputDocsList:
      ann = client.annotate(d)

      # You can access annotations using ann.
      sentence = ann.sentence[0]

      # You can access any property within a sentence.
      #print(sentence.text)

      # Likewise for tokens
      #token = sentence.token[0]
      #print (token)
      tokenizedDocsList.append(' '.join([token.word.lower() for token in sentence.token]))
  return tokenizedDocsList

print(coreNLP_tokenizer(texts))

2022-04-21 06:56:14 INFO: Writing properties to tmp file: corenlp_server-5f08aaef57844aa2.props
2022-04-21 06:56:14 INFO: Starting server with command: java -Xmx4G -cp ./corenlp/* edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9001 -timeout 60000 -threads 5 -maxCharLength 100000 -quiet True -serverProperties corenlp_server-5f08aaef57844aa2.props -annotators tokenize,ssplit,pos,lemma,ner,depparse -preload -outputFormat serialized


['albert einstein was a german - born theoretical physicist .', 'he was going to the school !']


In [None]:
import pandas as pd

df = pd.read_csv(root_dir + "setup6_testing.csv")
df.dropna()
df['source'] = df['source'].astype(str)
df['target'] = df['target'].astype(str)
df = df.iloc[:1000,:]
score = generate_summaries(df, rouge_metric) #tokenizer, bart_model

100%|██████████| 1000/1000 [46:41<00:00,  2.80s/it]
2022-04-21 07:43:46 INFO: Writing properties to tmp file: corenlp_server-84ac213ae63e4b49.props
2022-04-21 07:43:46 INFO: Starting server with command: java -Xmx4G -cp ./corenlp/* edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9001 -timeout 60000 -threads 5 -maxCharLength 100000 -quiet True -serverProperties corenlp_server-84ac213ae63e4b49.props -annotators tokenize,ssplit,pos,lemma,ner,depparse -preload -outputFormat serialized
2022-04-21 07:51:14 INFO: Writing properties to tmp file: corenlp_server-167f6849e9b64d2c.props
2022-04-21 07:51:14 INFO: Starting server with command: java -Xmx4G -cp ./corenlp/* edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9001 -timeout 60000 -threads 5 -maxCharLength 100000 -quiet True -serverProperties corenlp_server-167f6849e9b64d2c.props -annotators tokenize,ssplit,pos,lemma,ner,depparse -preload -outputFormat serialized


ls prediction: 
[' 52y F with PMHx dwarfism, asthma, COPD, OSA\non CPAP presents with c/o chronic dry cough, wheezing and dyspnea with\nintermitted dysphagia.\n\nPast \n_NNEWNOTE\nPlease shower daily including washing incisions gently with mild\nsoap, no baths or swimming, and look at your incisions\nEach morning you should weigh yourself and then in the evening\ntake your temperature, these should be written down on the chart\n\nNo driving for approximately one month until follow up\nwith\nsurgeon\nNo lifting more than 10 pounds for 10 weeks\nPlease call with any questions or concerns [**Telephone/Fax (1) 181**]\n\n', ' Mr [**Known patient lastname **] is an 88 year old man with a history of HTN, CAD,\ngout, dementia, h/o stent [**30**] who was recently\ntransferred from [**Hospital1 22**] for further care.\n\nPast \n_NNEWNOTE\nPlease shower daily including washing incisions, no baths or\nswimming\nMonitor wounds for infection - redness, drainage, weight\ngain, or increased pain\nRepo

In [None]:
print(score)
#setup 1
#new setup 2: setup 7
#new setup 3: setup 8
#setup 4
#setup 6
########........////////////////.....////

{'rouge1': AggregateScore(low=Score(precision=0.37581370931348695, recall=0.3845346073794345, fmeasure=0.36309276153015), mid=Score(precision=0.3930433603117144, recall=0.4000555436394585, fmeasure=0.3785663876013402), high=Score(precision=0.40965195865261317, recall=0.4163818686220701, fmeasure=0.3953994590510462)), 'rouge2': AggregateScore(low=Score(precision=0.2294718462563322, recall=0.22591345872136737, fmeasure=0.21924642796195365), mid=Score(precision=0.2465348235766111, recall=0.24333897423928583, fmeasure=0.2363358861926953), high=Score(precision=0.26580906284692074, recall=0.2620747576394299, fmeasure=0.2549936983366994)), 'rougeL': AggregateScore(low=Score(precision=0.3502006603301243, recall=0.35620259882177546, fmeasure=0.3371380177073741), mid=Score(precision=0.3663646191124005, recall=0.3723331956570879, fmeasure=0.3534177676575071), high=Score(precision=0.3821210155572312, recall=0.38759040436024533, fmeasure=0.3679563568002151)), 'rougeLsum': AggregateScore(low=Score(p

Pretrained setup 3 (batch 1 of size 500):
{'rouge1': AggregateScore(low=Score(precision=0.6950903192640693, recall=0.048831310290852456, fmeasure=0.08732959998835234), mid=Score(precision=0.7289609848484848, recall=0.05393380784165647, fmeasure=0.09558538981379487), high=Score(precision=0.7648061120129871, recall=0.05912893945955717, fmeasure=0.10375137688667864)), 'rouge2': AggregateScore(low=Score(precision=0.35927304532198107, recall=0.029461636141038843, fmeasure=0.052644727388255735), mid=Score(precision=0.395699009212322, recall=0.03355580706567919, fmeasure=0.059648008048935414), high=Score(precision=0.4315215480378752, recall=0.038254910479508686, fmeasure=0.06727818696738792)), 'rougeL': AggregateScore(low=Score(precision=0.6469413726551227, recall=0.04526303291085474, fmeasure=0.08071866313315211), mid=Score(precision=0.6817815836940839, recall=0.050410641014290894, fmeasure=0.08900172657293498), high=Score(precision=0.7166864448051954, recall=0.05557108657419433, fmeasure=0.09753763763673581)), 'rougeLsum': AggregateScore(low=Score(precision=0.6473862644300148, recall=0.04558611443458879, fmeasure=0.0813840883289109), mid=Score(precision=0.6809511544011545, recall=0.05028635309939497, fmeasure=0.08901723649352875), high=Score(precision=0.7132885479797983, recall=0.055371791250069385, fmeasure=0.09714082064251803))}

Pretrained setup 2 (batch 1 of size 500) ~ try again...may have used wrong training data,i.e., setup3:
{'rouge1': AggregateScore(low=Score(precision=0.7880784987368151, recall=0.09838452641091526, fmeasure=0.1671336510400838), mid=Score(precision=0.8091164988090687, recall=0.10440408407582427, fmeasure=0.1758818851117194), high=Score(precision=0.8286983220859798, recall=0.11051978314102531, fmeasure=0.18450823177578923)), 'rouge2': AggregateScore(low=Score(precision=0.6373537793731479, recall=0.06977243423213733, fmeasure=0.12063455006540436), mid=Score(precision=0.6632454610576912, recall=0.07451725002011936, fmeasure=0.12718368449843556), high=Score(precision=0.6870271745606921, recall=0.07944019604844037, fmeasure=0.13414861178856236)), 'rougeL': AggregateScore(low=Score(precision=0.7663269375436649, recall=0.09521719604369266, fmeasure=0.16253204258662152), mid=Score(precision=0.7897348469341114, recall=0.1008015127748148, fmeasure=0.17010535111032027), high=Score(precision=0.8088913524227317, recall=0.10655214688094755, fmeasure=0.17792153886802753)), 'rougeLsum': AggregateScore(low=Score(precision=0.7671920364136634, recall=0.0958359462870331, fmeasure=0.16295829664719408), mid=Score(precision=0.7885820087033713, recall=0.10089343145792931, fmeasure=0.17012112366618415), high=Score(precision=0.8105306661529472, recall=0.10608718064086926, fmeasure=0.17749317504997136))}

Made new changes: epochs 3, min length 40, made the training to have only training and validation

Batch 1 of size 200 // Setup 1
{'rouge1': AggregateScore(low=Score(precision=0.5155331667515496, recall=0.09231520644110583, fmeasure=0.1551412691056812), mid=Score(precision=0.5330986484920308, recall=0.09691373512218522, fmeasure=0.16191744719534873), high=Score(precision=0.5489829975579976, recall=0.10189727718722359, fmeasure=0.16902710258477993)), 'rouge2': AggregateScore(low=Score(precision=0.2373596122627374, recall=0.04018188598347949, fmeasure=0.0679016213979583), mid=Score(precision=0.25234310134310134, recall=0.04346693055968569, fmeasure=0.07316604892190157), high=Score(precision=0.26687518419080913, recall=0.04680679260852081, fmeasure=0.07829769033276567)), 'rougeL': AggregateScore(low=Score(precision=0.47828232841504903, recall=0.08589009144538472, fmeasure=0.14415168828269792), mid=Score(precision=0.4920547650878534, recall=0.08983976959630877, fmeasure=0.15003521547099585), high=Score(precision=0.5063340749528249, recall=0.09482923514198997, fmeasure=0.15709000264456097)), 'rougeLsum': AggregateScore(low=Score(precision=0.4773365952368711, recall=0.08539903457401692, fmeasure=0.14338733601782713), mid=Score(precision=0.4921249004263711, recall=0.08997188682276334, fmeasure=0.15020562433453405), high=Score(precision=0.5067925096451752, recall=0.09426151125742911, fmeasure=0.1566315984158779))}

for setup 2 first 500: {'rouge1': AggregateScore(low=Score(precision=0.33701996687384894, recall=0.11348454125386573, fmeasure=0.16600939194132977), mid=Score(precision=0.3550242965018823, recall=0.11919127644225685, fmeasure=0.17385931562957677), high=Score(precision=0.37345242087723957, recall=0.12538065372937474, fmeasure=0.18279776642099813)), 'rouge2': AggregateScore(low=Score(precision=0.17481586054773038, recall=0.054739521479604536, fmeasure=0.08191123756365556), mid=Score(precision=0.1889181078550854, recall=0.05953488600016738, fmeasure=0.08866385609994074), high=Score(precision=0.2031308159861683, recall=0.06440732610881321, fmeasure=0.09562641799442465)), 'rougeL': AggregateScore(low=Score(precision=0.2951295263896105, recall=0.10002661196265167, fmeasure=0.1456751238557045), mid=Score(precision=0.3121159656732555, recall=0.10560202761305443, fmeasure=0.15372366212477842), high=Score(precision=0.32827961037043396, recall=0.11104436591503991, fmeasure=0.1617214610608334)), 'rougeLsum': AggregateScore(low=Score(precision=0.29491977931022384, recall=0.10003439194620055, fmeasure=0.14593284915479504), mid=Score(precision=0.3116989552425705, recall=0.10547527503678816, fmeasure=0.15355707685930647), high=Score(precision=0.33085683350283435, recall=0.11119156379236014, fmeasure=0.16223443911072286))}

back to 3 epochs

setup 3 batch first 500: {'rouge1': AggregateScore(low=Score(precision=0.7993845394041542, recall=0.07142084849066445, fmeasure=0.12190686846280871), mid=Score(precision=0.8333140807510645, recall=0.08047555344612556, fmeasure=0.13437101754712832), high=Score(precision=0.8616421004378206, recall=0.08992493666106946, fmeasure=0.14748954100841466)), 'rouge2': AggregateScore(low=Score(precision=0.5813529417293235, recall=0.057402902546535586, fmeasure=0.09751820106972423), mid=Score(precision=0.6222520107085897, recall=0.06609539293825424, fmeasure=0.10958161605086791), high=Score(precision=0.6654680626756284, recall=0.07457156754174807, fmeasure=0.12181973722083508)), 'rougeL': AggregateScore(low=Score(precision=0.7992955510661824, recall=0.07146135136958172, fmeasure=0.12102209263537897), mid=Score(precision=0.8303607950965669, recall=0.08025109936578204, fmeasure=0.1338654657859127), high=Score(precision=0.8624487885910999, recall=0.0897757786606955, fmeasure=0.1474539708420152)), 'rougeLsum': AggregateScore(low=Score(precision=0.7985170347913758, recall=0.0716930119381343, fmeasure=0.12200517538386395), mid=Score(precision=0.8297169004865925, recall=0.08005882098902037, fmeasure=0.13356446118966764), high=Score(precision=0.8606226642767242, recall=0.08931115646383732, fmeasure=0.14739669910448874))}

changed min length to be 120 and max length to be 512

setup 4 batch first 1000:
{'rouge1': AggregateScore(low=Score(precision=0.2959319105633148, recall=0.3026005306290997, fmeasure=0.2816402229760728), mid=Score(precision=0.30911825608392623, recall=0.3133017857111212, fmeasure=0.29309364347066325), high=Score(precision=0.3222664183341534, recall=0.32382317796450244, fmeasure=0.3028627438747777)), 'rouge2': AggregateScore(low=Score(precision=0.1475429635893302, recall=0.14363358111581775, fmeasure=0.1367787524412716), mid=Score(precision=0.1583779811748498, recall=0.15341952745789816, fmeasure=0.14637314955473557), high=Score(precision=0.16979642529306452, recall=0.1639964274946964, fmeasure=0.15628245183136655)), 'rougeL': AggregateScore(low=Score(precision=0.27028277898963077, recall=0.2762736128937273, fmeasure=0.25767024972677766), mid=Score(precision=0.28247456496066803, recall=0.2862311281496536, fmeasure=0.26750830185795194), high=Score(precision=0.29550358140645444, recall=0.29842595865106103, fmeasure=0.2789473466878995)), 'rougeLsum': AggregateScore(low=Score(precision=0.2706675586706507, recall=0.27552443705723423, fmeasure=0.25716073154494556), mid=Score(precision=0.2829239370952667, recall=0.28669433067849615, fmeasure=0.2678849916489016), high=Score(precision=0.2946071485562838, recall=0.29768792619828993, fmeasure=0.27816931689472285))}


changed to 6 epochs in training stage

setup 6 (first 1000): 
{'rouge1': AggregateScore(low=Score(precision=0.2701154727290618, recall=0.2792953233433652, fmeasure=0.25843358706546715), mid=Score(precision=0.2804681454306134, recall=0.28839194222597353, fmeasure=0.2667090873919774), high=Score(precision=0.2910680628524381, recall=0.2981480860069484, fmeasure=0.27543124768610344)), 'rouge2': AggregateScore(low=Score(precision=0.1243194883800042, recall=0.12190764407195818, fmeasure=0.11598649118343289), mid=Score(precision=0.1331162941505215, recall=0.13012201541630164, fmeasure=0.12330720913946881), high=Score(precision=0.1422702176696457, recall=0.1380206608405504, fmeasure=0.1308608052231107)), 'rougeL': AggregateScore(low=Score(precision=0.2438849449044746, recall=0.25153110775439824, fmeasure=0.23296471574185967), mid=Score(precision=0.25461505754704805, recall=0.26086802922815366, fmeasure=0.24125222728601808), high=Score(precision=0.26471658020176403, recall=0.2702851103455747, fmeasure=0.2499070378133557)), 'rougeLsum': AggregateScore(low=Score(precision=0.24414231209082746, recall=0.25184747387260753, fmeasure=0.23259246492049584), mid=Score(precision=0.25420562717490325, recall=0.26074845647170647, fmeasure=0.2409968733026791), high=Score(precision=0.2658812795379027, recall=0.2705434771054577, fmeasure=0.2505086845858193))}


setup 7 (first 1000): {'rouge1': AggregateScore(low=Score(precision=0.19957466459727305, recall=0.1032716944739904, fmeasure=0.12880725228658255), mid=Score(precision=0.20874510605899185, recall=0.10696239611019814, fmeasure=0.13351527798559937), high=Score(precision=0.2177663260669316, recall=0.11091422324705633, fmeasure=0.13821464481945206)), 'rouge2': AggregateScore(low=Score(precision=0.06663129192425958, recall=0.031284804617388313, fmeasure=0.04057011387114406), mid=Score(precision=0.07279631403106995, recall=0.033744664517297795, fmeasure=0.043781497867035245), high=Score(precision=0.0790963394865811, recall=0.036314000204128956, fmeasure=0.04725350697018035)), 'rougeL': AggregateScore(low=Score(precision=0.14455673719804552, recall=0.07267833738703093, fmeasure=0.09158733803055573), mid=Score(precision=0.1531187853810193, recall=0.07576995426519223, fmeasure=0.09553796841291519), high=Score(precision=0.16085713250377542, recall=0.07899851970837177, fmeasure=0.099549234240269)), 'rougeLsum': AggregateScore(low=Score(precision=0.14467354771209406, recall=0.07252076116495239, fmeasure=0.09118573203916294), mid=Score(precision=0.15266796289243761, recall=0.07570559816185866, fmeasure=0.09548640058220967), high=Score(precision=0.16066229327924245, recall=0.07881938525951122, fmeasure=0.09943217384153433))}


setup 8 (first 1000: {'rouge1': AggregateScore(low=Score(precision=0.6476312849805026, recall=0.1310905021553869, fmeasure=0.2151238230700855), mid=Score(precision=0.6646193088838441, recall=0.1353901644761426, fmeasure=0.22178144366020036), high=Score(precision=0.6810071831316302, recall=0.13961562257799845, fmeasure=0.2279456265223907)), 'rouge2': AggregateScore(low=Score(precision=0.5474444586827614, recall=0.10480266255619512, fmeasure=0.1735333662673038), mid=Score(precision=0.5675278738418414, recall=0.10920528556170667, fmeasure=0.18061147448219328), high=Score(precision=0.5903708168221723, recall=0.11411242961175752, fmeasure=0.18837338471712803)), 'rougeL': AggregateScore(low=Score(precision=0.6172894551454963, recall=0.1259834729439327, fmeasure=0.2067062880047165), mid=Score(precision=0.6356091051213173, recall=0.13015516606968702, fmeasure=0.2130141326693556), high=Score(precision=0.6521853185618376, recall=0.13424981769448513, fmeasure=0.21924564212267897)), 'rougeLsum': AggregateScore(low=Score(precision=0.618736559562622, recall=0.1256490381833889, fmeasure=0.20612921596177694), mid=Score(precision=0.636008135730372, recall=0.13023917512525993, fmeasure=0.2131984806299711), high=Score(precision=0.653872566811416, recall=0.1348076782318806, fmeasure=0.21996028393907865))}


changing beam to 4 // min to 3 and max epochs to 3 // max encoding length in encode_sentences to 512

setup 8 (first 1000 beam 4): {'rouge1': AggregateScore(low=Score(precision=0.6461131181318723, recall=0.1296977626916271, fmeasure=0.213224892319574), mid=Score(precision=0.6629382918552071, recall=0.13373353188279657, fmeasure=0.21933750101882393), high=Score(precision=0.680875920329674, recall=0.13822388819688533, fmeasure=0.2261626164654231)), 'rouge2': AggregateScore(low=Score(precision=0.5438704647435868, recall=0.10270680027289553, fmeasure=0.17027334458895144), mid=Score(precision=0.5670909340659311, recall=0.10771120774091275, fmeasure=0.1785041547270986), high=Score(precision=0.5889753777472501, recall=0.11266191891009676, fmeasure=0.18623242622887032)), 'rougeL': AggregateScore(low=Score(precision=0.6179378143629272, recall=0.12445556568413133, fmeasure=0.20454298834529955), mid=Score(precision=0.6349767830209033, recall=0.12872159427677343, fmeasure=0.21093301447005455), high=Score(precision=0.6520038792375953, recall=0.13314241754499645, fmeasure=0.21758389253478716)), 'rougeLsum': AggregateScore(low=Score(precision=0.6167856378384701, recall=0.12418663070404537, fmeasure=0.20408753710773186), mid=Score(precision=0.6348866910866942, recall=0.12880884666242687, fmeasure=0.21107272548571385), high=Score(precision=0.6528032354287896, recall=0.13300988218428372, fmeasure=0.21772963990067648))}


setup 8 (first 1000 beam 5): {'rouge1': AggregateScore(low=Score(precision=0.6491581268209277, recall=0.1302863396304659, fmeasure=0.21435784963488122), mid=Score(precision=0.6656853479853524, recall=0.13437195424765402, fmeasure=0.22038132689866702), high=Score(precision=0.6805409634727247, recall=0.13879665293214907, fmeasure=0.2269363578048806)), 'rouge2': AggregateScore(low=Score(precision=0.5484223652342648, recall=0.10415385989886482, fmeasure=0.17294133324201463), mid=Score(precision=0.570160652780053, recall=0.10856032054714157, fmeasure=0.17982904971263192), high=Score(precision=0.5932658324449158, recall=0.11360453166417285, fmeasure=0.187992466764137)), 'rougeL': AggregateScore(low=Score(precision=0.6180103984725405, recall=0.12503753956164548, fmeasure=0.20519139076436524), mid=Score(precision=0.6364696482972492, recall=0.129231946562048, fmeasure=0.21176981744779255), high=Score(precision=0.6551473732455697, recall=0.13350815513096428, fmeasure=0.21845253363573755)), 'rougeLsum': AggregateScore(low=Score(precision=0.619142041142302, recall=0.12482654600664968, fmeasure=0.2048616835750532), mid=Score(precision=0.6371564363991369, recall=0.1293692820716894, fmeasure=0.2119370575323161), high=Score(precision=0.6538731058463011, recall=0.13400769958111436, fmeasure=0.21891185801454824))}


setup 8 (first 1000 beam 6): {'rouge1': AggregateScore(low=Score(precision=0.6503172419737161, recall=0.13080189633785666, fmeasure=0.21479269633973075), mid=Score(precision=0.6660516187244174, recall=0.13456786170568363, fmeasure=0.22077918166465144), high=Score(precision=0.6822686337804407, recall=0.13861529061990502, fmeasure=0.2270689592282383)), 'rouge2': AggregateScore(low=Score(precision=0.5489643028846123, recall=0.10430426120472643, fmeasure=0.1728515708469239), mid=Score(precision=0.5707999313186789, recall=0.10881451849348998, fmeasure=0.1802891582182756), high=Score(precision=0.5918718967490812, recall=0.11358030098821623, fmeasure=0.1877314801974851)), 'rougeL': AggregateScore(low=Score(precision=0.620275741354236, recall=0.12543795597333152, fmeasure=0.20576528119209483), mid=Score(precision=0.6376027661064451, recall=0.12970958173666455, fmeasure=0.21247173148145077), high=Score(precision=0.6548432099763013, recall=0.13417823589016536, fmeasure=0.21910757564840688)), 'rougeLsum': AggregateScore(low=Score(precision=0.6209938426524484, recall=0.12536595623236996, fmeasure=0.20577547923601397), mid=Score(precision=0.6383086188321505, recall=0.1298433025087508, fmeasure=0.2126724788142485), high=Score(precision=0.6554045063025244, recall=0.133994301590516, fmeasure=0.2191405107574221))}


setup 8 (first 1000 beam 7): {'rouge1': AggregateScore(low=Score(precision=0.6507811940000396, recall=0.1304707508127916, fmeasure=0.21478224361263004), mid=Score(precision=0.6669972217733285, recall=0.13478244957185373, fmeasure=0.2210323829992657), high=Score(precision=0.6828034533011966, recall=0.13873797025717427, fmeasure=0.22695315137548377)), 'rouge2': AggregateScore(low=Score(precision=0.5499269963369934, recall=0.10446857335066087, fmeasure=0.1732743707049391), mid=Score(precision=0.5717579441391915, recall=0.1089495061192372, fmeasure=0.18041055326296818), high=Score(precision=0.5939325492216092, recall=0.11367088855217519, fmeasure=0.1879445410293667)), 'rougeL': AggregateScore(low=Score(precision=0.6195224715623446, recall=0.12531638375548304, fmeasure=0.2056728392887232), mid=Score(precision=0.6390434128779743, recall=0.1296445520740192, fmeasure=0.21241662573472836), high=Score(precision=0.6552698626373664, recall=0.13398847101239034, fmeasure=0.21851460725031052)), 'rougeLsum': AggregateScore(low=Score(precision=0.6198454948646143, recall=0.12507066632562291, fmeasure=0.20514833316818143), mid=Score(precision=0.6379802000287321, recall=0.129669517935999, fmeasure=0.21232385669396342), high=Score(precision=0.6565006926488567, recall=0.13412645683393806, fmeasure=0.21945757583193604))}

setup 8 (first 1000 beam 8) : {'rouge1': AggregateScore(low=Score(precision=0.64983254578755, recall=0.13040380093279666, fmeasure=0.21426461906209598), mid=Score(precision=0.664915689865694, recall=0.1345004493459558, fmeasure=0.22047220522183975), high=Score(precision=0.6814915079365124, recall=0.13859773215479332, fmeasure=0.2266377613414076)), 'rouge2': AggregateScore(low=Score(precision=0.5473119871794841, recall=0.10419120808015181, fmeasure=0.1729596817557206), mid=Score(precision=0.5706633699633678, recall=0.10887134749451602, fmeasure=0.18028015645771678), high=Score(precision=0.5909680494505478, recall=0.11354629482896222, fmeasure=0.18743962198977593)), 'rougeL': AggregateScore(low=Score(precision=0.6195827869352888, recall=0.125389436392501, fmeasure=0.2056835520524333), mid=Score(precision=0.6372615995116018, recall=0.1295378502694847, fmeasure=0.21228126454698906), high=Score(precision=0.6553115506715532, recall=0.13376691776078334, fmeasure=0.21875129900259213)), 'rougeLsum': AggregateScore(low=Score(precision=0.6183291575091594, recall=0.12497820012413463, fmeasure=0.20517189295962307), mid=Score(precision=0.6376412087912106, recall=0.12948209155882856, fmeasure=0.21221113986766674), high=Score(precision=0.6568591910866941, recall=0.13435401295919086, fmeasure=0.2193400638056553))}


same beam 8, epoch 10: {'rouge1': AggregateScore(low=Score(precision=0.6542441432178966, recall=0.13041916462270775, fmeasure=0.21456302549039796), mid=Score(precision=0.6705441738816781, recall=0.13456823060828405, fmeasure=0.2208643972924661), high=Score(precision=0.6868473971861517, recall=0.13834707600329085, fmeasure=0.22664375850447807)), 'rouge2': AggregateScore(low=Score(precision=0.5538785943223413, recall=0.10409342350141472, fmeasure=0.17307144510975978), mid=Score(precision=0.5757556166056135, recall=0.1089100551030506, fmeasure=0.18046020990692627), high=Score(precision=0.5958640086996323, recall=0.11339673500404898, fmeasure=0.18750412284727577)), 'rougeL': AggregateScore(low=Score(precision=0.6252806006493521, recall=0.12509552153113282, fmeasure=0.20573704682018484), mid=Score(precision=0.6430133838383867, recall=0.12953468002032592, fmeasure=0.21239320805471582), high=Score(precision=0.6621749377705662, recall=0.1340243796011916, fmeasure=0.21931728319273977)), 'rougeLsum': AggregateScore(low=Score(precision=0.6239502967171743, recall=0.12532707956111327, fmeasure=0.2060670222275071), mid=Score(precision=0.6424088924963949, recall=0.12953727166939716, fmeasure=0.21245951283649334), high=Score(precision=0.6592111498917773, recall=0.13388510807489296, fmeasure=0.21901982644784126))}


changed back to epoch 3

setup 8 (first 1000 beam 1): {'rouge1': AggregateScore(low=Score(precision=0.6509224101939761, recall=0.12589605989028374, fmeasure=0.20761072042657677), mid=Score(precision=0.6666025738150776, recall=0.130196534863738, fmeasure=0.21438476019343414), high=Score(precision=0.6821492651792693, recall=0.13419069417116025, fmeasure=0.22014563496726705)), 'rouge2': AggregateScore(low=Score(precision=0.5496499633699609, recall=0.10043329540324869, fmeasure=0.1672769779821741), mid=Score(precision=0.5726804736929711, recall=0.10511884316049722, fmeasure=0.17473209443436566), high=Score(precision=0.5928077014651987, recall=0.10970310975773115, fmeasure=0.18205865285326192)), 'rougeL': AggregateScore(low=Score(precision=0.6227781152181165, recall=0.12136196300776916, fmeasure=0.19977396820964083), mid=Score(precision=0.640134120740373, recall=0.125428469625535, fmeasure=0.20628431514423148), high=Score(precision=0.6577153161768814, recall=0.13016202568992025, fmeasure=0.21355564164955612)), 'rougeLsum': AggregateScore(low=Score(precision=0.6233763799741942, recall=0.12127938431050024, fmeasure=0.19974095543255446), mid=Score(precision=0.6404275675713192, recall=0.12554169332478415, fmeasure=0.20646558054112513), high=Score(precision=0.6577203891941422, recall=0.13009692808450662, fmeasure=0.2134283129151689))}

setup 8 (first 1000 beam 2): {'rouge1': AggregateScore(low=Score(precision=0.6460002213064753, recall=0.12888943302558822, fmeasure=0.21197932590793306), mid=Score(precision=0.6631601800976838, recall=0.1331446467773457, fmeasure=0.21843299390924573), high=Score(precision=0.6799970276251576, recall=0.1371658200268925, fmeasure=0.22459991644966512)), 'rouge2': AggregateScore(low=Score(precision=0.5460076677489146, recall=0.10246391650306244, fmeasure=0.170321884729657), mid=Score(precision=0.5674195461401318, recall=0.1075128674886305, fmeasure=0.17808025405648362), high=Score(precision=0.5900260323255152, recall=0.11233692940237101, fmeasure=0.18579981485021527)), 'rougeL': AggregateScore(low=Score(precision=0.6177262782356555, recall=0.12403862626480873, fmeasure=0.20377151143396757), mid=Score(precision=0.635069139194142, recall=0.12833582576282704, fmeasure=0.21028537225454497), high=Score(precision=0.6527165636446924, recall=0.1326101485309869, fmeasure=0.21692727292878197)), 'rougeLsum': AggregateScore(low=Score(precision=0.616347252747254, recall=0.12361857216217305, fmeasure=0.20331856734467874), mid=Score(precision=0.6347557234432264, recall=0.12817180464550648, fmeasure=0.2100252446588419), high=Score(precision=0.652882505341883, recall=0.13272037185468247, fmeasure=0.21698858975244587))}


setup 8 (first 1000 beam 3): 
{'rouge1': AggregateScore(low=Score(precision=0.6509626179891702, recall=0.12990401423262818, fmeasure=0.21376769987364502), mid=Score(precision=0.6677052435009128, recall=0.13411361329646176, fmeasure=0.2200733116925112), high=Score(precision=0.6836831127002121, recall=0.13844162647434, fmeasure=0.2268737647293975)), 'rouge2': AggregateScore(low=Score(precision=0.551018572573257, recall=0.1037885920850994, fmeasure=0.1723093421817247), mid=Score(precision=0.572833050976798, recall=0.10858052065049587, fmeasure=0.17994002303146833), high=Score(precision=0.5950681299603152, recall=0.11328856649747236, fmeasure=0.18727294592532606)), 'rougeL': AggregateScore(low=Score(precision=0.622666118350176, recall=0.12494061120160425, fmeasure=0.205360411561291), mid=Score(precision=0.6399128078019355, recall=0.12928015171803786, fmeasure=0.21191716725498444), high=Score(precision=0.6582536634643769, recall=0.13359052566441862, fmeasure=0.2184562918714991)), 'rougeLsum': AggregateScore(low=Score(precision=0.6224858603309763, recall=0.12526630497424843, fmeasure=0.2055053579208858), mid=Score(precision=0.6403669498501179, recall=0.12934703168654488, fmeasure=0.2120483504720989), high=Score(precision=0.6567511734299019, recall=0.13387214125736296, fmeasure=0.2188743896845523))}


setup 8 (first 1000 beam 9): {'rouge1': AggregateScore(low=Score(precision=0.6518409967320297, recall=0.13029919867259218, fmeasure=0.2145777588760571), mid=Score(precision=0.6676997809380205, recall=0.13448860696567322, fmeasure=0.22063722496962396), high=Score(precision=0.6833209748258323, recall=0.1385975196214294, fmeasure=0.22694665119939086)), 'rouge2': AggregateScore(low=Score(precision=0.5535594391025608, recall=0.10463397344882025, fmeasure=0.17362364353643184), mid=Score(precision=0.5735111263736237, recall=0.10898707371041862, fmeasure=0.1806107185522928), high=Score(precision=0.5944686217948699, recall=0.11365260307604573, fmeasure=0.18783984605697637)), 'rougeL': AggregateScore(low=Score(precision=0.6219185315664739, recall=0.1253139588821414, fmeasure=0.2057916249178261), mid=Score(precision=0.6404044782015398, recall=0.12975065575340555, fmeasure=0.21264647775012868), high=Score(precision=0.6573630823098505, recall=0.13435580814663337, fmeasure=0.21948010693912523)), 'rougeLsum': AggregateScore(low=Score(precision=0.6215039287509889, recall=0.12498043735576295, fmeasure=0.20543131860252603), mid=Score(precision=0.6396321949292564, recall=0.1296193399709042, fmeasure=0.21242192994520798), high=Score(precision=0.6578826380808764, recall=0.13384370314450073, fmeasure=0.21893544753201785))}


setup 8 (beam 10): {'rouge1': AggregateScore(low=Score(precision=0.6496093527795771, recall=0.1307385745978583, fmeasure=0.21473197346450623), mid=Score(precision=0.6669077569489374, recall=0.1347407588314311, fmeasure=0.22098648645933505), high=Score(precision=0.6843906088396936, recall=0.13919295723820452, fmeasure=0.2276794960979539)), 'rouge2': AggregateScore(low=Score(precision=0.5508045993589715, recall=0.10446693612769034, fmeasure=0.17301703265574042), mid=Score(precision=0.5718509074259046, recall=0.1090581577388838, fmeasure=0.18067086959124767), high=Score(precision=0.5944161627955359, recall=0.11385141546182453, fmeasure=0.18812697997114397)), 'rougeL': AggregateScore(low=Score(precision=0.6227593615330765, recall=0.1255899055751816, fmeasure=0.206284205457681), mid=Score(precision=0.6389274267399295, recall=0.12990045394596303, fmeasure=0.21278752616142277), high=Score(precision=0.6559100612745133, recall=0.13404995003631465, fmeasure=0.2191989653171376)), 'rougeLsum': AggregateScore(low=Score(precision=0.621227056130147, recall=0.12543988428616662, fmeasure=0.20598274741371309), mid=Score(precision=0.638523093083389, recall=0.1299037208598179, fmeasure=0.21285053605923282), high=Score(precision=0.6563996431264845, recall=0.13428670307715762, fmeasure=0.21935758898801244))}


setup 1 (first 1000): {'rouge1': AggregateScore(low=Score(precision=0.48229438597125546, recall=0.31742321533679146, fmeasure=0.34882732629260754), mid=Score(precision=0.49335598085977495, recall=0.3296087438915013, fmeasure=0.3610705008226629), high=Score(precision=0.5047281637834604, recall=0.34072180741508384, fmeasure=0.3716456636740299)), 'rouge2': AggregateScore(low=Score(precision=0.2988678611856588, recall=0.2064939073297723, fmeasure=0.2240742943628778), mid=Score(precision=0.31089234780740005, recall=0.21665732971954957, fmeasure=0.23424048740546252), high=Score(precision=0.3234101747541477, recall=0.22684639762431788, fmeasure=0.244866077774931)), 'rougeL': AggregateScore(low=Score(precision=0.4387813590236205, recall=0.2885071367098172, fmeasure=0.31832042942047756), mid=Score(precision=0.4507713282711956, recall=0.3008426649306472, fmeasure=0.3294357830521927), high=Score(precision=0.4632478391772703, recall=0.312624397864685, fmeasure=0.3408575132648243)), 'rougeLsum': AggregateScore(low=Score(precision=0.43894403069680216, recall=0.2890438835979335, fmeasure=0.3177217640028891), mid=Score(precision=0.4509427573416873, recall=0.30069767104832923, fmeasure=0.32888036895131256), high=Score(precision=0.4641343769356566, recall=0.3121668956254859, fmeasure=0.34089234013774444))}


setup 7 (first 1000): {'rouge1': AggregateScore(low=Score(precision=0.3200080789895378, recall=0.15526547434789895, fmeasure=0.19861493843168818), mid=Score(precision=0.3371754173981245, recall=0.16327874070065485, fmeasure=0.20983878085952068), high=Score(precision=0.35368323182715883, recall=0.1717739942186453, fmeasure=0.22048102588928506)), 'rouge2': AggregateScore(low=Score(precision=0.17642550028999565, recall=0.0834776752566317, fmeasure=0.10895847353207287), mid=Score(precision=0.1924381727585076, recall=0.09044882682380201, fmeasure=0.11860642657252837), high=Score(precision=0.20833690672356708, recall=0.0978212529429341, fmeasure=0.12885286071224647)), 'rougeL': AggregateScore(low=Score(precision=0.26443130247595265, recall=0.12480381974282286, fmeasure=0.16273416268888372), mid=Score(precision=0.2797516636484494, recall=0.13252955604988054, fmeasure=0.17246597108766754), high=Score(precision=0.29574773234558166, recall=0.1398198688952513, fmeasure=0.18259465126915372)), 'rougeLsum': AggregateScore(low=Score(precision=0.2641336156941893, recall=0.12510566099732542, fmeasure=0.1627412959367001), mid=Score(precision=0.27946446247629986, recall=0.13231837584826944, fmeasure=0.17253741412186457), high=Score(precision=0.2962560355086074, recall=0.13987664299628266, fmeasure=0.1822724301477083))}


setup 4 (first 1000): {'rouge1': AggregateScore(low=Score(precision=0.35838241177647745, recall=0.36456802556044643, fmeasure=0.34416957765498446), mid=Score(precision=0.37494041551835056, recall=0.38105774601814535, fmeasure=0.359705191198717), high=Score(precision=0.3928869919412025, recall=0.398614979677985, fmeasure=0.37710604408916637)), 'rouge2': AggregateScore(low=Score(precision=0.21544745206355648, recall=0.21579381998989866, fmeasure=0.2061638159902275), mid=Score(precision=0.23340528906102495, recall=0.23452745129258226, fmeasure=0.2243738660557978), high=Score(precision=0.25140114932788304, recall=0.25230053502314426, fmeasure=0.24222988602670442)), 'rougeL': AggregateScore(low=Score(precision=0.33103143499094195, recall=0.33675389617460677, fmeasure=0.3179519154508725), mid=Score(precision=0.3488910480388456, recall=0.35539410061935073, fmeasure=0.3355597855973167), high=Score(precision=0.3660225040535281, recall=0.371737478433544, fmeasure=0.3514992878022417)), 'rougeLsum': AggregateScore(low=Score(precision=0.33013873754570544, recall=0.33760795237449653, fmeasure=0.31741783837671017), mid=Score(precision=0.3489177453786196, recall=0.3553808274149345, fmeasure=0.3355750274626767), high=Score(precision=0.3658987880586951, recall=0.3729545697947856, fmeasure=0.3512399345565972))}


setup 6 (first 1000): {'rouge1': AggregateScore(low=Score(precision=0.37581370931348695, recall=0.3845346073794345, fmeasure=0.36309276153015), mid=Score(precision=0.3930433603117144, recall=0.4000555436394585, fmeasure=0.3785663876013402), high=Score(precision=0.40965195865261317, recall=0.4163818686220701, fmeasure=0.3953994590510462)), 'rouge2': AggregateScore(low=Score(precision=0.2294718462563322, recall=0.22591345872136737, fmeasure=0.21924642796195365), mid=Score(precision=0.2465348235766111, recall=0.24333897423928583, fmeasure=0.2363358861926953), high=Score(precision=0.26580906284692074, recall=0.2620747576394299, fmeasure=0.2549936983366994)), 'rougeL': AggregateScore(low=Score(precision=0.3502006603301243, recall=0.35620259882177546, fmeasure=0.3371380177073741), mid=Score(precision=0.3663646191124005, recall=0.3723331956570879, fmeasure=0.3534177676575071), high=Score(precision=0.3821210155572312, recall=0.38759040436024533, fmeasure=0.3679563568002151)), 'rougeLsum': AggregateScore(low=Score(precision=0.34994101803231725, recall=0.35689828303276455, fmeasure=0.33741551566632416), mid=Score(precision=0.36694567660450805, recall=0.37326685377771845, fmeasure=0.3539942152870409), high=Score(precision=0.3828066557755826, recall=0.38912457294159775, fmeasure=0.369526484035887))}
