# 1. Setup

## 1.1. Installing and loading necessary libraries


In [1]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.9.0-py3-none-any.whl (2.6 MB)
[K     |████████████████████████████████| 2.6 MB 22.8 MB/s 
Collecting huggingface-hub==0.0.12
  Downloading huggingface_hub-0.0.12-py3-none-any.whl (37 kB)
Collecting sacremoses
  Downloading sacremoses-0.0.45-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 30.6 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636 kB)
[K     |████████████████████████████████| 636 kB 52.4 MB/s 
[?25hCollecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 53.0 MB/s 
Installing collected packages: tokenizers, sacremoses, pyyaml, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstalling PyYAML-3.13:
      Successfully uninst

In [2]:
!pip install datasets

Collecting datasets
  Downloading datasets-1.10.2-py3-none-any.whl (542 kB)
[K     |████████████████████████████████| 542 kB 23.5 MB/s 
Collecting fsspec>=2021.05.0
  Downloading fsspec-2021.7.0-py3-none-any.whl (118 kB)
[K     |████████████████████████████████| 118 kB 53.8 MB/s 
[?25hCollecting tqdm>=4.42
  Downloading tqdm-4.61.2-py2.py3-none-any.whl (76 kB)
[K     |████████████████████████████████| 76 kB 4.6 MB/s 
Collecting xxhash
  Downloading xxhash-2.0.2-cp37-cp37m-manylinux2010_x86_64.whl (243 kB)
[K     |████████████████████████████████| 243 kB 44.7 MB/s 
Installing collected packages: tqdm, xxhash, fsspec, datasets
  Attempting uninstall: tqdm
    Found existing installation: tqdm 4.41.1
    Uninstalling tqdm-4.41.1:
      Successfully uninstalled tqdm-4.41.1
Successfully installed datasets-1.10.2 fsspec-2021.7.0 tqdm-4.61.2 xxhash-2.0.2


In [3]:
!pip install rouge_score

Collecting rouge_score
  Downloading rouge_score-0.0.4-py2.py3-none-any.whl (22 kB)
Installing collected packages: rouge-score
Successfully installed rouge-score-0.0.4


In [4]:
# Importing necessary librairies
import datasets
from datasets import load_metric
from google.colab import drive
import io
import matplotlib.pyplot as plt
import numpy as np
import pickle 
import pandas as pd
import torch 
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertForSequenceClassification
from tqdm import tqdm, trange
import seaborn as sns
from sklearn.model_selection import train_test_split

## 1.2. Importing personal drive to file in order to later load the data and the saved fine tuned model


In [5]:
# Mount Google Drive to this Notebook instance.
drive.mount('/content/drive')

Mounted at /content/drive


## 2. Load and Format data

## 2.1. Loading the clean data

In [6]:
# We load the cleaned data from a pickle format
file ='/content/drive/MyDrive/Colab Notebooks/Data/final_clean.pickle'
df = pd.read_pickle(file)

In [7]:
# Looking at the data
df

Unnamed: 0,description_narrative,TLDR,text_clean,text_embedding,summary_clean,summary_embedding,labels,labels_idx_list,doc_label
0,The training improved women’s knowledge on the...,The training improved women’s knowledge on the...,[The training improved women’s knowledge on th...,"[[0.76747984, -0.1894493, 0.51285785, -0.02116...",[The training improved women’s knowledge on th...,"[[0.76747984, -0.1894493, 0.51285785, -0.02116...","[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",[0],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,Illegal oil refining in the Niger Delta is inc...,CEHRD in an effort to create awareness on the ...,[Illegal oil refining in the Niger Delta is in...,"[[-0.26888534, -0.6092957, -0.23817927, -1.167...",[CEHRD in an effort to create awareness on the...,"[[-0.43648192, -0.071132354, -0.18081759, -0.4...","[0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[2, 3]","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
2,CEHRD successfully set-up 4 formal and 4 infor...,CEHRD set-up the environmental clubs with the ...,[CEHRD successfully set-up 4 formal and 4 info...,"[[-0.48964322, -1.2085572, 1.0374501, 0.021369...",[CEHRD set-up the environmental clubs with the...,"[[-0.15335692, -0.29430717, 0.58692193, -1.126...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",[10],"[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]"
3,PROJECT NAME : Delivering Accelerated Family P...,Delivering Accelerated Family Planning in Paki...,"[PROJECT NAME :, Delivering Accelerated Family...","[[-0.23698506, 0.15983863, -0.071198896, -1.20...",[Delivering Accelerated Family Planning in Pak...,"[[-0.50266284, -1.2923895, 0.4206834, -1.25673...","[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",[1],"[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ..."
4,Road traffic injuries are world's eighth leadi...,The Cardiff Trauma Pack Research and Develop...,[Road traffic injuries are world's eighth lead...,"[[-0.4666822, -1.19177, 0.9945381, -0.95759714...",[ The Cardiff Trauma Pack Research and Develo...,"[[-0.4679708, -0.43502182, 0.8388376, -1.47547...","[0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, ...","[4, 5]","[4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, ..."
...,...,...,...,...,...,...,...,...,...
2980,Metta has been responding to the humanitarian ...,"ProjectGoal: To improve the condition of 2,854...",[Metta has been responding to the humanitarian...,"[[-0.6875755, -1.0828758, 0.32236806, -1.33987...","[ProjectGoal:, To improve the condition of 2,8...","[[-0.58209735, 0.027911462, 0.29285246, -1.098...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, ...","[7, 8]","[2980, 2980, 2980, 2980, 2980, 2980, 2980, 298..."
2981,"Destined Women is local not for profit, non-re...",GOAL: Contribute towards changing the socioeco...,"[Destined Women is local not for profit, non-r...","[[-0.3787009, -0.7928037, 0.16012278, -0.16681...","[GOAL:, Contribute towards changing the socioe...","[[-0.2746246, 0.04018628, 0.4832729, -0.996766...","[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ...","[1, 7]","[2981, 2981, 2981, 2981, 2981, 2981, 2981, 298..."
2982,The project will empower secondary school stud...,The project will empower secondary school stud...,[The project will empower secondary school stu...,"[[0.3152278, -0.62204534, 1.0845532, -0.392865...",[The project will empower secondary school stu...,"[[0.3152278, -0.62204534, 1.0845532, -0.392865...","[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]",[0],"[2982, 2982, 2982, 2982, 2982, 2982, 2982, 2982]"
2983,The project has been specifically designed to ...,The project has been specifically designed to ...,[The project has been specifically designed to...,"[[0.2448853, -0.043037124, 0.5578783, -0.51006...",[The project has been specifically designed to...,"[[0.2448853, -0.043037124, 0.5578783, -0.51006...","[1.0, 0.0, 0.0, 0.0, 0.0, 0.0]",[0],"[2983, 2983, 2983, 2983, 2983, 2983]"


## 2.2. Creating the train and test sets in the right format

### 2.2.1. Creating the train set, with each row being a sentence 

In [8]:
# Splitting the dataframe into a train and a test set, keeping 20% for the test set. 
train_dataset, test_dataset = train_test_split(df, test_size=0.2)

# We wish to split each sentence individually for the purpose of classification. 
#for this reason, we create three empty lists to later feed them with the individual sentences, associated with their label and document label 
sentences, labels, document_labels = [], [], [] 

# We do the below to feed the empty lists
#looping through the embeddings, the document labels, the text and the target labels
for doc_x, doc_y, doc_label in zip(train_dataset['text_clean'], 
                                   train_dataset['labels'], 
                                   train_dataset['doc_label']):
                                          
#looping through each item within the text, the target labels, the document labels
    for i, (sent, target, label) in enumerate(zip(doc_x, doc_y, doc_label)):
        
        #appending each item to the empty lists previously created 
        sentences.append(sent) #this will be used to train the data points 
        labels.append(target) #this will be used to train the data points
        document_labels.append(label) #this will be used to retrieve the intial document the sentence belongs to 

# We then create a dataframe out of the created lists 
train_dataset = pd.DataFrame({'sentence': sentences,
                             'label' :  labels,
                             'document_label': document_labels,
                             })

# For precision, we convert the current float labels to integers 
#creation of an empty list
new_label = []

#looping in the label column, converting each label to an integer and appending to the new_label list
for i in train_dataset['label']:
  i = int(i)
  new_label.append(i)  

#updating the label column with the new_label list
train_dataset['label'] = new_label

# Viewing the pre-formatted train_dataset
train_dataset



Unnamed: 0,sentence,label,document_label
0,The project is designed to support government ...,1,2825
1,It targets 600 households in the regions of Di...,0,2825
2,Theproject will also implement awareness-raisi...,0,2825
3,The development objective of the Integrated Na...,1,972
4,"Likewise, the global environment objective see...",0,972
...,...,...,...
25021,Progammes will include peace building and advo...,0,2658
25022,Oxfam will aim at ensuring that activities in ...,0,2658
25023,This is essential to reduce conflict between r...,0,2658
25024,# To train 25 refugees and host-community yout...,0,2658


### 2.2.2. Creating the test set, with each row being a sentence

In [9]:
# We repeat the same process for the test_dataset

# Creating empty lists 
sentences, labels, document_labels = [], [], [] 

#looping through the embeddings, the document labels, the text and the target labels
for doc_x, doc_y, doc_label in zip(test_dataset['text_clean'], 
                                   test_dataset['labels'], 
                                   test_dataset['doc_label']):
                                          
    #looping through each item within the text, the target labels, the document labels
    for i, (sent, target, label) in enumerate(zip(doc_x, doc_y, doc_label )):
        
        #appending each item to the empty lists previously created 
        sentences.append(sent) #this will be used to train the data points 
        labels.append(target) #this will be used to train the data points
        document_labels.append(label) #this will be used to retrieve the intial document the sentence belongs to 

# Creating a new test_dataset dataframe with appropriate lists 
test_dataset = pd.DataFrame({'sentence': sentences,
                             'label' :  labels,
                             'document_label': document_labels,
                             })

# Converting float labels to integers 
new_label = []

for i in test_dataset['label']:
  i = int(i)
  new_label.append(i)   

#updating the label column with the new_label list
test_dataset['label'] = new_label

# Viewing the pre-formatted train_dataset
test_dataset

Unnamed: 0,sentence,label,document_label
0,The development objective of the Climate Resil...,1,1010
1,It has four components.,0,1010
2,"First component, Sectoral and Spatial Planning...",0,1010
3,"Second component, Climate Resilient Infrastruc...",0,1010
4,"Third component, Strengthening the Enabling En...",0,1010
...,...,...,...
6138,This component will assist pastoralist/agro-pa...,0,1723
6139,The third component is the development learnin...,0,1723
6140,The third component comprises a set of interve...,0,1723
6141,The component will have two sub-components: (i...,0,1723


# 3. Loading the pre-trained BertForSequenceClassification model


We'll be using [BertForSequenceClassification](https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#bertforsequenceclassification). This is the normal BERT model with an added single linear layer on top for classification that we will use as a sentence classifier. We will not train this model, as it will be used as baseline model to evaluate the added value of training and fine-tuning it to our specific task of summarising development bank project papers. 


In [11]:
# Loading the previsouly pre-trained BERTForSequenceClassification model using the from_pretrained method of hugginface's transformer library
model = BertForSequenceClassification.from_pretrained('bert-base-uncased')

# Get all of the model's parameters as a list of tuples.
params = list(model.named_parameters())

print('The BERT model has {:} different named parameters.\n'.format(len(params)))

print('==== Embedding Layer ====\n')

for p in params[0:5]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== First Transformer ====\n')

for p in params[5:21]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== Output Layer ====\n')

for p in params[-4:]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

The BERT model has 201 different named parameters.

==== Embedding Layer ====

bert.embeddings.word_embeddings.weight                  (30522, 768)
bert.embeddings.position_embeddings.weight                (512, 768)
bert.embeddings.token_type_embeddings.weight                (2, 768)
bert.embeddings.LayerNorm.weight                              (768,)
bert.embeddings.LayerNorm.bias                                (768,)

==== First Transformer ====

bert.encoder.layer.0.attention.self.query.weight          (768, 768)
bert.encoder.layer.0.attention.self.query.bias                (768,)
bert.encoder.layer.0.attention.self.key.weight            (768, 768)
bert.encoder.layer.0.attention.self.key.bias                  (768,)
bert.encoder.layer.0.attention.self.value.weight          (768, 768)
bert.encoder.layer.0.attention.self.value.bias                (768,)
bert.encoder.layer.0.attention.output.dense.weight        (768, 768)
bert.encoder.layer.0.attention.output.dense.bias              (

# 4. Performance On Test Set

### 4.1 Prepare the dataset

BERT requires a specific format for its input, namely: 

- Tokenize the sentences at a word level
- Special tokens [CLS] and [SEP] respectively at the begggining and end of each sentence (i.e. input) 
- Map all tokens to their input IDs 
- Pad all sentences for each input to match the same length; the one of the longest sentence. 
- Add Attention mask to differentitate normal tokens from padded tokens.

We implement these pre-processing steps in the below cells. 

In [12]:
# Load the BERT tokenizer.
print('Loading BERT tokenizer...')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True) #we choose bert-base-uncased as it has shown to be more effective for classification purposes than the bert-base-cased version.

# Now computing the longest sentence's length to later pad to that length 
#setting the max length to 0
max_len = 0

#create sentence and label lists from test dataset
sentences = test_dataset.sentence.values
labels = test_dataset.label.values

#for every sentence...
for sent in sentences:

    #tokenize the text and add `[CLS]` and `[SEP]` tokens.
    input_ids = tokenizer.encode(sent, add_special_tokens=True)

    #update the maximum sentence length.
    max_len = max(max_len, len(input_ids))

#printing the maximum sentence length. 
print('Max sentence length: ', max_len)

Loading BERT tokenizer...


Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Max sentence length:  203


Now that we have the maximum sentence length, we can pre-process all the sentences as per defined above. 

We use the encode_plus method of huggingface's BERT tokenizer to carry these steps.


In [13]:
# Report the number of sentences.
print('Number of test sentences: {:,}\n'.format(test_dataset.shape[0]))

# Tokenize all of the sentences and map the tokens to their word IDs.
input_ids = []
attention_masks = []

# For every sentence...
for sent in sentences:
    # `encode_plus` will:
    #   (1) Tokenize the sentence.
    #   (2) Prepend the `[CLS]` token to the start.
    #   (3) Append the `[SEP]` token to the end.
    #   (4) Map tokens to their IDs.
    #   (5) Pad or truncate the sentence to `max_length`
    #   (6) Create attention masks for [PAD] tokens.
    encoded_dict = tokenizer.encode_plus(
                        sent,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = max_len,           # Pad & truncate all sentences.
                        pad_to_max_length = True,
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                   )
    
    # Add the encoded sentence to the list.    
    input_ids.append(encoded_dict['input_ids'])
    
    # And its attention mask (simply differentiates padding from non-padding).
    attention_masks.append(encoded_dict['attention_mask'])

# Convert the lists into tensors.
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)

# Set the batch size.  
batch_size = 32  

# Create the DataLoader.
prediction_data = TensorDataset(input_ids, attention_masks)
prediction_sampler = SequentialSampler(prediction_data)
prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Number of test sentences: 6,143





## 4.2. Evaluate on Test Set


## 4.2.1. Making predictions on the test set


With the test set prepared, we can apply the pre-trained BertForSequenceClassifiction model to generate predictions on the test set.

In [14]:
# Prediction on test set

print('Predicting labels for {:,} test sentences...'.format(len(input_ids)))

# Put model in evaluation mode
model.eval()

# Tracking variables 
predictions  = []

# Predict 
for batch in prediction_dataloader:
  
  # Unpack the inputs from our dataloader
  b_input_ids, b_input_mask = batch
  
  # Telling the model not to compute or store gradients, saving memory and 
  # speeding up prediction
  with torch.no_grad():
    # Forward pass, calculate logit predictions.
    result = model(b_input_ids, 
                   token_type_ids=None, 
                   attention_mask=b_input_mask,
                   return_dict=True)
    
  logits = result.logits

  # Move logits and labels to CPU
  logits = logits.detach().cpu().numpy()
  
  # Store predictions and true labels
  predictions.append(logits)

print('    DONE.')

Predicting labels for 6,143 test sentences...
    DONE.


## 4.2.2. Computing the ROUGE metrics to assess the model's performance

### 4.2.2.1. Formatting the data to allow metric computation

In [15]:
# Combine the results across all batches. 
flat_predictions = np.concatenate(predictions, axis=0)

# For each sample, pick the label (0 or 1) with the higher score.
flat_predictions = np.argmax(flat_predictions, axis=1).flatten()


# Adding flat_prediction to the test_datasets
test_dataset['prediction'] = flat_predictions

# Create a new dataframe which groups the predictions based on their document_label to map the prediction to the original text 
df_summaries = test_dataset.groupby('document_label').agg(list)
#resetting the index
df_summaries = df_summaries.reset_index()

# Now we wish to extract the summaries only 
# Keeping the sentences that were predicted with 1
#creating empty lists
predicted_labels, predicted_sentences, actual_labels, actual_sentences, doc_labels =[],[],[],[],[]

#looping in every line of the df_summaries simultaneously 
for predicted_label, predicted_text, actual_label, doc_label in zip(df_summaries['prediction'], 
                                                                    df_summaries['sentence'], 
                                                                    df_summaries['label'], 
                                                                    df_summaries['document_label']):
    #looping in each item of each line of df_summaries
    for pred_label, pred_sent, act_label in zip(predicted_label, 
                                                              predicted_text, 
                                                              actual_label):
        #setting the condition that only if the item equals 1 in the predicted_labels
        #assing the text, the predicted label, the actual label and the document label to the lists 
        if pred_label == 1:
            predicted_labels.append(pred_label)
            predicted_sentences.append(pred_sent)
            actual_labels.append(act_label)
            doc_labels.append(doc_label)

# Creating a new dataframe with only the predicted sentences kept, against the actual label 
df_summaries = pd.DataFrame({'predicted_label': predicted_labels,
                             'predicted_summary': predicted_sentences,
                             'actual_label' : actual_labels,
                             'document_label': doc_labels})

#grouping by the document label and resetting the index 
df_summaries = df_summaries.groupby('document_label').agg(list).reset_index()

# Adding the actual summaries for each document to the dataframe
# This is critical for the evaluation 

#creataing an empty list 
actual_summaries = []

#looping though the document labels in the df_summaries 
for label in df_summaries['document_label']:
  #looping through both the indexes and the text of the original summaries in the initial dataframe
  for index, summ in zip(df.index, df['summary_clean']):
        #setting the condition that if the index of the actual summary in the original document equals 
        #to the label of the document in the results (i.e. summaries) dataframe 
        #append the summary to the empty list
        if label == index:
            actual_summaries.append(summ)

# Adding the list to the df_summaries dataframe
df_summaries['actual_summary'] = actual_summaries



### 4.2.2.1. Computing ROUGE scores

In [16]:
# Computing rouge scores for the results 

#load Huggingface's rouge metric 
metric = load_metric("rouge")

#creating empty lists for the predicted and the actual summaries 
predicted_summ = []
label_summ = []

#looping through both the predicted and the actual summaries 
for doc_pred, doc_label in zip(df_summaries['predicted_summary'], df_summaries['actual_summary']):
    #looping through each sentence within the predicted and the actual summaries  
    for pred_summ, labelled_summ in zip(doc_pred, doc_label):
        #appending these to the empty lists 
        predicted_summ.append(pred_summ)
        label_summ.append(labelled_summ)

#computing and saving the rouge scores in rouge_scores object 
rouge_scores = metric.compute(predictions=predicted_summ, references=label_summ)

#printing the scores 
print(rouge_scores)

Downloading:   0%|          | 0.00/2.17k [00:00<?, ?B/s]

{'rouge1': AggregateScore(low=Score(precision=0.6882904587369563, recall=0.7088594225759833, fmeasure=0.6838830922865757), mid=Score(precision=0.7160452242895889, recall=0.7381408908145205, fmeasure=0.7119409549265565), high=Score(precision=0.7448572067552555, recall=0.7651313799528078, fmeasure=0.7405446045186203)), 'rouge2': AggregateScore(low=Score(precision=0.6243055764616225, recall=0.6520068397301512, fmeasure=0.6304768090012144), mid=Score(precision=0.6576149535328037, recall=0.6844964279029998, fmeasure=0.663322264586433), high=Score(precision=0.6891820762222407, recall=0.7168115660227306, fmeasure=0.6952255709340731)), 'rougeL': AggregateScore(low=Score(precision=0.6716178448169996, recall=0.6919025067744066, fmeasure=0.6683438663028815), mid=Score(precision=0.6996408141404824, recall=0.7207982513607604, fmeasure=0.6972045148484238), high=Score(precision=0.7275200583682546, recall=0.748488153994005, fmeasure=0.7253441467814902)), 'rougeLsum': AggregateScore(low=Score(precision