# 1. Setup

## 1.1. Installing and loading necessary libraries


In [None]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.9.1-py3-none-any.whl (2.6 MB)
[K     |████████████████████████████████| 2.6 MB 27.3 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636 kB)
[K     |████████████████████████████████| 636 kB 52.9 MB/s 
Collecting huggingface-hub==0.0.12
  Downloading huggingface_hub-0.0.12-py3-none-any.whl (37 kB)
Collecting sacremoses
  Downloading sacremoses-0.0.45-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 47.2 MB/s 
[?25hCollecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 52.9 MB/s 
Installing collected packages: tokenizers, sacremoses, pyyaml, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstalling PyYAML-3.13:
      Successfully uninst

In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-1.10.2-py3-none-any.whl (542 kB)
[?25l[K     |▋                               | 10 kB 25.6 MB/s eta 0:00:01[K     |█▏                              | 20 kB 27.5 MB/s eta 0:00:01[K     |█▉                              | 30 kB 29.1 MB/s eta 0:00:01[K     |██▍                             | 40 kB 31.1 MB/s eta 0:00:01[K     |███                             | 51 kB 33.2 MB/s eta 0:00:01[K     |███▋                            | 61 kB 35.8 MB/s eta 0:00:01[K     |████▎                           | 71 kB 35.9 MB/s eta 0:00:01[K     |████▉                           | 81 kB 37.6 MB/s eta 0:00:01[K     |█████▍                          | 92 kB 34.1 MB/s eta 0:00:01[K     |██████                          | 102 kB 32.5 MB/s eta 0:00:01[K     |██████▋                         | 112 kB 32.5 MB/s eta 0:00:01[K     |███████▎                        | 122 kB 32.5 MB/s eta 0:00:01[K     |███████▉                        | 133 kB 32.5 MB/s et

In [None]:
!pip install rouge_score

Collecting rouge_score
  Downloading rouge_score-0.0.4-py2.py3-none-any.whl (22 kB)
Installing collected packages: rouge-score
Successfully installed rouge-score-0.0.4


In [None]:
# Importing necessary librairies
import datasets
from datasets import load_metric
from google.colab import drive
import io
import matplotlib.pyplot as plt
import numpy as np
import pickle 
import pandas as pd
import torch 
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertForSequenceClassification
from tqdm import tqdm, trange
import seaborn as sns
from sklearn.model_selection import train_test_split

## 1.2. Importing personal drive to file in order to later load the data and the saved fine tuned model


In [3]:
# Mount Google Drive to this Notebook instance.
drive.mount('/content/drive')

Mounted at /content/drive


## 2. Load and Format data

## 2.1. Loading the testing and the reference summaries datasets

In [4]:
import pandas as pd
df = pd.read_pickle('/content/drive/MyDrive/Data/SummaDevDocs_preprocessed.pickle') #this will be used to compute the ROUGE scores of the generated summaries 
df

Unnamed: 0,document_text,summary_text,text_clean,text_embedding,summary_clean,summary_embedding,labels,doc_label
0,The training improved women’s knowledge on the...,The training improved women’s knowledge on the...,[The training improved women’s knowledge on th...,"[[0.76747984, -0.18944956, 0.51285803, -0.0211...",[The training improved women’s knowledge on th...,"[[0.76747984, -0.18944956, 0.51285803, -0.0211...","[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,Illegal oil refining in the Niger Delta is inc...,CEHRD in an effort to create awareness on the ...,[Illegal oil refining in the Niger Delta is in...,"[[-0.34167996, -0.6055787, -0.20679495, -1.148...",[CEHRD in an effort to create awareness on the...,"[[-0.436482, -0.07113252, -0.18081762, -0.4864...","[0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
2,CEHRD successfully set-up 4 formal and 4 infor...,CEHRD set-up the environmental clubs with the ...,[CEHRD successfully set-up 4 formal and 4 info...,"[[-0.4896432, -1.2085572, 1.0374498, 0.0213696...",[CEHRD set-up the environmental clubs with the...,"[[-0.15335679, -0.2943071, 0.58692193, -1.1263...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ..."
3,PROJECT NAME : Delivering Accelerated Family P...,Delivering Accelerated Family Planning in Paki...,"[PROJECT NAME :, Delivering Accelerated Family...","[[-0.23698464, 0.15983887, -0.07119872, -1.200...",[Delivering Accelerated Family Planning in Pak...,"[[-0.50266284, -1.2923898, 0.42068344, -1.2567...","[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ..."
4,Road traffic injuries are world's eighth leadi...,The Cardiff Trauma Pack Research and Develop...,[Road traffic injuries are world's eighth lead...,"[[-0.466682, -1.1917696, 0.99453795, -0.957597...",[ The Cardiff Trauma Pack Research and Develo...,"[[-0.4679709, -0.43502185, 0.83883774, -1.4754...","[0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, ...","[4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, ..."
...,...,...,...,...,...,...,...,...
2980,Metta has been responding to the humanitarian ...,"ProjectGoal: To improve the condition of 2,854...",[Metta has been responding to the humanitarian...,"[[-0.6875753, -1.0828757, 0.32236812, -1.33987...","[ProjectGoal:, To improve the condition of 2,8...","[[-0.5820974, 0.02791187, 0.2928526, -1.098855...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, ...","[2980, 2980, 2980, 2980, 2980, 2980, 2980, 298..."
2981,"Destined Women is local not for profit\, non-r...",GOAL: Contribute towards changing the socioeco...,"[Destined Women is local not for profit\,, non...","[[-0.91105515, -0.780988, -0.013189635, -0.504...",[GOAL: Contribute towards changing the socioec...,"[[0.06080835, -0.2223898, 0.48705336, -1.54801...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[2981, 2981, 2981, 2981, 2981, 2981, 2981, 298..."
2982,The project will empower secondary school stud...,The project will empower secondary school stud...,[The project will empower secondary school stu...,"[[0.2446245, -0.58844894, 1.0700818, -0.348946...",[The project will empower secondary school stu...,"[[0.31522802, -0.6220454, 1.084553, -0.3928656...","[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]","[2982, 2982, 2982, 2982, 2982, 2982, 2982]"
2983,The project has been specifically designed to ...,The project has been specifically designed to ...,[The project has been specifically designed to...,"[[0.22308932, -0.042834148, 0.5815844, -0.4814...",[The project has been specifically designed to...,"[[0.24488513, -0.04303686, 0.55787843, -0.5100...","[1.0, 0.0, 0.0, 0.0, 0.0, 0.0]","[2983, 2983, 2983, 2983, 2983, 2983]"


In [None]:
test_dataset = pd.read_csv('/content/drive/MyDrive/Data/test_dataset.csv') #this will be used to generate predictions 
df = pd.read_pickle('/content/drive/MyDrive/Data/SummaDevDocs_preprocessed.pickle') #this will be used to compute the ROUGE scores of the generated summaries 

# 3. Loading the pre-trained BertForSequenceClassification model


We'll be using [BertForSequenceClassification](https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#bertforsequenceclassification). This is the normal BERT model with an added single linear layer on top for classification that we will use as a sentence classifier. We will not train this model, as it will be used as baseline model to evaluate the added value of training and fine-tuning it to our specific task of summarising development bank project papers. 


In [None]:
# Loading the previsouly pre-trained BERTForSequenceClassification model using the from_pretrained method of hugginface's transformer library
model = BertForSequenceClassification.from_pretrained('bert-base-uncased')

# Get all of the model's parameters as a list of tuples.
params = list(model.named_parameters())

print('The BERT model has {:} different named parameters.\n'.format(len(params)))

print('==== Embedding Layer ====\n')

for p in params[0:5]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== First Transformer ====\n')

for p in params[5:21]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== Output Layer ====\n')

for p in params[-4:]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

The BERT model has 201 different named parameters.

==== Embedding Layer ====

bert.embeddings.word_embeddings.weight                  (30522, 768)
bert.embeddings.position_embeddings.weight                (512, 768)
bert.embeddings.token_type_embeddings.weight                (2, 768)
bert.embeddings.LayerNorm.weight                              (768,)
bert.embeddings.LayerNorm.bias                                (768,)

==== First Transformer ====

bert.encoder.layer.0.attention.self.query.weight          (768, 768)
bert.encoder.layer.0.attention.self.query.bias                (768,)
bert.encoder.layer.0.attention.self.key.weight            (768, 768)
bert.encoder.layer.0.attention.self.key.bias                  (768,)
bert.encoder.layer.0.attention.self.value.weight          (768, 768)
bert.encoder.layer.0.attention.self.value.bias                (768,)
bert.encoder.layer.0.attention.output.dense.weight        (768, 768)
bert.encoder.layer.0.attention.output.dense.bias              (

# 4. Performance On Test Set

### 4.1 Prepare the dataset

BERT requires a specific format for its input, namely: 

- Tokenize the sentences at a word level
- Special tokens [CLS] and [SEP] respectively at the begggining and end of each sentence (i.e. input) 
- Map all tokens to their input IDs 
- Pad all sentences for each input to match the same length; the one of the longest sentence. 
- Add Attention mask to differentitate normal tokens from padded tokens.

We implement these pre-processing steps in the below cells. 

In [None]:
# Load the BERT tokenizer.
print('Loading BERT tokenizer...')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True) #we choose bert-base-uncased as it has shown to be more effective for classification purposes than the bert-base-cased version.

# Now computing the longest sentence's length to later pad to that length 
#setting the max length to 0
max_len = 0

#create sentence and label lists from test dataset
sentences = test_dataset.sentence.values
labels = test_dataset.label.values

#for every sentence...
for sent in sentences:

    #tokenize the text and add `[CLS]` and `[SEP]` tokens.
    input_ids = tokenizer.encode(sent, add_special_tokens=True)

    #update the maximum sentence length.
    max_len = max(max_len, len(input_ids))

#printing the maximum sentence length. 
print('Max sentence length: ', max_len)

Loading BERT tokenizer...


Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Max sentence length:  203


Now that we have the maximum sentence length, we can pre-process all the sentences as per defined above. 

We use the encode_plus method of huggingface's BERT tokenizer to carry these steps.


In [None]:
# Report the number of sentences.
print('Number of test sentences: {:,}\n'.format(test_dataset.shape[0]))

# Tokenize all of the sentences and map the tokens to their word IDs.
input_ids = []
attention_masks = []

# For every sentence...
for sent in sentences:
    # `encode_plus` will:
    #   (1) Tokenize the sentence.
    #   (2) Prepend the `[CLS]` token to the start.
    #   (3) Append the `[SEP]` token to the end.
    #   (4) Map tokens to their IDs.
    #   (5) Pad or truncate the sentence to `max_length`
    #   (6) Create attention masks for [PAD] tokens.
    encoded_dict = tokenizer.encode_plus(
                        sent,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = max_len,           # Pad & truncate all sentences.
                        pad_to_max_length = True,
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                   )
    
    # Add the encoded sentence to the list.    
    input_ids.append(encoded_dict['input_ids'])
    
    # And its attention mask (simply differentiates padding from non-padding).
    attention_masks.append(encoded_dict['attention_mask'])

# Convert the lists into tensors.
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)

# Set the batch size.  
batch_size = 32  

# Create the DataLoader.
prediction_data = TensorDataset(input_ids, attention_masks)
prediction_sampler = SequentialSampler(prediction_data)
prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Number of test sentences: 5,973





## 4.2. Evaluate on Test Set


## 4.2.1. Making predictions on the test set


With the test set prepared, we can apply the pre-trained BertForSequenceClassifiction model to generate predictions on the test set.

In [None]:
# Prediction on test set

print('Predicting labels for {:,} test sentences...'.format(len(input_ids)))

# Put model in evaluation mode
model.eval()

# Tracking variables 
predictions  = []

# Predict 
for batch in prediction_dataloader:
  
  # Unpack the inputs from our dataloader
  b_input_ids, b_input_mask = batch
  
  # Telling the model not to compute or store gradients, saving memory and 
  # speeding up prediction
  with torch.no_grad():
    # Forward pass, calculate logit predictions.
    result = model(b_input_ids, 
                   token_type_ids=None, 
                   attention_mask=b_input_mask,
                   return_dict=True)
    
  logits = result.logits

  # Move logits and labels to CPU
  logits = logits.detach().cpu().numpy()
  
  # Store predictions and true labels
  predictions.append(logits)

print('    DONE.')

Predicting labels for 5,973 test sentences...
    DONE.


## 4.2.2. Computing the ROUGE metrics to assess the model's performance

### 4.2.2.1. Formatting the data to allow metric computation

In [None]:
# Combine the results across all batches. 
flat_predictions = np.concatenate(predictions, axis=0)

# For each sample, pick the label (0 or 1) with the higher score.
flat_predictions = np.argmax(flat_predictions, axis=1).flatten()


# Adding flat_prediction to the test_datasets
test_dataset['prediction'] = flat_predictions

# Create a new dataframe which groups the predictions based on their document_label to map the prediction to the original text 
df_summaries = test_dataset.groupby('document_label').agg(list)
#resetting the index
df_summaries = df_summaries.reset_index()

# Now we wish to extract the summaries only 
# Keeping the sentences that were predicted with 1
#creating empty lists
predicted_labels, predicted_sentences, actual_labels, actual_sentences, doc_labels =[],[],[],[],[]

#looping in every line of the df_summaries simultaneously 
for predicted_label, predicted_text, actual_label, doc_label in zip(df_summaries['prediction'], 
                                                                    df_summaries['sentence'], 
                                                                    df_summaries['label'], 
                                                                    df_summaries['document_label']):
    #looping in each item of each line of df_summaries
    for pred_label, pred_sent, act_label in zip(predicted_label, 
                                                              predicted_text, 
                                                              actual_label):
        #setting the condition that only if the item equals 1 in the predicted_labels
        #assing the text, the predicted label, the actual label and the document label to the lists 
        if pred_label == 1:
            predicted_labels.append(pred_label)
            predicted_sentences.append(pred_sent)
            actual_labels.append(act_label)
            doc_labels.append(doc_label)

# Creating a new dataframe with only the predicted sentences kept, against the actual label 
df_summaries = pd.DataFrame({'predicted_label': predicted_labels,
                             'predicted_summary': predicted_sentences,
                             'actual_label' : actual_labels,
                             'document_label': doc_labels})

#grouping by the document label and resetting the index 
df_summaries = df_summaries.groupby('document_label').agg(list).reset_index()

# Adding the actual summaries for each document to the dataframe
# This is critical for the evaluation 

#creataing an empty list 
reference_summaries = []

#looping though the document labels in the df_summaries 
for label in df_summaries['document_label']:
  #looping through both the indexes and the text of the original summaries in the initial dataframe
  for index, summ in zip(df.index, df['summary_clean']):
        #setting the condition that if the index of the actual summary in the original document equals 
        #to the label of the document in the results (i.e. summaries) dataframe 
        #append the summary to the empty list
        if label == index:
            reference_summaries.append(summ)

# Adding the list to the df_summaries dataframe
df_summaries['reference_summary'] = reference_summaries



### 4.2.2.1. Computing ROUGE scores

In [None]:
# Computing rouge scores for the results 

#load Huggingface's rouge metric 
metric = load_metric("rouge")

#creating empty lists for the predicted and the actual summaries 
predicted_summaries = []
reference_summaries = []

#looping through both the predicted and the actual summaries 
for doc_pred, doc_label in zip(df_summaries['predicted_summary'], df_summaries['reference_summary']):
    #looping through each sentence within the predicted and the actual summaries  
    for pred_summ, labelled_summ in zip(doc_pred, doc_label):
        #appending these to the empty lists 
        predicted_summaries.append(pred_summ)
        reference_summaries.append(labelled_summ)

#computing and saving the rouge scores in rouge_scores object 
rouge_scores = metric.compute(predictions=predicted_summaries, references=reference_summaries)

#printing the scores 
print(rouge_scores)

Downloading:   0%|          | 0.00/2.17k [00:00<?, ?B/s]

{'rouge1': AggregateScore(low=Score(precision=0.3356039408195568, recall=0.37951295504888194, fmeasure=0.3196245550146586), mid=Score(precision=0.35739602778264096, recall=0.40559375646400486, fmeasure=0.3421246343198156), high=Score(precision=0.3820151975123291, recall=0.4333673465210733, fmeasure=0.3669738149287004)), 'rouge2': AggregateScore(low=Score(precision=0.16963203651195258, recall=0.20748429391636075, fmeasure=0.1764105300931812), mid=Score(precision=0.19716567395071571, recall=0.24000115920448267, fmeasure=0.20580621466031687), high=Score(precision=0.2252685993478366, recall=0.2703532525756018, fmeasure=0.23427400760362183)), 'rougeL': AggregateScore(low=Score(precision=0.2857601896007371, recall=0.3247965557850347, fmeasure=0.2741428824910158), mid=Score(precision=0.31167118856941944, recall=0.3501280697750182, fmeasure=0.2995874383425019), high=Score(precision=0.33570455684384237, recall=0.3763636627944342, fmeasure=0.3231189345868737)), 'rougeLsum': AggregateScore(low=Sc