## Step 3: Automatic and Human Evaluation

In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from datasets import load_dataset
import pandas as pd
import evaluate
import numpy as np

In [2]:
data_frame = pd.read_csv("step2.csv")

In [3]:
data_frame.head(20)

Unnamed: 0.1,Unnamed: 0,summary,generated_sequence_greedy,generated_sequence_beam,generated_sequence_topk,generated_sequence_topp
0,0,"There is a ""chronic"" need for more housing for...",",,,,,,,,, who has been in prison for 20 years,...","in Wales, a Welsh charity says. ""I think the k...",has been criticized for lack of help in the re...,", the Welsh Government has warned that it is i..."
1,1,"A man has appeared in court after firearms, am...","Police have recovered three firearms, ammuniti...","Police have recovered three firearms, ammuniti...",Police arrested an eight-year-old man who was ...,Detectives say a man has been arrested and cha...
2,2,Four people accused of kidnapping and torturin...,",,, have been charged with hate crimes and agg...","Hill, 18, and Cooper, all 18, have been charge...",", 22,'s Witness said their victim. had been ac...","has been charged with murder, kidnapping and b..."
3,3,West Brom have appointed Nicky Hammond as tech...,keeper David West Brom has been appointed a yo...,'s former Arsenal goalkeeper has played for th...,keeper Michael Warfinger has signed a new mana...,keeper David Warres has been appointed Junior ...
4,4,The pancreas can be triggered to regenerate it...,",,,,,,,,,,,,, and the ability",". ""I was glad to have something to eat,"" he sa...",and to find out what to do. on of eating a fas...,"from food and drinks. ""The aim is to improve b..."
5,5,Since their impending merger was announced in ...,",,,,,,,,, and,,, and a merger",",,,,,,,,,, a merger between the two optician","by German optician Specsavers as a merger, is ...",and becoming the biggest optical business in E...
6,6,"A ""medal at any cost"" approach created a ""cult...",",,,,, has apologised for the sexist behaviour ...",has apologised to British Cycling's new chair ...,is to get to work at British Cycling's Champio...,", Scotland's Olympic Gold medallist Wendy Houv..."
7,7,Have you heard the one about the computer prog...,",,,,,,, and a new generation of comedians,,,,","a comedy club in the United States.,,.,,s are ...",(eh really). The concept for a comedy club in ...,"it in the US, or some other world, such as the..."
8,8,The reaction from BT's investors told us much ...,",,, and a new board to run Openreach.,,,,, BT","of BT's giant pension scheme, Openreach, is a ...",. the first BT retirement scheme and that's a ...,The Andes have shown that the rulings are a ve...
9,9,Manager Brendan Rodgers is sure Celtic can exp...,has been a key player in the Scottish Cup and ...,is looking forward to the Scottish Cup semi-fi...,'s first Premier League title for Celtic in Sc...,as the German has praised his position as one ...


In [4]:
references = list(data_frame["summary"])

generated_sequence_greedy = list(data_frame["generated_sequence_greedy"])
generated_sequence_beam = list(data_frame["generated_sequence_beam"])
generated_sequence_topk = list(data_frame["generated_sequence_topk"])
generated_sequence_topp = list(data_frame["generated_sequence_topp"])

## Computing ROUGE Score

In [5]:
rouge = evaluate.load("rouge") 

In [6]:
rouge_greedy = rouge.compute(predictions=generated_sequence_greedy, references=references) 
print(rouge_greedy)

{'rouge1': 0.2029126071053451, 'rouge2': 0.04577778659590956, 'rougeL': 0.15521367360472338, 'rougeLsum': 0.1554106254095839}


In [7]:
rouge_beam = rouge.compute(predictions=generated_sequence_beam, references=references) 
print(rouge_beam)

{'rouge1': 0.20485106201261247, 'rouge2': 0.046903186854296536, 'rougeL': 0.16137002212589063, 'rougeLsum': 0.16126149341537582}


In [8]:
rouge_topk = rouge.compute(predictions=generated_sequence_topk, references=references) 
print(rouge_topk)

{'rouge1': 0.18775090236850012, 'rouge2': 0.028978603026998, 'rougeL': 0.13815566083384, 'rougeLsum': 0.1382690090819329}


In [9]:
rouge_topp = rouge.compute(predictions=generated_sequence_topp, references=references) 
print(rouge_topp)

{'rouge1': 0.18894895032636985, 'rouge2': 0.02871813447468976, 'rougeL': 0.13845965355946627, 'rougeLsum': 0.13873953825191768}


In [10]:
def compute_rouge(sentence, references):
    return rouge.compute(predictions=[sentence], references=[references]) 


rouge_scores = []

for index, item in enumerate(generated_sequence_greedy):
    if index%10==0:
        print("Current index: {}".format(index + 1))
    rouge_greedy = generated_sequence_greedy[index]
    rouge_beam = generated_sequence_beam[index]
    rouge_topk = generated_sequence_topk[index]
    rouge_topp = generated_sequence_topp[index]
    
    rouge_scores.append([compute_rouge(rouge_greedy, references)["rougeL"], 
                         compute_rouge(rouge_beam, references)["rougeL"], 
                         compute_rouge(rouge_topk, references)["rougeL"], 
                         compute_rouge(rouge_topp, references)["rougeL"]])
    
rouge_score_df = pd.DataFrame(rouge_scores, columns = ["rougel_greedy", "rougel_beam", "rougel_topk", "rougel_topp"])

Current index: 1
Current index: 11
Current index: 21
Current index: 31
Current index: 41
Current index: 51
Current index: 61
Current index: 71
Current index: 81
Current index: 91


In [11]:
rouge_score_df.head()

Unnamed: 0,rougel_greedy,rougel_beam,rougel_topk,rougel_topp
0,0.24,0.210526,0.294118,0.235294
1,0.214286,0.214286,0.25,0.285714
2,0.285714,0.210526,0.188679,0.263158
3,0.258065,0.216216,0.210526,0.222222
4,0.173913,0.171429,0.2,0.2


## Computing BERT score

In [12]:
bertscore = evaluate.load("bertscore")

In [13]:
bert_score_greedy = bertscore.compute(predictions=generated_sequence_greedy, references=references, lang="en") 
bert_score_greedy["precision"] = np.array(bert_score_greedy["precision"]).mean()
bert_score_greedy["recall"] = np.array(bert_score_greedy["recall"]).mean()
bert_score_greedy["f1"] = np.array(bert_score_greedy["f1"]).mean()
print(bert_score_greedy)

{'precision': 0.8636398160457611, 'recall': 0.8537258714437485, 'f1': 0.8584919589757919, 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.20.1)'}


In [14]:
bert_score_beam = bertscore.compute(predictions=generated_sequence_beam, references=references, lang="en") 
bert_score_beam["precision"] = np.array(bert_score_beam["precision"]).mean()
bert_score_beam["recall"] = np.array(bert_score_beam["recall"]).mean()
bert_score_beam["f1"] = np.array(bert_score_beam["f1"]).mean()
print(bert_score_beam)

{'precision': 0.8640277189016342, 'recall': 0.8592779016494752, 'f1': 0.8614707732200623, 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.20.1)'}


In [15]:
bert_score_topk = bertscore.compute(predictions=generated_sequence_topk, references=references, lang="en") 
bert_score_topk["precision"] = np.array(bert_score_topk["precision"]).mean()
bert_score_topk["recall"] = np.array(bert_score_topk["recall"]).mean()
bert_score_topk["f1"] = np.array(bert_score_topk["f1"]).mean()
print(bert_score_topk)

{'precision': 0.850326600074768, 'recall': 0.8554176151752472, 'f1': 0.8526835870742798, 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.20.1)'}


In [16]:
bert_score_topp = bertscore.compute(predictions=generated_sequence_topp, references=references, lang="en") 
bert_score_topp["precision"] = np.array(bert_score_topp["precision"]).mean()
bert_score_topp["recall"] = np.array(bert_score_topp["recall"]).mean()
bert_score_topp["f1"] = np.array(bert_score_topp["f1"]).mean()
print(bert_score_topp)

{'precision': 0.8610235536098481, 'recall': 0.8533705633878708, 'f1': 0.8570091015100479, 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.20.1)'}


In [17]:
def compute_bert_score(sentence, references):
    return bertscore.compute(predictions=[sentence], references=[references], lang="en")


bertscores = []

for index, item in enumerate(generated_sequence_greedy):
    if index%10==0:
        print("Current index: {}".format(index + 1))
    bertscore_greedy = generated_sequence_greedy[index]
    bertscore_beam = generated_sequence_beam[index]
    bertscore_topk = generated_sequence_topk[index]
    bertscore_topp = generated_sequence_topp[index]
    
    bertscores.append([compute_bert_score(bertscore_greedy, references)["f1"][0], 
                         compute_bert_score(bertscore_beam, references)["f1"][0], 
                         compute_bert_score(bertscore_topk, references)["f1"][0], 
                         compute_bert_score(bertscore_topp, references)["f1"][0]])

Current index: 1
Current index: 11
Current index: 21
Current index: 31
Current index: 41
Current index: 51
Current index: 61
Current index: 71
Current index: 81
Current index: 91


In [18]:
bert_score_df_f1 = pd.DataFrame(bertscores, columns = ["bert_score_f1_greedy", "bert_score_f1_beam", "bert_score_f1_topk", "bert_score_f1_topp"])

In [19]:
bert_score_df_f1

Unnamed: 0,bert_score_f1_greedy,bert_score_f1_beam,bert_score_f1_topk,bert_score_f1_topp
0,0.851630,0.885002,0.872510,0.872255
1,0.913446,0.913446,0.890272,0.900306
2,0.848532,0.856651,0.832835,0.857496
3,0.871094,0.873786,0.857377,0.881984
4,0.854376,0.866062,0.842327,0.873964
...,...,...,...,...
95,0.879924,0.881419,0.876053,0.870580
96,0.880048,0.854513,0.886684,0.866757
97,0.877925,0.876371,0.862268,0.838461
98,0.883520,0.882250,0.881713,0.868850


In [20]:
concat_data_frame = pd.concat([rouge_score_df, bert_score_df_f1], axis=1)
concat_data_frame.head()

Unnamed: 0,rougel_greedy,rougel_beam,rougel_topk,rougel_topp,bert_score_f1_greedy,bert_score_f1_beam,bert_score_f1_topk,bert_score_f1_topp
0,0.24,0.210526,0.294118,0.235294,0.85163,0.885002,0.87251,0.872255
1,0.214286,0.214286,0.25,0.285714,0.913446,0.913446,0.890272,0.900306
2,0.285714,0.210526,0.188679,0.263158,0.848532,0.856651,0.832835,0.857496
3,0.258065,0.216216,0.210526,0.222222,0.871094,0.873786,0.857377,0.881984
4,0.173913,0.171429,0.2,0.2,0.854376,0.866062,0.842327,0.873964


In [21]:
new_data_frame = pd.concat([data_frame, concat_data_frame], axis=1)
new_data_frame.head()

Unnamed: 0.1,Unnamed: 0,summary,generated_sequence_greedy,generated_sequence_beam,generated_sequence_topk,generated_sequence_topp,rougel_greedy,rougel_beam,rougel_topk,rougel_topp,bert_score_f1_greedy,bert_score_f1_beam,bert_score_f1_topk,bert_score_f1_topp
0,0,"There is a ""chronic"" need for more housing for...",",,,,,,,,, who has been in prison for 20 years,...","in Wales, a Welsh charity says. ""I think the k...",has been criticized for lack of help in the re...,", the Welsh Government has warned that it is i...",0.24,0.210526,0.294118,0.235294,0.85163,0.885002,0.87251,0.872255
1,1,"A man has appeared in court after firearms, am...","Police have recovered three firearms, ammuniti...","Police have recovered three firearms, ammuniti...",Police arrested an eight-year-old man who was ...,Detectives say a man has been arrested and cha...,0.214286,0.214286,0.25,0.285714,0.913446,0.913446,0.890272,0.900306
2,2,Four people accused of kidnapping and torturin...,",,, have been charged with hate crimes and agg...","Hill, 18, and Cooper, all 18, have been charge...",", 22,'s Witness said their victim. had been ac...","has been charged with murder, kidnapping and b...",0.285714,0.210526,0.188679,0.263158,0.848532,0.856651,0.832835,0.857496
3,3,West Brom have appointed Nicky Hammond as tech...,keeper David West Brom has been appointed a yo...,'s former Arsenal goalkeeper has played for th...,keeper Michael Warfinger has signed a new mana...,keeper David Warres has been appointed Junior ...,0.258065,0.216216,0.210526,0.222222,0.871094,0.873786,0.857377,0.881984
4,4,The pancreas can be triggered to regenerate it...,",,,,,,,,,,,,, and the ability",". ""I was glad to have something to eat,"" he sa...",and to find out what to do. on of eating a fas...,"from food and drinks. ""The aim is to improve b...",0.173913,0.171429,0.2,0.2,0.854376,0.866062,0.842327,0.873964


In [22]:
new_data_frame.to_csv("step3a.csv", encoding='utf-8')