In [1]:
import os
import pandas as pd
from transformers import BartTokenizer, BartForConditionalGeneration
from rouge_score import rouge_scorer
import torch
from rouge_score import rouge_scorer
from nltk.translate import chrf_score
from nltk.translate import meteor_score
import string

In [2]:
# specify data path and change directory
data_path = 'Data'
data_path = "../Datasets/OA-STM-Corpus/SimpleText/SimpleText_auto"
os.chdir(data_path)

In [3]:
# extract titles and abstracts and create array of entire texts
titles = []
abstracts = []
texts = []

for filename in os.listdir():
        with open(filename, 'r', encoding='utf-8') as file:
            lines = file.readlines()
            
            # extract titles
            title = lines[0].strip()
            titles.append(title)
            
            # loop through lines to extract text & abstracts
            text = ''
            abstract_text = ''
            capturing_abstract = False
            capturing_intro = False
            
            for line in lines:
                # add entire text into array
                text += line.strip() + ' '
                
                # check to see if currently in abstract or introduction block
                #if 'Abstract' in line:
                if (line.strip() == "Abstract" or line.strip() == "Highlights" or line.strip() == "Background" or line.strip() == "Summary") and line.strip()[-1] not in string.punctuation and capturing_intro == False:
                    capturing_abstract = True
                #elif 'Introduction' in line:
                elif (line.strip()[:len("Introduction")] == "Introduction" or line.strip() == "Summary" or line.strip() == "Methods") and line.strip()[-1] not in string.punctuation and capturing_abstract == True:
                    capturing_intro = True
                    
                # only add abstract text (not introduction text) into array
                if capturing_abstract and not capturing_intro:
                    abstract_text += line.strip() + ' '
            
            # append entire text into texts array & abstract text into abstracts array
            if capturing_abstract and capturing_intro:
                texts.append(text)
                abstracts.append(abstract_text)
            else:
                print("Investigate:")
                print(filename)
                print(title)
                print(len(abstract_text))
                print(len(text))
                print()
            

Investigate:
S0024379513003716.txt
Two cores of a nonnegative matrix
728
763



In [4]:
print("Titles:\n", titles)
print("Abstracts:\n", abstracts)
print("Entires Texts:\n", texts)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [5]:
# initialize BART tokenizer and model
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
bart_model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')

In [6]:
# generate summaries & append into list
summaries = []
#counter for the reserach papers completed
i = 1
for input_text in texts:
    print(i)
    # Tokenize the input text
    inputs = tokenizer([input_text], max_length=1024, truncation=True, padding='max_length', return_tensors='pt')

    # Generate summary using BART model
    with torch.no_grad():
        summary_ids = bart_model.generate(inputs['input_ids'], max_length=150, num_beams=6, early_stopping=True)
        
    # Decode the summary tokens
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    summaries.append(summary)
    i += 1

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109


In [7]:
for i, summary in enumerate(summaries):
    print(f"Summary for '{titles[i]}': {summary}")

Summary for 'Integrality of the Chern character in small codimension': We prove an integrality property of the Chern character with values in Chow groups. We obtain a construction of the p-1 first homological Steenrod operations on Chow groups modulo p and p-primary torsion, over an arbitrary field. We provide applications to the study of correspondences between algebraic varieties.
Summary for 'Bergman spaces of natural G-manifolds': Bergman spaces of natural G-manifolds are large. Natural examples include the gauged G-complexifications of Heinzner, Huckleberry, and Kutzschebauch. Our method of analyzing the Bergman space is derived from a partial differential equations approach to several complex variables.
Summary for 'Generalized Macdonald-Ruijsenaars systems': Generalized Macdonald-Ruijsenaars systems are based on special representations of Double Affine Hecke Algebras (DAHAs) DAHA is a very rich mathematical subject with deep connections and interactions with combinatorics, theor

In [8]:
# ROUGE-3 evaluation
# initialize scorer
scorer = rouge_scorer.RougeScorer(['rouge3'])

In [9]:
# calculate rouge score for each summary
rouge3_scores_recall = []
rouge3_scores_fmeasure = []
meteor_scores = []
chrf_scores = []
for title, summary, abstract in zip(titles, summaries, abstracts):
    scores = scorer.score(summary, abstract)
    meteor = meteor_score.single_meteor_score(summary.split(), abstract.split())
    chrf = chrf_score.sentence_chrf(summary.split(), abstract.split())
    
    rouge3_scores_recall.append(scores['rouge3'].recall)
    rouge3_scores_fmeasure.append(scores['rouge3'].fmeasure)
    meteor_scores.append(meteor)
    chrf_scores.append(chrf)
    
    
    print(title, " : ", scores['rouge3'].recall)
    print(title, " : ", scores['rouge3'].fmeasure)
    print(title, " : ", meteor)
    print(title, " : ", chrf)

Integrality of the Chern character in small codimension  :  0.9583333333333334
Integrality of the Chern character in small codimension  :  0.9199999999999999
Integrality of the Chern character in small codimension  :  0.9916996671258035
Integrality of the Chern character in small codimension  :  0.9782190012016877
Bergman spaces of natural G-manifolds  :  0.32432432432432434
Bergman spaces of natural G-manifolds  :  0.20168067226890754
Bergman spaces of natural G-manifolds  :  0.4481615083482949
Bergman spaces of natural G-manifolds  :  0.5726415077817301
Generalized Macdonald-Ruijsenaars systems  :  0.16666666666666666
Generalized Macdonald-Ruijsenaars systems  :  0.08219178082191782
Generalized Macdonald-Ruijsenaars systems  :  0.2373168536797998
Generalized Macdonald-Ruijsenaars systems  :  0.5509105983252596
Vascular Risk Status as a Predictor of Later-Life Depressive Symptoms: A Cohort Study  :  0.47692307692307695
Vascular Risk Status as a Predictor of Later-Life Depressive Sympt

Two cores of a nonnegative matrix  :  0.38181818181818183
Two cores of a nonnegative matrix  :  0.13548387096774195
Two cores of a nonnegative matrix  :  0.39418507128878966
Two cores of a nonnegative matrix  :  0.5434750202834676
Chemical weathering and provenance evolution of Holocene-Recent sediments from the Western Indus Shelf, Northern Arabian Sea inferred from physical and mineralogical properties  :  0.34782608695652173
Chemical weathering and provenance evolution of Holocene-Recent sediments from the Western Indus Shelf, Northern Arabian Sea inferred from physical and mineralogical properties  :  0.08767123287671233
Chemical weathering and provenance evolution of Holocene-Recent sediments from the Western Indus Shelf, Northern Arabian Sea inferred from physical and mineralogical properties  :  0.37233099381775075
Chemical weathering and provenance evolution of Holocene-Recent sediments from the Western Indus Shelf, Northern Arabian Sea inferred from physical and mineralogical 

SpiNNaker: Fault tolerance in a power- and area- constrained large-scale neuromimetic architecture  :  0.7241379310344828
SpiNNaker: Fault tolerance in a power- and area- constrained large-scale neuromimetic architecture  :  0.1822125813449024
SpiNNaker: Fault tolerance in a power- and area- constrained large-scale neuromimetic architecture  :  0.3349979647203995
SpiNNaker: Fault tolerance in a power- and area- constrained large-scale neuromimetic architecture  :  0.519200676825705
Sink or source-The potential of coffee agroforestry systems to sequester atmospheric CO2 into soil organic carbon  :  0.7037037037037037
Sink or source-The potential of coffee agroforestry systems to sequester atmospheric CO2 into soil organic carbon  :  0.3636363636363636
Sink or source-The potential of coffee agroforestry systems to sequester atmospheric CO2 into soil organic carbon  :  0.6900090771558246
Sink or source-The potential of coffee agroforestry systems to sequester atmospheric CO2 into soil org

Morphology control in co-evaporated bulk heterojunction solar cells  :  0.9402985074626866
Morphology control in co-evaporated bulk heterojunction solar cells  :  0.28125000000000006
Morphology control in co-evaporated bulk heterojunction solar cells  :  0.4597596701685636
Morphology control in co-evaporated bulk heterojunction solar cells  :  0.6714109940166703
Influences on the energy delivery of thin film photovoltaic modules  :  0.5681818181818182
Influences on the energy delivery of thin film photovoltaic modules  :  0.19083969465648856
Influences on the energy delivery of thin film photovoltaic modules  :  0.543323635634029
Influences on the energy delivery of thin film photovoltaic modules  :  0.6619909522051491
Minority carrier lifetime in silicon photovoltaics: The effect of oxygen precipitation  :  0.7619047619047619
Minority carrier lifetime in silicon photovoltaics: The effect of oxygen precipitation  :  0.18550724637681162
Minority carrier lifetime in silicon photovoltaics

Generation of familial amyloidotic polyneuropathy-specific induced pluripotent stem cells  :  0.18461538461538463
Generation of familial amyloidotic polyneuropathy-specific induced pluripotent stem cells  :  0.0670391061452514
Generation of familial amyloidotic polyneuropathy-specific induced pluripotent stem cells  :  0.33268262972488816
Generation of familial amyloidotic polyneuropathy-specific induced pluripotent stem cells  :  0.43491327784318307
Induced Pluripotent Stem Cell Models of Progranulin-Deficient Frontotemporal Dementia Uncover Specific Reversible Neuronal Defects  :  0.7121212121212122
Induced Pluripotent Stem Cell Models of Progranulin-Deficient Frontotemporal Dementia Uncover Specific Reversible Neuronal Defects  :  0.26330532212885155
Induced Pluripotent Stem Cell Models of Progranulin-Deficient Frontotemporal Dementia Uncover Specific Reversible Neuronal Defects  :  0.4654246997055977
Induced Pluripotent Stem Cell Models of Progranulin-Deficient Frontotemporal Demen

In [10]:
# calculate average ROUGE-3 recall score for all summaries
average_rouge3_recall = sum(rouge3_scores_recall) / len(rouge3_scores_recall)
average_rouge3_fmeasure = sum(rouge3_scores_fmeasure) / len(rouge3_scores_fmeasure)
average_meteor = sum(meteor_scores) / len(meteor_scores)
average_chrf = sum(chrf_scores) / len(chrf_scores)


print("Average ROUGE-3 Recall:", average_rouge3_recall)
print("Average ROUGE-3 F-Score:", average_rouge3_fmeasure)
print("Average METEOR F-Score:", average_meteor)
print("Average CHRF F-Score:", average_chrf)

Average ROUGE-3 Recall: 0.6532880339503321
Average ROUGE-3 F-Score: 0.26189244723731553
Average METEOR F-Score: 0.5160756954806082
Average CHRF F-Score: 0.6419233739925494
