### Requirments

In [2]:
! pip install transformers
! pip install torch===1.7.0 torchvision===0.8.1 torchaudio===0.7.0 -f https://download.pytorch.org/whl/torch_stable.html

Looking in links: https://download.pytorch.org/whl/torch_stable.html


### Importing the libraries

In [3]:
import pandas as pd
from glob import glob
import xml.etree.ElementTree as et 
from transformers import BartTokenizer, BartForConditionalGeneration, BartConfig

### Data Pre-processing

In [3]:
# Get articles name - extract folder name from Directory - Adjust directory according to the location of the data 
articles = glob("/content/drive/My Drive/Thesis/Data/*/")
articles_id = [article[-9:-1] for article in articles]

In [4]:
# Get all gold summaries from source and store it in a dataframe
gold_summary = pd.DataFrame(columns=['Article_ID', 'GoldSummary', 'Length']) 
for article in articles_id:
  with open("/content/drive/My Drive/Thesis/Data/{0}/summary/{0}.gold.txt".format(article),"r") as file:
    data = file.read().replace('\n', ' ')
  gold_summary = gold_summary.append({'Article_ID': article, 'GoldSummary': data , 'Length':len(data)}, ignore_index=True)

In [13]:
import statistics
statistics.mode(list(gold_summary['Length'].values))

1008

In [14]:
gold_summary.head()

Unnamed: 0,Article_ID,GoldSummary,Length
0,P05-3026,Multi-Engine Machine Translation Guided By Exp...,1110
1,P06-1004,Minimum Cut Model For Spoken Lecture Segmentat...,782
2,P06-1005,Bootstrapping Path-Based Pronoun Resolution We...,1562
3,P06-1009,Discriminative Word Alignment With Conditional...,931
4,P06-1010,Named Entity Transliteration With Comparable C...,1096


In [15]:
# Get all citing sentences from source and store it in a dataframe 
citing_sentences = pd.DataFrame(columns=['Article_ID', 'Cit_no', 'Raw_text','Clean_text']) 
for article in articles_id:
  path = '/content/drive/My Drive/Thesis/Data/{0}/citing_sentences_annotated.json'.format(article)
  temp = pd.read_json(path)
  for i in range(len(temp)):
    citing_sentences = citing_sentences.append({'Article_ID': article, 'Cit_no': temp.iloc[i,0] , 'Raw_text':temp.iloc[i,4],'Clean_text':temp.iloc[i,5]}, ignore_index=True)

In [16]:
citing_sentences.head()
citing_sentences.groupby('Article_ID').count() #count how many cites for each article 

Unnamed: 0_level_0,Cit_no,Raw_text,Clean_text
Article_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A00-1031,20,20,20
A00-1043,18,18,18
A00-2004,20,20,20
A00-2009,14,14,14
A00-2018,20,20,20
...,...,...,...
W99-0612,20,20,20
W99-0613,20,20,20
W99-0623,20,20,20
W99-0625,11,11,11


In [17]:
# Get Abstract text for all articles and store it in a DF 
abstract = pd.DataFrame(columns=['Article_ID', 'Title', 'Abstract']) 
for article in articles_id:
  xtree = et.parse("/content/drive/My Drive/Thesis/Data/{0}/Documents_xml/{0}.xml".format(article))
  xroot = xtree.getroot()
  for child in xroot:
    title = child.text
    break
  abs = xroot.find('ABSTRACT')
  if abs is None:
    continue
  Abs = abs.findall('S')
  abs_text = []
  for a in Abs:
    abs_text.append(a.text)
  abstractText = ' '.join(abs_text)
  abstract = abstract.append({'Article_ID': article, 'Title': title , 'Abstract':abstractText}, ignore_index=True)

In [18]:
abstract

Unnamed: 0,Article_ID,Title,Abstract
0,P05-3026,Multi-Engine Machine Translation Guided By Exp...,We describe a new approach for synthetically c...
1,P06-1004,Minimum Cut Model For Spoken Lecture Segmentation,We consider the task of unsupervised lecture s...
2,P06-1005,Bootstrapping Path-Based Pronoun Resolution,We present an approach to pronoun resolution b...
3,P06-1009,Discriminative Word Alignment With Conditional...,In this paper we present a novel approach for ...
4,P06-1010,Named Entity Transliteration With Comparable C...,In this paper we investigate Chinesename trans...
...,...,...,...
942,C90-2067,Word Sense Disambiguation With Very Large Neur...,"In this paper, we describe a means for automat..."
943,C90-3045,Synchronous Tree-Adjoining Grammars,The unique properties of lree-adjoining gramma...
944,C92-1019,Word Identification For Mandarin Chinese Sente...,Keh- J iann Chen Sh ing- l luan Liu Institute ...
945,C92-2066,Stochastic Lexicalized Tree-Adjoining Grammars,"Aho, A. V. 1968. lndexed grammars - An extensi..."


In [19]:
#Find out which articles has no abstract tags
print(len(articles_id))
print(len(abstract['Article_ID']))
no_abs = [x for x in articles_id if x not in list(abstract['Article_ID'])]
len(no_abs)

1009
947


62

In [168]:
no_abs

['P85-1011',
 'P89-1009',
 'P90-1010',
 'W08-2123',
 'W97-0703',
 'C94-2174',
 'C96-1005',
 'C96-1055',
 'C96-1058',
 'C96-1079',
 'C96-2183',
 'D07-1074',
 'H01-1035',
 'H05-2018',
 'H91-1026',
 'H91-1060',
 'H92-1026',
 'H92-1045',
 'H93-1051',
 'H93-1052',
 'H93-1061',
 'H94-1020',
 'H94-1046',
 'H94-1048',
 'I05-3025',
 'J05-1004',
 'J06-1003',
 'J08-2005',
 'J93-2004',
 'J94-2003',
 'J94-4004',
 'M95-1005',
 'M95-1012',
 'C00-1007',
 'C00-1044',
 'C00-1072',
 'C00-2136',
 'C00-2137',
 'C00-2163',
 'C02-1139',
 'C02-2025',
 'C04-1046',
 'C04-1073',
 'C04-1100',
 'C86-1016',
 'C88-1016',
 'C88-2121',
 'C88-2128',
 'C88-2147',
 'C90-3030',
 'C90-3044',
 'C90-3052',
 'C90-3063',
 'C92-1025',
 'C92-1038',
 'C92-2070',
 'C92-2082',
 'C92-3126',
 'C92-3150',
 'C94-1027',
 'C94-1032',
 'C94-1042']

### Setting the model up 

In [4]:
# see ``examples/summarization/bart/run_eval.py`` for a longer example
model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')

ARTICLE_TO_SUMMARIZE = "In statistical machine translation, the currently best performing systems are based in some way on phrases or word groups.We describe the baseline phrase-based translation system and various refinements. We describe a highly efficient monotone search algorithm with a complexity linear in the input sentence length. We present translation results for three tasks: Verbmobil, Xerox and the Canadian Hansards. For the Xerox task, it takes less than 7 seconds to translate the whole test set consisting of more than 10K words. The translation results for the Xerox and Canadian Hansards task are very promising.The system even outperforms the alignment template system"
inputs = tokenizer([ARTICLE_TO_SUMMARIZE], max_length=1024, return_tensors='pt')

# Generate Summary
summary_ids = model.generate(inputs['input_ids'], num_beams=3, max_length = 75 , early_stopping=True)
print([tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_ids])

NameError: ignored