# I. Load datasets and import libraries

In [1]:
# Mount Google Drive
import os, sys
from IPython.display import clear_output
# from google.colab import drive
# drive.mount('/content/drive')

In [3]:
! pip install evaluate
! pip install sacrebleu
! pip install -r rouge/requirements.txt
! pip install rouge-score
from rouge_score import rouge_scorer
import evaluate
clear_output()

In [4]:
# Load amrlib library from drive
amrlib_path = '/content/amrlib'
os.symlink('/content/drive/MyDrive/IP/amrlib', amrlib_path)
sys.path.insert(0, amrlib_path)

# Install and import dependencies to environment
! pip install --target=$amrlib_path jdc
os.chdir('/content/amrlib')
! pip install -r requirements.txt

import amrlib
import penman
import transformers
import unidecode
import word2number
import sentencepiece

clear_output()

In [5]:
# Import all other necessary libraries
import seaborn as sns
sns.set_theme()
import pandas as pd
import numpy as np
import torch
import requests
import tqdm
import time
import urllib.parse
import json
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import matplotlib.pyplot as plt
%matplotlib inline
import random
import tarfile
import csv
from ast import literal_eval
import pickle
import torch

# Preprocessing
import nltk
import pprint
import re
from nltk.tokenize.api import TokenizerI
from nltk.tokenize.util import regexp_span_tokenize
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('omw-1.4')

clear_output()

### Choose data input: AMR Proxy or CNNDM

Each data folder should contain .txt files of the AMRs (with annotations), tokenised text and non-tokenised document text. They should be labelled `amr.txt`, `tok_stories.txt` and `snt_stories.txt` respectively. The story text can be obtained by running `pipeline.py` of the Unsupervised SAS program up to and including the `save_stories()` function, with the `amr.txt` file as input.



In [6]:
# Uncomment dataset to process
# dataset = 'amr_proxy'
# dataset = 'cnndm'
# dataset = 'test'
dataset = 'moral_test'
data_path = f'/content/drive/MyDrive/IP/datasets/{dataset}_inputs'
output_path = f'/content/drive/MyDrive/IP/datasets/{dataset}_outputs'

In [7]:
# Extract AMR bank files from tgz file in drive
filename = '/content/drive/MyDrive/IP/amr_annotation_3.0_LDC2020T02.tgz'
if filename.endswith("tgz"):
    tar = tarfile.open(filename, "r:gz")
    for item in tar:
        tar.extract(item, '/content')
    tar.close()

*End of setup*

--------------------------------------------------------------------------------

### Load datasets

In [8]:
# Load CNNDM dataset from HuggingFace
! pip install datasets
from datasets import load_dataset
cnndm_data = load_dataset("cnn_dailymail","3.0.0")
clear_output()

In [9]:
print('Training articles =', len(cnndm_data['train']))
print('Validation articles =', len(cnndm_data['validation']))
print('Testing articles =', len(cnndm_data['test']))

Training articles = 287113
Validation articles = 13368
Testing articles = 11490


### Filter datasets for opinion articles

In [19]:
# For CNNDM dataset, use keyphrase 'opinions expressed' to identify opinion pieces.
def filter_opinions(dataset):
    start_time = time.time()
    opinions = dataset.filter(lambda x: 'opinions expressed in this' in x['article'])
    clear_output()
    print('Finished in %s seconds' % (time.time() - start_time))
    print('New corpus size =', len(opinions))
    return opinions

In [20]:
corpus = filter_opinions(cnndm_data['train'])

Finished in 0.008241415023803711 seconds
New corpus size = 4898


### Preprocess articles

In [None]:
def preprocess(article):
  
    # Split into list of sentences
    sent_list = nltk.tokenize.sent_tokenize(article)

    # Remove extraneous info from start of article
    for i, sent in enumerate(sent_list):

        if "(CNN) --" in sent:
            index = sent.find('(CNN) --')
            new_sent = ""
            new_sent = [new_sent+char for idx, char in enumerate(sent_list[i]) if idx > index+8]
            new_sent = ''.join((str(n) for n in new_sent))
            sent_list[i] = new_sent
            sent_list = sent_list[i:-1]

    return sent_list

# Stage 1: Text-to-AMR parsing (for CNNDM)

https://github.com/bjascob/amrlib

* Input: corpus of articles (HuggingFace dataset) ---
* Intermediate: table of article AMRs, sentences and graphs (.csv file)
* Output: AMR Proxy text format (.txt file)

### Functions

In [None]:
def process_test_corpus(corpus, csv_file):
  
    with open('/content/drive/MyDrive/IP/datasets/test_selection.csv', 'r', encoding='latin-1') as file:
        csv_reader = csv.reader(file, quotechar = '"')
        ids = []
        moral_summs = []
        for idx, line in enumerate(csv_reader):
            if idx > 0:
                ids.append(line[4])
                moral_summs.append(line[5])

    start_time = time.time()
    num_records = len(ids)

    total = num_records
    with tqdm.tqdm(total=total) as bar:

        with open(csv_file, 'r+', newline='') as file:
            reader = csv.DictReader(file)
            fieldnames = ['article', 'highlights', 'id', 'article_sentences', 'article_penmans', 'summary_sentences', 'summary_penmans', 'moral_sentences', 'moral_penmans']
            writer = csv.DictWriter(file, fieldnames=fieldnames)

            for idx, id in enumerate(ids):
                for art in corpus:
                    if art['id'] == id:

                        # Preprocess the article
                        article_sents = preprocess(art['article'])
                        # Parse each artcile sentence to an AMR
                        article_penmans = parser.parse_sents(article_sents)
                        # Preprocess the highlights
                        summary_sents = preprocess(art['highlights'])
                        # Parse each summary sentence to an AMR
                        summary_penmans = parser.parse_sents(summary_sents)
                        # Preprocess moral summary sentences
                        moral_sents = preprocess(moral_summs[idx])
                        # Parse each moral summary sentence to an AMR
                        moral_penmans = parser.parse_sents(moral_sents)
                        
                        writer.writerow({'article': art['article'], 
                                        'highlights': art['highlights'], 
                                        'id': art['id'], 
                                        'article_sentences': article_sents, 
                                        'article_penmans': article_penmans, 
                                        'summary_sentences': summary_sents, 
                                        'summary_penmans': summary_penmans,
                                         'moral_sentences': moral_sents,
                                         'moral_penmans': moral_penmans})   
                torch.cuda.empty_cache()
                bar.update(1)

### Parse articles

In [None]:
# Instantiate parser: 'parse_xfm_bart_large' model
parser = amrlib.load_stog_model(model_dir='/content/drive/MyDrive/IP/amrlib/data/model_stog')
clear_output()

In [None]:
# Process articles to csv file
process_test_corpus(corpus, csv_file='/content/drive/MyDrive/IP/datasets/processed_test.csv')

### Load previously processed dataset

In [None]:
def load_dataset(filepath):
    corpus = pd.read_csv(filepath)
    corpus.name = filepath.split('/')[-1][:-4] # Give name
    return corpus

In [None]:
# Load from datasets folder in drive
test_corpus = load_dataset('/content/drive/MyDrive/IP/datasets/processed_test.csv')
test_corpus.name = 'test_corpus'

### Aligments

https://github.com/clab/fast_align
* Input: tokenised sentences and PENMANs (from .csv file)
* Ouput: list of alignments (written into .txt file)

In [None]:
# Compile C++ code and build binaries
from os.path import exists
path = '/content/drive/MyDrive/IP/amrlib/alignments/faa_aligner/fast_align-master'
os.chdir(path)
if exists(f"{path}/fast_align"):
    ! rm "{path}/fast_align"
    ! rm "{path}/atools"
    ! mkdir build
    ! cd build
    ! cmake .
    ! make
clear_output()

In [None]:
# Set environment variable to location of binaries
os.environ['FABIN_DIR'] = '/content/drive/MyDrive/IP/amrlib/alignments/faa_aligner/fast_align-master'
from amrlib.alignments.faa_aligner import FAA_Aligner
from amrlib.evaluate.alignment_scorer import AlignmentScorer

### Write sentence AMRs to PENMAN .txt file for summary graph extraction

In [None]:
def amr_write(corpus, art_index, sent_index, penman, penman_number, output, aligner, key, snt_type):
    # Assign ID and sentence type
    output.write(f"# ::id {corpus.iloc[art_index]['id'][:8]}.{str(penman_number)}" + f" ::snt-type {snt_type}" + "\n")
    # Assign tokens
    sentence = literal_eval(corpus.iloc[art_index][key])[sent_index]
    tokens = nltk.tokenize.word_tokenize(sentence)
    joined_tokens = " ".join(tokens)
    output.write(f"# ::tok {joined_tokens}" + "\n")
    # Create alignments
    amr_surface_aligns, alignment_strings = aligner.align_sents([joined_tokens], [penman])
    output.write(f"# ::alignments {alignment_strings[0]}" + "\n")
    # Write PENMAN
    output.write(str(penman) + "\n\n")


def write_penman_doc(corpus, path):
    inference = FAA_Aligner() # Instantiate aligner object
    total = len(corpus)

    # SOME ARTICLES ARE NOT SUCCESSFULLY ALIGNED
    # Identify these article and update unaligned articles
    unaligned_articles = [25]

    with tqdm.tqdm(total=total) as bar:
        with open(f"{path}/{corpus.name}.txt", "w") as output:
            article_num = 1
            # Loop over articles
            output.write('CNNDM AMRs'+'\n\n')
            for i in range(len(corpus)):
                penman_num = 1
                output.write('# ::snt-type date [signifies start of new document]'+'\n\n')
                # Write summaries
                if i not in unaligned_articles:
                        for j, penman in enumerate(literal_eval(corpus.iloc[i]['moral_penmans'])):
                            try: 
                                amr_write(corpus, i, j, penman, penman_num, output, inference, key='moral_sentences', snt_type='summary')
                                penman_num += 1
                            except AttributeError:
                                unaligned_articles.append(i)
                                # print(f'Failed to retrieve alignments for article {i}')
                                continue
                        # Write articles
                        for k, penman in enumerate(literal_eval(corpus.iloc[i]['article_penmans'])):
                            amr_write(corpus, i, k, penman, penman_num, output, inference, key='article_sentences', snt_type='body')
                            penman_num += 1

                article_num += 1
                bar.update(1)

    return unaligned_articles

In [None]:
# Write the document
unaligned_articles = write_penman_doc(test_corpus, data_path)

100%|██████████| 50/50 [01:36<00:00,  1.92s/it]


### Measure alignments

In [None]:
inference = FAA_Aligner()

with open ('/content/drive/MyDrive/IP/datasets/amr_proxy_dict.txt', 'rb') as data:
    amr_dict = pickle.load(data)

    ref_toks = []
    ref_penmans = []
    ref_alignments = []
    ref_alignments_list = []

    for doc in amr_dict:
        for sent in doc:
              ref_alignments.append(" ".join(sent['alignments']))
              ref_alignments_list.append(sent['alignments'])
              ref_toks.append("".join(sent['tok']))
              ref_penmans.append(" ".join(sent['amr']))

amr_surface_aligns, test_alignments = inference.align_sents(ref_toks, ref_penmans)
test_alignments_list = []
for alignment in test_aligments:
    test_alignments_list.append(alignment)

scorer = AlignmentScorer(ref_alignments_list, test_alignments_list)
scores = scorer.get_precision_recall_f1()
print(scores)

(0.7438520351727903, 0.5604987354609473, 0.6392883486116707)


# II: Create pipeline inputs

### a. Coreference resolution 

*(3-4 articles per min)*

https://github.com/kentonl/e2e-coref

* Input: tokenised stories (.txt file)
* Ouput: dataframe of sentences, predicted_clusters, top_spans, head_scores (.csv file)

In [None]:
os.chdir('/content')
! git clone https://github.com/kentonl/e2e-coref
%cd e2e-coref

# Temporary hack
! sed 's/MarkupSafe==1.0/MarkupSafe==1.1.1/; s/scikit-learn==0.19.1/scikit-learn==0.21/; s/scipy==1.0.0/scipy==1.6.2/' < requirements.txt > tmp
! mv tmp requirements.txt
! sed 's/.D.GLIBCXX.USE.CXX11.ABI.0//' < setup_all.sh  > tmp
! mv tmp setup_all.sh 
! chmod u+x setup_all.sh 

# Set environment variables
os.environ['data_dir'] = "."
os.environ['TAR'] = '/content/drive/MyDrive/IP/e2e-coref/e2e-coref.tgz'
clear_output()

In [None]:
# Copy edited python files and requirements from drive to Colab space
! cp '/content/drive/MyDrive/IP/e2e-coref/demo.py' '/content/e2e-coref'
! cp '/content/drive/MyDrive/IP/e2e-coref/requirements.txt' '/content/e2e-coref'
! cp '/content/drive/MyDrive/IP/e2e-coref/char_vocab.english.txt' '/content/e2e-coref'
! cp '/content/drive/MyDrive/IP/e2e-coref/resolver.py' '/content/e2e-coref'

In [None]:
# Run setup from clean slate
! pip uninstall -y tensorflow
os.chdir('/content/drive/MyDrive/IP/e2e-coref/')
! pip install -r requirements.txt --log install-log.txt -q
! pip install scikit-learn==0.22.2 # Takes care of error
os.chdir('/content/e2e-coref/')
! ./setup_all.sh
! tar xvzf $TAR
clear_output()

In [None]:
# Create coref input
with open(f'{data_path}/tok_stories.txt', 'r', encoding='latin-1') as file:
    stories = []
    for line in file:
       story = line
       sent_list = nltk.tokenize.sent_tokenize(story)
       stories.append(sent_list)
amr_proxy = pd.DataFrame()
amr_proxy['sentences'] = stories
amr_proxy.to_csv('/content/e2e-coref/coref_input.csv')


In [None]:
# Run the coreference resolution program
os.chdir('/content/e2e-coref')
! GPU=1 python resolver.py final
clear_output()

In [None]:
# Copy result to data folder
! cp "/content/e2e-coref/coref.csv" {data_path}

### b. TF-IDF

*(~800 articles per min)*
* Input: non-tokenised stories (.txt file)
* Output: dictionary of IDF scores over vocab (.json file)

In [None]:
train_set = pd.DataFrame(cnndm_data['train'])

In [None]:
# Calculating IDF scores for each word in the corpus

df_dict = {}
idf_dict = {}

# Add words from dataset if AMR Proxy
if dataset == 'amr_proxy':
    with open(f'{data_path}/snt_stories.txt', 'r', encoding='latin-1') as file:
        stories = file.readlines()
        for story in stories:
            unique_words = set()
            words = nltk.tokenize.word_tokenize(story)
            for word in words:
                if word.lower() not in stopwords.words('english'):
                    unique_words.add(word.lower())
                    if word.lower() not in df_dict:
                        df_dict[word.lower()] = 0

            for word in list(unique_words):
                df_dict[word] += 1

total = len(train_set)
with tqdm.tqdm(total=total) as bar:

    for idx, article in enumerate(train_set['article']):
        sent_list = preprocess(article)
        unique_words = set()
        for sent in sent_list:
            words = nltk.tokenize.word_tokenize(sent)
            for word in words:
                if word.lower() not in stopwords.words('english'):
                    unique_words.add(word.lower())
                    if word.lower() not in df_dict:
                        df_dict[word.lower()] = 0

        for word in list(unique_words):
            df_dict[word] += 1
        bar.update(1)

# Using the equation given by the authors
for word in df_dict.keys():
    idf_dict[word] = np.log(len(train_set) / (df_dict[word] + 1))


100%|██████████| 287113/287113 [6:16:35<00:00, 12.71it/s]


In [None]:
# Export dictionary as json file to data folder
with open(f'{data_path}/idf_dict.json', 'w') as out:
    json.dump(idf_dict, out)

### c. OpenIE

*(~60 articles per min)*

https://colab.research.google.com/github/stanfordnlp/stanza/blob/master/demo/Stanza_CoreNLP_Interface.ipynb#scrollTo=s194RnNg5z95

https://pypi.org/project/stanford-openie/1.0.1/

* Input: non-tokenised stories (.txt file)
* Output: list of list of list of tuples (.txt file, pickled)

In [None]:
# Download the Stanford CoreNLP package with Stanza's installation command
! pip install stanza
import stanza
stanza.install_corenlp()

# Set the CORENLP_HOME environment variable to point to the installation location
os.environ["CORENLP_HOME"] = '/content/drive/MyDrive/IP/stanford-corenlp-full-2018-10-05'
clear_output()

In [None]:
# Import client module
from stanza.server import CoreNLPClient

In [None]:
# Construct a CoreNLPClient with the OpenIE annotator and port number 9001
client = CoreNLPClient(timeout=150000000, be_quiet=True, annotators=['openie'], 
endpoint='http://localhost:9001')
# Start the background server and wait for some time
client.start()
time.sleep(10)
clear_output()

In [None]:
# Find relational tuples
with open(f'{data_path}/snt_stories.txt', 'r', encoding='latin-1') as file:
    stories = file.readlines()
    document_triples = []

    for idx, story in enumerate(stories):
        document = client.annotate(story, output_format='json')
        story_triples = []

        for sentence in document['sentences']:
            sentence_triples = []

            for triple in sentence['openie']:
                if triple not in sentence_triples:
                    # Append triples in format {Relation: [[subject start index, object end index]]}
                    sentence_triples.append({triple['relation']: [[triple['subjectSpan'][0], triple['objectSpan'][-1]]]})

            if sentence_triples != []:
                story_triples.append(sentence_triples)
        
        if story_triples != []:
            document_triples.append(story_triples)

In [None]:
# Export list as pickled text file to data folder
with open(f'{data_path}/triples.txt', 'wb') as out:
    pickle.dump(document_triples, out)

### d. (i): MDF-2 keywords

In [None]:
# Load MFD-2 dictionary
from nltk.stem import WordNetLemmatizer, PorterStemmer
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

# Write dictionary
with open('/content/drive/MyDrive/mfd2.0.txt','r', encoding='latin-1') as f:
    mfd_dict = {}
    for line in f:
        items = line.split('\t')
        key_1, key_2, value = lemmatizer.lemmatize(items[0]), stemmer.stem(items[0]), items[1].split('\n')[0]
        if key_2 not in mfd_dict.keys() and len(key_2) > 2:
            mfd_dict[key_1] = value # lemmas
            mfd_dict[key_2] = value # stems

In [None]:
# Export dictionary as json file to data folder
with open(f'{data_path}/mft_dict.json', 'w') as out:
    json.dump(mfd_dict, out)

### d. (ii): BERT similarity embeddings for sentences

In [None]:
from transformers import BertTokenizer, BertModel
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')
clear_output()

In [None]:
# Load MFD2.0 dictionary
from nltk.stem import WordNetLemmatizer, PorterStemmer
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

with open('/content/drive/MyDrive/mfd2.0.txt','r', encoding='latin-1') as f:
    mfd_dict = {}
    for line in f:
        items = line.split('\t')
        key, value = items[0], int(items[1].split('\n')[0])
        mfd_dict[key] = value

In [None]:
# Make dictionary of BERT word embeddings
mfd_embeddings = {}
for key in mfd_dict.keys():
    word = key
    input_ids = tokenizer(key, return_tensors="pt")
    output = model(**input_ids)
    final_layer = output.last_hidden_state
    av_embedding = torch.mean(final_layer, dim=1)
    mfd_embeddings[key] = av_embedding

In [None]:
# Calculate average foundation embeddings and make dictionary
unique_idx = sorted(list(set(mfd_dict.values())))
foundation_embeddings = {}
for index in unique_idx:
    tensor_list = []
    for key, value in mfd_dict.items():
        if value == index:
            tensor_list.append(mfd_embeddings[key])
    av_tensor = torch.mean(torch.stack(tensor_list), dim=0)
    foundation_embeddings[str(index)] = av_tensor

In [None]:
with open ('/content/drive/MyDrive/IP/datasets/foundation_embeddings', 'rb') as file:
    foundation_embeddings = pickle.load(file)

In [None]:
# Calculate average sentence embeddings and make dictionary
with open('/content/drive/MyDrive/IP/datasets/test_inputs/snt_stories.txt', 'r', encoding='latin-1') as f:
    stories = f.readlines()
    test_stories = []
    for story in stories:
        test_stories.append(nltk.tokenize.sent_tokenize(story))

In [None]:
corpus_embeddings = {}
for doc_idx, story in enumerate(test_stories):
    story_embeddings = {}
    for story_idx, sent in enumerate(story):
        input_ids = tokenizer(sent, return_tensors="pt")
        output = model(**input_ids)
        final_layer = output.last_hidden_state
        av_embedding = torch.mean(final_layer, dim=1)
        story_embeddings[story_idx] = av_embedding
    corpus_embeddings[doc_idx] = story_embeddings


In [None]:
with open (f'{data_path}/corpus_embeddings', 'rb') as file:
    corpus_embeddings = pickle.load(file)

In [None]:
# Find cosine similarity between each sentence embedding and each foundation embedding

mft_embed_dict = corpus_embeddings.copy()
cos = torch.nn.CosineSimilarity(eps=1e-6)

# Loop over documents and sentences
for doc_idx in corpus_embeddings.keys():
    for sent_idx in corpus_embeddings[doc_idx].keys():
        sent_embed = corpus_embeddings[doc_idx][sent_idx]

        cosine_scores = foundation_embeddings.copy()
        # Loop over foundations
        for key in foundation_embeddings.keys():
            cosine_score = cos(sent_embed, foundation_embeddings[key])
            cosine_scores[key] = cosine_score.item()
    
        mft_embed_dict[doc_idx][sent_idx] = cosine_scores


In [None]:
with open (f'{data_path}/mft_embeddings.json', 'r') as file:
    mft_embed_dict = json.load(file)

In [None]:
	def softmax(z):
    t = np.exp(z)
    a = np.exp(z) / np.sum(t, axis=0)
    return a

In [None]:
score_list = []
for story_key, story_dict in mft_embed_dict.items():
    for sent_key, sent_dict in story_dict.items():
        cosine_scores = list(sent_dict.values())
        score_list.append(cosine_scores)

score_list = np.array(score_list)
print(np.mean(score_list))
print(np.std(score_list))
print(round(np.mean(score_list) + np.std(score_list), 4))
        

0.442458133870336
0.0904558085993614
0.5329


# Stage 2: SGE 

https://github.com/vgupta123/Unsupervised-SAS
* Input: AMRs in Proxy text format (.txt file)
* Output: predicted summary AMRs and predicted nodes (.txt files)

*Download pipeline input files to Unsupervised SAS folder of the same name to run on VSCode or run with version mounted on Google Drive.*

### Run the algorithm

In [None]:
# Choose dataset
# dataset = 'test'
dataset = 'moral_test'
data_path = f'/content/drive/MyDrive/IP/datasets/{dataset}_inputs'
output_path = f'/content/drive/MyDrive/IP/datasets/{dataset}_outputs'

In [13]:
# Ensure correct graph library
! pip uninstall -y networkx
! pip install networkx==2.3
clear_output()

In [16]:
os.chdir('/content/drive/MyDrive/IP/u-sas')
! python pipeline.py --dataset='{dataset}'

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
Article 1 done
Article 2 done
Article 3 done
Article 4 done
Article 5 done
Article 6 done
Article 7 done
Article 8 done
Article 9 done
Article 10 done
Article 11 done
Article 12 done
Article 13 done
Article 14 done
Article 15 done
Article 16 done
Article 17 done
Article 18 done
Article 19 done
Article 20 done
Article 21 done
Article 22 done
Article 23 done
Article 24 done
Article 25 done
Article 26 done
Article 27 done
Article 28 done
Article 29 done
Article 30 done
Article 31 done
Article 32 done
Article 33 done
Article 34 done
Article 35 done
Article 36 done
Article 37 done
Article 38 done
Article 39 done
Article 40 done
Article 41 done
Article 42 done
Article 43 done
Article 44 done
Ar

In [None]:
! cp '/content/drive/MyDrive/IP/u-sas/{dataset}_outputs/predicted_summary_amrs.txt' '{output_path}'
! cp '/content/drive/MyDrive/IP/u-sas/{dataset}_outputs/predicted_summary_nodes.txt' '{output_path}'

# Stage 3: AMR-to-Text generation

https://github.com/bjascob/amrlib

* Input: predicted summary AMRs (.txt file)
* Output: predicted summary sentences (.txt file)

### Generate summaries

In [None]:
# Instantiate generator: 'generate_t5wtense' model
generator = amrlib.load_gtos_model(model_dir='/content/drive/MyDrive/IP/amrlib/data/model_gtos')
clear_output()

In [None]:
# Full SGE algorithm
suffix = ''

In [None]:
from ast import literal_eval
pred_summaries = []
with open(f'{output_path}{suffix}/predicted_summary_amrs.txt', 'r', encoding='latin-1') as file:
    amrs = file.readlines()
    for amr in amrs:
        # Join them into the correct format for generation
        formatted_amr = ["\n".join(literal_eval(amr))]
        # Generate text
        regenerated, _ = generator.generate(formatted_amr)
        string = ""
        for x in regenerated:
            string = string + " " + x
            text_from_amr = string
        pred_summaries.append(text_from_amr)
        
clear_output()
with open(f'{output_path}{suffix}/predicted_summary_sentences.txt', 'w') as out:
    for summary in pred_summaries:
        out.write(summary + '\n')

# for text in pred_summaries:
#     print(text)

### Post-processing

In [None]:
# # OPTIONAL: load list of predicted summaries
# with open(f'{output_path}/predicted_summary_sentences.txt', 'r', encoding='latin-1') as file:
#     pred_summaries = file.readlines()

In [None]:
# Load list of reference summaries from output folder
with open(f'{output_path}/target_summaries.txt', 'r', encoding='latin-1') as file:
    ref_summaries = file.readlines()

In [None]:
to_remove = []
duplicate_starts = set()

for idx, text in enumerate(pred_summaries):

    # Remove if fewer than 10 tokens in summary (probably faulty generation)
    tokens = nltk.tokenize.word_tokenize(text)
    if len(tokens) < 10:
        to_remove.append(idx)
    
    # Remove duplicates
    elif text[:50] in duplicate_starts:
        to_remove.append(idx)

    # Remove if a word appears more than 10 times
    else:
        for word in tokens:
            if tokens.count(word) > 10:
                to_remove.append(idx)
                break

    duplicate_starts.add(text[:50])

# Remove from summary lists
if len(to_remove) >= 1:
    for idx in sorted(to_remove)[::-1]:
        del pred_summaries[idx]
        del ref_summaries[idx]

In [None]:
# Calculate lengths of summaries
pred_len = 0
ref_len = 0
num_summaries = len(ref_summaries)
for i in range(num_summaries):
    pred_len += len(nltk.tokenize.word_tokenize(pred_summaries[i]))
    ref_len += len(nltk.tokenize.word_tokenize(ref_summaries[i]))

print('Number of summaries -> %5.0f' % (num_summaries))
print('Total pred words -> %5.0f' % (pred_len))
print('Average pred summary words -> %5.1f' % (pred_len / num_summaries))
print('Total ref words -> %5.0f' % (ref_len))
print('Average ref summary words -> %5.1f' % (ref_len / num_summaries))
print('Average pred summary words as percentage of average ref summary words -> %5.1f' %  (100 * pred_len / ref_len))

Number of summaries ->    45
Total pred words ->  2452
Average pred summary words ->  54.5
Total ref words ->  3698
Average ref summary words ->  82.2
Average pred summary words as percentage of average ref summary words ->  66.3


# Evaluation

### ROUGE

In [None]:
# Calculate average ROUGE scores (P, R, F1) over a dataset
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
average_scores = dict(zip(['rouge1', 'rouge2', 'rougeL'],[[0,0,0],[0,0,0],[0,0,0]]))
N = len(ref_summaries)
K = len(average_scores.keys())

for i in range(N):
    scores = scorer.score(ref_summaries[i],
                          pred_summaries[i])
    for key in scores.keys():
        for j in range(K):
            average_scores[key][j] += scores[key][j]

for key in scores.keys():
    for j in range(K):
        average_scores[key][j] = round((average_scores[key][j] / N) , 3)

# print(average_scores)

### Node F1 scores

In [None]:
# Define function for finding overlapping nodes (non-unique)
def intersection(lst1, lst2):
    lst3 = list(set(lst1) & set(lst2))
    add_nodes = []
    for value in lst3:
        lst1_count = lst1.count(value)
        lst2_count = lst2.count(value)
        if lst1_count > 1 and lst2_count > 1:
            add_nodes.extend([value] * (min(lst1_count, lst2_count)-1))
    lst3.extend(add_nodes)
    return lst3

In [None]:
with open(f'{output_path}{suffix}/predicted_summary_nodes.txt', 'r') as pred_file:
    pred_nodes = pred_file.readlines()

with open(f'{output_path}{suffix}/target_summary_nodes.txt', 'r') as ref_file:
    ref_nodes = ref_file.readlines()

if len(to_remove) >= 1 and len(pred_nodes) > len(pred_summaries):
    for idx in sorted(to_remove)[::-1]:
        del pred_nodes[idx]
        del ref_nodes[idx]

node_scores = [0,0,0]

for i in range(len(pred_nodes)):

    ref_tokens = ref_nodes[i].split()
    pred_tokens = pred_nodes[i].split()

    correct_nodes = intersection(ref_tokens, pred_tokens)
    p = len(correct_nodes) / len(pred_tokens)
    r = len(correct_nodes) / len(ref_tokens)
    if p+r != 0:
        f1 = (2*p*r) / (p+r) 
    node_scores[0] += p
    node_scores[1] += r
    node_scores[2] += f1

for i, score in enumerate(node_scores):
    node_scores[i] = round(score / len(pred_nodes), 3)

# print(node_scores)

### BLEU, CHRF++, Meteor

In [None]:
chrf = evaluate.load('chrf')
meteor = evaluate.load('meteor')
bleu = evaluate.load('bleu')
clear_output()

# Compute BLEU scores from reference and predicted sentences
results = bleu.compute(predictions=pred_summaries, references=ref_summaries, smooth=True)
bleu = round(results['bleu'],3)
# print('BLEU score -> %5.3f' % (results['bleu']))

# Compute CHRF++ scores from reference and predicted sentences
results = chrf.compute(predictions=pred_summaries, references=ref_summaries, word_order=2)
chrf = round(results['score']/100,3)
# print('CHRF++ score -> %5.3f' % (results['score']/100))

# Compute METEOR scores from reference and predicted sentences
results = meteor.compute(predictions=pred_summaries, references=ref_summaries)
meteor = round(results['meteor'],3)
# print('METEOR score -> %5.3f' % (results['meteor']))

### Experiment admin

In [None]:
# Print all results
print(average_scores['rouge1'][1],
      average_scores['rouge1'][0],
      average_scores['rouge2'][1],
      average_scores['rouge2'][0],
      average_scores['rougeL'][2],
      node_scores[2],
      bleu,
      chrf,
      meteor)

0.135 0.35 0.019 0.055 0.121 0.156 0.007 0.133 0.099


In [None]:
# Copy predictions to archive for safekeeping
# Choose number experiment
exp = '38'
sent_location = f'{output_path}/predicted_summary_sentences.txt'
nodes_location = f'{output_path}/predicted_summary_nodes.txt'
amrs_location = f'{output_path}/predicted_summary_amrs.txt'
archive_location = '/content/drive/MyDrive/IP/datasets/DS predictions'

! cp  '{sent_location}'  '{archive_location}'
! cp  '{nodes_location}'  '{archive_location}'
! cp  '{amrs_location}'  '{archive_location}'
! mv '{archive_location}/predicted_summary_sentences.txt' '{archive_location}/{exp}_predicted_summary_sentences.txt'
! mv '{archive_location}/predicted_summary_amrs.txt' '{archive_location}/{exp}_predicted_summary_amrs.txt'
! mv '{archive_location}/predicted_summary_nodes.txt' '{archive_location}/{exp}_predicted_summary_nodes.txt'

# *Archive*

*The archive contiains earlier versions of functions for reference or code that is no longer needed.*

### *Reproduce parser results (SMATCH)*

In [None]:
# Parse some test sentences and write them to a file for testing
test_file = '/content/amr_annotation_3.0/data/amrs/split/test/amr-release-3.0-amrs-test-proxy.txt'
test_sents = []

with open(test_file, encoding = 'utf-8') as infile:
    for idx, line in enumerate(infile):
        line = line.rstrip()
        if '# ::snt' in line:
            test_sents.append(line[8:])
    
    start_time = time.time()
    test_penmans = parser.parse_sents(test_sents)
    clear_output()
    # Should process ~2 sentences/s
    print(f'--- Parsed {len(test_sents)} sentences in %s seconds ---' % (time.time() - start_time))

with open(f"/content/drive/MyDrive/IP/datasets/predicted_proxy_penmans.txt", "w") as output:
    for penman in test_penmans:
        output.write(str(penman) + "\n\n")

--- Parsed 823 sentences in 471.56675267219543 seconds ---


In [None]:
# Compute the SMATCH scores and other metrics
from amrlib.evaluate.smatch_enhanced import compute_scores, get_entries
test_file = '/content/amr_annotation_3.0/data/amrs/split/test/amr-release-3.0-amrs-test-proxy.txt'
predict_file = '/content/drive/MyDrive/IP/datasets/predicted_proxy_penmans.txt'
compute_scores(test_file, predict_file)

Smatch           -> P: 0.825,  R: 0.885,  F: 0.854
Unlabeled        -> P: 0.848,  R: 0.910,  F: 0.878
No WSD           -> P: 0.828,  R: 0.889,  F: 0.857
Non_sense_frames -> P: 0.943,  R: 0.936,  F: 0.939
Wikification     -> P: 0.000,  R: 0.000,  F: 0.000
Named Ent.       -> P: 0.917,  R: 0.929,  F: 0.923
Negations        -> P: 0.731,  R: 0.721,  F: 0.726
IgnoreVars       -> P: 0.722,  R: 0.803,  F: 0.760
Concepts         -> P: 0.925,  R: 0.932,  F: 0.928
Frames           -> P: 0.919,  R: 0.913,  F: 0.916
Reentrancies     -> P: 0.756,  R: 0.757,  F: 0.757
SRL              -> P: 0.834,  R: 0.833,  F: 0.833


### *Reproduce generator results (BLEU, CHRF++, Meteor)*

In [None]:
# Generate sentences from the reference AMRs
test_file = '/content/amr_annotation_3.0/data/amrs/split/test/amr-release-3.0-amrs-test-proxy.txt'
test_amrs = []
amr = ''

with open(test_file, encoding = 'utf-8') as infile:
    for idx, line in enumerate(infile):
        line = line.rstrip()
        if '# ::id' in line:
            test_amrs.append(amr)
            amr = ''
        amr = amr + line + '\n'
    test_amrs.append(amr)

start_time = time.time()
pred_sents, _ = generator.generate(test_amrs[1:], use_tense=False) 
clear_output()
print(f'--- Generated {len(test_amrs[1:])} sentences in %s seconds ---' % (time.time() - start_time))

--- Generated 823 sentences in 66.9390721321106 seconds ---


In [None]:
# Compute BLEU scores from reference and predicted sentences tokenized into words
chrf = evaluate.load('chrf')
meteor = evaluate.load('meteor')
from amrlib.evaluate.bleu_scorer import BLEUScorer

test_sents = []
with open(test_file, encoding = 'utf-8') as infile:
    for idx, line in enumerate(infile):
        line = line.rstrip()
        if '# ::snt' in line:
            test_sents.append(line[8:])

test_token_list = []
pred_token_list = []
for i in range(len(pred_sents)):
    test_words = nltk.tokenize.word_tokenize(test_sents[i])
    pred_words = nltk.tokenize.word_tokenize(pred_sents[i])
    test_token_list.append(test_words)
    pred_token_list.append(pred_words)

bleu_scorer = BLEUScorer()
bleu_score, ref_len, hyp_len = bleu_scorer.compute_bleu(test_token_list, pred_token_list)
print('BLEU score -> %5.3f' % (bleu_score))

# Compute CHRF++ scores from reference and predicted sentences
results = chrf.compute(predictions=pred_sents, references=test_sents, word_order=2)
print('CHRF++ score -> %5.3f' % (results['score']/100))

# Compute METEOR scores from reference and predicted sentences
results = meteor.compute(predictions=pred_sents, references=test_sents)
print('METEOR score -> %5.3f' % (results['meteor']))

### *Transfomer summarisation*

In [None]:
word_counts = []
for i in range(len(pred_summaries)):
    ref_summ_words = nltk.tokenize.word_tokenize(ref_summaries[i])
    word_counts.append(len(ref_summ_words))
max(word_counts)

In [None]:
from transformers import pipeline
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
clear_output()

In [None]:
def load_dataset(filepath):
    corpus = pd.read_csv(filepath)
    corpus.name = filepath.split('/')[-1][:-4] # Give name
    return corpus

In [None]:
# Load from datasets folder in drive
with open ('/content/drive/MyDrive/IP/datasets/cnndm_inputs/snt_stories.txt', 'r', encoding='latin-1') as file:
    stories = file.readlines()

In [None]:
transformer_summaries = []
for story in stories:
    try:
        summary = summarizer(story, max_length=64, do_sample=False)[0]['summary_text']
        print(summary)
        transformer_summaries.append(summary)
    except IndexError:
        print('INDEX ERROR')
        transformer_summaries.append("None")
        continue

In [None]:
pred_summaries = transformer_summaries

### *Test: SPRING API*

In [None]:
def sentence_to_amr(sentence_list):
    amrs = []
    url_base = "https://nlp.uniroma1.it/spring/api/text-to-amr?sentence="
    total = len(sentence_list)
    with tqdm.tqdm(total=total) as bar:
        for sent in sentence_list:
            # URL encode the sentence
            enc_sentence = urllib.parse.quote(sent)
            # Attach it to the get request for the SPRING API
            url = url_base + enc_sentence
            response = requests.get(url)
            amrs.append(response.json())
            bar.update(1)
    return amrs

In [None]:
amrs = sentence_to_amr(sent_list)

In [None]:
def amr_to_text(amr_list):
    sentences = []
    url_base = "https://nlp.uniroma1.it/spring/api/amr-to-text?penman="
    total = len(amr_list)
    with tqdm.tqdm(total=total) as bar:
        for amr in amr_list:
            # URL encode the AMR
            enc_amr = urllib.parse.quote(amr['penman'])
            # Attach it to the get request for the SPRING API
            url = url_base + enc_amr
            response = requests.get(url)
            try:
                sentence = response.json()
                sentences.append(sentence)
            except Exception:
                print("Decoder error")
            bar.update(1)
    return sentences

In [None]:
regenerated = amr_to_text(amrs)

In [None]:
string = ""
for x in regenerated:
    string = string + x['sentence']
    article_from_amr = string

In [None]:
print(article)
print(article_from_amr)

### *Combining AMRs and alignments*

In [None]:
amrs = '/content/amr_annotation_3.0/data/amrs/split/dev/amr-release-3.0-amrs-dev-proxy.txt'
alignments = '/content/amr_annotation_3.0/data/alignments/split/dev/amr-release-3.0-alignments-dev-proxy.txt'
outfile = '/content/sample_data/combined_dev.txt'

with open(alignments, 'r') as alignments:
    data = []
    data = alignments.readlines()

with open(amrs, 'r') as in_1:
        with open(outfile, 'w') as out:

            for line in in_1:

                out.write(line)

                if '::id' in line:
                    id = line.split('id')[1].split(' ')[1].strip()

                    for idx, line in enumerate(data):
                        if '::id' in line:
                            if line.split('id')[1].split(' ')[1].strip() == id:
                                out.write(data[idx+1])
                                out.write(data[idx+2])


### *Writing predicted AMRs and alignments to input file*

In [None]:
# Parse AMR test sentences
test_file = '/content/amr_annotation_3.0/data/amrs/split/test/amr-release-3.0-amrs-test-proxy.txt'
test_sents = []

with open(test_file, encoding = 'utf-8') as infile:
    for idx, line in enumerate(infile):
        line = line.rstrip()
        if '# ::snt' in line:
            test_sents.append(line[8:])
    
    start_time = time.time()
    test_penmans = parser.parse_sents(test_sents)
    clear_output()
    # Should process ~2 sentences/s
    print(f'--- Parsed {len(test_sents)} sentences in %s seconds ---' % (time.time() - start_time))

# with open(f"/content/drive/MyDrive/IP/datasets/predicted_proxy_penmans.txt", "w") as output:
#     for penman in test_penmans:
#         output.write(str(penman) + "\n\n")

--- Parsed 823 sentences in 479.1572017669678 seconds ---


In [None]:
inference = FAA_Aligner()

with open ('/content/drive/MyDrive/IP/datasets/amr_proxy_dict.txt', 'rb') as data:
    amr_dict = pickle.load(data)

    ref_toks = []
    ref_penmans = []
    ref_alignments = []
    ref_alignments_list = []

    for doc in amr_dict:
        for sent in doc:
              ref_alignments.append(" ".join(sent['alignments']))
              ref_alignments_list.append(sent['alignments'])
              ref_toks.append("".join(sent['tok']))
              ref_penmans.append(" ".join(sent['amr']))

amr_surface_aligns, test_alignments = inference.align_sents(ref_toks, test_penmans[:-1])
test_alignments_list = []
for alignment in test_alignments:
    test_alignments_list.append(alignment)


In [None]:
amrs = '/content/amr_annotation_3.0/data/amrs/split/test/amr-release-3.0-amrs-test-proxy.txt'
alignments = '/content/amr_annotation_3.0/data/alignments/split/test/amr-release-3.0-alignments-test-proxy.txt'
outfile = '/content/sample_data/proxy_pred_amrs.txt'

# Write input with predicted penmans but reference alignments
with open(alignments, 'r') as alignments:
    data = []
    data = alignments.readlines()

with open(amrs, 'r') as in_1:
    with open(outfile, 'w') as out:
        
        out.write('AMR release; corpus: proxy; section: test; number of AMRs: 823' + '\n\n')
        counter = 0
        for line in in_1:

            if line.startswith('# ::') and 'snt ' not in line:
                out.write(line)

            if '::id' in line:
                id = line.split('id')[1].split(' ')[1].strip()

                for idx, line in enumerate(data):
                    if '::id' in line:
                        if line.split('id')[1].split(' ')[1].strip() == id:
                            out.write(data[idx+1])
                            out.write(data[idx+2])

            if line.startswith('('):
                for idx, penman_line in enumerate(test_penmans[counter]):
                    out.write(penman_line)
                out.write('\n\n')
                counter += 1


In [None]:
test_penmans[0]

'# ::snt 2007-08-21\n(d / date-entity\n      :year 2007\n      :month 8\n      :day 21)'

In [None]:
amrs = '/content/amr_annotation_3.0/data/amrs/split/test/amr-release-3.0-amrs-test-proxy.txt'
alignments = '/content/amr_annotation_3.0/data/alignments/split/test/amr-release-3.0-alignments-test-proxy.txt'
outfile = '/content/sample_data/proxy_pred_amrs_2.txt'

# Write input with predicted penmans and predicted alignments
with open(alignments, 'r') as alignments:
    data = []
    data = alignments.readlines()

with open(amrs, 'r') as in_1:
    with open(outfile, 'w') as out:
        
        out.write('AMR release; corpus: proxy; section: test; number of AMRs: 823' + '\n\n')
        counter = 0
        for line in in_1:

            if line.startswith('# ::') and 'snt ' not in line:
                out.write(line)

            if '::id' in line:
                id = line.split('id')[1].split(' ')[1].strip()

                for idx, line in enumerate(data):
                    if '::id' in line:
                        if line.split('id')[1].split(' ')[1].strip() == id:
                            out.write(data[idx+1])
                            out.write(f'# ::alignments {test_alignments_list[counter]}'+'\n')

            if line.startswith('('):
                for penman_line in test_penmans[counter]:
                    out.write(penman_line)
                out.write('\n\n')
                counter += 1
                
            if counter == (len(test_alignments_list) - 1):
                break


        

### *Export selection of articles from corpus*

In [None]:
# Cell for choosing and exporting random articles from corpus

import random

df_temp = pd.DataFrame(corpus)

random_ints = []
while len(random_ints) < 50:
    i = random.randint(0,len(df_temp))
    if i not in random_ints:
        random_ints.append(i)
print(random_ints)
random_select = df_temp.iloc[random_ints]
random_select.to_csv('/content/drive/MyDrive/IP/datasets/test_selection.csv')

[2866, 2443, 2382, 1766, 1474, 4807, 2100, 1048, 3154, 1448, 539, 1764, 2996, 1728, 4113, 875, 2076, 3428, 4456, 3525, 881, 1717, 418, 1801, 2332, 1373, 1478, 4478, 812, 2901, 4068, 745, 1128, 1684, 799, 1970, 3702, 3917, 1020, 1083, 3501, 3970, 2861, 3119, 527, 582, 2785, 2794, 4110, 165]


### *Processing to dataframes*

In [None]:
# Function for parsing sentences of an article and compiling a dataframe
def process_corpus(corpus, start_idx, end_idx, key='article', visualise=False, regenerate=False):
    """ Takes a corpus as a Huggingface dataset and returns a subset as a Pandas 
        dataframe with: 
            (1) a list of Penman format AMRs for each sentence, 
            (2) a list of graphical AMRs for each sentence (optional)
            (3) a regenerated article (optional).
    """
    start_time = time.time()
    num_records = int(end_idx - start_idx)
    indices = list(range(start_idx, end_idx))
    corpus_subset = corpus.select(indices=indices)

    total = num_records
    with tqdm.tqdm(total=total) as bar:
        
        sentences = []
        penmans = []
        graphs = []
        regen = []

        for art in corpus_subset:

            # Preprocess the article
            sentence_list = preprocess(art[key])
            sentences.append(sentence_list)

            print('done')

            # Parse each sentence to an AMR
            penman_list = parser.parse_sents(sentence_list)
            penmans.append(penman_list)

            print('done')
            

            # Draw a graph for each sentence
            if visualise is True: 
                graph_list = []
                try:
                    for penman in penman_list:
                        graph = produce_graph(penman)
                        graph_list.append(graph)
                    graphs.append(graph_list)
                except IndexError:
                    print("Could not draw graph.")
                    graphs.append("Missing graph")
                

            # Regenerate the article from the AMRs
            if regenerate is True:
                regenerated, _ = generator.generate(penman_list)
                string = ""
                for x in regenerated:
                    string = string + " " + x
                    text_from_amr = string
                regen.append(text_from_amr)       

            bar.update(1)

    # Convert to dataframe in order to add graph column
    corpus_subset = pd.DataFrame(corpus_subset)
    corpus_subset["sentences"] = sentences
    corpus_subset["penmans"] = penmans
    if visualise is True:
        corpus_subset["graphs"] = graphs
    if regen is True:
        corpus_subset["regen"] = regen

    clear_output()

    print(f'---Processed {num_records} {key} in {(time.time() - start_time)}seconds ---')

    corpus_subset.name = f'{key}_{start_idx}_{end_idx}'

    return corpus_subset

### *Understanding OpenIE*

In [None]:
text = "Albert Einstein was a German-born theoretical physicist. He developed the theory of relativity."
document = client.annotate(text, output_format='json')
triples = []
for sentence in document['sentences']:
    for triple in sentence['openie']:
        triples.append({
           'subject': triple['subject'],
           'relation': triple['relation'],
            'object': triple['object']
        })
print(triples)

[{'subject': 'Albert Einstein', 'relation': 'was', 'object': 'theoretical physicist'}, {'subject': 'Einstein', 'relation': 'was', 'object': 'born'}, {'subject': 'Albert Einstein', 'relation': 'was', 'object': 'born theoretical physicist'}, {'subject': 'He', 'relation': 'developed', 'object': 'theory of relativity'}, {'subject': 'He', 'relation': 'developed', 'object': 'theory'}]


### *Redundant TF-IDF*

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer

In [None]:
tfIdfVectorizer=TfidfVectorizer(use_idf=True, stop_words=stopwords)
tfIdf = tfIdfVectorizer.fit_transform(corpus)
df = pd.DataFrame(tfIdf[0].T.todense(), index=tfIdfVectorizer.get_feature_names(), columns=["TF-IDF"])
idf = df.to_dict()
idf = idf['TF-IDF']
print(idf)





### *SUPERT*

In [None]:
os.chdir('/content')
! git clone 'https://github.com/yg211/acl20-ref-free-eval.git'
os.chdir('/content/acl20-ref-free-eval')
! pip install -r requirements.txt
! pip uninstall scikit-learn -y
! pip install scikit-learn==0.23.1
clear_output()

In [None]:
# Write stories and summaries to individual file for ease of use with SUPERT program
from os import path
from nltk.tokenize import sent_tokenize
os.chdir('/content/acl20-ref-free-eval')

with open(f'{data_path}/snt_stories.txt', 'r') as file:
    stories = file.readlines()
    for idx, story in enumerate(stories):
        if path.exists(f'data/{dataset}/{idx}') == False:
            os.mkdir(f'data/{dataset}/{idx}')
            os.mkdir(f'data/{dataset}/{idx}/summaries')
            os.mkdir(f'data/{dataset}/{idx}/input_docs')
        with open(f'data/{dataset}/{idx}/input_docs/story.txt', 'w') as out:
            sentences = sent_tokenize(story)
            out.write('<P>' + '\n')
            for sent in sentences:
                out.write(sent + '\n')
                out.write('</P>' + '\n')
                out.write('<P>' + '\n')

for idx, summary in enumerate(pred_summaries):
    with open(f'data/{dataset}/{idx}/summaries/summary.txt', 'w') as out:
        out.write(summary)

In [None]:
from ref_free_metrics.supert import Supert
from utils.data_reader import CorpusReader

# read docs and summaries
# reader = CorpusReader(f'{data_path}/snt_summaries.txt')
for idx, _ in enumerate(pred_summaries):
    reader = CorpusReader(f'data/{dataset}/{idx}')
    source_docs = reader()
    summaries = reader.readSummaries() 

    # compute the Supert scores
    supert = Supert(source_docs, ref_metric='top15') 
    supert_scores = supert(summaries)
    print(supert_scores)
# av_supert_score = sum(supert_scores)/len(supert_scores)
# print('SUPERT -> %5.3f' % (av_supert_score))

### *Reference free scores (BLANC, SUPERT)*

In [None]:
! pip install blanc
! pip install pytorch_transformers
from blanc import BlancHelp, BlancTune
blanc_help = BlancHelp()
clear_output()

In [None]:
with open(f'{data_path}/snt_stories.txt', 'r', encoding='latin-1') as file:
    stories = file.readlines()

In [None]:
blanc_scores = blanc_help.eval_pairs(stories, pred_summaries)
av_blanc_score = sum(blanc_scores)/len(blanc_scores)
print('BLANC -> %5.3f' % (av_blanc_score))

100%|██████████| 2700/2700 [08:05<00:00,  5.56it/s]


BLANC -> 0.090


In [None]:
os.chdir('/content')
! git clone 'https://github.com/yg211/summary-reward-no-reference.git'
os.chdir('/content/summary-reward-no-reference')
clear_output()

In [None]:
from rewarder import Rewarder
rewarder = Rewarder(os.path.join('trained_models','sample.model'))
for idx, story in enumerate(stories):
    article = story
    summary = pred_summaries[idx]
    ref = ref_summaries[idx]

    summ_score = rewarder(article, summary)
    ref_score = rewarder(article, ref)
    print(summ_score, ref_score)