In [None]:
# !pip install transformers

In [None]:
import json
import os
from typing import List
import ast
import nltk
import re
import torch
import pandas as pd
from tqdm import tqdm
from transformers import T5Tokenizer, T5ForConditionalGeneration
# from transformers import PegasusForConditionalGeneration, PegasusTokenizer
# from transformers import TransfoXLTokenizer, TransfoXLLMHeadModel
# from transformers import BartTokenizer, BartForConditionalGeneration
from transformers import AutoModel, AutoTokenizer
import bert_score
import functools
import networkx as nx

from pyrouge import Rouge155
import time
import shutil
import numpy as np
import nltk
from simcse import SimCSE
import argparse
from scipy import spatial
from sentence_transformers import SentenceTransformer
from nltk.tokenize import sent_tokenize

import matplotlib.pyplot as plt


from IPython.utils import io
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
 # Define the target device. Use GPU if available.
os.environ["CUDA_VISIBLE_DEVICES"] = '0'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cuda


In [None]:
def generate_salient_texts(
        text,
        model,
        tokenizer,
        device,
        num_texts_per_section=7,
        temperature=0.5,
        max_len=64
    ):
    """
    This function takes a text passage and generate a list of salient_texts
    """

    input_ids = tokenizer.encode(text, return_tensors='pt').to(device)

    salient_texts = []
    try:
        outputs = model.generate(
            input_ids=input_ids,
            max_length=max_len,
            do_sample=True,
            top_k=10,
            num_return_sequences=num_texts_per_section,
            temperature=temperature
        )
        salient_texts = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
    except RuntimeError:
        print(len(input_ids))

    return salient_texts

In [None]:
def get_T5_model(model,device):

    tokenizer_T5 = T5Tokenizer.from_pretrained(model)
    model_T5 = T5ForConditionalGeneration.from_pretrained(model)
    output = model_T5.to(device)
    return model_T5, tokenizer_T5


In [None]:
# generative_model_base = "facebook/bart-large-cnn"
generative_model_base = "doc2query/S2ORC-t5-base-v1"
# generative_model_base ="google/pegasus-multi_news"
# generative_model_base = "mrm8488/t5-base-finetuned-summarize-news"
 # define model for gen_text generation
generative_model, generative_tokenizer = get_T5_model(generative_model_base,device)

Downloading:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.79k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.12k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/702 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/990M [00:00<?, ?B/s]

In [None]:
# !pip install --upgrade numpy

In [None]:
from gensim.models import Word2Vec
from sklearn.cluster import KMeans
from scipy.spatial import distance
from nltk.corpus import stopwords

#creating word vectors
# n_clusters=5
n_clusters=7
model_kmeans = KMeans(n_clusters, init = 'k-means++', random_state = 42)

In [None]:
def get_cluster_representative_sentence_weights(sentence,
                                                model_kmeans,
                                                n_clusters):
    corpus = []
    for i in range(len(sentence)):
        sen = re.sub('[^a-zA-Z]', " ", sentence[i])
        sen = sen.lower()
        sen=sen.split()
        sen = ' '.join([i for i in sen if i not in stopwords.words('english')])
        corpus.append(sen)

    n=300
    all_words = [i.split() for i in corpus]
    model_word_vec = Word2Vec(all_words, min_count=1,size= n)

    sen_vector=[]
    for i in corpus:
        plus=0
        for j in i.split():
            plus+=model_word_vec.wv[j]
        plus = plus/len(plus)

        sen_vector.append(plus)

    # wcss=[]
    # #this loop will fit the k-means algorithm to our data and
    # #second we will compute the within cluster sum of squares and #appended to our wcss list.
    # for i in range(1,11):
    #     kmeans = KMeans(n_clusters=i, init ='k-means++', max_iter=300,  n_init=10,random_state=42 )
    #     kmeans.fit(sen_vector)
    # #kmeans algorithm fits to the X dataset
    #     wcss.append(kmeans.inertia_)
    # #kmeans inertia_ attribute is:  Sum of squared distances of samples #to their closest cluster center.
    # #4.Plot the elbow graph
    # plt.plot(range(1,11),wcss)
    # plt.title('The Elbow Method Graph')
    # plt.xlabel('Number of clusters')
    # plt.ylabel('WCSS')
    # plt.show()

    y_kmeans = model_kmeans.fit_predict(sen_vector)

    cluster_rep={}
    for i in range(n_clusters):
        my_dict={}

        for j in range(len(y_kmeans)):

            if y_kmeans[j]==i:
                my_dict[j] =  distance.euclidean(model_kmeans.cluster_centers_[i],sen_vector[j])

        weight=len(my_dict)
        min_distance = min(my_dict.values())
        cluster_rep[sentence[(min(my_dict, key=my_dict.get))]] = weight

    return cluster_rep

In [None]:
def flatten(t):
    return [item for sublist in t for item in sublist]

def calculate_similarity_bert_score(
        model_bertscore,
        tokenizer_bertscore,
        salient_texts,
        doc_text,
        model_type,
        all_layers,
        num_layers,
        device
    ):
    # create an array of salient_texts of length no_salient_texts*num_sentences
    salient_texts_compare_array = flatten(
        np.array([[str(gen_text)]*len(doc_text) for gen_text in salient_texts])
        .astype('str')
    )
    # create an array of sentences of length no_salient_texts*num_sentences
    sentences_compare_array = flatten(np.array([doc_text for gen_text in salient_texts]).astype('str'))
    P_sci, R_sci, F1_sci = bert_score.score(
        salient_texts_compare_array,
        sentences_compare_array,
        model_bertscore,
        tokenizer_bertscore,
        model_type,
        num_layers=num_layers,
        device=device,
        verbose=False,
        all_layers=all_layers,
        batch_size=64
    )
    return F1_sci

In [None]:

def calculate_similarity_simsce(model_simcse,salient_texts,doc_text,device):
    return model_simcse.similarity(list(salient_texts),list(doc_text))

def calculate_similarity_sentence_transformers(model,salient_texts,doc_text):
    vectors_bert_sent = model.encode(doc_text)
    vectors_bert_gen_text = model.encode(salient_texts)
    scores_marix = np.zeros((len(salient_texts),len(doc_text)))
    for idx_1,ii in enumerate(vectors_bert_sent):
        for idx_2, jj in enumerate(vectors_bert_gen_text):
            scores_marix[idx_2,idx_1] =  spatial.distance.cosine(ii, jj)
    return scores_marix

In [None]:
# df = pd.read_csv("DUC2004.csv")
df = pd.read_csv("DUC2004.csv")

In [None]:
similarity_model_name = 'bert_score'
similarity_model_path = "bert-base-uncased"

all_layers=False
# define similarity model
if (similarity_model_name == 'bert_score'):
    num_layers, similarity_model, similarity_tokenizer =  bert_score.get_model_and_tokenizer(
        similarity_model_path,
        device,
        all_layers=all_layers
    )
if (similarity_model_name =='simcse'):
    similarity_model = SimCSE(similarity_model_path)
    similarity_tokenizer, num_layers = None, None
if (similarity_model_name=='sentence_transformers'):
    similarity_model = SentenceTransformer(similarity_model_path)
    similarity_model.to(device)
    similarity_tokenizer, num_layers = None, None

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
def rank_answers_based_on_similarity_scores(doc_text,salient_texts,scores,similarity_model_name,gen_text_weights):
    """
    Params:
        doc_text: Array[<string>] : array of sentences in article
        salient_texts: Array[<string>] : array of salient_texts summarising the article
        scores: Array[<number>] : np.array of simialirty scores between each sentence and each gen_text
        gen_text_weights Array[<number>] : np.array of weights associated with importance of each gen_text

    Returns:
        sorted_idxs: Array[<int>] : np.array of indicies of sentencs in doc_text,
                     sorted in order of importance for summary
    """
    # reshape so that rows represent different salient_texts and columns represent different sentences in source text
    scores_reshaped = np.array(scores.reshape(len(salient_texts),len(doc_text)))
    # optionally multiply by weights associated with salient_texts
    scores_reshaped = scores_reshaped*gen_text_weights.reshape(gen_text_weights.size,1)
    # Take mean bert-score for sentences across all salient_texts
    scores_average = np.mean(scores_reshaped,axis=0)
    # sort indicies of scores in descendng order
    if similarity_model_name =='sentence_transformers':
        # distance based - minimize scores
        sorted_idxs = np.argsort(scores_average)
    else:
        # siimilarity based - maximize scores
        sorted_idxs = np.argsort(-scores_average)
    return sorted_idxs

In [None]:

def select_sentences(
    doc_text,
    sorted_idxs,
    top_k_sentences,
    ):
    article_len = len(doc_text)
    max_len = top_k_sentences

    #  if article is too short, whole things is summary
    if article_len <= max_len:
        pred = doc_text
    else :
        _count = 0
        _pred = []
        _pred_idxs = []
        for sentence_idx in sorted_idxs:
            candidate = doc_text[sentence_idx]
            if (_count < max_len):
                _count += 1
                _pred.append(candidate)
                _pred_idxs.append(sentence_idx)
        sorted_pred_idxs = np.sort(_pred_idxs)
        doc_text = np.array(doc_text)
        pred  = doc_text[sorted_pred_idxs]
    return pred

In [None]:
# output_sentences = 7
# ref_sum = "1"

In [None]:
output_sentences = 7
ref_sum = "1"
ground_summary_list =[]
gen_summary_list=[]
for index, row in df.iterrows():
    sal_txt_combined=[]
    doc_txt_combined=[]
    count=0
    for file in os.listdir(row["documents"]):
        with open(row["documents"]+file) as f:
            full_txt = (' '.join(f.readlines())).replace('\n','')
            count+=1
            print(count)
            print("file:", full_txt)
            sal_txt_combined = sal_txt_combined + generate_salient_texts(full_txt, generative_model, generative_tokenizer, device,)
            doc_txt_combined = doc_txt_combined + sent_tokenize(full_txt)
    print("****************************")
    # print(doc_txt_combined)
    clus_rep_sent_dict = get_cluster_representative_sentence_weights(sal_txt_combined, model_kmeans, n_clusters)
    bert_mat = calculate_similarity_bert_score(similarity_model, similarity_tokenizer, list(clus_rep_sent_dict.keys()), doc_txt_combined, similarity_model_path, all_layers, num_layers, device )
    sorted_idxs = rank_answers_based_on_similarity_scores(doc_txt_combined,np.array(list(clus_rep_sent_dict.keys())),bert_mat,similarity_model_name,np.array(list(clus_rep_sent_dict.values())))
    output_summary = select_sentences(doc_txt_combined, sorted_idxs, output_sentences)
    output_summary =  (' '.join(output_summary)).replace('\n','')
    gen_summary_list.append(output_summary)

    with open(row["summary"]+ref_sum+".txt") as f1:
        gsum=(' '.join(f1.readlines())).replace('\n','')
        ground_summary_list.append(gsum)

    print("gsum:", ground_summary_list)
    print("****************************")
    print("output_summary:", gen_summary_list)
    print("****************************")
    # break

    # print(clus_rep_sent_dict)
    break

Token indices sequence length is longer than the specified maximum sequence length for this model (1729 > 512). Running this sequence through the model will result in indexing errors


1
file:  In little more than a week, the world's leaders will converge on this  businesslike city in the heart of Southeast Asia for the annual meeting  of the Asia Pacific Economic Cooperation forum. They could hardly  be meeting in a more provocative place. On Sept. 1, Malaysia discontinued  trading in its currency, the ringgit, and imposed sweeping controls  on the flow of capital in its stock and currency markets, particularly  on investment from overseas. In doing so, the Malaysian prime minister,  Mahathir Mohamad, in effect slammed the door on the global economy  that President Clinton and the other leaders are coming here to champion.  Mahathir's decision drew jeers from international investors and policy-  makers, who warned that Malaysia was seeking a quick fix that would  retard its desperately needed reforms and leave it the odd man out  when Asia finally recovered from the regional malaise. Now, though,  Mahathir's allies are marshaling new economic data that they say indi



****************************
gsum: ["Prospects for the Asia-Pacific Economic Cooperation (APEC) forum scheduled for Nov. 14-18, 1998 in Malaysia were cast in doubt in September when the Malaysian Prime Minister fired and then arrested his deputy and expected successor who was very popular at home and abroad. Widespread demonstrations occurred in Malaysia while presidents of Indonesia and the Philippines spoke of skipping the APEC meeting. APEC also faced a gloomy financial picture with many of the region's economies mired in recession and high unemployment. On the way to the forum a group of high powered U.S. investors made a pep talk in Thailand, but prospects remained dim."]
****************************
output_summary: ["The worsening  financial gloom is likely to dominate talks at next week's summit  in Kuala Lumpur of leaders from the 18-nation Asia-Pacific Economic  Cooperation forum.  The agenda might be global, but the menu will be Malaysian when world  leaders meet next week fo

In [None]:
def test_rouge(predicted_summaries, gold_summaries):
    """Calculate ROUGE scores of sequences passed as an iterator
       e.g. a list of str, an open file, StringIO or even sys.stdin
    """
    current_time = time.strftime('%Y-%m-%d-%H-%M-%S', time.localtime())
    tmp_dir = ".rouge-tmp-{}".format(current_time)
    try:
        if not os.path.isdir(tmp_dir):
            os.mkdir(tmp_dir)
            os.mkdir(tmp_dir + "/candidate")
            os.mkdir(tmp_dir + "/reference")
        print('preparing predicted summaries')
        candidates = [line.strip() for line in tqdm(predicted_summaries,total=len(predicted_summaries))]
        print('preparing gold summaries')
        gold = [line.strip() for line in tqdm(gold_summaries,total=len(gold_summaries))]
        assert len(candidates) == len(gold)
        cnt = len(candidates)
        print('Writing temp files')
        for i in tqdm(range(cnt)):
            if len(gold[i]) < 1:
                continue
            with open(tmp_dir + "/candidate/cand.{}.txt".format(i), "w",
                      encoding="utf-8") as f:
                f.write(candidates[i])
            with open(tmp_dir + "/reference/ref.{}.txt".format(i), "w",
                      encoding="utf-8") as f:
                f.write(gold[i])
        print("Doing ROUGE calculation")
        with io.capture_output() as captured:
            r = Rouge155()
            r.model_dir = tmp_dir + "/reference/"
            r.system_dir = tmp_dir + "/candidate/"
            r.model_filename_pattern = 'ref.#ID#.txt'
            r.system_filename_pattern = r'cand.(\d+).txt'
            rouge_results = r.convert_and_evaluate()
            results_dict = r.output_to_dict(rouge_results)
        return results_dict
    finally:
        pass
        if os.path.isdir(tmp_dir):
            shutil.rmtree(tmp_dir)


In [None]:
def format_rouge_results(results):
    # return f"ROUGE-F(1/2/l)/ROUGE-R(1/2/l)/ROUGE-P(1/2/l): {results['rouge_1_f_score']}/{results['rouge_2_f_score']}/{results['rouge_l_f_score']} /{results['rouge_1_recall']}/{results['rouge_2_recall']}/{results['rouge_l_recall']} /{results['rouge_1_precision']}/{results['rouge_2_precision']}/{results['rouge_l_precision']}"
    # return f"ROUGE-F(1/2/l/3/4/s*/su*)/ROUGE-R(1/2/l)/ROUGE-P(1/2/l): {results['rouge_1_f_score']}/{results['rouge_2_f_score']}/{results['rouge_l_f_score']}/{results['rouge_3_f_score']}/{results['rouge_4_f_score']}/{results['rouge_s*_f_score']}/{results['rouge_su*_f_score']} /{results['rouge_1_recall']}/{results['rouge_2_recall']}/{results['rouge_l_recall']} /{results['rouge_1_precision']}/{results['rouge_2_precision']}/{results['rouge_l_precision']}"
    return f"ROUGE-F(1/2/3/4/s*/su*)/ROUGE-R(1/2/l)/ROUGE-P(1/2/l): {results['rouge_1_f_score']}/{results['rouge_2_f_score']}/{results['rouge_3_f_score']}/{results['rouge_4_f_score']}/{results['rouge_s*_f_score']}/{results['rouge_su*_f_score']}"

In [None]:
our_pred = test_rouge(gen_summary_list,ground_summary_list)

preparing predicted summaries


100%|██████████| 50/50 [00:00<00:00, 222627.60it/s]


preparing gold summaries


100%|██████████| 50/50 [00:00<00:00, 430626.69it/s]


Writing temp files


100%|██████████| 50/50 [00:00<00:00, 5490.36it/s]
INFO:global:Writing summaries.
INFO:global:Processing summaries. Saving system files to /tmp/tmp1bagg6ua/system and model files to /tmp/tmp1bagg6ua/model.
INFO:global:Processing files in .rouge-tmp-2022-09-30-06-09-46/candidate/.
INFO:global:Processing cand.45.txt.
INFO:global:Processing cand.9.txt.
INFO:global:Processing cand.7.txt.
INFO:global:Processing cand.39.txt.
INFO:global:Processing cand.6.txt.
INFO:global:Processing cand.23.txt.
INFO:global:Processing cand.22.txt.
INFO:global:Processing cand.4.txt.
INFO:global:Processing cand.42.txt.
INFO:global:Processing cand.44.txt.
INFO:global:Processing cand.47.txt.
INFO:global:Processing cand.36.txt.
INFO:global:Processing cand.46.txt.
INFO:global:Processing cand.16.txt.
INFO:global:Processing cand.40.txt.
INFO:global:Processing cand.32.txt.
INFO:global:Processing cand.3.txt.
INFO:global:Processing cand.33.txt.
INFO:global:Processing cand.38.txt.
INFO:global:Processing cand.29.txt.
INFO:

Doing ROUGE calculation


INFO:global:Processing ref.33.txt.
INFO:global:Processing ref.39.txt.
INFO:global:Processing ref.48.txt.
INFO:global:Processing ref.44.txt.
INFO:global:Processing ref.8.txt.
INFO:global:Processing ref.42.txt.
INFO:global:Processing ref.49.txt.
INFO:global:Processing ref.12.txt.
INFO:global:Processing ref.9.txt.
INFO:global:Processing ref.46.txt.
INFO:global:Processing ref.28.txt.
INFO:global:Processing ref.16.txt.
INFO:global:Processing ref.14.txt.
INFO:global:Processing ref.32.txt.
INFO:global:Processing ref.10.txt.
INFO:global:Saved processed files to /tmp/tmp1bagg6ua/model.
INFO:global:Written ROUGE configuration to /tmp/tmpmul83dew/rouge_conf.xml
INFO:global:Running ROUGE with command /content/GenCompareSum/src/rouge/tools/ROUGE-1.5.5/ROUGE-1.5.5.pl -e /content/GenCompareSum/src/rouge/tools/ROUGE-1.5.5/data -c 95 -2 -1 -U -r 1000 -n 4 -w 1.2 -a -m /tmp/tmpmul83dew/rouge_conf.xml


In [1]:
format_rouge_results(our_pred)

NameError: name 'format_rouge_results' is not defined


GENCOMPARESUM

In [None]:
# !python3 /content/GenCompareSum/GenCompareSum.py --num_generated_texts 10 --block_n_gram_generated_texts 4 --col_name new_heuristic --summary_len_metric sentences --num_sentences 7 --block_n_gram_sum 4   --visible_device 0 --texts_per_section 3 --temperature 0.5 --stride 4 --gen_text_weights 1 --data_path  /content/gdrive/MyDrive/COVIDSUM/FINALDATASET/compareCORD19.csv --generative_model_path doc2query/S2ORC-t5-base-v1  --similarity_model_name bert_score --similarity_model_path bert-base-uncased --inference_only False --save_predictions False

MODEL

In [None]:
import json
import os
from typing import List
import ast
import nltk
import torch
import pandas as pd
from tqdm import tqdm
from transformers import T5Tokenizer, T5ForConditionalGeneration
# from transformers import TransfoXLTokenizer, TransfoXLLMHeadModel
# from transformers import BartTokenizer, BartForConditionalGeneration
from transformers import AutoModel, AutoTokenizer
import bert_score
import functools
import networkx as nx

from pyrouge import Rouge155
import time
import shutil
import numpy as np
import nltk
from simcse import SimCSE
import argparse
from scipy import spatial
from sentence_transformers import SentenceTransformer

from IPython.utils import io
nltk.download('punkt')

In [None]:
def timer(func):
    @functools.wraps(func)
    def wrapper_timer(*args, **kwargs):
        tic = time.perf_counter()
        value = func(*args, **kwargs)
        toc = time.perf_counter()
        elapsed_time = toc - tic
        print(f"Elapsed time: {elapsed_time:0.4f} seconds")
        return value
    return wrapper_timer

In [None]:
def preprocess(document: str, stride=5, list=False) -> List[str]:
    """
    This function takes a corpus document and outputs a list of generation
    spans where the 'stride' is the number of sentences in each section.
    """
    if list==False:
        sentences = nltk.tokenize.sent_tokenize(document)
    else:
        sentences = document
    chunks = [" ".join(sentences[i:i+stride]) for i in range(0, len(sentences), stride)]

    return chunks


In [None]:
def generate_salient_texts(
        text,
        model,
        tokenizer,
        device,
        num_texts_per_section,
        temperature,
        max_len
    ):
    """
    This function takes a text passage and generate a list of salient_texts
    """

    input_ids = tokenizer.encode(text, return_tensors='pt').to(device)

    salient_texts = []
    try:
        outputs = model.generate(
            input_ids=input_ids,
            max_length=max_len,
            do_sample=True,
            top_k=10,
            num_return_sequences=num_texts_per_section,
            temperature=temperature
        )
        salient_texts = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
    except RuntimeError:
        print(len(input_ids))

    return salient_texts

In [None]:
def get_salient_texts_across_corpus(
        model,
        tokenizer,
        doc_text,
        device,
        stride,
        num_texts_per_section,
        temperature,
        top_k_salient_texts,
        block_n_gram
    ):
    """
    This function takes a document, which is pre-split into an array, with one sentence per element.
    The function then combines several sentences into paragraphs (num sentences in section is 'stride').
    Several salient_texts are then geenrated per section.
    The salient_texts are combined and the most frequent k salient_texts generated from across the whole corpus are taken
    """
    text_split_into_sections = preprocess(doc_text,stride=stride,list=True)
    salient_texts = [
        generate_salient_texts(
            span,
            model,
            tokenizer,
            device,
            num_texts_per_section,
            temperature=temperature,
            max_len=64
        ) for span in text_split_into_sections
    ]
    gen_text_df = pd.DataFrame([
        dict(
            document_id=doc_idx,
            span_id=f"{doc_idx}:{span_idx}",
            gen_id=f"{doc_idx}:{span_idx}",
            gen_text=gen_text,
        )
        for doc_idx, document_gen in enumerate(salient_texts)
        for span_idx, gen_text in enumerate(document_gen)
    ])

    salient_texts_grouped_tbl = gen_text_df \
        .groupby("gen_text") \
        .nunique() \
        .sort_values("gen_id", ascending=False)

    top_salient_texts = list(salient_texts_grouped_tbl.index[0:top_k_salient_texts])
    top_weight = np.array(salient_texts_grouped_tbl.gen_id[0:top_k_salient_texts])


    # experiment with trigram blocking version
    if (block_n_gram):
        _pred = []
        _weight = []
        for candidate, weight in zip(salient_texts_grouped_tbl.index,salient_texts_grouped_tbl.gen_id):
            idx_ngram_blocker = _block_n_gram(block_n_gram,candidate, _pred,True)
            if (idx_ngram_blocker != False):
                _weight[idx_ngram_blocker] =  _weight[idx_ngram_blocker] + weight
            else:
                _pred.append(candidate)
                _weight.append(weight)
        _pred = np.array(_pred)
        _weight = np.array(_weight)
        _pred = _pred[np.argsort(-_weight)]
        _weight = _weight[np.argsort(-_weight)]
        top_salient_texts_trigram_block = _pred[0:top_k_salient_texts]
        weight_trigram_block = _weight[0:top_k_salient_texts]

        return top_salient_texts, top_weight, top_salient_texts_trigram_block, weight_trigram_block

    return top_salient_texts, top_weight

In [None]:
def flatten(t):
    return [item for sublist in t for item in sublist]


def dedupe_doc_text(seq):
    seen = set()
    seen_add = seen.add
    return [x.replace('\n','') for x in seq if not (x in seen or seen_add(x))]


def calculate_similarity_bert_score(
        model_bertscore,
        tokenizer_bertscore,
        salient_texts,
        doc_text,
        model_type,
        all_layers,
        num_layers,
        device
    ):
    # create an array of salient_texts of length no_salient_texts*num_sentences
    salient_texts_compare_array = flatten(
        np.array([[str(gen_text)]*len(doc_text) for gen_text in salient_texts])
        .astype('str')
    )
    # create an array of sentences of length no_salient_texts*num_sentences
    sentences_compare_array = flatten(np.array([doc_text for gen_text in salient_texts]).astype('str'))
    P_sci, R_sci, F1_sci = bert_score.score(
        salient_texts_compare_array,
        sentences_compare_array,
        model_bertscore,
        tokenizer_bertscore,
        model_type,
        num_layers=num_layers,
        device=device,
        verbose=False,
        all_layers=all_layers,
        batch_size=64
    )
    return F1_sci

In [None]:

def calculate_similarity_simsce(model_simcse,salient_texts,doc_text,device):
    return model_simcse.similarity(list(salient_texts),list(doc_text))

def calculate_similarity_sentence_transformers(model,salient_texts,doc_text):
    vectors_bert_sent = model.encode(doc_text)
    vectors_bert_gen_text = model.encode(salient_texts)
    scores_marix = np.zeros((len(salient_texts),len(doc_text)))
    for idx_1,ii in enumerate(vectors_bert_sent):
        for idx_2, jj in enumerate(vectors_bert_gen_text):
            scores_marix[idx_2,idx_1] =  spatial.distance.cosine(ii, jj)
    return scores_marix

In [None]:
# def rank_answers_based_on_similarity_scores(doc_text,salient_texts,scores,similarity_model_name,gen_text_weights=np.array([])):
#     """
#     Params:
#         doc_text: Array[<string>] : array of sentences in article
#         salient_texts: Array[<string>] : array of salient_texts summarising the article
#         scores: Array[<number>] : np.array of simialirty scores between each sentence and each gen_text
#         gen_text_weights Array[<number>] : np.array of weights associated with importance of each gen_text

#     Returns:
#         sorted_idxs: Array[<int>] : np.array of indicies of sentencs in doc_text,
#                      sorted in order of importance for summary
#     """
#     # reshape so that rows represent different salient_texts and columns represent different sentences in source text

#     scores_reshaped = np.array(scores.reshape(len(salient_texts),len(doc_text)))
#     # np.pad(scores_reshaped, [(0,0),(0,10)], mode='constant', constant_values=0)
#     # optionally multiply by weights associated with salient_texts
#     if len(gen_text_weights>0):
#         # print(gen_text_weights)
#         scores_reshaped = scores_reshaped*gen_text_weights.reshape(gen_text_weights.size,1)

#     scores_reshaped = np.pad(scores_reshaped, [(0,len(doc_text)-len(salient_texts)),(0,0)], mode='constant', constant_values=0)


#     nx_graph = nx.from_numpy_array(scores_reshaped)
#     scores = nx.pagerank(nx_graph)
#     sorted_idxs = np.argsort(list(scores.values()))
#     return sorted_idxs


In [None]:
def rank_answers_based_on_similarity_scores(doc_text,salient_texts,scores,similarity_model_name,gen_text_weights=np.array([])):
    """
    Params:
        doc_text: Array[<string>] : array of sentences in article
        salient_texts: Array[<string>] : array of salient_texts summarising the article
        scores: Array[<number>] : np.array of simialirty scores between each sentence and each gen_text
        gen_text_weights Array[<number>] : np.array of weights associated with importance of each gen_text

    Returns:
        sorted_idxs: Array[<int>] : np.array of indicies of sentencs in doc_text,
                     sorted in order of importance for summary
    """
    # reshape so that rows represent different salient_texts and columns represent different sentences in source text
    scores_reshaped = np.array(scores.reshape(len(salient_texts),len(doc_text)))
    # optionally multiply by weights associated with salient_texts
    if len(gen_text_weights>0):
        scores_reshaped = scores_reshaped*gen_text_weights.reshape(gen_text_weights.size,1)
    # Take mean bert-score for sentences across all salient_texts
    scores_average = np.mean(scores_reshaped,axis=0)
    # sort indicies of scores in descendng order
    if similarity_model_name =='sentence_transformers':
        # distance based - minimize scores
        sorted_idxs = np.argsort(scores_average)
    else:
        # siimilarity based - maximize scores
        sorted_idxs = np.argsort(-scores_average)
    return sorted_idxs


In [None]:

def select_sentences(
    doc_text,
    sorted_idxs,
    metric,
    target_tokens,
    top_k_sentences,
    block_n_gram
    ):
    """
    Params:
        doc_text: Array[<string>] : array of sentences in article
        sorted_idxs: Array[<int>] : np.array of indicies of sentencs in doc_text,
                     sorted in order of importance for summary
        metric: 'tokens' or 'sentences' : how to calculate how long summary should be
        target_tokens: int : number of tokens to aim for in target summary
        target_tokens: int : number of sentences to aim for in target summary
        gen_text_weights: Array[<number>] : np.array of weights associated with importance of each gen_text
        block_n_gram: int or None: if int, number of consecutive words
                      required to match for a sentence to be blocked

    Returns:
        sorted_idxs: Array[<int>] : np.array of indicies of sentencs in doc_text,
                     sorted in order of importance for summary
    """
    if metric == 'tokens':
        len_sentences = np.array([len(nltk.word_tokenize(s)) for s in doc_text])
        article_len = np.sum(len_sentences)
        max_len = target_tokens
    elif metric == 'sentences':
        article_len = len(doc_text)
        max_len = top_k_sentences

    #  if article is too short, whole things is summary
    if article_len <= max_len:
        pred = doc_text
    else :
        _count = 0
        _pred = []
        _pred_idxs = []
        for sentence_idx in sorted_idxs:
            candidate = doc_text[sentence_idx]
            if metric == 'tokens':
                candidate_len = len_sentences[sentence_idx]
            else:
                candidate_len = 1
            if (_count < max_len):
                if (block_n_gram):
                    idx_ngram_blocker = _block_n_gram(block_n_gram,candidate, _pred,False)
                    if (idx_ngram_blocker ==True):
                        continue
                _count += candidate_len
                _pred.append(candidate)
                _pred_idxs.append(sentence_idx)
        sorted_pred_idxs = np.sort(_pred_idxs)
        doc_text = np.array(doc_text)
        pred  = doc_text[sorted_pred_idxs]
    return pred

In [None]:
def pick_top_sentences_and_join_into_one_str(
    model,
    tokenizer,
    doc_text,
    salient_texts,
    device,
    num_layers,
    all_layers,
    metric,
    top_k_sentences,
    weights,
    similarity_model_name,
    block_n_gram,
    target_tokens,
    similarity_model_path
    ):
    """
    Passed an array of strings representing an article  (doc_text)
    and an array of strings representing the salient_texts (salient_texts) which summarise it,
    the function returns one string containing the top k scoring sentences to be included in a summary,
    in the order that they appear in the text.
    """

    # calculate similarity
    with io.capture_output() as captured:
        if similarity_model_name == 'bert_score':
            scores = calculate_similarity_bert_score(
                model,
                tokenizer,
                salient_texts,
                doc_text,
                similarity_model_path,
                all_layers,
                num_layers,
                device
            )
        elif similarity_model_name == 'simcse':
            scores = calculate_similarity_simsce(
                model,
                salient_texts,
                doc_text,
                device
            )
        elif similarity_model_name =='sentence_transformers':
            scores = calculate_similarity_sentence_transformers(model,salient_texts,doc_text)
        else:
            raise ValueError('similarity_model name not recognised.')

    #  sort indexes of doc_text sentences in order of importance
    sorted_idxs = rank_answers_based_on_similarity_scores(
        doc_text,
        salient_texts,
        scores,
        similarity_model_name,
        gen_text_weights=weights
    )

    # select sentences based on requires number of sentences or
    # tokens and integrate optional trigram blocking
    pred = select_sentences(
        doc_text,
        sorted_idxs,
        metric=metric,
        target_tokens=target_tokens,
        top_k_sentences=top_k_sentences,
        block_n_gram=block_n_gram
        )
    pred_ext_summary = combine_array_sentences(pred)

    return pred_ext_summary, sorted_idxs

In [None]:
def test_rouge(predicted_summaries, gold_summaries):
    """Calculate ROUGE scores of sequences passed as an iterator
       e.g. a list of str, an open file, StringIO or even sys.stdin
    """
    current_time = time.strftime('%Y-%m-%d-%H-%M-%S', time.localtime())
    tmp_dir = ".rouge-tmp-{}".format(current_time)
    try:
        if not os.path.isdir(tmp_dir):
            os.mkdir(tmp_dir)
            os.mkdir(tmp_dir + "/candidate")
            os.mkdir(tmp_dir + "/reference")
        print('preparing predicted summaries')
        candidates = [line.strip() for line in tqdm(predicted_summaries,total=len(predicted_summaries))]
        print('preparing gold summaries')
        gold = [line.strip() for line in tqdm(gold_summaries,total=len(gold_summaries))]
        assert len(candidates) == len(gold)
        cnt = len(candidates)
        print('Writing temp files')
        for i in tqdm(range(cnt)):
            if len(gold[i]) < 1:
                continue
            with open(tmp_dir + "/candidate/cand.{}.txt".format(i), "w",
                      encoding="utf-8") as f:
                f.write(candidates[i])
            with open(tmp_dir + "/reference/ref.{}.txt".format(i), "w",
                      encoding="utf-8") as f:
                f.write(gold[i])
        print("Doing ROUGE calculation")
        with io.capture_output() as captured:
            r = Rouge155()
            r.model_dir = tmp_dir + "/reference/"
            r.system_dir = tmp_dir + "/candidate/"
            r.model_filename_pattern = 'ref.#ID#.txt'
            r.system_filename_pattern = r'cand.(\d+).txt'
            rouge_results = r.convert_and_evaluate()
            results_dict = r.output_to_dict(rouge_results)
        return results_dict
    finally:
        pass
        if os.path.isdir(tmp_dir):
            shutil.rmtree(tmp_dir)


In [None]:
def format_rouge_results(results):
     return f"ROUGE-F(1/2/l)/ROUGE-R(1/2/l)/ROUGE-P(1/2/l): {results['rouge_1_f_score']}/{results['rouge_2_f_score']}/{results['rouge_l_f_score']} /{results['rouge_1_recall']}/{results['rouge_2_recall']}/{results['rouge_l_recall']} /{results['rouge_1_precision']}/{results['rouge_2_precision']}/{results['rouge_l_precision']}"
    # return f"ROUGE-F(1/2/l)/ROUGE-R(1/2/l): {results['rouge_1_f_score']}/{results['rouge_2_f_score']}/{results['rouge_l_f_score']} /{results['rouge_1_recall']}/{results['rouge_2_recall']}/{results['rouge_l_recall']}"

def combine_array_sentences(sentence_array):
    combined = ''
    for sentence in sentence_array:
        combined+=(' \n'+sentence)
    return combined

def _get_ngrams(n, text):
    ngram_set = set()
    text_length = len(text)
    max_index_ngram_start = text_length - n
    for i in range(max_index_ngram_start + 1):
        ngram_set.add(tuple(text[i:i + n]))
    return ngram_set

def _block_n_gram(n, c, p,salient_texts=False):
    """
    Params:
        n: int : number of consecutive words required to match for a sentence to be blocked
        c: string : candidate string to compare to array of strings
        p: Array[<string>]: array of strings to compare to
        q: Bool: indicates whether the canddiate and prediction strings are salient_texts
    """
    tri_c = _get_ngrams(n, c.split())
    for idx, s in enumerate(p):
        if (salient_texts):
            s = s.replace('what is',' ')
            s = s.replace('why is',' ')
            s = s.replace('what is the',' ')
            s = s.replace('how long does', ' ')
        tri_s = _get_ngrams(n, s.split())
        if len(tri_c.intersection(tri_s)) > 0:
            return idx
    return False


In [None]:
def get_T5_model(model,device):

    tokenizer_T5 = T5Tokenizer.from_pretrained(model)
    model_T5 = T5ForConditionalGeneration.from_pretrained(model)
    output = model_T5.to(device)
    return model_T5, tokenizer_T5

    # tokenizer_T5= BartTokenizer.from_pretrained(model)
    # model_T5 = BartForConditionalGeneration.from_pretrained(model)
    # # model_T5.resize_token_embeddings(len(tokenizer_T5))
    # output = model_T5.to(device)
    # return model_T5, tokenizer_T5

    # tokenizer_T5 = GPT2Tokenizer.from_pretrained(model)
    # model_T5 = GPT2LMHeadModel.from_pretrained(model)
    # # model_T5.resize_token_embeddings(len(tokenizer_T5))
    # output = model_T5.to(device)
    # return model_T5, tokenizer_T5

In [None]:
# !wget http://nlp.stanford.edu/data/glove.6B.zip
# !unzip glove*.zip

In [None]:

# # Extract word vectors
# word_embeddings = {}
# f = open('glove.6B.100d.txt', encoding='utf-8')
# for line in f:
#     values = line.split()
#     word = values[0]
#     coefs = np.asarray(values[1:], dtype='float32')
#     word_embeddings[word] = coefs
# f.close()

# nltk.download('stopwords')
# from nltk.corpus import stopwords
# stop_words = stopwords.words('english')
# # function to remove stopwords
# def remove_stopwords(sen):
#     sen_new = " ".join([i for i in sen if i not in stop_words])
#     return sen_new

In [None]:
# from nltk.tokenize import sent_tokenize
# from sklearn.metrics.pairwise import cosine_similarity
# import networkx as nx
# def pangerank_extract_best_sentences(text, best_sentences_number):
#     sentences = sent_tokenize(text)
#     clean_sent=[]
#     for sen in sentences:
#         clean_sentences = pd.Series(sen).str.replace("[^a-zA-Z]", " ")
#         clean_sentences = [s.lower() for s in clean_sentences]
#         clean_sent.append(clean_sentences)
#     cleaned=[]
#     for j in clean_sent:
#         clean_sentences = [remove_stopwords(r.split()) for r in j]
#         cleaned.append(clean_sentences)
#     sentence_vectors = []
#     for clean_sentences in cleaned:
#         for i in clean_sentences:
#             if len(i) != 0:
#                 v = sum([word_embeddings.get(w, np.zeros((100,))) for w in i.split()])/(len(i.split())+0.001)
#             else:
#                 v = np.zeros((100,))
#             sentence_vectors.append(v)
#     sim_mat = np.zeros([len(sentences), len(sentences)])
#     for i in range(len(sentences)):
#         for j in range(len(sentences)):
#             if i != j:
#                 sim_mat[i][j] = cosine_similarity(sentence_vectors[i].reshape(1,100), sentence_vectors[j].reshape(1,100))[0,0]
#     nx_graph = nx.from_numpy_array(sim_mat)
#     scores = nx.pagerank(nx_graph, tol=1.0e-2)
#     ranked_sentences = sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse=True)
#     final_sentences = ''
#     if len(ranked_sentences)<best_sentences_number:
#         best_sentences_number=len(ranked_sentences)
#     for i in range(best_sentences_number):
#         final_sentences = final_sentences+(ranked_sentences[i][1])

#     return final_sentences

In [None]:

def main(df,
        generative_model,
        generative_tokenizer,
        stride,
        num_texts_per_section,
        temperature,
        num_salient_texts,
        block_n_gram_generated_texts,
        similarity_model,
        similarity_tokenizer,
        num_layers,
        all_layers,
        summary_len_metric,
        num_sentences,
        target_tokens,
        block_n_gram_sum,
        similarity_model_path,
        device,
        col_name,
        gen_text_weights,
        inference_only,
        save_predictions,
        # best_sentences_number
        ):

    gold_summaries = []
    our_predictions = []
    our_summary_lens = []

    for idx, row in tqdm(df.iterrows(),total=len(df)):
        #  read in article to summarise
        doc_text = np.array(ast.literal_eval(row[col_name]))

        weights = None
        #  generate salient text fragments
        if (block_n_gram_generated_texts):
            salient_texts, freq, q_tg, f_tg = get_salient_texts_across_corpus(
                generative_model,
                generative_tokenizer,
                doc_text,
                device,
                stride=stride,
                num_texts_per_section=num_texts_per_section,
                temperature=temperature,
                top_k_salient_texts=num_salient_texts,
                block_n_gram=block_n_gram_generated_texts,
            )
            salient_texts = q_tg
            weights = f_tg if (gen_text_weights) else np.array([])
        else:
            salient_texts, freq= get_salient_texts_across_corpus(
                generative_model,
                generative_tokenizer,
                doc_text,
                device,
                stride=stride,
                num_texts_per_section=num_texts_per_section,
                temperature=temperature,
                top_k_salient_texts=num_salient_texts,
                block_n_gram=block_n_gram_generated_texts,
            )
            weights = freq if (gen_text_weights) else np.array([])

        #  generate summary
        pred_sum, idxs = pick_top_sentences_and_join_into_one_str(
            similarity_model,
            similarity_tokenizer,
            doc_text,
            salient_texts,
            device,
            num_layers=num_layers,
            all_layers=all_layers,
            metric=summary_len_metric,
            top_k_sentences=num_sentences,
            weights=weights,
            similarity_model_name=similarity_model_name,
            block_n_gram=block_n_gram_sum,
            target_tokens=target_tokens,
            similarity_model_path=similarity_model_path
        )

        # pred_sum = pangerank_extract_best_sentences(pred_sum, best_sentences_number)
        # print(pred_sum)
        # print()
        #  append out predicted summary and gold summary to arrays for evaluation
        our_summary_lens.append(len(nltk.word_tokenize(pred_sum)))
        our_predictions.append(pred_sum)

        gold_sum = df.loc[idx,'summary_text_combined']
        # print(doc_text)
        # print()
        gold_summaries.append(gold_sum)


    # df=df.assign(predictions=our_predictions)
    # df=df.to_csv("/content/drive/MyDrive/F_Models/Predicted_gencomparesum.csv")


    #  calculate ROUGE scores
    # if (save_predictions):
    #     with open('./results.json','w') as f:
    #         json.dump(our_predictions,f)
    # if not (inference_only):
    model_type = similarity_model_path.split('/')[-1]
    our_pred = test_rouge(our_predictions,gold_summaries)

    # print summaries
    our_summary_lens = np.array(our_summary_lens)
    print(f'\n\nData col: {col_name}.\n'+
        f'Num salient_texts: {num_salient_texts}.\n'+
        f'block_n_gram_generated_texts: {block_n_gram_generated_texts}.\n'+
        f'Similarity model type: {model_type}.\n'+
        f'block_n_gram_sum: {block_n_gram_sum}.\n'+
        f'summary_len_metric: {summary_len_metric}.\n'+
        f'Num sentences: {num_sentences}.\n'+
        f'target_tokens: {target_tokens}.\n'+
        f'{format_rouge_results(our_pred)}\n'+
        f'Average length of summary: {np.mean(our_summary_lens)}'
        )
    print(f'gen_text weights: {gen_text_weights}')





# # if __name__=='__main__':

# #     # -------- DEFINE EXPERIMENT PARAMS ---------------

# #     parser = argparse.ArgumentParser()
# #     parser.add_argument("--num_generated_texts",default=10)
# #     parser.add_argument("--block_n_gram_generated_texts",default=None)
# #     parser.add_argument("--col_name",default='article_text')
# #     parser.add_argument("--num_sentences",default=9)
# #     parser.add_argument("--summary_len_metric",default='sentences')
# #     parser.add_argument("--similarity_model_path",default='bert-base-uncased')
# #     parser.add_argument("--target_tokens",default=250)
# #     parser.add_argument("--block_n_gram_sum",default=4)
# #     parser.add_argument("--visible_device",default='0')
# #     parser.add_argument("--gen_text_weights",default=None)
# #     parser.add_argument("--temperature",default=0.5)
# #     parser.add_argument("--texts_per_section",default=3)
# #     parser.add_argument("--stride",default=4)
# #     parser.add_argument("--data_path")
# #     parser.add_argument("--generative_model_path")
# #     parser.add_argument("--similarity_model_name")
# #     parser.add_argument('--inference_only',default=False)
# #     parser.add_argument("--save_predictions",default=False)
# #     args = parser.parse_args()

#     # data params
#     path = args.data_path
#     col_name = args.col_name

#     # gen_text generation params
#     num_generated_texts = int(args.num_generated_texts)
#     block_n_gram_generated_texts = int(args.block_n_gram_generated_texts) if (args.block_n_gram_generated_texts != None) else None
#     temperature = float(args.temperature)
#     num_texts_per_section = int(args.texts_per_section)
#     stride = int(args.stride)


#     # extractive summarisation (similarity and ranking) params
#     num_sentences = int(args.num_sentences)
#     summary_len_metric = args.summary_len_metric
#     similarity_model_name = args.similarity_model_name
#     similarity_model_path = args.similarity_model_path

#     block_n_gram_sum = int(args.block_n_gram_sum) if (args.block_n_gram_sum != None) else None
#     target_tokens = int(args.target_tokens)
#     gen_text_weights = bool(args.gen_text_weights)

#     # other
#     inference_only = bool(args.inference_only)
#     save_predictions = bool(args.save_predictions)



#     # -------- LOAD MODELS ---------------






#     # define similarity model
#     if (
#         )
#     if (similarity_model_name =='simcse'):
#         similarity_model = SimCSE(similarity_model_path)
#         similarity_tokenizer, num_layers = None, None
#     if (similarity_model_name=='sentence_transformers'):
#         similarity_model = SentenceTransformer(similarity_model_path)
#         similarity_model.to(device)
#         similarity_tokenizer, num_layers = None, None


In [None]:
 # Define the target device. Use GPU if available.
os.environ["CUDA_VISIBLE_DEVICES"] = '0'
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [None]:
device

In [None]:
all_layers=False
similarity_model_name='bert_score'
similarity_model_path = 'bert-base-uncased'
num_layers, similarity_model, similarity_tokenizer =  bert_score.get_model_and_tokenizer(similarity_model_path,device,all_layers=all_layers)

In [None]:
# generative_model_base = "facebook/bart-large-cnn"
generative_model_base = "doc2query/S2ORC-t5-base-v1"
 # define model for gen_text generation
generative_model, generative_tokenizer = get_T5_model(generative_model_base,device)


In [None]:
 # -------- LOAD DATA ---------------
# path="19.csv"
# path=a.csv"
# path="preprocessed_840.csv"
path="ref4_gencompareDUC2004_4.csv"
df = pd.read_csv(path)

In [None]:
     # -------- RUN EXPERIMENT ---------------
main(df,
      generative_model,
      generative_tokenizer,
      stride=4,
      num_texts_per_section=3,
      temperature=0.5,
      num_salient_texts=10,
      block_n_gram_generated_texts=4,
      similarity_model=similarity_model,
      similarity_tokenizer=similarity_tokenizer,
      num_layers=num_layers,
      all_layers=all_layers,
      summary_len_metric='sentences',
      num_sentences=6,
      target_tokens=250,
      block_n_gram_sum=4,
      similarity_model_path=similarity_model_path,
      device=device,
      col_name='main_txt',
      gen_text_weights=1,
      inference_only=False,
      save_predictions=False,
      # best_sentences_number=7
      )