In [43]:
import random
import more_itertools as mit
import pickle
import itertools
import numpy as np
import pandas as pd
import re
from rouge import Rouge
import tqdm.notebook as tq

In [44]:
texts = pickle.load(open("/project/Data/unlabelled_terms_medium.p", "rb"))
texts = [np.array(re.findall('.*?[.!?]', text)) for text in texts]

In [45]:
texts = [[sentence.replace("\t","").replace("\n","").replace("\r","").strip() for sentence in text] for text in texts]

In [46]:
texts = [np.array([sentence for sentence in text if len(sentence)>4]) for text in texts]

In [47]:
texts = [text for text in texts if len(text)>2]

In [48]:
len(texts)

29988

In [49]:
prepared_texts = {'input_text':[None]*len(texts),
                  'target_text':[None]*len(texts)}
failed = 0

for index, text in enumerate(tq.tqdm(texts)):
    
    # The original Pegaus paper scores each sentence in a document with
    # rouge-1 f-1, with respect to the rest of the document, minus the
    # sentence being scored. Then the top-m high scoring sentences are masked.
    # Here this process is replicated.
    
    # dict for storing refs, i.e. the document minus the sentence being scored
    # and hyps, i.e. the sentence being scored
    rouge_input = {'refs':[],'hyps':[]}
    
    # separating sentences from each document
    for i,_ in enumerate(text):
        rouge_input['refs'].append(''.join(np.delete(text, i)))
        rouge_input['hyps'].append(text[i])
        
    # compute rouge scores for each sentence
    rouge = Rouge()
    try:
        rouge_scores = rouge.get_scores(rouge_input['hyps'], rouge_input['refs'])
    except:
        continue
        
    # define m to be equal to 30% of the number of sentences, as in the original paper
    m = round(len(text)*0.3)
    
    # select the sentences among the top-m rouge-1 f-1 scores
    # these sentences will be masked
    masked = set(np.array([score['rouge-1']['f'] for score in rouge_scores]).argsort()[-m:][::-1])
    
    # compute indeces for sentences that get masked and don't get masked
    unmasked = sorted(list(set(range(len(text))) - masked))
    masked = sorted(list(masked))
    
    # create the input and target texts
    input_text = text.copy()
    target_text = text.copy()
    
    # insert masking tokens
    input_text[masked] = ' <mask_1> '
    prepared_texts['input_text'][index] = ''.join(input_text)
    
    # retrieve masked sentences and append end of sentence tokens
    prepared_texts['target_text'][index] = ''.join([sentence + "</s>" for sentence in target_text[masked]])
    

  0%|          | 0/29988 [00:00<?, ?it/s]

In [50]:
prepared_df = pd.DataFrame(prepared_texts)
prepared_df.head()

Unnamed: 0,input_text,target_text
0,For orders that contain any fresh grocery item...,Orders that contain both fresh grocery items a...
1,evaluate site performance.provided by the cook...,information is stored to enable historical rep...
2,Where our Sites include links to other website...,If you submit Personal Data to any of those ot...
3,You may view information provided through the ...,You agree not to access the Services by any me...
4,"Under the CCPA, California Consumers have cert...",CCPA rights requests are subject to an identif...


In [51]:
prepared_df.shape

(29988, 2)

In [52]:
prepared_df.to_csv('/project/Data/Pegasus_training_data_medium.csv')