In [1]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
import time
import torch
from nltk.tokenize import sent_tokenize

class DipperParaphraser(object):
    def __init__(self, model="kalpeshk2011/dipper-paraphraser-xxl", verbose=True):
        time1 = time.time()
        self.tokenizer = T5Tokenizer.from_pretrained('google/t5-v1_1-xxl')
        self.model = T5ForConditionalGeneration.from_pretrained(model, torch_dtype=torch.float16)
        if verbose:
            print(f"{model} model loaded in {time.time() - time1}")
        self.model.cuda()
        self.model.eval()

    def paraphrase(self, input_text, lex_diversity, order_diversity, prefix="", sent_interval=3, **kwargs):
        """Paraphrase a text using the DIPPER model.

        Args:
            input_text (str): The text to paraphrase. Make sure to mark the sentence to be paraphrased between <sent> and </sent> blocks, keeping space on either side.
            lex_diversity (int): The lexical diversity of the output, choose multiples of 20 from 0 to 100. 0 means no diversity, 100 means maximum diversity.
            order_diversity (int): The order diversity of the output, choose multiples of 20 from 0 to 100. 0 means no diversity, 100 means maximum diversity.
            **kwargs: Additional keyword arguments like top_p, top_k, max_length.
        """
        assert lex_diversity in [0, 20, 40, 60, 80, 100], "Lexical diversity must be one of 0, 20, 40, 60, 80, 100."
        assert order_diversity in [0, 20, 40, 60, 80, 100], "Order diversity must be one of 0, 20, 40, 60, 80, 100."

        lex_code = int(100 - lex_diversity)
        order_code = int(100 - order_diversity)

        input_text = " ".join(input_text.split())
        sentences = sent_tokenize(input_text)
        prefix = " ".join(prefix.replace("\n", " ").split())
        output_text = ""

        for sent_idx in range(0, len(sentences), sent_interval):
            curr_sent_window = " ".join(sentences[sent_idx:sent_idx + sent_interval])
            final_input_text = f"lexical = {lex_code}, order = {order_code}"
            if prefix:
                final_input_text += f" {prefix}"
            final_input_text += f" <sent> {curr_sent_window} </sent>"

            final_input = self.tokenizer([final_input_text], return_tensors="pt")
            final_input = {k: v.cuda() for k, v in final_input.items()}

            with torch.inference_mode():
                outputs = self.model.generate(**final_input, **kwargs)
            outputs = self.tokenizer.batch_decode(outputs, skip_special_tokens=True)
            prefix += " " + outputs[0]
            output_text += " " + outputs[0]

        return output_text

if __name__ == "__main__":
    dp = DipperParaphraser()

    prompt = "In a shocking finding, scientist discovered a herd of unicorns living in a remote valley."
    input_text = "They have never been known to mingle with humans. Today, it is believed these unicorns live in an unspoilt environment which is surrounded by mountains. Its edge is protected by a thick wattle of wattle trees, giving it a majestic appearance. Along with their so-called miracle of multicolored coat, their golden coloured feather makes them look like mirages. Some of them are rumored to be capable of speaking a large amount of different languages. They feed on elk and goats as they were selected from those animals that possess a fierceness to them, and can \"eat\" them with their long horns."

    print(f"Input = {prompt} <sent> {input_text} </sent>\n")
    output_l60_sample = dp.paraphrase(input_text, lex_diversity=60, order_diversity=0, prefix=prompt, do_sample=True, top_p=0.75, top_k=None, max_length=512)
    print(f"Output (Lexical diversity = 60, Sample p = 0.75) = {output_l60_sample}\n")


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

kalpeshk2011/dipper-paraphraser-xxl model loaded in 167.92706298828125
Input = In a shocking finding, scientist discovered a herd of unicorns living in a remote valley. <sent> They have never been known to mingle with humans. Today, it is believed these unicorns live in an unspoilt environment which is surrounded by mountains. Its edge is protected by a thick wattle of wattle trees, giving it a majestic appearance. Along with their so-called miracle of multicolored coat, their golden coloured feather makes them look like mirages. Some of them are rumored to be capable of speaking a large amount of different languages. They feed on elk and goats as they were selected from those animals that possess a fierceness to them, and can "eat" them with their long horns. </sent>

Output (Lexical diversity = 60, Sample p = 0.75) =  Their life is said to be unsullied, and they have never been known to mix with humans. It is believed that they live in an isolated and unspoiled valley. The valley is 

In [2]:
import json
import os

def load_json_file(file_path):
    if os.path.exists(file_path):
        with open(file_path, 'r') as file:
            return json.load(file)
    else:
        raise FileNotFoundError(f"The file '{file_path}' does not exist.")
        
        
datasets = {
    'abstract': load_json_file("./datasets/abstract/abstract_gpt-3.5-turbo.raw_data.json"),
    'squad': load_json_file("./datasets/squad/squad_gpt-3.5-turbo.raw_data.json"),
    'xsum': load_json_file("./datasets/xsum/xsum_gpt-3.5-turbo.raw_data.json"),
    'writing': load_json_file("./datasets/writing/writing_gpt-3.5-turbo.raw_data.json") 
}

In [3]:
from tqdm import tqdm

for dataset_name, dataset in datasets.items():
    datasets[dataset_name]['evade'] = []
    
    save_path = os.path.join(f'./datasets/{dataset_name}/', f'{dataset_name}_evasion_dipper.json')
    
    for i,data in tqdm(enumerate(dataset['sampled'])):
        data_evade = dp.paraphrase(data, lex_diversity=20, order_diversity=60, prefix=prompt, do_sample=True, top_p=0.75, top_k=None, max_length=512)
        datasets[dataset_name]['evade'].append(data_evade)
        
        with open(save_path, 'w') as output_file:
            json.dump(datasets[dataset_name], output_file)
        
        
    

350it [17:00,  2.91s/it]
200it [18:00,  5.40s/it]
150it [17:30,  7.01s/it]
150it [19:06,  7.65s/it]
