In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

In [None]:
device = "cuda"
tokenizer = AutoTokenizer.from_pretrained("humarin/chatgpt_paraphraser_on_T5_base")
model = AutoModelForSeq2SeqLM.from_pretrained("humarin/chatgpt_paraphraser_on_T5_base").to(device)

In [None]:
def paraphrase(
    question,
    num_beams=5,
    num_beam_groups=5,
    num_return_sequences=5,
    repetition_penalty=10.0,
    diversity_penalty=3.0, 
    no_repeat_ngram_size=2,
    temperature=0.7,
    max_length=128
):
    input_ids = tokenizer(
        f'paraphrase: {question}',
        return_tensors="pt", padding="longest",
        max_length=max_length,
        truncation=True,
    ).input_ids.to(device)

    outputs = model.generate(
        input_ids, temperature=temperature, repetition_penalty=repetition_penalty,
        num_return_sequences=num_return_sequences, no_repeat_ngram_size=no_repeat_ngram_size,
        num_beams=num_beams, num_beam_groups=num_beam_groups,
        max_length=max_length, diversity_penalty=diversity_penalty
    )

    res = tokenizer.batch_decode(outputs, skip_special_tokens=True)

    return res

In [None]:
##text

In [None]:
text = 'What are the best places to see in New York?'
paraphrase(text)

['What are some must-see places in New York?',
 'Can you suggest some must-see spots in New York?',
 'Where should one go to experience the best NYC has to offer?',
 'Which places should I visit in New York?',
 'What are the top destinations to explore in New York?']

Training parameters
- epochs = 5
- batch_size = 64
- max_length = 128
- lr = 5e-5
- batches_qty = 196465
- betas = (0.9, 0.999)
- eps = 1e-08

Para contratos en NLI?

In [None]:
text = "Receiving Party shall not reverse engineer any objects which embody Disclosing Party's Confidential Information."
paraphrase(text)

['Any objects that contain Confidential Information of Disclosing Party shall not be reverse-engineered by the Receiving Parties.',
 'The Receiving Party is prohibited from reverse engineering any objects that contain Confidential Information of Disclosing Parties.',
 'It is forbidden for the Receiving Party to reverse engineer any objects that contain Confidential Information of Disclosing Parties.',
 'Reversing of object that contains Confidential Information of Disclosing Party is prohibited for Receivers.',
 "Disclosing Party's Confidential Information cannot be reverse-engineered by the Receiving Partie."]

Pasa test!

In [None]:
text = "Receiving Party shall destroy or return some Confidential Information upon the termination of Agreement"
paraphrase(text)

['Upon the termination of the Agreement, the Receiving Party is required to destroy or return some Confidential Information.',
 'Confidential Information must be destroyed or returned by the Receiving Party upon the termination of the Agreement.',
 'The Receiving Party is obligated to destroy or return Confidential Information upon the termination of the Agreement.',
 'At the conclusion of the Agreement, the Receiving Party is obligated to destroy or return some Confidential Information.',
 'When the Agreement is over, the Receiving Party must destroy or return some Confidential Information.']

Pasa test!

In [None]:
text = "Receiving Party shall not disclose the fact that Agreement was agreed or negotiated."
paraphrase(text)

['The Receiving Party is prohibited from revealing the fact that the Agreement was either negotiated or agreed upon.',
 'It is the responsibility of the Receiving Party to maintain confidentiality while negotiating or agreeing to the Agreement.',
 'The receiving Party is prohibited from revealing whether the Agreement was agreed upon or negotiated.',
 "No information about the agreement's negotiation or agreement is disclosed to the Receiving Party.",
 'Unless otherwise agreed upon or négociated, no information is disclosed by the Receiving Party.']

In [None]:
import pandas as pd
from tqdm import tqdm

data = pd.read_csv('train.csv')
data

Unnamed: 0,doc_id,text,hypothesis,label,spans
0,34,NON-DISCLOSURE AND CONFIDENTIALITY AGREEMENT\n...,Receiving Party shall not reverse engineer any...,NotMentioned,[]
1,34,NON-DISCLOSURE AND CONFIDENTIALITY AGREEMENT\n...,Receiving Party shall destroy or return some C...,Entailment,"[39, 40]"
2,34,NON-DISCLOSURE AND CONFIDENTIALITY AGREEMENT\n...,Agreement shall not grant Receiving Party any ...,Entailment,[38]
3,34,NON-DISCLOSURE AND CONFIDENTIALITY AGREEMENT\n...,Receiving Party shall not disclose the fact th...,Entailment,[51]
4,34,NON-DISCLOSURE AND CONFIDENTIALITY AGREEMENT\n...,Confidential Information shall only include te...,NotMentioned,[]
...,...,...,...,...,...
7186,624,Exhibit (d)(3)\nNON-DISCLOSURE AGREEMENT\nThis...,Receiving Party may create a copy of some Conf...,NotMentioned,[]
7187,624,Exhibit (d)(3)\nNON-DISCLOSURE AGREEMENT\nThis...,Receiving Party shall notify Disclosing Party ...,Entailment,[30]
7188,624,Exhibit (d)(3)\nNON-DISCLOSURE AGREEMENT\nThis...,Receiving Party may acquire information simila...,Entailment,"[98, 101]"
7189,624,Exhibit (d)(3)\nNON-DISCLOSURE AGREEMENT\nThis...,Receiving Party may share some Confidential In...,Entailment,"[24, 25, 104, 105, 106]"


In [None]:
data.columns

Index(['doc_id', 'text', 'hypothesis', 'label', 'spans'], dtype='object')

In [None]:
from tqdm import tqdm

def paraphrase_batch_in_chunks(
    questions,
    batch_size=10,
    num_beams=2,
    num_beam_groups=2,
    num_return_sequences=2,
    repetition_penalty=10.0,
    diversity_penalty=3.0,
    no_repeat_ngram_size=2,
    temperature=0.7,
    max_length=128
):
    """
    Parafrasea una lista de preguntas procesándolas en batches.

    :param questions: Lista de hipótesis (strings) para parafrasear.
    :param batch_size: Número de hipótesis procesadas por lote.
    :param num_beams: Número de haces para la búsqueda de haz.
    :param num_beam_groups: Número de grupos de haces.
    :param num_return_sequences: Número de paráfrasis por pregunta.
    :param repetition_penalty: Penalización por repetición.
    :param diversity_penalty: Penalización para fomentar diversidad.
    :param no_repeat_ngram_size: Tamaño de n-gramas a evitar repetir.
    :param temperature: Controla la aleatoriedad de la generación.
    :param max_length: Longitud máxima de la paráfrasis.
    :return: Lista de paráfrasis generadas.
    """
    all_paraphrased = []

    for i in tqdm(range(0, len(questions), batch_size), desc="Procesando batches"):
        batch = questions[i:i + batch_size]

        input_ids = tokenizer(
            [f'paraphrase: {q}' for q in batch],
            return_tensors="pt", padding=True, truncation=True,
            max_length=max_length
        ).input_ids.to(device)

        outputs = model.generate(
            input_ids, temperature=temperature, repetition_penalty=repetition_penalty,
            num_return_sequences=num_return_sequences, no_repeat_ngram_size=no_repeat_ngram_size,
            num_beams=num_beams, num_beam_groups=num_beam_groups,
            max_length=max_length, diversity_penalty=diversity_penalty
        )

        results = tokenizer.batch_decode(outputs, skip_special_tokens=True)
        paraphrased_list = [
            results[j * num_return_sequences:(j + 1) * num_return_sequences]
            for j in range(len(batch))
        ]
        all_paraphrased.extend(paraphrased_list)

    return all_paraphrased

In [None]:
hypotheses = data["hypothesis"].values


paraphrased_hypotheses = paraphrase_batch_in_chunks(
    hypotheses,
    batch_size=3,
    num_beams=3,
    num_beam_groups=3,
    num_return_sequences=1,
    max_length=64
)


# for original, paraphrased in zip(hypotheses, paraphrased_hypotheses):
#     print(f"Original: {original}")
#     print(f"Paráfrasis: {paraphrased[0]}")
#     print()


Procesando batches: 100%|██████████| 2397/2397 [36:14<00:00,  1.10it/s]


In [None]:
new_hypotheses1 = [paraphrased[0] for paraphrased in paraphrased_hypotheses]

In [None]:
data["paraphrased hypothesis"] = new_hypotheses1

In [None]:
data

Unnamed: 0,doc_id,text,hypothesis,label,spans,paraphrased hypothesis
0,34,NON-DISCLOSURE AND CONFIDENTIALITY AGREEMENT\n...,Receiving Party shall not reverse engineer any...,NotMentioned,[],Any objects that contain Confidential Informat...
1,34,NON-DISCLOSURE AND CONFIDENTIALITY AGREEMENT\n...,Receiving Party shall destroy or return some C...,Entailment,"[39, 40]",Confidential Information must be destroyed or ...
2,34,NON-DISCLOSURE AND CONFIDENTIALITY AGREEMENT\n...,Agreement shall not grant Receiving Party any ...,Entailment,[38],Confidential Information is not granted to the...
3,34,NON-DISCLOSURE AND CONFIDENTIALITY AGREEMENT\n...,Receiving Party shall not disclose the fact th...,Entailment,[51],The Receiving Party is prohibited from reveali...
4,34,NON-DISCLOSURE AND CONFIDENTIALITY AGREEMENT\n...,Confidential Information shall only include te...,NotMentioned,[],Technical Specifications are the only parts th...
...,...,...,...,...,...,...
7186,624,Exhibit (d)(3)\nNON-DISCLOSURE AGREEMENT\nThis...,Receiving Party may create a copy of some Conf...,NotMentioned,[],Confidential Information may be copied by the ...
7187,624,Exhibit (d)(3)\nNON-DISCLOSURE AGREEMENT\nThis...,Receiving Party shall notify Disclosing Party ...,Entailment,[30],"If the Receiving Party is required by law, reg..."
7188,624,Exhibit (d)(3)\nNON-DISCLOSURE AGREEMENT\nThis...,Receiving Party may acquire information simila...,Entailment,"[98, 101]",Information that is confidential may be obtain...
7189,624,Exhibit (d)(3)\nNON-DISCLOSURE AGREEMENT\nThis...,Receiving Party may share some Confidential In...,Entailment,"[24, 25, 104, 105, 106]",Certain employees of Receiving Party may be gi...


In [None]:
data.to_csv('train_paraphrased.csv', index=False)

In [None]:
data.to_excel('train_paraphrased.xlsx', index=False)

In [None]:
data1 = data[["doc_id","text", "hypothesis", "label", "spans"]]

In [None]:
data2 = data[["doc_id","text", "paraphrased hypothesis", "label", "spans"]]

In [None]:
data1

Unnamed: 0,doc_id,text,hypothesis,label,spans
0,34,NON-DISCLOSURE AND CONFIDENTIALITY AGREEMENT\n...,Receiving Party shall not reverse engineer any...,NotMentioned,[]
1,34,NON-DISCLOSURE AND CONFIDENTIALITY AGREEMENT\n...,Receiving Party shall destroy or return some C...,Entailment,"[39, 40]"
2,34,NON-DISCLOSURE AND CONFIDENTIALITY AGREEMENT\n...,Agreement shall not grant Receiving Party any ...,Entailment,[38]
3,34,NON-DISCLOSURE AND CONFIDENTIALITY AGREEMENT\n...,Receiving Party shall not disclose the fact th...,Entailment,[51]
4,34,NON-DISCLOSURE AND CONFIDENTIALITY AGREEMENT\n...,Confidential Information shall only include te...,NotMentioned,[]
...,...,...,...,...,...
7186,624,Exhibit (d)(3)\nNON-DISCLOSURE AGREEMENT\nThis...,Receiving Party may create a copy of some Conf...,NotMentioned,[]
7187,624,Exhibit (d)(3)\nNON-DISCLOSURE AGREEMENT\nThis...,Receiving Party shall notify Disclosing Party ...,Entailment,[30]
7188,624,Exhibit (d)(3)\nNON-DISCLOSURE AGREEMENT\nThis...,Receiving Party may acquire information simila...,Entailment,"[98, 101]"
7189,624,Exhibit (d)(3)\nNON-DISCLOSURE AGREEMENT\nThis...,Receiving Party may share some Confidential In...,Entailment,"[24, 25, 104, 105, 106]"


In [None]:
data2["hypothesis"] = data2["paraphrased hypothesis"]

In [None]:
data2.drop("paraphrased hypothesis", axis=1, inplace=True)

In [None]:
data2 = data2[["doc_id","text", "hypothesis", "label", "spans"]]

In [None]:
data2

Unnamed: 0,doc_id,text,hypothesis,label,spans
0,34,NON-DISCLOSURE AND CONFIDENTIALITY AGREEMENT\n...,Any objects that contain Confidential Informat...,NotMentioned,[]
1,34,NON-DISCLOSURE AND CONFIDENTIALITY AGREEMENT\n...,Confidential Information must be destroyed or ...,Entailment,"[39, 40]"
2,34,NON-DISCLOSURE AND CONFIDENTIALITY AGREEMENT\n...,Confidential Information is not granted to the...,Entailment,[38]
3,34,NON-DISCLOSURE AND CONFIDENTIALITY AGREEMENT\n...,The Receiving Party is prohibited from reveali...,Entailment,[51]
4,34,NON-DISCLOSURE AND CONFIDENTIALITY AGREEMENT\n...,Technical Specifications are the only parts th...,NotMentioned,[]
...,...,...,...,...,...
7186,624,Exhibit (d)(3)\nNON-DISCLOSURE AGREEMENT\nThis...,Confidential Information may be copied by the ...,NotMentioned,[]
7187,624,Exhibit (d)(3)\nNON-DISCLOSURE AGREEMENT\nThis...,"If the Receiving Party is required by law, reg...",Entailment,[30]
7188,624,Exhibit (d)(3)\nNON-DISCLOSURE AGREEMENT\nThis...,Information that is confidential may be obtain...,Entailment,"[98, 101]"
7189,624,Exhibit (d)(3)\nNON-DISCLOSURE AGREEMENT\nThis...,Certain employees of Receiving Party may be gi...,Entailment,"[24, 25, 104, 105, 106]"


In [None]:
new_data = pd.concat([data1, data2])
new_data.to_csv('train_data_augmentation.csv', index=False)

In [None]:
new_data.to_excel('train_data_augmentation.xlsx', index=False)

In [None]:
new_data

Unnamed: 0,doc_id,text,hypothesis,label,spans
0,34,NON-DISCLOSURE AND CONFIDENTIALITY AGREEMENT\n...,Receiving Party shall not reverse engineer any...,NotMentioned,[]
1,34,NON-DISCLOSURE AND CONFIDENTIALITY AGREEMENT\n...,Receiving Party shall destroy or return some C...,Entailment,"[39, 40]"
2,34,NON-DISCLOSURE AND CONFIDENTIALITY AGREEMENT\n...,Agreement shall not grant Receiving Party any ...,Entailment,[38]
3,34,NON-DISCLOSURE AND CONFIDENTIALITY AGREEMENT\n...,Receiving Party shall not disclose the fact th...,Entailment,[51]
4,34,NON-DISCLOSURE AND CONFIDENTIALITY AGREEMENT\n...,Confidential Information shall only include te...,NotMentioned,[]
...,...,...,...,...,...
7186,624,Exhibit (d)(3)\nNON-DISCLOSURE AGREEMENT\nThis...,Confidential Information may be copied by the ...,NotMentioned,[]
7187,624,Exhibit (d)(3)\nNON-DISCLOSURE AGREEMENT\nThis...,"If the Receiving Party is required by law, reg...",Entailment,[30]
7188,624,Exhibit (d)(3)\nNON-DISCLOSURE AGREEMENT\nThis...,Information that is confidential may be obtain...,Entailment,"[98, 101]"
7189,624,Exhibit (d)(3)\nNON-DISCLOSURE AGREEMENT\nThis...,Certain employees of Receiving Party may be gi...,Entailment,"[24, 25, 104, 105, 106]"
