<a href="https://colab.research.google.com/github/Inzamam1234/NLP_ToolKit/blob/main/NLP_Toolkit_Paraphasing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install datasets



In [None]:
import pandas as pd
from datasets import load_dataset, Dataset
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments, TrainerCallback

In [None]:
# Load the PAWS dataset
dataset = load_dataset("paws", "labeled_final")


def preprocess_pows(dataset, label=1):
  df = pd.DataFrame(dataset)
  df = df[df['label']==label]

  df['input_text'] = "paraphrase :" + df['sentence1']
  df['target_text'] = df['sentence2']

  return df[['input_text','target_text']]

train_df = preprocess_pows(dataset['train']).sample(3000, random_state=42)
test_df = preprocess_pows(dataset['test']).sample(300, random_state=42)
validation_df = preprocess_pows(dataset['validation']).sample(300, random_state=42)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)
validation_dataset = Dataset.from_pandas(validation_df)

In [None]:
# Initialize tokenizer and model
model_name = "t5-base"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

max_length = max([len(tokenizer.encode(text)) for text in train_dataset['input_text']])

# Tokenization function
def tokenize_function(examples):
    inputs = tokenizer(examples['input_text'], max_length=max_length, truncation=True, padding="max_length")
    targets = tokenizer(examples['target_text'], max_length=max_length, truncation=True, padding="max_length")
    inputs['labels'] = targets['input_ids']
    return inputs

# Tokenize datasets
train_dataset = train_dataset.map(tokenize_function, batched=True)
validation_dataset = validation_dataset.map(tokenize_function, batched=True)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

In [None]:
train_dataset[0]

{'input_text': 'paraphrase :William died in 1859 and Elizabeth died the following year .',
 'target_text': 'In 1859 , William and Elizabeth died the following year .',
 '__index_level_0__': 28667,
 'input_ids': [3856,
  27111,
  3,
  10,
  518,
  1092,
  23,
  265,
  3977,
  16,
  507,
  3390,
  11,
  9066,
  3977,
  8,
  826,
  215,
  3,
  5,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 'attention_mask': [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,


In [None]:
results_dir = "/content/results"
model_dir = "/content/saved_t5_model"

In [None]:
training_args = TrainingArguments(
    output_dir= results_dir,  # Directory where the results (logs, model outputs) will be saved
    overwrite_output_dir=True,  # Overwrite the output directory if it exists
    eval_strategy="epoch",  # Evaluate the model at the end of each epoch
    eval_steps=500,  # Evaluate every 500 steps (this might not be necessary if evaluation is at epoch level)
    logging_dir='./logs',  # Directory to save logs
    logging_steps=100,  # Log every 100 steps
    learning_rate=3e-5,  # Learning rate for optimizer
    per_device_train_batch_size=8,  # Batch size for training
    per_device_eval_batch_size=8,  # Batch size for evaluation
    num_train_epochs=5,  # Number of epochs to train the model
    weight_decay=0.01,  # L2 regularization to prevent overfitting
    save_strategy="no",  # Disable saving checkpoints during training to save memory
    fp16=True,  # Use mixed precision training to speed up training and reduce memory usage
    report_to="none"  # Disable reporting to avoid unwanted logging to external platforms
)

# Initialize Trainer
trainer = Trainer(
    model=model,  # The model to be trained
    args=training_args,  # The training arguments defined above
    train_dataset=train_dataset,  # The training dataset
    eval_dataset=validation_dataset,  # The validation dataset
)

# Start training
trainer.train()

Epoch,Training Loss,Validation Loss
1,0.2578,0.227999
2,0.2278,0.215651
3,0.2,0.211094
4,0.1921,0.209937
5,0.1867,0.209199


TrainOutput(global_step=1875, training_loss=0.30667708536783855, metrics={'train_runtime': 373.4652, 'train_samples_per_second': 40.164, 'train_steps_per_second': 5.021, 'total_flos': 1355882803200000.0, 'train_loss': 0.30667708536783855, 'epoch': 5.0})

In [None]:
# Save the final model
trainer.save_model(model_dir)
tokenizer.save_pretrained(model_dir)

('/content/saved_t5_model/tokenizer_config.json',
 '/content/saved_t5_model/special_tokens_map.json',
 '/content/saved_t5_model/spiece.model',
 '/content/saved_t5_model/added_tokens.json')

In [None]:
import os
from transformers import T5ForConditionalGeneration, T5Tokenizer

# Load the model and tokenizer from the saved directory
model = T5ForConditionalGeneration.from_pretrained(model_dir)
tokenizer = T5Tokenizer.from_pretrained(model_dir)

In [None]:
import torch
from transformers import T5ForConditionalGeneration, T5Tokenizer

# Load the model and tokenizer from the saved directory
model = T5ForConditionalGeneration.from_pretrained(model_dir)
tokenizer = T5Tokenizer.from_pretrained(model_dir)

# Set the device (GPU if available, otherwise CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)


# Preprocessing function for inference
def preprocess_input(sentence):
    return "paraphrase: " + sentence

# Generate paraphrases with corrected num_beams and num_return_sequences
def generate_paraphrase(input_text, model, tokenizer, max_length=max_length, num_beams=5, num_return_sequences=4, top_k=100, top_p=0.9, temperature=1.0):
    # Preprocess input
    input_text = preprocess_input(input_text)

    # Tokenize input
    inputs = tokenizer(input_text, return_tensors="pt", truncation=True, max_length=max_length, padding="max_length")

    # Move inputs to the same device as the model
    inputs = {key: value.to(device) for key, value in inputs.items()}

    # Generate paraphrases
    outputs = model.generate(
        input_ids=inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_length=max_length + 20,  # Increase max_length for more room in paraphrases
        num_beams=num_beams,
        num_return_sequences=num_return_sequences,
        top_k=top_k,              # Use top-k sampling for diversity
        top_p=top_p,              # Use top-p sampling for nucleus-based sampling
        temperature=temperature,  # Encourage more exploratory generation
        do_sample=True,           # Enable sampling for top-k and top-p
        early_stopping=True
    )

    # Decode generated outputs
    paraphrased_texts = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
    return paraphrased_texts

# Example sentence
input_sentence = "The quick brown fox jumps over the lazy dog."

# Generate paraphrases
paraphrased_sentences = generate_paraphrase(
    input_sentence, model, tokenizer,max_length, num_return_sequences=4
)

# Display results
print(f"Original: {input_sentence}")
for i, paraphrase in enumerate(paraphrased_sentences, 1):
    print(f"Paraphrase {i}: {paraphrase}")

Original: The quick brown fox jumps over the lazy dog.
Paraphrase 1: The quick brown fox jumps over the lazy dog.
Paraphrase 2: The fast brown fox jumps over the lazy dog.
Paraphrase 3: The quick brown fox leaps over the lazy dog.
Paraphrase 4: The quick brown fox jumps over the lazy dog .


In [None]:
# Example sentence
input_sentence = "She enjoys reading books on rainy afternoons."

# Generate paraphrases
paraphrased_sentences = generate_paraphrase(
    input_sentence, model, tokenizer, num_return_sequences=4
)

# Display results
print(f"Original: {input_sentence}")
for i, paraphrase in enumerate(paraphrased_sentences, 1):
    print(f"Paraphrase {i}: {paraphrase}")

Original: She enjoys reading books on rainy afternoons.
Paraphrase 1: She enjoys reading books on rainy afternoons.
Paraphrase 2: She enjoys reading books on rainy afternoons .
Paraphrase 3: On rainy afternoons, she enjoys reading books.
Paraphrase 4: On rainy afternoons she enjoys reading books.
