# Importing the libraries

In [5]:
pip install torch transformers spacy

Note: you may need to restart the kernel to use updated packages.


In [6]:
pip install nltk

Note: you may need to restart the kernel to use updated packages.


In [7]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\adity\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# Pre-processing step using Tokenizer

In [8]:
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize

text = "This is a sample text. It contains multiple sentences. We want to tokenize it."

# Tokenize into sentences
sentences = sent_tokenize(text)

# Tokenize each sentence into words
tokenized_sentences = [word_tokenize(sentence) for sentence in sentences]

print(sentences)
print(tokenized_sentences)

['This is a sample text.', 'It contains multiple sentences.', 'We want to tokenize it.']
[['This', 'is', 'a', 'sample', 'text', '.'], ['It', 'contains', 'multiple', 'sentences', '.'], ['We', 'want', 'to', 'tokenize', 'it', '.']]


# Pre-processing step using *Stemming*

In [9]:
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

# Sample text
text = "This is a sample text. It contains multiple sentences. We want to tokenize it."

# Tokenize the text into words
words = word_tokenize(text)

# Initialize the Porter Stemmer
porter_stemmer = PorterStemmer()

# Apply stemming to each word
stemmed_words = [porter_stemmer.stem(word) for word in words]

print(stemmed_words)


['thi', 'is', 'a', 'sampl', 'text', '.', 'it', 'contain', 'multipl', 'sentenc', '.', 'we', 'want', 'to', 'token', 'it', '.']


# pre processing step convert it to Token Id

In [10]:
from transformers import AutoTokenizer

# Initialize the tokenizer with the model you are using

tokenizer = AutoTokenizer.from_pretrained("humarin/chatgpt_paraphraser_on_T5_base")

def preprocess_input(text, max_length=128):

    # Add the "paraphrase: " prefix to the input text

    formatted_text = f'paraphrase: {text}'

    # Tokenize the input text

    input_ids = tokenizer(formatted_text, return_tensors="pt", padding="longest",
                          max_length=max_length, truncation=True).input_ids

    return input_ids

# Example input text

input_text = "Please paraphrase this sentence."

# Preprocess the input text

input_ids = preprocess_input(input_text)

# Print the pre-processed input IDs

print(input_ids)

tensor([[ 3856, 27111,    10,   863,  3856, 27111,    48,  7142,     5,     1]])


In [11]:
# Example input text
input_text = "Generating text is the task of producing new text. These models can, for example, fill in incomplete text or paraphrase."

# Preprocess the input text
input_ids = preprocess_input(input_text)

# Print the pre-processed input IDs
print(input_ids)

tensor([[ 3856, 27111,    10,  5945,    49,  1014,  1499,    19,     8,  2491,
            13,  5874,   126,  1499,     5,   506,  2250,    54,     6,    21,
           677,     6,    14,    16, 19840,  1499,    42,  3856, 27111,     5,
             1]])


# Trained the Model

In [12]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

device = "cpu"

tokenizer = AutoTokenizer.from_pretrained("humarin/chatgpt_paraphraser_on_T5_base")

model = AutoModelForSeq2SeqLM.from_pretrained("humarin/chatgpt_paraphraser_on_T5_base").to(device)


# SVO Arrangement

In [13]:
def organize_output(paraphrase_output):
    # Logic to organize paraphrase output into a structured sentence
    # Example logic: Split the output into tokens and organize them into SVO structure
    tokens = paraphrase_output.split()

    subject = " ".join(tokens[:3])
    verb = " ".join(tokens[3:6])
    obj = " ".join(tokens[6:])

    structured_sentence = f"{subject} {verb} {obj}"

    return structured_sentence

## Define the Paraphrase function

In [14]:
def paraphrase(
    question,
    num_beams=5,
    num_beam_groups=5,
    num_return_sequences=5,
    repetition_penalty=10.0,
    diversity_penalty=3.0,
    no_repeat_ngram_size=2,
    temperature=0.7,
    max_length=128
):
    input_ids = tokenizer(
        f'paraphrase: {question}',
        return_tensors="pt", padding="longest",
        max_length=max_length,
        truncation=True,
    ).input_ids

    outputs = model.generate(
        input_ids, temperature=temperature, repetition_penalty=repetition_penalty,
        num_return_sequences=num_return_sequences, no_repeat_ngram_size=no_repeat_ngram_size,
        num_beams=num_beams, num_beam_groups=num_beam_groups,
        max_length=max_length, diversity_penalty=diversity_penalty
    )

    paraphrases = tokenizer.batch_decode(outputs, skip_special_tokens=True)

    # Organize each paraphrase into a structured sentence
    structured_sentences = [organize_output(paraphrase) for paraphrase in paraphrases]

    return structured_sentences

# Examples for paraphrasing text or input

In [15]:
text = 'Shivam is doing Data science.'

structured_sentences = paraphrase(text)
print(structured_sentences)



['Shivam is engaged in Data Science. ', 'The work being done by Shivam is in the field of Data Science.', 'A person named Shivam is engaged in Data Science.', 'Data science is being performed by Shivam.', 'Shivam focuses on data science. ']


In [16]:
text = 'Ram eat mango'

structured_sentences = paraphrase(text)
print(structured_sentences)



['Ram indulges in mango consumption. ', "The consumption of mangoes is Ram's preference.", 'Ram ingests mangoes.  ', "Mangos are a part of Ram's diet.", "In Ram's diet, mango is a must-eat."]


In [18]:
text = 'I have done this work '

structured_sentences = paraphrase(text)
print(structured_sentences)

['This work has been completed by me.', 'I have carried out this task. ', 'It is my job that I have completed.', 'The task that I am currently doing has been completed by me.', 'I\'ve completed this job." ']
