In [45]:
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt
import torch

import random
import re

from parrot import Parrot
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA

# IMDB data set exploration

In [9]:
imdb = pd.read_csv("IMDB Dataset.csv")
imdb

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [41]:
# clean reviews
imdb['review'] = imdb['review'].apply(lambda x: re.sub(r'<br\s*/?>', '', x, flags=re.IGNORECASE))

In [14]:
imdb['sentiment'].value_counts()

sentiment
positive    25000
negative    25000
Name: count, dtype: int64

In [42]:
pos = imdb[imdb['sentiment'] == "positive"]
neg = imdb[imdb['sentiment'] == "negative"]

# Paraphrasing models

In [27]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM


tokenizer = AutoTokenizer.from_pretrained("humarin/chatgpt_paraphraser_on_T5_base")

model = AutoModelForSeq2SeqLM.from_pretrained("humarin/chatgpt_paraphraser_on_T5_base")

def paraphrase(
    question,
    num_beams=5,
    num_beam_groups=5,
    num_return_sequences=5,
    repetition_penalty=10.0,
    diversity_penalty=3.0,
    no_repeat_ngram_size=2,
    temperature=0.7,
    max_length=128
):
    input_ids = tokenizer(
        f'paraphrase: {question}',
        return_tensors="pt", padding="longest",
        max_length=max_length,
        truncation=True,
    ).input_ids
    
    outputs = model.generate(
        input_ids, temperature=temperature, repetition_penalty=repetition_penalty,
        num_return_sequences=num_return_sequences, no_repeat_ngram_size=no_repeat_ngram_size,
        num_beams=num_beams, num_beam_groups=num_beam_groups,
        max_length=max_length, diversity_penalty=diversity_penalty
    )

    res = tokenizer.batch_decode(outputs, skip_special_tokens=True)

    return res


In [3]:
parrot = Parrot(model_tag="prithivida/parrot_paraphraser_on_T5")

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. If you see this, DO NOT PANIC! This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Downloading pytorch_model.bin:   0%|          | 0.00/892M [00:00<?, ?B/s]



Downloading (…)lve/main/config.json:   0%|          | 0.00/913 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/476 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)2b9e5/.gitattributes:   0%|          | 0.00/736 [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)3c1ed2b9e5/README.md:   0%|          | 0.00/3.74k [00:00<?, ?B/s]

Downloading (…)1ed2b9e5/config.json:   0%|          | 0.00/686 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading (…)c1ed2b9e5/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/329M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading (…)2b9e5/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/1.12k [00:00<?, ?B/s]

Downloading (…)c1ed2b9e5/vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading (…)ed2b9e5/modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

In [30]:
phrase = "this model is not performing up to my expectations"

print("Parrot output:")
para_phrases = parrot.augment(input_phrase=phrase, use_gpu=False) # returns (string, len(string))
for para_phrase in para_phrases:
    print(para_phrase[0])
    
print("--------------------------------------------------------")

print("Humarin's paraphraser output:")
para_phrases2 = paraphrase(phrase)
for para_phrase in para_phrases2:
    print(para_phrase)


Parrot output:
this model doesn't meet my expectations
this model does not meet my expectations
this model is not at my level of expectation
this model is not a match for my expectations
this model is not on par with my expectations
this model does not live up to my expectations
this model doesn't perform to my expectations
--------------------------------------------------------
Humarin's paraphraser output:
This model is not meeting my expectations.
I am not satisfied with the performance of this model.
The quality of this model is not satisfactory.
Although this model is good, I am not entirely impressed with its performance.
My impressions of this model are not up to par.


In [56]:
phrase = " A welcome relief from baseball movies that try too hard to be mythic , this one is a sweet and modest and ultimately winning story ."

print("Parrot output:")
para_phrases = parrot.augment(input_phrase=phrase, use_gpu=False,max_length=len(phrase)) # returns (string, len(string))
for para_phrase in para_phrases:
    print(para_phrase[0])
    
print("--------------------------------------------------------")

print("Humarin's paraphraser output:")
para_phrases2 = paraphrase(phrase)
for para_phrase in para_phrases2:
    print(para_phrase)

Parrot output:
a welcome relief from baseball movies that try too hard to be mythic this one is a sweet and modest and ultimately winning story
--------------------------------------------------------
Humarin's paraphraser output:
Unlike baseball movies that strive to be overhyped, this story is both humble and ultimately successful.
The sweet, modest and ultimately triumphant storyline of this baseball movie is a welcome change from those who try to steal the show.
This baseball movie is a welcome change from the overly ambitious and overblown tale of triumphant team members, as it's genuinely sweet and modest.
It's a welcome change from baseball movies that strive to be mythical, as it'll end up being genuinely sweet, modest, and ultimately successful.
In a time when baseball movies strive to be mythical, this film offers reassurance and an ultimately successful story.


In [43]:
phrase = imdb["review"].values[random.randint(0,50000)]
print(f'review: {phrase}')
print("--------------------------------------------------------")

print("Parrot output:")
para_phrases = parrot.augment(input_phrase=phrase, use_gpu=False) # returns (string, len(string))
for para_phrase in para_phrases:
    print(para_phrase[0])
    
print("--------------------------------------------------------")

print("Humarin's paraphraser output:")
para_phrases2 = paraphrase(phrase)
for para_phrase in para_phrases2:
    print(para_phrase)

review: The John Van Druten Broadway hit is brought to the screen with a maximum of star power in this romantic fantasy about a modern-day witch who beguiles a successful Manhattan publisher. James Stewart may get top billing, but it is Kim Novak who steals the show as one of the most alluring witches ever to cast a spell on the movie screen. The lead pairing is, in fact, one of the movie's few weaknesses: the gray-haired Stewart seems a bit old for the role, and while it is easy to see why he falls hard for Novak, it's a little harder to understand what she finds attractive about him, as they seem mismatched in temperment and outlook. (It is one of the story's amusing conceits that witches and warlocks are portrayed as Greenwich Village beatniks and bohemians.) Curiously, the Stewart-Novak pairing would generate a lot more heat in "Vertigo", released the same year as this film, but then "Vertigo" had a compelling suspense story, and the benefit of Alfred Hitchcock's direction.The film



The John Van Druten Broadway hit is brought to the screen with a maximum of star power in this romantic fantasy about a modern-day witch who beguiles a successful Manhattan publisher. James Stewart may get top billing, but it is Kim Novak who steals the show as one of the most alluring witches ever to cast a spell on the movie screen. The lead pairing is, in fact, one of the movie's few weaknesses: the gray-haired Stewart seems a bit old for the role, and while it is easy to see why he falls hard for Novak, it's a little harder to understand what she finds attractive about him, as they seem mismatched in temperment and outlook. (It is one of the story's amusing conceits that witches and warlocks are portrayed as Greenwich Village beatniks and bohemians.) Curiously, the Stewart-Novak pairing would generate a lot more heat in "Vertigo", released the same year as this film, but then "Vertigo" had a compelling suspense story, and the benefit of Alfred Hitchcock's direction.The film's comic



The modern-day witch who trickles down on a wealthy Manhattan publisher is the star of this romantic fantasy, with Kim Novak being cast as one of the most alluring witches to ever grace the screen. However, Stewart's appearance doesn't suit his taste at all, making him an unappealing halftime role in the film.
In this romantic fantasy about a modern witch who befriends – and overpowers the successful Manhattan publisher - John Van Druten's Broadway hit, Kim Novak is one of the most attractive witches ever to cast on the movie screen. The lead actor, James Stewart, looks rather outcast for his role in the film, which may not have been an issue at all.
The movie features a modern-day witch who aids aspiring publisher in their romantic fantasy, inspired by the Broadway hit John Van Druten. While James Stewart is the most popular character, Kim Novak is one of the more alluring witches to ever grace the screen. Unfortunately, Stewart's youthful appearance makes her less suitable for the le

*Parrot does not work for multi sentence strings 

# Validating paraphrasing models

### Sentence wise

In [49]:
# Using Nltk.vader's Sentiment intensity Analyser
sia = SIA()

In [48]:
phrase = 'This is an outstanding movie with a great cast. The plot is equally great'
base_score = sia.polarity_scores(phrase)['compound']
print(f'base score: {base_score}')

print("--------------------------------------------------------")

print("Humarin's paraphraser output:")
para_phrases2 = paraphrase(phrase)
for para_phrase in para_phrases2:
    para_score = sia.polarity_scores(para_phrase)['compound']
    print(f'para score: {para_score}')
    print(f'difference: {abs(para_score - base_score)}')
    
    

base score: 0.9217
--------------------------------------------------------
Humarin's paraphraser output:
para score: 0.8074
difference: 0.11429999999999996
para score: 0.9081
difference: 0.013599999999999945
para score: 0.8074
difference: 0.11429999999999996
para score: 0.8126
difference: 0.10909999999999997
para score: 0.6124
difference: 0.3092999999999999


In [51]:
phrase = 'This is an outstanding movie with a great cast. The plot is equally great'
base_score = sia.polarity_scores(phrase)['compound']
print(f'base score: {base_score}')

print("--------------------------------------------------------")

print("Parrot output:")
para_phrases = parrot.augment(input_phrase=phrase, use_gpu=False) # returns (string, len(string))
for para_phrase in para_phrases:
    para_score = sia.polarity_scores(para_phrase[0])['compound']
    print(f'para score: {para_score}')
    print(f'difference: {abs(para_score - base_score)}')


base score: 0.9217
--------------------------------------------------------
Parrot output:
para score: 0.8934
difference: 0.028299999999999992
para score: 0.8934
difference: 0.028299999999999992
para score: 0.9217
difference: 0.0
para score: 0.9136
difference: 0.008099999999999996
para score: 0.9001
difference: 0.021599999999999953
para score: 0.9217
difference: 0.0
para score: 0.9001
difference: 0.021599999999999953
para score: 0.9217
difference: 0.0


In [57]:
phrase = imdb["review"].values[random.randint(0,50000)]
base_score = sia.polarity_scores(phrase)['compound']
print(f'base score: {base_score}')

print("--------------------------------------------------------")

print("Humarin's paraphraser output:")
para_phrases2 = paraphrase(phrase)
for para_phrase in para_phrases2:
    para_score = sia.polarity_scores(para_phrase)['compound']
    print(f'para score: {para_score}')
    print(f'difference: {abs(para_score - base_score)}')

base score: 0.9273
--------------------------------------------------------
Humarin's paraphraser output:
para score: 0.945
difference: 0.017699999999999938
para score: 0.9225
difference: 0.0048000000000000265
para score: 0.6557
difference: 0.27160000000000006
para score: 0.8968
difference: 0.03049999999999997
para score: 0.8622
difference: 0.06510000000000005


Sentence wise paraphrasing works better for parrot, if sentence is actually paraphrased