### Setup

In [None]:
!pip install -q transformers
!pip install -q sentencepiece
!pip install -q numpy requests nlpaug
!pip install -q git+https://github.com/PrithivirajDamodaran/Parrot_Paraphraser.git

[K     |████████████████████████████████| 3.8 MB 4.2 MB/s 
[K     |████████████████████████████████| 67 kB 3.3 MB/s 
[K     |████████████████████████████████| 596 kB 50.4 MB/s 
[K     |████████████████████████████████| 6.5 MB 38.3 MB/s 
[K     |████████████████████████████████| 895 kB 47.4 MB/s 
[K     |████████████████████████████████| 1.2 MB 4.3 MB/s 
[K     |████████████████████████████████| 410 kB 4.1 MB/s 
[K     |████████████████████████████████| 50 kB 2.6 MB/s 
[K     |████████████████████████████████| 79 kB 3.9 MB/s 
[?25h  Building wheel for parrot (setup.py) ... [?25l[?25hdone
  Building wheel for python-Levenshtein (setup.py) ... [?25l[?25hdone
  Building wheel for sentence-transformers (setup.py) ... [?25l[?25hdone


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
cd drive/MyDrive/Colab Notebooks/siap/datasets

/content/drive/MyDrive/Colab Notebooks/siap/datasets


In [None]:
from nltk import sent_tokenize, word_tokenize
import nltk

nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
import pandas as pd
df = pd.read_csv('train_clean_with_emoticons.csv')

In [None]:
from numpy import inf
def find_new_elements(old_lst, new_lst):
    return [value for value in new_lst if value not in old_lst]

def intersection(old_lst, new_lst):
    return [value for value in new_lst if value in old_lst]

def normalize_parot_output(output):
  return [out[0] for out in output]

def find_best_paraphrase(original, paraphrases):
  if len(paraphrases) == 1:
    return paraphrases[0]
  
  best_paraphrase = None
  min_intersection = inf
  for paraphrase in paraphrases:
    value = len(find_new_elements(original.lower().split(' '), paraphrase.lower().split(' ')))
    if value < min_intersection:
      min_paraphrase = value
      best_paraphrase = paraphrase
  
  return best_paraphrase

def append_if_removed(str1, str2, characters):
  if str1[-1] in characters and str2[-1] not in characters:
      str2 += str1[-1]
  return str1, str2

In [None]:
import torch
from transformers import PegasusForConditionalGeneration, AutoTokenizer

model_name = 'tuner007/pegasus_paraphrase'
torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name).to(torch_device)

Downloading:   0%|          | 0.00/86.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.12k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.82M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.12G [00:00<?, ?B/s]

In [None]:
def get_response(input_text,num_return_sequences,num_beams):
  batch = tokenizer([input_text],truncation=True,padding='longest',max_length=60, return_tensors="pt").to(torch_device)
  translated = model.generate(**batch,max_length=60,num_beams=num_beams, num_return_sequences=num_return_sequences, temperature=1.5)
  tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)
  return tgt_text

In [None]:
from tqdm import tqdm
import pickle
from IPython.display import clear_output

with open('paraphrases_dict.pkl', 'rb') as f:
  results = pickle.load(f)
#results = {}

num_beams = 10
num_return_sequences = 5
iteration_counter = 0
num_rows = len(df)
for index, row in df.iterrows():
  iteration_counter += 1
  if row['Review ID'] in results or int(row['Rating']) == 5:
    continue

  sentences = sent_tokenize(row['Review Text'])
  final_reviews = [''] * num_return_sequences
  for j, sentence in enumerate(sentences):
    paraphrases = get_response(sentence, num_return_sequences, num_beams)

    for i, paraphrase in enumerate(paraphrases):
      sentence, paraphrase = append_if_removed(sentence, paraphrase, ['.', '!', '?'])
      final_reviews[i] += paraphrase + ' '

  results[row['Review ID']] = [f.strip() for f in final_reviews]

  if len(list(results.keys())) % 10 == 0:
    with open('paraphrases_dict.pkl', 'wb') as f:
      pickle.dump(results, f)
      
    clear_output(wait=True)
    print('Progress >>>', round(iteration_counter / num_rows * 100), '%')

Progress >>> 100 %
