### Setup

In [None]:
!pip install -q transformers
!pip install -q sentencepiece
!pip install -q numpy requests nlpaug
!pip install -q git+https://github.com/PrithivirajDamodaran/Parrot_Paraphraser.git

In [None]:
import torch
from tqdm import tqdm
import pickle
from transformers import PegasusForConditionalGeneration, AutoTokenizer
from numpy import inf
import pandas as pd
import pickle
from nltk import sent_tokenize, word_tokenize
import nltk
nltk.download('punkt')

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
cd drive/MyDrive/siap/datasets

/content/drive/MyDrive/siap/datasets


In [16]:
dataset_name = 'train_clean_with_emoticons'
df = pd.read_csv(f'{dataset_name}.csv')

In [None]:
def find_new_elements(old_lst, new_lst):
    return [value for value in new_lst if value not in old_lst]

def intersection(old_lst, new_lst):
    return [value for value in new_lst if value in old_lst]

def find_best_paraphrase(original, paraphrases):
  if len(paraphrases) == 1:
    return paraphrases[0]
  
  best_paraphrase = None
  min_intersection = inf
  for paraphrase in paraphrases:
    value = len(find_new_elements(original.lower().split(' '), paraphrase.lower().split(' ')))
    if value < min_intersection:
      min_paraphrase = value
      best_paraphrase = paraphrase
  
  return best_paraphrase

def append_if_removed(str1, str2, character):
  if str1[-1] == character and str2[-1] != character:
      str2 += character
  return str1, str2

In [None]:
model_name = 'tuner007/pegasus_paraphrase'
torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name).to(torch_device)

Downloading:   0%|          | 0.00/86.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.12k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.82M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.12G [00:00<?, ?B/s]

In [None]:
def get_response(input_text,num_return_sequences,num_beams):
  batch = tokenizer([input_text],truncation=True,padding='longest',max_length=60, return_tensors="pt").to(torch_device)
  translated = model.generate(**batch,max_length=60,num_beams=num_beams, num_return_sequences=num_return_sequences, temperature=1.5)
  tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)
  return tgt_text

In [None]:
with open('paraphrases.pkl', 'rb') as f:
  results = pickle.load(f)

num_beams = 10
num_return_sequences = 5
res_len = len(results)
i = 0
for review in tqdm(df['Review Text']):
  if i < res_len:
    i += 1
    continue

  paraphrase = ''
  sentences = sent_tokenize(review)
  for sentence in sentences:
    paraphrases = get_response(sentence, num_return_sequences, num_beams)
    best_paraphrase = find_best_paraphrase(sentence, paraphrases)

    sentence, best_paraphrase = append_if_removed(sentence, best_paraphrase, '.')
    sentence, best_paraphrase = append_if_removed(sentence, best_paraphrase, '!')
    sentence, best_paraphrase = append_if_removed(sentence, best_paraphrase, '?')
    
    paraphrase += best_paraphrase + ' ' 

  results.append(paraphrase)
  if len(results) % 10 == 0:
    with open('paraphrases.pkl', 'wb') as f:
      pickle.dump(results, f)

In [5]:
with open('paraphrases_final.pkl', 'rb') as f:
  results = pickle.load(f)

In [17]:
df['Augmented review text'] = results

In [None]:
rows = 3
for _, row in df.sample(rows).iterrows():
    print(row['Review Text'])
    print(row['Augmented review text'])
    print('='*200)

In [18]:
df.to_csv(f'{dataset_name}.csv', index=False)