# Installing

In [None]:
#!pip install t5

Collecting t5
  Downloading t5-0.9.4-py2.py3-none-any.whl (164 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m164.5/164.5 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
Collecting immutabledict (from t5)
  Downloading immutabledict-4.1.0-py3-none-any.whl (4.5 kB)
Collecting mesh-tensorflow[transformer]>=0.1.13 (from t5)
  Downloading mesh_tensorflow-0.1.21-py3-none-any.whl (385 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m385.2/385.2 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
Collecting rouge-score>=0.1.2 (from t5)
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting sacrebleu (from t5)
  Downloading sacrebleu-2.4.0-py3-none-any.whl (106 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m106.3/106.3 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
Collecting sentencepiece (from t5)
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64

In [None]:
#!pip install pyterrier

In [None]:
#!pip install python-terrier

# Import Libraries

In [None]:
import numpy as np
import pandas as pd
import pyterrier as pt
if not pt.started():
  pt.init(boot_packages=["com.github.terrierteam:terrier-prf:-SNAPSHOT"])

import torch
from transformers import T5ForConditionalGeneration,T5Tokenizer

PyTerrier 0.10.0 has loaded Terrier 5.8 (built by craigm on 2023-11-01 18:05) and terrier-helper 0.0.8



# Connect To Google Drive And Load Data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!ls '/content/drive/MyDrive/project_data'

scifact


In [None]:
!ls '/content/drive/MyDrive/project_data/scifact'


corpus.jsonl	    queries.jsonl  test.source_rl  train.source     train.target_rl  val.target
process_scifact.py  test.csv	   test.target	   train.source_rl  val.source	     val.target_rl
qrels		    test.source    train.csv	   train.target     val.source_rl


In [None]:
data_directory = '/content/drive/MyDrive/project_data'

# Load Train and Test Data

In [None]:
#train data
df_train = pd.read_csv(data_directory + '/scifact/train.csv', sep='\t', dtype=str)
df_train2 = df_train[['qid', 'query']]
df_train2.to_csv('my_train_queries.csv', sep = '\t', index=False, header=False)
train_query = pt.io.read_topics('my_train_queries.csv', format='singleline')
train_source = train_query['query']

# Rephrasing

In [None]:
#INSPIRED BY!
#https://github.com/ramsrigouthamg/Paraphrase-any-question-with-T5-Text-To-Text-Transfer-Transformer-
def set_seed(seed):
  torch.manual_seed(seed)
  if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)

set_seed(15)

In [None]:
model = T5ForConditionalGeneration.from_pretrained('ramsrigouthamg/t5_paraphraser')
tokenizer = T5Tokenizer.from_pretrained('t5-base')

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print ("device ",device)
model = model.to(device)

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/892M [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


device  cuda


# Rephrasing - Train

In [None]:
all_train_outputs = []
for i in range(len(train_source)):
  text =  "paraphrase: " + train_source.iloc[i] + " </s>"


  max_len = 256

  encoding = tokenizer.encode_plus(text,pad_to_max_length=True, return_tensors="pt")
  input_ids, attention_masks = encoding["input_ids"].to(device), encoding["attention_mask"].to(device)


  # set top_k = 50 and set top_p = 0.95 and num_return_sequences = 3
  beam_outputs = model.generate(
      input_ids=input_ids, attention_mask=attention_masks,
      do_sample=True,
      max_length=256,
      top_k=120,
      top_p=0.98,
      early_stopping=True,
      num_return_sequences=5
  )

  final_outputs =[]
  for beam_output in beam_outputs:
      sent = tokenizer.decode(beam_output, skip_special_tokens=True,clean_up_tokenization_spaces=True)
      if sent.lower() != train_source.iloc[i].lower() and sent not in final_outputs:
          final_outputs.append(sent)
  all_train_outputs.append(final_outputs)

  if i == 0:
    print ("\nOriginal Question ::")
    print (train_source.iloc[i])
    print ("\n")
    print ("Paraphrased Questions :: ")
    for i, final_output in enumerate(final_outputs):
        print("{}: {}".format(i, final_output))

  if i % 50 == 0:
    print(i,  " out of ", len(train_source))




Original Question ::
0 dimensional biomaterials lack inductive properties


Paraphrased Questions :: 
0: 0 dimensional biomaterials lack inductive properties.
1: 0 dimensional biomaterials lack inductive properties. What is the probability that they do not have inductive properties?
2: 0-dimensional biomaterials lack inductive properties. Inductive properties that are interesting to a scientist are the same as one-dimensional biomaterials.
3: A physical description of a nanowafer based on a 0-dimensional biomaterial does not appear to have inductive properties.
50  out of  919
100  out of  919
150  out of  919
200  out of  919
250  out of  919
300  out of  919
350  out of  919
400  out of  919
450  out of  919
500  out of  919
550  out of  919
600  out of  919
650  out of  919
700  out of  919
750  out of  919
800  out of  919
850  out of  919
900  out of  919


In [None]:
train_rewritten_text1 = []
train_rewritten_text2 = []
train_rewritten_text3 = []
train_rewritten_text4 = []
train_rewritten_text5 = []

counter = 0

for i in range(len(all_train_outputs)):
  train_rewritten_text1.append(all_train_outputs[i][0])

for i in range(len(all_train_outputs)):
  if len(all_train_outputs[i]) < 2:
    counter += 1
    train_rewritten_text2.append(all_train_outputs[i][0])
  else:
    train_rewritten_text2.append(all_train_outputs[i][1])

for i in range(len(all_train_outputs)):
  if len(all_train_outputs[i]) < 3:
    train_rewritten_text3.append(all_train_outputs[i][0])
  else:
    train_rewritten_text3.append(all_train_outputs[i][2])

for i in range(len(all_train_outputs)):
  if len(all_train_outputs[i]) < 4:
    train_rewritten_text4.append(all_train_outputs[i][0])
  else:
    train_rewritten_text4.append(all_train_outputs[i][3])

for i in range(len(all_train_outputs)):
  if len(all_train_outputs[i]) < 5:
    train_rewritten_text5.append(all_train_outputs[i][0])
  else:
    train_rewritten_text5.append(all_train_outputs[i][4])

print(counter)

1


# Manipulate rephrased queries - Train

In [None]:
train_new_queries = pd.DataFrame()
train_new_queries['qid'] = train_query['qid']
train_new_queries['query'] = train_source

train_new_queries1 = pd.DataFrame()
train_new_queries1['qid'] = train_query['qid']
train_new_queries1['query'] = train_rewritten_text1

train_new_queries2 = pd.DataFrame()
train_new_queries2['qid'] = train_query['qid']
train_new_queries2['query'] = train_rewritten_text2

train_new_queries3 = pd.DataFrame()
train_new_queries3['qid'] = train_query['qid']
train_new_queries3['query'] = train_rewritten_text3

train_new_queries4 = pd.DataFrame()
train_new_queries4['qid'] = train_query['qid']
train_new_queries4['query'] = train_rewritten_text4

train_new_queries5 = pd.DataFrame()
train_new_queries5['qid'] = train_query['qid']
train_new_queries5['query'] = train_rewritten_text5

In [None]:
train_new_queries['query'] = train_new_queries['query'].str.replace('%','')
train_new_queries['query'] = train_new_queries['query'].str.replace('?','')
train_new_queries['query'] = train_new_queries['query'].str.replace('\'','')
train_new_queries['query'] = train_new_queries['query'].str.replace('(','')
train_new_queries['query'] = train_new_queries['query'].str.replace(')','')
train_new_queries['query'] = train_new_queries['query'].str.replace(':','')
train_new_queries['query'] = train_new_queries['query'].str.replace('/','')
train_new_queries['query'] = train_new_queries['query'].str.replace('!','')
train_new_queries['query'] = train_new_queries['query'].str.replace('*','')

train_new_queries1['query'] = train_new_queries1['query'].str.replace('%','')
train_new_queries1['query'] = train_new_queries1['query'].str.replace('?','')
train_new_queries1['query'] = train_new_queries1['query'].str.replace('\'','')
train_new_queries1['query'] = train_new_queries1['query'].str.replace('(','')
train_new_queries1['query'] = train_new_queries1['query'].str.replace(')','')
train_new_queries1['query'] = train_new_queries1['query'].str.replace(':','')
train_new_queries1['query'] = train_new_queries1['query'].str.replace('/','')
train_new_queries1['query'] = train_new_queries1['query'].str.replace('!','')
train_new_queries1['query'] = train_new_queries1['query'].str.replace('*','')

train_new_queries2['query'] = train_new_queries2['query'].str.replace('%','')
train_new_queries2['query'] = train_new_queries2['query'].str.replace('?','')
train_new_queries2['query'] = train_new_queries2['query'].str.replace('!','')
train_new_queries2['query'] = train_new_queries2['query'].str.replace('\'','')
train_new_queries2['query'] = train_new_queries2['query'].str.replace('(','')
train_new_queries2['query'] = train_new_queries2['query'].str.replace(')','')
train_new_queries2['query'] = train_new_queries2['query'].str.replace(':','')
train_new_queries2['query'] = train_new_queries2['query'].str.replace('/','')
train_new_queries2['query'] = train_new_queries2['query'].str.replace('*','')

train_new_queries3['query'] = train_new_queries3['query'].str.replace('%','')
train_new_queries3['query'] = train_new_queries3['query'].str.replace('?','')
train_new_queries3['query'] = train_new_queries3['query'].str.replace('!','')
train_new_queries3['query'] = train_new_queries3['query'].str.replace('\'','')
train_new_queries3['query'] = train_new_queries3['query'].str.replace('(','')
train_new_queries3['query'] = train_new_queries3['query'].str.replace(')','')
train_new_queries3['query'] = train_new_queries3['query'].str.replace(':','')
train_new_queries3['query'] = train_new_queries3['query'].str.replace('/','')
train_new_queries3['query'] = train_new_queries3['query'].str.replace('*','')

train_new_queries4['query'] = train_new_queries4['query'].str.replace('%','')
train_new_queries4['query'] = train_new_queries4['query'].str.replace('?','')
train_new_queries4['query'] = train_new_queries4['query'].str.replace('\'','')
train_new_queries4['query'] = train_new_queries4['query'].str.replace('(','')
train_new_queries4['query'] = train_new_queries4['query'].str.replace(')','')
train_new_queries4['query'] = train_new_queries4['query'].str.replace(':','')
train_new_queries4['query'] = train_new_queries4['query'].str.replace('/','')
train_new_queries4['query'] = train_new_queries4['query'].str.replace('!','')
train_new_queries4['query'] = train_new_queries4['query'].str.replace('*','')

train_new_queries5['query'] = train_new_queries5['query'].str.replace('%','')
train_new_queries5['query'] = train_new_queries5['query'].str.replace('?','')
train_new_queries5['query'] = train_new_queries5['query'].str.replace('\'','')
train_new_queries5['query'] = train_new_queries5['query'].str.replace('(','')
train_new_queries5['query'] = train_new_queries5['query'].str.replace(')','')
train_new_queries5['query'] = train_new_queries5['query'].str.replace(':','')
train_new_queries5['query'] = train_new_queries5['query'].str.replace('/','')
train_new_queries5['query'] = train_new_queries5['query'].str.replace('!','')
train_new_queries5['query'] = train_new_queries5['query'].str.replace('*','')

  train_new_queries['query'] = train_new_queries['query'].str.replace('?','')
  train_new_queries['query'] = train_new_queries['query'].str.replace('(','')
  train_new_queries['query'] = train_new_queries['query'].str.replace(')','')
  train_new_queries['query'] = train_new_queries['query'].str.replace('*','')
  train_new_queries1['query'] = train_new_queries1['query'].str.replace('?','')
  train_new_queries1['query'] = train_new_queries1['query'].str.replace('(','')
  train_new_queries1['query'] = train_new_queries1['query'].str.replace(')','')
  train_new_queries1['query'] = train_new_queries1['query'].str.replace('*','')
  train_new_queries2['query'] = train_new_queries2['query'].str.replace('?','')
  train_new_queries2['query'] = train_new_queries2['query'].str.replace('(','')
  train_new_queries2['query'] = train_new_queries2['query'].str.replace(')','')
  train_new_queries2['query'] = train_new_queries2['query'].str.replace('*','')
  train_new_queries3['query'] = train_new_querie

In [None]:
train_new_queries6 = pd.DataFrame()
train_new_queries6['qid'] = train_query['qid']
train_new_queries6['query'] = train_new_queries['query'] + " " + train_new_queries1['query'] + " " + train_new_queries2['query'] + " " + train_new_queries3['query'] + " " + train_new_queries4['query'] + " " + train_new_queries5['query']

# Save data files - Train

In [None]:
train_new_queries1.to_csv(data_directory + '/scifact/rephrase/my_train_query1.csv', sep='\t', index = True, header = True)
train_new_queries2.to_csv(data_directory + '/scifact/rephrase/my_train_query2.csv', sep='\t', index = True, header = True)
train_new_queries3.to_csv(data_directory + '/scifact/rephrase/my_train_query3.csv', sep='\t', index = True, header = True)
train_new_queries4.to_csv(data_directory + '/scifact/rephrase/my_train_query4.csv', sep='\t', index = True, header = True)
train_new_queries5.to_csv(data_directory + '/scifact/rephrase/my_train_query5.csv', sep='\t', index = True, header = True)
train_new_queries6.to_csv(data_directory + '/scifact/rephrase/my_train_query6.csv', sep='\t', index = True, header = True)

In [None]:
ntq2 = pd.read_csv(data_directory + '/scifact/rephrase/my_train_query6.csv', sep='\t', index_col=0, dtype=str)
ntq2

Unnamed: 0,qid,query
0,0,0 dimensional biomaterials lack inductive prop...
1,2,1 in 5 million in uk have abnormal prp positiv...
2,4,1 1 of colorectal cancer patients are diagnose...
3,6,10 of sudden infant death syndrome sids deaths...
4,9,32 of liver transplantation programs required ...
...,...,...
914,1404,sirna knockdown of a20 slows tumor progression...
915,1405,taa1 tar1 tar2 triple mutants in arabidopsis d...
916,1406,sheet opening occurs during pleurotolysin pore...
917,1407,1 ketel is able to bind microtubules If I have...
