In [1]:
#https://huggingface.co/docs/transformers/model_doc/t5
!pip install t5



In [2]:
import numpy as np
import pandas as pd

In [3]:
#documents
docs = pd.read_csv('./toy_data/docs.csv', dtype=str)

#queries
queries = pd.read_csv('./toy_data/queries.csv', dtype=str)

#qrels
qrels = pd.read_csv('./toy_data/qrels.csv', dtype=str)
qrels = qrels.astype({'label': 'int32'})


#prints
print(docs.shape)
print(docs.head())

print(queries.shape)
print(queries.head())

print(qrels.shape)
print(qrels.head())

(2453, 2)
     docno                                               text
0   935016  he emigrated to france with his family in 1956...
1  2360440  after being ambushed by the germans in novembe...
2   347765  she was the second ship named for captain alex...
3  1969335  world war ii was a global war that was under w...
4  1576938  the ship was ordered on 2 april 1942 laid down...
(9, 2)
       qid                 query
0  1015979    president of chile
1     2674    computer animation
2   340095  2020 summer olympics
3  1502917         train station
4     2574       chinese cuisine
(2454, 4)
       qid    docno  label iteration
0  1015979  1015979      2         0
1  1015979  2226456      1         0
2  1015979  1514612      1         0
3  1015979  1119171      1         0
4  1015979  1053174      1         0


# T5 Implementation for Query Rewriting

In [4]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

# Below can be used to summarize documents and use summarization as queries to improve training.

In [5]:
#tokenizer = T5Tokenizer.from_pretrained("t5-base")
#model = T5ForConditionalGeneration.from_pretrained("t5-base", return_dict=True)

In [6]:
#task_prefix = "summarize: "
# use different length sentences to test batching
#sentences = ["The house is wonderful.", "I like to work in NYC."]
#sentences = []
#for i in range(100):
#  sentences.append(docs['text'].loc[i])

#inputs = tokenizer([task_prefix + sentence for sentence in sentences], return_tensors="pt", padding=True)

In [7]:
#output_sequences = model.generate(
#     input_ids=inputs["input_ids"],
#     attention_mask=inputs["attention_mask"],
#     do_sample=False,  # disable sampling to test if batching affects output
# )

#print(tokenizer.batch_decode(output_sequences, skip_special_tokens=True))

# Rewrite Queries - google/flan-t5-base

In [8]:
#https://codepal.ai/code-generator/query/DpMCp564/ai-content-rewriter
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-base")
tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-base")

config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [9]:
rewritten_texts = []
for i in range(len(queries)):
  # Tokenize the input text
  inputs = tokenizer.encode(queries['query'].iloc[i], return_tensors="pt")

  # Generate the rewritten text
  outputs = model.generate(inputs, max_length=512, num_beams=4, early_stopping=True)
  rewritten_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
  rewritten_texts.append(rewritten_text)

In [10]:
queries['query']

0      president of chile
1      computer animation
2    2020 summer olympics
3           train station
4         chinese cuisine
5            world war ii
6                painting
7                   house
8         mexican cuisine
Name: query, dtype: object

In [11]:
rewritten_texts

['president of chile',
 'computer animation',
 '2020 summer olympics',
 'train station',
 'chinese cuisine',
 'World War II',
 'painting of a woman',
 'house for sale house for sale',
 'mexican cuisine']

In [12]:
queries['rewritten_queries'] = rewritten_texts

In [13]:
queries

Unnamed: 0,qid,query,rewritten_queries
0,1015979,president of chile,president of chile
1,2674,computer animation,computer animation
2,340095,2020 summer olympics,2020 summer olympics
3,1502917,train station,train station
4,2574,chinese cuisine,chinese cuisine
5,14082,world war ii,World War II
6,1250390,painting,painting of a woman
7,5597,house,house for sale house for sale
8,8438,mexican cuisine,mexican cuisine


In [14]:
queries['query'] = queries['query'] + ' ' + rewritten_texts

In [15]:
queries

Unnamed: 0,qid,query,rewritten_queries
0,1015979,president of chile president of chile,president of chile
1,2674,computer animation computer animation,computer animation
2,340095,2020 summer olympics 2020 summer olympics,2020 summer olympics
3,1502917,train station train station,train station
4,2574,chinese cuisine chinese cuisine,chinese cuisine
5,14082,world war ii World War II,World War II
6,1250390,painting painting of a woman,painting of a woman
7,5597,house house for sale house for sale,house for sale house for sale
8,8438,mexican cuisine mexican cuisine,mexican cuisine


Evaluation

In [16]:
!pip install pyterrier

Collecting pyterrier
  Downloading pyterrier-0.1.5-py2.py3-none-any.whl (22 kB)
Installing collected packages: pyterrier
Successfully installed pyterrier-0.1.5


In [17]:
!pip install python-terrier

Collecting python-terrier
  Downloading python-terrier-0.10.0.tar.gz (107 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m107.6/107.6 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting wget (from python-terrier)
  Downloading wget-3.2.zip (10 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pyjnius>=1.4.2 (from python-terrier)
  Downloading pyjnius-1.6.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting matchpy (from python-terrier)
  Downloading matchpy-0.5.5-py3-none-any.whl (69 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m69.6/69.6 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
Collecting deprecated (from python-terrier)
  Downloading Deprecated-1.2.14-py2.py3-none-any.whl (9.6 kB)
Collecting chest (from pyth

In [18]:
import pyterrier as pt
if not pt.started():
  pt.init(boot_packages=["com.github.terrierteam:terrier-prf:-SNAPSHOT"])

terrier-assemblies 5.8 jar-with-dependencies not found, downloading to /root/.pyterrier...
Done
terrier-python-helper 0.0.8 jar not found, downloading to /root/.pyterrier...
Done
terrier-prf -SNAPSHOT jar not found, downloading to /root/.pyterrier...
Done


PyTerrier 0.10.0 has loaded Terrier 5.8 (built by craigm on 2023-11-01 18:05) and terrier-helper 0.0.8



In [19]:
indexer = pt.DFIndexer("./indexes/default", overwrite=True, blocks=True)
index_ref = indexer.index(docs["text"], docs["docno"])
index = pt.IndexFactory.of(index_ref)

In [20]:
tf = pt.BatchRetrieve(index, wmodel="Tf")
tf_idf = pt.BatchRetrieve(index, wmodel="TF_IDF")
bm25 = pt.BatchRetrieve(index, wmodel="BM25")

In [21]:
qrels = qrels.astype({'label': 'int32'})
final_res = pt.Experiment(
                        retr_systems = [tf, tf_idf, bm25],
                        names =  ["TF", "TF-IDF", "BM25"],
                        topics = queries,
                        qrels = qrels,
                        eval_metrics = ["map", "ndcg", "ndcg_cut_10", "P_10", "recall_10"])

In [22]:
final_res

Unnamed: 0,name,map,ndcg,ndcg_cut_10,P_10,recall_10
0,TF,0.618924,0.793804,0.859541,0.811111,0.408212
1,TF-IDF,0.634148,0.794164,0.824113,0.766667,0.386933
2,BM25,0.638495,0.79642,0.819127,0.766667,0.386933


In [23]:
new_queries = pd.DataFrame()
new_queries['qid'] = queries['qid']
new_queries['query'] = rewritten_texts
new_queries

Unnamed: 0,qid,query
0,1015979,president of chile
1,2674,computer animation
2,340095,2020 summer olympics
3,1502917,train station
4,2574,chinese cuisine
5,14082,World War II
6,1250390,painting of a woman
7,5597,house for sale house for sale
8,8438,mexican cuisine


In [24]:
qrels = qrels.astype({'label': 'int32'})
final_res = pt.Experiment(
                        retr_systems = [tf, tf_idf, bm25],
                        names =  ["TF", "TF-IDF", "BM25"],
                        topics = new_queries,
                        qrels = qrels,
                        eval_metrics = ["map", "ndcg", "ndcg_cut_10", "P_10", "recall_10"])

In [25]:
final_res

Unnamed: 0,name,map,ndcg,ndcg_cut_10,P_10,recall_10
0,TF,0.625442,0.798296,0.864339,0.811111,0.408212
1,TF-IDF,0.607258,0.783524,0.792257,0.733333,0.354813
2,BM25,0.612774,0.785749,0.787561,0.722222,0.354438


# Rewrite Queries - castorini/t5-base-canard

In [26]:
#https://huggingface.co/castorini/t5-base-canard
model = T5ForConditionalGeneration.from_pretrained("castorini/t5-base-canard")
tokenizer = T5Tokenizer.from_pretrained("castorini/t5-base-canard")

config.json:   0%|          | 0.00/1.34k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/892M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.86k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/1.79k [00:00<?, ?B/s]

In [27]:
rewritten_texts = []
for i in range(len(queries)):
  # Tokenize the input text
  inputs = tokenizer.encode(queries['query'].iloc[i], return_tensors="pt")

  # Generate the rewritten text
  outputs = model.generate(inputs, max_length=512, num_beams=4, early_stopping=True)
  rewritten_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
  rewritten_texts.append(rewritten_text)

In [28]:
queries

Unnamed: 0,qid,query,rewritten_queries
0,1015979,president of chile president of chile,president of chile
1,2674,computer animation computer animation,computer animation
2,340095,2020 summer olympics 2020 summer olympics,2020 summer olympics
3,1502917,train station train station,train station
4,2574,chinese cuisine chinese cuisine,chinese cuisine
5,14082,world war ii World War II,World War II
6,1250390,painting painting of a woman,painting of a woman
7,5597,house house for sale house for sale,house for sale house for sale
8,8438,mexican cuisine mexican cuisine,mexican cuisine


In [29]:
rewritten_texts

['president of chile president of chile',
 'computer animation computer animation computer animation computer animation',
 '2020 summer olympics 2020 summer olympics 2020 summer olympics',
 'train station train station train station train station',
 'chinese cuisine chinese cuisine',
 'World War II ended in World War II',
 'painting painting painting of a woman.',
 'house for sale house for sale house for sale house for sale house for sale house for sale house for sale house for sale house for sale house for sale house for sale house for sale house for sale house for sale house for sale house for sale house for sale house for sale house for sale house for sale house for sale house for sale house for sale house for sale house for sale house for sale house for sale house for sale house for sale house for sale house for sale house for sale house for sale house for sale house for sale house for sale house for sale house for',
 'mexican cuisine mexican cuisine mexican cuisine']