In [1]:
#!pip install --upgrade transformers==2.9.0

In [2]:
#!pip install --upgrade pytorch_lightning==0.7.5

In [3]:
#!pip install sentencepiece

In [4]:
#!pip install t5

In [None]:
#!pip install pyterrier

In [None]:
#!pip install python-terrier

# Import Libraries

In [7]:
import numpy as np
import pandas as pd
import pyterrier as pt
if not pt.started():
  pt.init(boot_packages=["com.github.terrierteam:terrier-prf:-SNAPSHOT"])

import torch
from transformers import T5ForConditionalGeneration,T5Tokenizer

terrier-assemblies 5.8 jar-with-dependencies not found, downloading to /root/.pyterrier...
Done
terrier-python-helper 0.0.8 jar not found, downloading to /root/.pyterrier...
Done
terrier-prf -SNAPSHOT jar not found, downloading to /root/.pyterrier...
Done


PyTerrier 0.10.0 has loaded Terrier 5.8 (built by craigm on 2023-11-01 18:05) and terrier-helper 0.0.8



## Connect To Google Drive And Load Data

In [8]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [9]:
!ls '/content/drive/MyDrive/project_data'

scifact


In [10]:
!ls '/content/drive/MyDrive/project_data/scifact'

corpus.jsonl	    queries.jsonl  test.source_rl  train.source     train.target_rl  val.target
process_scifact.py  test.csv	   test.target	   train.source_rl  val.source	     val.target_rl
qrels		    test.source    train.csv	   train.target     val.source_rl


In [11]:
data_directory = '/content/drive/MyDrive/project_data'

# Load Meta-Data

In [12]:
corpus = pd.read_json(data_directory + '/scifact/corpus.jsonl', lines=True, dtype=str)

corpus

Unnamed: 0,_id,title,text,metadata
0,4983,Microstructural development of human newborn c...,Alterations of the architecture of cerebral wh...,{}
1,5836,Induction of myelodysplasia by myeloid-derived...,Myelodysplastic syndromes (MDS) are age-depend...,{}
2,7912,"BC1 RNA, the transcript from a master gene for...",ID elements are short interspersed elements (S...,{}
3,18670,The DNA Methylome of Human Peripheral Blood Mo...,DNA methylation plays an important role in bio...,{}
4,19238,The human myelin basic protein gene is include...,Two human Golli (for gene expressed in the oli...,{}
...,...,...,...,...
5178,195689316,Body-mass index and cause-specific mortality i...,BACKGROUND The main associations of body-mass ...,{}
5179,195689757,Targeting metabolic remodeling in glioblastoma...,A key aberrant biological difference between t...,{}
5180,196664003,Signaling architectures that transmit unidirec...,A signaling pathway transmits information from...,{}
5181,198133135,"Association between pre-diabetes, type 2 diabe...",AIMS Trabecular bone score (TBS) is a surrogat...,{}


# Load Test data and qrels from .csv files

In [13]:
#test data
df_test = pd.read_csv(data_directory + '/scifact/test.csv', sep='\t', dtype=str)
print(df_test.shape)
print(df_test['query'].apply(len).mean())
print(df_test['text'].apply(len).mean())
df_test

(339, 3)
91.43952802359883
1520.1592920353983


Unnamed: 0,qid,query,text
0,1,0-dimensional biomaterials show inductive prop...,Nanotechnologies are emerging platforms that c...
1,3,"1,000 genomes project enables mapping of genet...",Genome-wide association studies (GWAS) have no...
2,5,1/2000 in UK have abnormal PrP positivity.,OBJECTIVES To carry out a further survey of ar...
3,13,5% of perinatal mortality is due to low birth ...,CONTEXT One key target of the United Nations M...
4,36,A deficiency of vitamin B12 increases blood le...,BACKGROUND Homocysteine is a risk factor for c...
...,...,...,...
334,1379,Women with a higher birth weight are more like...,"INTRODUCTION Various perinatal factors, includ..."
335,1382,aPKCz causes tumour enhancement by affecting g...,Tumor cells have high-energetic and anabolic n...
336,1385,cSMAC formation enhances weak ligand signalling.,T cell activation is predicated on the interac...
337,1389,mTORC2 regulates intracellular cysteine levels...,Mutations in cancer reprogram amino acid metab...


In [14]:
#test_data
df_test = pd.read_csv(data_directory + '/scifact/test.csv', sep='\t', dtype=str)
df_test2 = df_test[['qid', 'query']]
df_test2.to_csv('my_test_queries.csv', sep = '\t', index=False, header=False)
test_query = pt.io.read_topics('my_test_queries.csv', format='singleline')
test_query

Unnamed: 0,qid,query
0,1,0 dimensional biomaterials show inductive prop...
1,3,1 000 genomes project enables mapping of genet...
2,5,1 2000 in uk have abnormal prp positivity
3,13,5 of perinatal mortality is due to low birth w...
4,36,a deficiency of vitamin b12 increases blood le...
...,...,...
334,1379,women with a higher birth weight are more like...
335,1382,apkcz causes tumour enhancement by affecting g...
336,1385,csmac formation enhances weak ligand signalling
337,1389,mtorc2 regulates intracellular cysteine levels...


In [15]:
#test qrels
df_test_qrels = pd.read_csv(data_directory + '/scifact/qrels/test.tsv', sep='\t', dtype=str)

df_test_qrels

Unnamed: 0,query-id,corpus-id,score
0,1,31715818,1
1,3,14717500,1
2,5,13734012,1
3,13,1606628,1
4,36,5152028,1
...,...,...,...
334,1379,17450673,1
335,1382,17755060,1
336,1385,306006,1
337,1389,23895668,1


In [16]:
#source data
test_source = test_query['query']
test_source

#target data
test_target = df_test['text']

test_target

0      Nanotechnologies are emerging platforms that c...
1      Genome-wide association studies (GWAS) have no...
2      OBJECTIVES To carry out a further survey of ar...
3      CONTEXT One key target of the United Nations M...
4      BACKGROUND Homocysteine is a risk factor for c...
                             ...                        
334    INTRODUCTION Various perinatal factors, includ...
335    Tumor cells have high-energetic and anabolic n...
336    T cell activation is predicated on the interac...
337    Mutations in cancer reprogram amino acid metab...
338    Monitoring cancer and aging in vivo remains ex...
Name: text, Length: 339, dtype: object

# T5 - Query Rephrasing

In [17]:
#INSPIRED BY!
#https://github.com/ramsrigouthamg/Paraphrase-any-question-with-T5-Text-To-Text-Transfer-Transformer-
def set_seed(seed):
  torch.manual_seed(seed)
  if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)

set_seed(15)


In [18]:
model = T5ForConditionalGeneration.from_pretrained('ramsrigouthamg/t5_paraphraser')
tokenizer = T5Tokenizer.from_pretrained('t5-base')

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print ("device ",device)
model = model.to(device)

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/892M [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


device  cuda


In [19]:
all_outputs = []
for i in range(len(test_source)):
  text =  "paraphrase: " + test_source.iloc[i] + " </s>"


  max_len = 256

  encoding = tokenizer.encode_plus(text,pad_to_max_length=True, return_tensors="pt")
  input_ids, attention_masks = encoding["input_ids"].to(device), encoding["attention_mask"].to(device)


  # set top_k = 50 and set top_p = 0.95 and num_return_sequences = 3
  beam_outputs = model.generate(
      input_ids=input_ids, attention_mask=attention_masks,
      do_sample=True,
      max_length=256,
      top_k=120,
      top_p=0.98,
      early_stopping=True,
      num_return_sequences=5
  )

  final_outputs =[]
  for beam_output in beam_outputs:
      sent = tokenizer.decode(beam_output, skip_special_tokens=True,clean_up_tokenization_spaces=True)
      if sent.lower() != test_source.iloc[i].lower() and sent not in final_outputs:
          final_outputs.append(sent)
  all_outputs.append(final_outputs)

  if i == 0:
    print ("\nOriginal Question ::")
    print (test_source.iloc[i])
    print ("\n")
    print ("Paraphrased Questions :: ")
    for i, final_output in enumerate(final_outputs):
        print("{}: {}".format(i, final_output))

  if i % 10 == 0:
    print(i,  " out of ", len(test_source))




Original Question ::
0 dimensional biomaterials show inductive properties


Paraphrased Questions :: 
0: 0 dimensional biomaterials show inductive properties, having inductive elasticity and inductive induction properties.
1: 0 dimensional biomaterials show inductive properties, avoiding loss of thermal force, induction properties, and reducing heat resistance.
2: 0 dimensional biomaterials have inductive properties.
3: What do 0 dimensional biomaterials show inductive properties?
4: Inductive properties of nanomaterials: 0 dimensional biomaterials have inductive properties.
10  out of  339
20  out of  339
30  out of  339
40  out of  339
50  out of  339
60  out of  339
70  out of  339
80  out of  339
90  out of  339
100  out of  339
110  out of  339
120  out of  339
130  out of  339
140  out of  339
150  out of  339
160  out of  339
170  out of  339
180  out of  339
190  out of  339
200  out of  339
210  out of  339
220  out of  339
230  out of  339
240  out of  339
250  out of  339
2

In [20]:
rewritten_text1 = []
rewritten_text2 = []
rewritten_text3 = []
rewritten_text4 = []
rewritten_text5 = []

counter = 0

for i in range(len(all_outputs)):
  rewritten_text1.append(all_outputs[i][0])

for i in range(len(all_outputs)):
  if len(all_outputs[i]) < 2:
    counter += 1
    rewritten_text2.append(all_outputs[i][0])
  else:
    rewritten_text2.append(all_outputs[i][1])

for i in range(len(all_outputs)):
  if len(all_outputs[i]) < 3:
    rewritten_text3.append(all_outputs[i][0])
  else:
    rewritten_text3.append(all_outputs[i][2])

for i in range(len(all_outputs)):
  if len(all_outputs[i]) < 4:
    rewritten_text4.append(all_outputs[i][0])
  else:
    rewritten_text4.append(all_outputs[i][3])

for i in range(len(all_outputs)):
  if len(all_outputs[i]) < 5:
    rewritten_text5.append(all_outputs[i][0])
  else:
    rewritten_text5.append(all_outputs[i][4])

print(counter)

1


# Evaluation using BM25

In [21]:
corpus = corpus.rename(columns={'_id': 'docno'})
corpus

Unnamed: 0,docno,title,text,metadata
0,4983,Microstructural development of human newborn c...,Alterations of the architecture of cerebral wh...,{}
1,5836,Induction of myelodysplasia by myeloid-derived...,Myelodysplastic syndromes (MDS) are age-depend...,{}
2,7912,"BC1 RNA, the transcript from a master gene for...",ID elements are short interspersed elements (S...,{}
3,18670,The DNA Methylome of Human Peripheral Blood Mo...,DNA methylation plays an important role in bio...,{}
4,19238,The human myelin basic protein gene is include...,Two human Golli (for gene expressed in the oli...,{}
...,...,...,...,...
5178,195689316,Body-mass index and cause-specific mortality i...,BACKGROUND The main associations of body-mass ...,{}
5179,195689757,Targeting metabolic remodeling in glioblastoma...,A key aberrant biological difference between t...,{}
5180,196664003,Signaling architectures that transmit unidirec...,A signaling pathway transmits information from...,{}
5181,198133135,"Association between pre-diabetes, type 2 diabe...",AIMS Trabecular bone score (TBS) is a surrogat...,{}


In [22]:
indexer = pt.DFIndexer("./indexes_scifact/both", overwrite=True, blocks=True,verbose=True, stemmer='porter', stopwords='terrier', tokenizer = 'english')
index_ref = indexer.index(corpus["text"], corpus["docno"])
index = pt.IndexFactory.of(index_ref)

  0%|          | 0/5183 [00:00<?, ?documents/s]

In [23]:
bm25 = pt.BatchRetrieve(index, wmodel="BM25")

In [50]:
new_queries = pd.DataFrame()
new_queries['qid'] = test_query['qid']
new_queries['query'] = test_source

new_queries1 = pd.DataFrame()
new_queries1['qid'] = test_query['qid']
new_queries1['query'] = rewritten_text1

new_queries2 = pd.DataFrame()
new_queries2['qid'] = test_query['qid']
new_queries2['query'] = rewritten_text2

new_queries3 = pd.DataFrame()
new_queries3['qid'] = test_query['qid']
new_queries3['query'] = rewritten_text3

new_queries4 = pd.DataFrame()
new_queries4['qid'] = test_query['qid']
new_queries4['query'] = rewritten_text4

new_queries5 = pd.DataFrame()
new_queries5['qid'] = test_query['qid']
new_queries5['query'] = rewritten_text5

In [51]:
new_queries['query'] = new_queries['query'].str.replace('%','')
new_queries['query'] = new_queries['query'].str.replace('?','')
new_queries['query'] = new_queries['query'].str.replace('\'','')
new_queries['query'] = new_queries['query'].str.replace('(','')
new_queries['query'] = new_queries['query'].str.replace(')','')
new_queries['query'] = new_queries['query'].str.replace(':','')
new_queries['query'] = new_queries['query'].str.replace('/','')
new_queries['query'] = new_queries['query'].str.replace('!','')
new_queries['query'] = new_queries['query'].str.replace('*','')

new_queries1['query'] = new_queries1['query'].str.replace('%','')
new_queries1['query'] = new_queries1['query'].str.replace('?','')
new_queries1['query'] = new_queries1['query'].str.replace('\'','')
new_queries1['query'] = new_queries1['query'].str.replace('(','')
new_queries1['query'] = new_queries1['query'].str.replace(')','')
new_queries1['query'] = new_queries1['query'].str.replace(':','')
new_queries1['query'] = new_queries1['query'].str.replace('/','')
new_queries1['query'] = new_queries1['query'].str.replace('!','')
new_queries1['query'] = new_queries1['query'].str.replace('*','')

new_queries2['query'] = new_queries2['query'].str.replace('%','')
new_queries2['query'] = new_queries2['query'].str.replace('?','')
new_queries2['query'] = new_queries2['query'].str.replace('!','')
new_queries2['query'] = new_queries2['query'].str.replace('\'','')
new_queries2['query'] = new_queries2['query'].str.replace('(','')
new_queries2['query'] = new_queries2['query'].str.replace(')','')
new_queries2['query'] = new_queries2['query'].str.replace(':','')
new_queries2['query'] = new_queries2['query'].str.replace('/','')
new_queries2['query'] = new_queries2['query'].str.replace('*','')

new_queries3['query'] = new_queries3['query'].str.replace('%','')
new_queries3['query'] = new_queries3['query'].str.replace('?','')
new_queries3['query'] = new_queries3['query'].str.replace('!','')
new_queries3['query'] = new_queries3['query'].str.replace('\'','')
new_queries3['query'] = new_queries3['query'].str.replace('(','')
new_queries3['query'] = new_queries3['query'].str.replace(')','')
new_queries3['query'] = new_queries3['query'].str.replace(':','')
new_queries3['query'] = new_queries3['query'].str.replace('/','')
new_queries3['query'] = new_queries3['query'].str.replace('*','')

new_queries4['query'] = new_queries4['query'].str.replace('%','')
new_queries4['query'] = new_queries4['query'].str.replace('?','')
new_queries4['query'] = new_queries4['query'].str.replace('\'','')
new_queries4['query'] = new_queries4['query'].str.replace('(','')
new_queries4['query'] = new_queries4['query'].str.replace(')','')
new_queries4['query'] = new_queries4['query'].str.replace(':','')
new_queries4['query'] = new_queries4['query'].str.replace('/','')
new_queries4['query'] = new_queries4['query'].str.replace('!','')
new_queries4['query'] = new_queries4['query'].str.replace('*','')

new_queries5['query'] = new_queries5['query'].str.replace('%','')
new_queries5['query'] = new_queries5['query'].str.replace('?','')
new_queries5['query'] = new_queries5['query'].str.replace('\'','')
new_queries5['query'] = new_queries5['query'].str.replace('(','')
new_queries5['query'] = new_queries5['query'].str.replace(')','')
new_queries5['query'] = new_queries5['query'].str.replace(':','')
new_queries5['query'] = new_queries5['query'].str.replace('/','')
new_queries5['query'] = new_queries5['query'].str.replace('!','')
new_queries5['query'] = new_queries5['query'].str.replace('*','')

  new_queries['query'] = new_queries['query'].str.replace('?','')
  new_queries['query'] = new_queries['query'].str.replace('(','')
  new_queries['query'] = new_queries['query'].str.replace(')','')
  new_queries['query'] = new_queries['query'].str.replace('*','')
  new_queries1['query'] = new_queries1['query'].str.replace('?','')
  new_queries1['query'] = new_queries1['query'].str.replace('(','')
  new_queries1['query'] = new_queries1['query'].str.replace(')','')
  new_queries1['query'] = new_queries1['query'].str.replace('*','')
  new_queries2['query'] = new_queries2['query'].str.replace('?','')
  new_queries2['query'] = new_queries2['query'].str.replace('(','')
  new_queries2['query'] = new_queries2['query'].str.replace(')','')
  new_queries2['query'] = new_queries2['query'].str.replace('*','')
  new_queries3['query'] = new_queries3['query'].str.replace('?','')
  new_queries3['query'] = new_queries3['query'].str.replace('(','')
  new_queries3['query'] = new_queries3['query'].str.repl

In [47]:
new_queries6 = pd.DataFrame()
new_queries6['qid'] = test_query['qid']
new_queries6['query'] = new_queries['query'] + " " + new_queries1['query'] + " " + new_queries2['query'] + " " + new_queries3['query'] + " " + new_queries4['query'] + " " + new_queries5['query']

In [48]:
test_qrels = df_test_qrels.rename(columns={"query-id": "qid", "corpus-id" : "docno", "score": "label"})
test_qrels['iteration'] = 0
qrels = test_qrels.astype({'label': 'int32'})
qrels

Unnamed: 0,qid,docno,label,iteration
0,1,31715818,1,0
1,3,14717500,1,0
2,5,13734012,1,0
3,13,1606628,1,0
4,36,5152028,1,0
...,...,...,...,...
334,1379,17450673,1,0
335,1382,17755060,1,0
336,1385,306006,1,0
337,1389,23895668,1,0


In [52]:
final_res_ndcg = pt.Experiment(
                        retr_systems = [bm25],
                        names =  ["BM25"],
                        topics = new_queries,
                        qrels = qrels,
                        eval_metrics = ["ndcg_cut_10", "ndcg_cut_100", "ndcg_cut_1000"])

final_res_map = pt.Experiment(
                        retr_systems = [bm25],
                        names =  ["BM25"],
                        topics = new_queries,
                        qrels = qrels,
                        eval_metrics = ["map_cut_10", "map_cut_100", "map_cut_1000"])

final_res_precision = pt.Experiment(
                        retr_systems = [bm25],
                        names =  ["BM25"],
                        topics = new_queries,
                        qrels = qrels,
                        eval_metrics = ["P_10", "P_100", "P_1000"])

final_res_recall = pt.Experiment(
                        retr_systems = [bm25],
                        names =  ["BM25"],
                        topics = new_queries,
                        qrels = qrels,
                        eval_metrics = ["recall_10", "recall_100", "recall_1000"])

print(final_res_ndcg)
print(final_res_map)
print(final_res_precision)
print(final_res_recall)

   name  ndcg_cut_10  ndcg_cut_100  ndcg_cut_1000
0  BM25     0.670751      0.694243       0.700678
   name  map_cut_10  map_cut_100  map_cut_1000
0  BM25    0.618102     0.623728      0.624016
   name      P_10     P_100  P_1000
0  BM25  0.090667  0.010433  0.0011
   name  recall_10  recall_100  recall_1000
0  BM25   0.817111    0.921556         0.97


In [53]:
final_res_ndcg1 = pt.Experiment(
                        retr_systems = [bm25],
                        names =  ["BM25"],
                        topics = new_queries1,
                        qrels = qrels,
                        eval_metrics = ["ndcg_cut_10", "ndcg_cut_100", "ndcg_cut_1000"])

final_res_map1 = pt.Experiment(
                        retr_systems = [bm25],
                        names =  ["BM25"],
                        topics = new_queries1,
                        qrels = qrels,
                        eval_metrics = ["map_cut_10", "map_cut_100", "map_cut_1000"])

final_res_precision1 = pt.Experiment(
                        retr_systems = [bm25],
                        names =  ["BM25"],
                        topics = new_queries1,
                        qrels = qrels,
                        eval_metrics = ["P_10", "P_100", "P_1000"])

final_res_recall1 = pt.Experiment(
                        retr_systems = [bm25],
                        names =  ["BM25"],
                        topics = new_queries1,
                        qrels = qrels,
                        eval_metrics = ["recall_10", "recall_100", "recall_1000"])

print(final_res_ndcg1)
print(final_res_map1)
print(final_res_precision1)
print(final_res_recall1)

   name  ndcg_cut_10  ndcg_cut_100  ndcg_cut_1000
0  BM25     0.591551       0.62278        0.63343
   name  map_cut_10  map_cut_100  map_cut_1000
0  BM25    0.542715     0.549689      0.550099
   name      P_10     P_100    P_1000
0  BM25  0.080667  0.009833  0.001077
   name  recall_10  recall_100  recall_1000
0  BM25   0.723222    0.863222     0.946667


In [29]:
final_res_ndcg2 = pt.Experiment(
                        retr_systems = [bm25],
                        names =  ["BM25"],
                        topics = new_queries2,
                        qrels = qrels,
                        eval_metrics = ["ndcg_cut_10", "ndcg_cut_100", "ndcg_cut_1000"])

final_res_map2 = pt.Experiment(
                        retr_systems = [bm25],
                        names =  ["BM25"],
                        topics = new_queries2,
                        qrels = qrels,
                        eval_metrics = ["map_cut_10", "map_cut_100", "map_cut_1000"])

final_res_precision2 = pt.Experiment(
                        retr_systems = [bm25],
                        names =  ["BM25"],
                        topics = new_queries2,
                        qrels = qrels,
                        eval_metrics = ["P_10", "P_100", "P_1000"])

final_res_recall2 = pt.Experiment(
                        retr_systems = [bm25],
                        names =  ["BM25"],
                        topics = new_queries2,
                        qrels = qrels,
                        eval_metrics = ["recall_10", "recall_100", "recall_1000"])

print(final_res_ndcg2)
print(final_res_map2)
print(final_res_precision2)
print(final_res_recall2)

   name  ndcg_cut_10  ndcg_cut_100  ndcg_cut_1000
0  BM25     0.578522      0.603028       0.615622
   name  map_cut_10  map_cut_100  map_cut_1000
0  BM25     0.52816     0.533637      0.534126
   name      P_10   P_100    P_1000
0  BM25  0.078667  0.0094  0.001057
   name  recall_10  recall_100  recall_1000
0  BM25   0.720389    0.830222     0.926667


In [30]:
final_res_ndcg3 = pt.Experiment(
                        retr_systems = [bm25],
                        names =  ["BM25"],
                        topics = new_queries3,
                        qrels = qrels,
                        eval_metrics = ["ndcg_cut_10", "ndcg_cut_100", "ndcg_cut_1000"])

final_res_map3 = pt.Experiment(
                        retr_systems = [bm25],
                        names =  ["BM25"],
                        topics = new_queries3,
                        qrels = qrels,
                        eval_metrics = ["map_cut_10", "map_cut_100", "map_cut_1000"])

final_res_precision3 = pt.Experiment(
                        retr_systems = [bm25],
                        names =  ["BM25"],
                        topics = new_queries3,
                        qrels = qrels,
                        eval_metrics = ["P_10", "P_100", "P_1000"])

final_res_recall3 = pt.Experiment(
                        retr_systems = [bm25],
                        names =  ["BM25"],
                        topics = new_queries3,
                        qrels = qrels,
                        eval_metrics = ["recall_10", "recall_100", "recall_1000"])

print(final_res_ndcg3)
print(final_res_map3)
print(final_res_precision3)
print(final_res_recall3)

   name  ndcg_cut_10  ndcg_cut_100  ndcg_cut_1000
0  BM25     0.579082      0.613901       0.625229
   name  map_cut_10  map_cut_100  map_cut_1000
0  BM25    0.529841     0.537826      0.538224
   name   P_10   P_100   P_1000
0  BM25  0.079  0.0099  0.00109
   name  recall_10  recall_100  recall_1000
0  BM25   0.717556       0.871     0.961667


In [31]:
final_res_ndcg4 = pt.Experiment(
                        retr_systems = [bm25],
                        names =  ["BM25"],
                        topics = new_queries4,
                        qrels = qrels,
                        eval_metrics = ["ndcg_cut_10", "ndcg_cut_100", "ndcg_cut_1000"])

final_res_map4 = pt.Experiment(
                        retr_systems = [bm25],
                        names =  ["BM25"],
                        topics = new_queries4,
                        qrels = qrels,
                        eval_metrics = ["map_cut_10", "map_cut_100", "map_cut_1000"])

final_res_precision4 = pt.Experiment(
                        retr_systems = [bm25],
                        names =  ["BM25"],
                        topics = new_queries4,
                        qrels = qrels,
                        eval_metrics = ["P_10", "P_100", "P_1000"])

final_res_recall4 = pt.Experiment(
                        retr_systems = [bm25],
                        names =  ["BM25"],
                        topics = new_queries4,
                        qrels = qrels,
                        eval_metrics = ["recall_10", "recall_100", "recall_1000"])

print(final_res_ndcg4)
print(final_res_map4)
print(final_res_precision4)
print(final_res_recall4)

   name  ndcg_cut_10  ndcg_cut_100  ndcg_cut_1000
0  BM25     0.573299      0.604686       0.617388
   name  map_cut_10  map_cut_100  map_cut_1000
0  BM25    0.529361     0.536378      0.536873
   name      P_10   P_100   P_1000
0  BM25  0.077333  0.0095  0.00106
   name  recall_10  recall_100  recall_1000
0  BM25   0.695333    0.835889     0.934333


In [32]:
final_res_ndcg5 = pt.Experiment(
                        retr_systems = [bm25],
                        names =  ["BM25"],
                        topics = new_queries5,
                        qrels = qrels,
                        eval_metrics = ["ndcg_cut_10", "ndcg_cut_100", "ndcg_cut_1000"])

final_res_map5 = pt.Experiment(
                        retr_systems = [bm25],
                        names =  ["BM25"],
                        topics = new_queries5,
                        qrels = qrels,
                        eval_metrics = ["map_cut_10", "map_cut_100", "map_cut_1000"])

final_res_precision5 = pt.Experiment(
                        retr_systems = [bm25],
                        names =  ["BM25"],
                        topics = new_queries5,
                        qrels = qrels,
                        eval_metrics = ["P_10", "P_100", "P_1000"])

final_res_recall5 = pt.Experiment(
                        retr_systems = [bm25],
                        names =  ["BM25"],
                        topics = new_queries5,
                        qrels = qrels,
                        eval_metrics = ["recall_10", "recall_100", "recall_1000"])

print(final_res_ndcg5)
print(final_res_map5)
print(final_res_precision5)
print(final_res_recall5)

   name  ndcg_cut_10  ndcg_cut_100  ndcg_cut_1000
0  BM25     0.547305      0.581519        0.59295
   name  map_cut_10  map_cut_100  map_cut_1000
0  BM25    0.497979     0.506437      0.506863
   name      P_10   P_100   P_1000
0  BM25  0.075333  0.0094  0.00105
   name  recall_10  recall_100  recall_1000
0  BM25   0.688944    0.834667     0.924333


In [49]:
final_res_ndcg6 = pt.Experiment(
                        retr_systems = [bm25],
                        names =  ["BM25"],
                        topics = new_queries6,
                        qrels = qrels,
                        eval_metrics = ["ndcg_cut_10", "ndcg_cut_100", "ndcg_cut_1000"])

final_res_map6 = pt.Experiment(
                        retr_systems = [bm25],
                        names =  ["BM25"],
                        topics = new_queries6,
                        qrels = qrels,
                        eval_metrics = ["map_cut_10", "map_cut_100", "map_cut_1000"])

final_res_precision6 = pt.Experiment(
                        retr_systems = [bm25],
                        names =  ["BM25"],
                        topics = new_queries6,
                        qrels = qrels,
                        eval_metrics = ["P_10", "P_100", "P_1000"])

final_res_recall6 = pt.Experiment(
                        retr_systems = [bm25],
                        names =  ["BM25"],
                        topics = new_queries6,
                        qrels = qrels,
                        eval_metrics = ["recall_10", "recall_100", "recall_1000"])

print(final_res_ndcg6)
print(final_res_map6)
print(final_res_precision6)
print(final_res_recall6)

   name  ndcg_cut_10  ndcg_cut_100  ndcg_cut_1000
0  BM25     0.600484      0.622392       0.629937
   name  map_cut_10  map_cut_100  map_cut_1000
0  BM25    0.548784     0.553886      0.554209
   name      P_10     P_100   P_1000
0  BM25  0.083333  0.009633  0.00103
   name  recall_10  recall_100  recall_1000
0  BM25   0.744333    0.843667          0.9
