In [1]:
from tira.third_party_integrations import ir_datasets, ensure_pyterrier_is_loaded, persist_and_normalize_run
import pyterrier as pt
import pandas as pd

if not pt.started():
    pt.init(boot_packages=['mam10eks:custom-terrier-token-processing:0.0.1'])
    from jnius import autoclass

ensure_pyterrier_is_loaded()

training_dataset = 'ir-lab-jena-leipzig-wise-2023/training-20231104-training'
validation_dataset = 'ir-lab-jena-leipzig-wise-2023/validation-20231104-training'

terrier-assemblies 5.8 jar-with-dependencies not found, downloading to /root/.pyterrier...
100% [......................................................................] 104292653 / 104292653Done
terrier-python-helper 0.0.8 jar not found, downloading to /root/.pyterrier...
100% [..............................................................................] 37524 / 37524Done


PyTerrier 0.9.2 has loaded Terrier 5.8 (built by craigm on 2023-11-01 18:05) and terrier-helper 0.0.8

No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.


In [8]:
def create_index(documents):
    indexer = pt.IterDictIndexer("/tmp/index", overwrite=True, meta={'docno': 100, 'text': 20480}, verbose=True)
    index_ref = indexer.index(({'docno': i.doc_id, 'text': i.text} for i in documents))
    return pt.IndexFactory.of(index_ref)

In [2]:
def create_index_lemma(documents):
    # indexer = pt.IterDictIndexer("/tmp/index", overwrite=True, stemmer='PorterStemmer', meta={'docno': 100, 'text': 20480})
    indexer = pt.IterDictIndexer("/tmp/index", overwrite=True, stemmer='StanfordLemmatizer', meta={'docno': 100, 'text': 20480}, verbose=True)
    index_ref = indexer.index(({'docno': i.doc_id, 'text': i.text} for i in documents))
    return pt.IndexFactory.of(index_ref)

In [5]:
dataset = ir_datasets.load(validation_dataset)
queries = pt.io.read_topics(ir_datasets.topics_file(validation_dataset), format='trecxml')
qrels = pd.DataFrame(dataset.qrels_iter()).rename(columns={"query_id": "qid"})

queries.head(3)

Load ir_dataset "ir-lab-jena-leipzig-wise-2023/validation-20231104-training" from tira.
No settings given in /root/.tira/.tira-settings.json. I will use defaults.
No settings given in /root/.tira/.tira-settings.json. I will use defaults.


Unnamed: 0,qid,query
0,q072224,purchase money
1,q072226,purchase used car
2,q072232,buy gold silver


In [11]:
index = create_index(dataset.docs_iter())
configuration = {"bm25.b" : 0.75, "bm25.k_1": 1.2}
bm25Orig = pt.BatchRetrieve(index, wmodel="BM25", controls=configuration, verbose=True)

No settings given in /root/.tira/.tira-settings.json. I will use defaults.


In [26]:
pt.Experiment([bm25Orig], pd.DataFrame(queries), qrels, eval_metrics=['ndcg_cut_5'])

BR(BM25): 100%|████████████████████████████████████████████████████████████████████████| 882/882 [07:10<00:00,  2.05q/s]


Unnamed: 0,name,ndcg_cut_5
0,BR(BM25),0.145222


In [3]:
def lemmatize(t):
    lemmatizer = autoclass("org.terrier.terms.StanfordLemmatizer")()
    return lemmatizer.stem(t)

print('are =>', lemmatize('are'))
print('producer =>', lemmatize('producer'))
print('produces =>', lemmatize('produces'))
print('corpora =>', lemmatize('corpus'))

are => be
producer => producer
produces => produce
corpora => corpus


In [7]:
lemma_index = create_index_lemma(dataset.docs_iter())
configuration = {"bm25.b" : 0.75, "bm25.k_1": 1.2}
bm25lemma = pt.BatchRetrieve(lemma_index, wmodel="BM25", controls=configuration, verbose=True)

In [8]:
pt.Experiment([bm25lemma], pd.DataFrame(queries), qrels, eval_metrics=['ndcg_cut_5'])

BR(BM25): 100%|████████████████████████████████████████████████████████████████████████| 882/882 [15:32<00:00,  1.06s/q]


Unnamed: 0,name,ndcg_cut_5
0,BR(BM25),0.142147
