# Information Retrieval project
**Authors:** Arduini L., Menchini L., Namaki Ghaneh D., Petruzzella C.

# 1. Preprocessing

In [None]:
!pip install ir_datasets
!pip install nltk
!pip install ir_measures

In [138]:
import ir_datasets

# Load the MS MARCO dataset
# dataset = ir_datasets.load("msmarco-passage")
dataset = ir_datasets.load("vaswani")

In [None]:
# print the first document in the dataset

import random

# Initialize a flag to check if a document has been printed
document_printed = False

# Iterate over the documents in the dataset
for doc in dataset.docs_iter():
    if not document_printed:
        if random.random() < 0.01:  # Adjust the probability as needed
            print(doc.text)
            document_printed = True
            break

In [140]:
import re
import string
import nltk

nltk.download("stopwords", quiet=True)

# Compile regex patterns once globally
ACRONYM_REGEX = re.compile(r"(?<!\w)\.(?!\d)")
PUNCTUATION_TRANS = str.maketrans("", "", string.punctuation)

# Preload stopwords set
STOPWORDS = set(nltk.corpus.stopwords.words('english'))

# Initialize stemmer
STEMMER = nltk.stem.PorterStemmer()

def preprocess(s):
    # lowercasing
    s = s.lower()

    # replace ampersand
    s = s.replace("&", " and ")

    # normalize quotes and dashes
    s = s.translate(str.maketrans("‘’´“”–-", "'''\"\"--"))

    # remove unnecessary dots in acronyms (but not decimals)
    s = ACRONYM_REGEX.sub("", s)

    # remove punctuation
    s = s.translate(PUNCTUATION_TRANS)

    # strip and remove extra spaces
    s = " ".join(s.split())

    # tokenize
    tokens = s.split()

    # remove stopwords
    tokens = [t for t in tokens if t not in STOPWORDS]

    # stemming
    tokens = [STEMMER.stem(t) for t in tokens]

    return tokens


In [141]:
import time

def profile(f):
    def f_timer(*args, **kwargs):
        start = time.time()
        result = f(*args, **kwargs)
        end = time.time()
        ms = (end - start) * 1000
        print(f"{f.__name__} ({ms:.3f} ms)")
        return result
    return f_timer

In [142]:
from collections import Counter
from tqdm.auto import tqdm

@profile
def build_index(dataset):
    lexicon = {}
    doc_index = []
    inv_d, inv_f = {}, {}
    termid = 0

    num_docs = 0
    total_dl = 0
    total_toks = 0
    for docid, doc in tqdm(enumerate(dataset.docs_iter()), desc='Indexing', total=dataset.docs_count()):
        tokens = preprocess(doc.text)
        token_tf = Counter(tokens)
        for token, tf in token_tf.items():
            if token not in lexicon:
                lexicon[token] = [termid, 0, 0]
                inv_d[termid], inv_f[termid] =  [], []
                termid += 1
            token_id = lexicon[token][0]
            inv_d[token_id].append(docid)
            inv_f[token_id].append(tf)
            lexicon[token][1] += 1
            lexicon[token][2] += tf
        doclen = len(tokens)
        doc_index.append((str(doc.doc_id), doclen))
        total_dl += doclen
        num_docs += 1


    stats = {
        'num_docs': 1 + docid,
        'num_terms': len(lexicon),
        'num_tokens': total_dl,
    }
    return lexicon, {'docids': inv_d, 'freqs': inv_f}, doc_index, stats

Compress and save the index components

In [None]:
import gzip
import pickle
import os

#
# check if files exists in the current directory, if not we build the index and we save the files
#

files_to_check = ['lexicon.pickle.gz', 'inverted_file.pickle.gz', 'document_index.pickle.gz', 'stats.pickle.gz']
if all(os.path.exists(file) for file in files_to_check):
  print("All files already exist.")
  with gzip.open('lexicon.pickle.gz', 'rb') as f:
    lex = pickle.load(f)
  with gzip.open('inverted_file.pickle.gz', 'rb') as f:
    inv = pickle.load(f)
  with gzip.open('document_index.pickle.gz', 'rb') as f:
    doc = pickle.load(f)
  with gzip.open('stats.pickle.gz', 'rb') as f:
    stats = pickle.load(f)
else:
  lex, inv, doc, stats = build_index(dataset)
  with gzip.open('lexicon.pickle.gz', 'wb') as f:
    pickle.dump(lex, f)
  with gzip.open('inverted_file.pickle.gz', 'wb') as f:
    pickle.dump(inv, f)
  with gzip.open('document_index.pickle.gz', 'wb') as f:
    pickle.dump(doc, f)
  with gzip.open('stats.pickle.gz', 'wb') as f:
    pickle.dump(stats, f)

In [144]:
import math

class InvertedIndex:

    class PostingListIterator:
        def __init__(self, docids, freqs, doc):
            self.docids = docids
            self.freqs = freqs
            self.pos = 0
            self.doc = doc

        def docid(self):
            if self.is_end_list():
                return math.inf
            return self.docids[self.pos]

        def score(self):
            if self.is_end_list():
                return math.inf
            return self.freqs[self.pos]/self.doc[self.docid()][1]

        def next(self, target = None):
            if not target:
                if not self.is_end_list():
                    self.pos += 1
            else:
                if target > self.docid():
                    try:
                        self.pos = self.docids.index(target, self.pos)
                    except ValueError:
                        self.pos = len(self.docids)

        def is_end_list(self):
            return self.pos == len(self.docids)


        def len(self):
            return len(self.docids)


    def __init__(self, lex, inv, doc, stats):
        self.lexicon = lex
        self.inv = inv
        self.doc = doc
        self.stat = stats

    def num_docs(self):
        return self.stats['num_docs']

    def get_posting(self, termid):
        return InvertedIndex.PostingListIterator(self.inv['docids'][termid], self.inv['freqs'][termid], self.doc)

    def get_termids(self, tokens):
        return [self.lexicon[token][0] for token in tokens if token in self.lexicon]

    def get_postings(self, termids):
        return [self.get_posting(termid) for termid in termids]

In [145]:
inv_index = InvertedIndex(lex, inv, doc, stats)

# 3. Query processing

Decompress and load the index components


In [146]:
import gzip
import pickle

with gzip.open('lexicon.pickle.gz', 'rb') as f:
    lex = pickle.load(f)
with gzip.open('inverted_file.pickle.gz', 'rb') as f:
    inv = pickle.load(f)
with gzip.open('document_index.pickle.gz', 'rb') as f:
    doc = pickle.load(f)
with gzip.open('stats.pickle.gz', 'rb') as f:
    stats = pickle.load(f)

In [147]:
# trec_dl_2020 = ir_datasets.load("msmarco-passage/trec-dl-2020")
trec_dl_2020 = ir_datasets.load("vaswani")

### Ranked Retrieval

In [148]:
import heapq

class TopQueue:
    def __init__(self, k=10, threshold=0.0):
        self.queue = []
        self.k = k
        self.threshold = threshold

    def size(self):
        return len(self.queue)

    def would_enter(self, score):
        return score > self.threshold

    def clear(self, new_threshold=None):
        self.queue = []
        if new_threshold:
            self.threshold = new_threshold

    def __repr__(self):
        return f'<{self.size()} items, th={self.threshold} {self.queue}'

    def insert(self, docid, score):
        if score > self.threshold:
            if self.size() >= self.k:
                heapq.heapreplace(self.queue, (score, docid))
            else:
                heapq.heappush(self.queue, (score, docid))
            if self.size() >= self.k:
                self.threshold = max(self.threshold, self.queue[0][0])
            return True
        return False

BM25 scoring function

In [149]:
avg_dl = inv_index.stat['num_tokens'] / inv_index.stat['num_docs']
N = inv_index.stat['num_docs']

# BM25 for a term
def bm25(tf, df, dl, k1=1.5, b=0.75):
    idf = math.log(1 + (N - df + 0.5) / (df + 0.5))
    term_frequency_component = (tf * (k1 + 1)) / (tf + k1 * (1 - b + b * (dl / avg_dl)))
    return idf * term_frequency_component

DAAT with BM25

In [150]:
import math
from collections import defaultdict

# Calculate document lengths
doc_lengths = defaultdict(int)
for docid, doc_len in inv_index.doc:
    doc_lengths[docid] = doc_len

def min_docid(postings):
    min_docid = math.inf
    for p in postings:
        if not p.is_end_list():
            min_docid = min(p.docid(), min_docid)
    return min_docid

def daat_bm25(postings, k=10):
    top = TopQueue(k)
    current_docid = min_docid(postings)

    while current_docid != math.inf:
        score = 0
        next_docid = math.inf

        for posting in postings:
            if posting.docid() == current_docid:
                tf = posting.freqs[posting.pos]
                df = posting.len()
                dl = doc_lengths[current_docid]

                score += bm25(tf, df, dl)

                posting.next()
            if not posting.is_end_list():
                next_docid = min(next_docid, posting.docid())

        top.insert(current_docid, score)
        current_docid = next_docid

    return sorted(top.queue, reverse=True)

TAAT with BM25

In [151]:
def taat_bm25(postings, k=10):
    A = defaultdict(float)

    for posting in postings:
        current_docid = posting.docid()

        df = posting.len()

        while current_docid != math.inf:
            tf = posting.freqs[posting.pos]
            dl = doc_lengths[current_docid]

            score = bm25(tf, df, dl)
            A[current_docid] += score

            posting.next()
            current_docid = posting.docid()

    top = TopQueue(k)
    for docid, score in A.items():
        top.insert(docid, score)

    return sorted(top.queue, reverse=True)

In [152]:
@profile
def query_processing(queries_iter, fn):
    res = []
    for q in queries_iter:
        query = preprocess(q.text)
        termids = inv_index.get_termids(query)
        postings = inv_index.get_postings(termids)
        res.append({'query_id': q.query_id, 'scores': fn(postings)})
    return res

In [None]:
print(query_processing(trec_dl_2020.queries_iter(), daat_bm25))

In [None]:
results = query_processing(trec_dl_2020.queries_iter(), taat_bm25)
print(results)

# 4. Evaluation

In [None]:
for doc in dataset.docs_iter()[:3]:
    print(doc)

In [None]:
trec_dl_2020 = ir_datasets.load("msmarco-passage/trec-dl-2020")
for query in trec_dl_2020.queries_iter()[:3]:
    print(query) # namedtuple<query_id, text>

In [None]:
for ass in list(trec_dl_2020.qrels_iter())[:3]:
  print(ass)

In [None]:
# Generate run file
trec_run_list = []
for doc_scores in results:
    rank = 1
    query_id = doc_scores['query_id']
    scores = doc_scores['scores']

    for score, doc_id in scores:
        line = f"{query_id} Q0 {doc_id} {rank} {score} GOODFELLAS"
        trec_run_list.append(line)
        rank += 1

with open("trec_eval_run_file.txt", "w") as f:
    for line in trec_run_list:
        f.write(line + "\n")

In [165]:
# Create format for Trec_Eval
qrels_file = []
for qrel in trec_dl_2020.qrels_iter():
    line = f"{qrel.query_id} 0 {qrel.doc_id} {qrel.relevance}"
    qrels_file.append(line)

In [166]:
with open("trec_eval_qrels_file.txt", "w") as f:
    for line in qrels_file:
        f.write(line + "\n")

In [167]:
import ir_measures
qrels = ir_measures.read_trec_qrels('trec_eval_qrels_file.txt')
run = ir_measures.read_trec_run('trec_eval_run_file.txt')

In [None]:
measures = ir_measures.P@10, ir_measures.R@1000, ir_measures.AP, ir_measures.nDCG@10
results = ir_measures.calc_aggregate(measures, qrels, run)
print(results)