# Information Retrieval project 
**Authors:** Arduini L., Menchini L., Namaki Ghaneh D., Petruzzella C.

# 1. Preprocessing

In [1]:
!pip install ir_datasets
!pip install nltk
!pip install ir_measures



In [2]:
import ir_datasets

# Load the MS MARCO dataset
dataset = ir_datasets.load("msmarco-passage")

In [3]:
# print the first document in the dataset

import random

# Initialize a flag to check if a document has been printed
document_printed = False

# Iterate over the documents in the dataset
for doc in dataset.docs_iter():
    if not document_printed:
        if random.random() < 0.01:  # Adjust the probability as needed
            print(doc.text)
            document_printed = True
            break

For example, the pressure of electromagnetic radiation on an object derives from the transfer of photon momentum per unit time and unit area to that object, since pressure is force per unit area and force is the change in momentum per unit time.


In [4]:
import re
import string
import nltk

nltk.download("stopwords", quiet=True)

# Compile regex patterns once globally
ACRONYM_REGEX = re.compile(r"(?<!\w)\.(?!\d)")
PUNCTUATION_TRANS = str.maketrans("", "", string.punctuation)

# Preload stopwords set
STOPWORDS = set(nltk.corpus.stopwords.words('english'))

# Initialize stemmer
STEMMER = nltk.stem.PorterStemmer()

def preprocess(s):
    # lowercasing
    s = s.lower()
    
    # replace ampersand
    s = s.replace("&", " and ")
    
    # normalize quotes and dashes
    s = s.translate(str.maketrans("‘’´“”–-", "'''\"\"--"))
    
    # remove unnecessary dots in acronyms (but not decimals)
    s = ACRONYM_REGEX.sub("", s)
    
    # remove punctuation
    s = s.translate(PUNCTUATION_TRANS)
    
    # strip and remove extra spaces
    s = " ".join(s.split())
    
    # tokenize
    tokens = s.split()
    
    # remove stopwords
    tokens = [t for t in tokens if t not in STOPWORDS]
    
    # stemming
    tokens = [STEMMER.stem(t) for t in tokens]
    
    return tokens


In [5]:
import time

def profile(f):
    def f_timer(*args, **kwargs):
        start = time.time()
        result = f(*args, **kwargs)
        end = time.time()
        ms = (end - start) * 1000
        print(f"{f.__name__} ({ms:.3f} ms)")
        return result
    return f_timer

In [6]:
from collections import Counter
from tqdm.auto import tqdm

@profile
def build_index(dataset):
    lexicon = {}
    doc_index = [] 
    inv_d, inv_f = {}, {}
    termid = 0

    num_docs = 0
    total_dl = 0
    total_toks = 0
    for docid, doc in tqdm(enumerate(dataset.docs_iter()), desc='Indexing', total=dataset.docs_count()):
        tokens = preprocess(doc.text)
        token_tf = Counter(tokens)
        for token, tf in token_tf.items():
            if token not in lexicon:
                lexicon[token] = [termid, 0, 0]
                inv_d[termid], inv_f[termid] =  [], []
                termid += 1
            token_id = lexicon[token][0]
            inv_d[token_id].append(docid)
            inv_f[token_id].append(tf)
            lexicon[token][1] += 1
            lexicon[token][2] += tf
        doclen = len(tokens)
        doc_index.append((str(doc.doc_id), doclen))
        total_dl += doclen                         
        num_docs += 1
        

    stats = {
        'num_docs': 1 + docid,
        'num_terms': len(lexicon),
        'num_tokens': total_dl,
    }
    return lexicon, {'docids': inv_d, 'freqs': inv_f}, doc_index, stats

  from .autonotebook import tqdm as notebook_tqdm


Compress and save the index components

In [8]:
import gzip
import pickle
import os

#
# check if files exists in the current directory, if not we build the index and we save the files
#

files_to_check = ['lexicon.pickle.gz', 'inverted_file.pickle.gz', 'document_index.pickle.gz', 'stats.pickle.gz']
if all(os.path.exists(file) for file in files_to_check):
  print("All files already exist.")
  with gzip.open('lexicon.pickle.gz', 'rb') as f:
    lex = pickle.load(f)
  with gzip.open('inverted_file.pickle.gz', 'rb') as f:
    inv = pickle.load(f)
  with gzip.open('document_index.pickle.gz', 'rb') as f:
    doc = pickle.load(f)
  with gzip.open('stats.pickle.gz', 'rb') as f:
    stats = pickle.load(f)
else:
  lex, inv, doc, stats = build_index(dataset)
  with gzip.open('lexicon.pickle.gz', 'wb') as f:
    pickle.dump(lex, f) 
  with gzip.open('inverted_file.pickle.gz', 'wb') as f:
    pickle.dump(inv, f)
  with gzip.open('document_index.pickle.gz', 'wb') as f:
    pickle.dump(doc, f)
  with gzip.open('stats.pickle.gz', 'wb') as f:
    pickle.dump(stats, f)

NameError: name 'lex' is not defined

Decompress and load the index components


# 3. Query processing

# 4. Evaluation

In [10]:
for doc in dataset.docs_iter()[:3]:
    print(doc)

GenericDoc(doc_id='1', text='compact memories have flexible capacities  a digital data storage\nsystem with capacity up to bits and random and or sequential access\nis described\n')
GenericDoc(doc_id='2', text='an electronic analogue computer for solving systems of linear equations\nmathematical derivation of the operating principle and stability\nconditions for a computer consisting of amplifiers\n')
GenericDoc(doc_id='3', text='electronic coordinate transformer  circuit details are given for\nthe construction of an electronic calculating unit which enables\nthe polar coordinates of a vector modulus and cosine or sine of the\nargument to be derived from those of a rectangular system of axes\n')


In [11]:
trec_dl_2020 = ir_datasets.load("msmarco-passage/trec-dl-2020")
for query in trec_dl_2020.queries_iter()[:3]:
    print(query) # namedtuple<query_id, text>

GenericQuery(query_id='1030303', text='who is aziz hashim')
GenericQuery(query_id='1037496', text='who is rep scalise?')
GenericQuery(query_id='1043135', text='who killed nicholas ii of russia')


In [12]:
for ass in list(trec_dl_2020.qrels_iter())[:3]:
  print(ass)

TrecQrel(query_id='23849', doc_id='1020327', relevance=2, iteration='0')
TrecQrel(query_id='23849', doc_id='1034183', relevance=3, iteration='0')
TrecQrel(query_id='23849', doc_id='1120730', relevance=0, iteration='0')


In [13]:
# Generate run file
trec_run_list = []
for query_id, doc_scores in results.items():
    rank = 1
    for doc_id, score in doc_scores:
        line = f"{query_id} Q0 {doc_id} {rank} {score} GOODFELLAS"
        trec_run_list.append(line)
        rank += 1

with open("trec_eval_run_file.txt", "w") as f:
    for line in trec_run_list:
        f.write(line + "\n")

NameError: name 'results' is not defined

In [None]:
# Create format for Trec_Eval
qrels_file = []
for qrel in trec_dl_2020.qrels_iter():
    line = f"{qrel.query_id} 0 {qrel.doc_id} {qrel.relevance}"
    qrels_file.append(line)

In [None]:
with open("trec_eval_qrels_file.txt", "w") as f:
    for line in qrels_file:
        f.write(line + "\n")

In [None]:
import ir_measures
qrels = ir_measures.read_trec_qrels('trec_eval_qrels_file.txt')
run = ir_measures.read_trec_run('trec_eval_run_file.txt')

In [None]:
measures = ir_measures.P@10, ir_measures.R@1000, ir_measures.AP, ir_measures.nDCG@10
results = ir_measures.calc_aggregate(measures, qrels, run)
print(results)