# Information Retrieval project 
**Authors:** Arduini L., Menchini L., Namaki Ghaneh D., Petruzzella C.

# 1. Preprocessing

In [1]:
!pip install ir_datasets
!pip install nltk


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
import ir_datasets

# Load the MS MARCO dataset
dataset = ir_datasets.load("msmarco-passage")

In [3]:
# print the first document in the dataset

import random

# Initialize a flag to check if a document has been printed
document_printed = False

# Iterate over the documents in the dataset
for doc in dataset.docs_iter():
    if not document_printed:
        if random.random() < 0.01:  # Adjust the probability as needed
            print(doc.text)
            document_printed = True
            break

The Rise of Industrial America, 1877-1900. When in 1873 Mark Twain and Charles Dudley Warner entitled their co-authored novel The Gilded Age, they gave the late nineteenth century its popular name. The term reflected the combination of outward wealth and dazzle with inner corruption and poverty.


In [4]:
import re
import string
import nltk

nltk.download("stopwords", quiet=True)

# Compile regex patterns once globally
ACRONYM_REGEX = re.compile(r"(?<!\w)\.(?!\d)")
PUNCTUATION_TRANS = str.maketrans("", "", string.punctuation)

# Preload stopwords set
STOPWORDS = set(nltk.corpus.stopwords.words('english'))

# Initialize stemmer
STEMMER = nltk.stem.PorterStemmer()

def preprocess(s):
    # lowercasing
    s = s.lower()
    
    # replace ampersand
    s = s.replace("&", " and ")
    
    # normalize quotes and dashes
    s = s.translate(str.maketrans("‘’´“”–-", "'''\"\"--"))
    
    # remove unnecessary dots in acronyms (but not decimals)
    s = ACRONYM_REGEX.sub("", s)
    
    # remove punctuation
    s = s.translate(PUNCTUATION_TRANS)
    
    # strip and remove extra spaces
    s = " ".join(s.split())
    
    # tokenize
    tokens = s.split()
    
    # remove stopwords
    tokens = [t for t in tokens if t not in STOPWORDS]
    
    # stemming
    tokens = [STEMMER.stem(t) for t in tokens]
    
    return tokens


In [5]:
import time

def profile(f):
    def f_timer(*args, **kwargs):
        start = time.time()
        result = f(*args, **kwargs)
        end = time.time()
        ms = (end - start) * 1000
        print(f"{f.__name__} ({ms:.3f} ms)")
        return result
    return f_timer

In [6]:

from collections import Counter, defaultdict
from tqdm import tqdm

@profile
def build_index(dataset):
    lexicon = {}
    doc_index = []
    
    # Uniamo inv_d e inv_f in un unico dizionario usando defaultdict
    inverted_index = defaultdict(lambda: {'docids': [], 'freqs': []})
    
    termid = 0
    total_dl = 0
    num_docs = 0
    
    # Iteriamo sui documenti del dataset
    for docid, doc in tqdm(enumerate(dataset.docs_iter()), desc='Indexing', total=dataset.docs_count()):
        tokens = preprocess(doc.text)
        token_tf = Counter(tokens)  # Frequenze dei termini nel documento
        doclen = len(tokens)
        total_dl += doclen
        num_docs += 1
        
        # Aggiorniamo l'indice invertito e il lexicon
        for token, tf in token_tf.items():
            if token not in lexicon:
                lexicon[token] = [termid, 0, 0]  # [termid, df, tf]
                termid += 1

            token_id = lexicon[token][0]
            lexicon[token][1] += 1  # Incrementiamo df
            lexicon[token][2] += tf  # Incrementiamo tf

            inverted_index[token_id]['docids'].append(docid)
            inverted_index[token_id]['freqs'].append(tf)

        # Inseriamo il documento nell'indice
        doc_index.append((str(doc.doc_id), doclen))
    
    # Statistiche finali
    stats = {
        'num_docs': num_docs,
        'num_terms': len(lexicon),
        'num_tokens': total_dl,
    }
    
    return lexicon, inverted_index, doc_index, stats


In [7]:
lex, inv, doc, stats = build_index(dataset)

Indexing: 100%|██████████| 8841823/8841823 [1:19:50<00:00, 1845.83it/s] 

build_index (4790183.575 ms)





Compress and save the index components

In [11]:
import gzip
import pickle

with gzip.open('lexicon.pickle.gz', 'wb') as f:
  pickle.dump(lex, f)
with gzip.open('inverted_file.pickle.gz', 'wb') as f:
  pickle.dump(inv, f)
with gzip.open('document_index.pickle.gz', 'wb') as f:
  pickle.dump(doc, f)
with gzip.open('stats.pickle.gz', 'wb') as f:
  pickle.dump(stats, f)

AttributeError: Can't pickle local object 'build_index.<locals>.<lambda>'

Decompress and load the index components


In [None]:
with gzip.open('lexicon.pickle.gz', 'rb') as f:
  lex = pickle.load(f)
with gzip.open('inverted_file.pickle.gz', 'rb') as f:
  inv = pickle.load(f)
with gzip.open('document_index.pickle.gz', 'rb') as f:
  doc = pickle.load(f)
with gzip.open('stats.pickle.gz', 'rb') as f:
  stats = pickle.load(f)

# 3. Query processing

# 4. Evaluation