# HW2 - WET PART

### Imports and Configuration

In [137]:
import re
import subprocess
from pathlib import Path
import warnings
import logging
from pyserini.search.lucene import LuceneSearcher
from pyserini.analysis import get_lucene_analyzer
from pyserini.index.lucene import IndexReader
from pyserini.search import get_topics_with_reader

# Suppress warnings and Java logging
warnings.filterwarnings('ignore')
logging.getLogger().setLevel(logging.ERROR)

BASE_DIR = Path.cwd()

# Part A
PART_A_DIR = BASE_DIR / "data" / "WET_PART_A"
PART_A_INDEX = BASE_DIR / "indexes" / "part_a_index"

# Part B
AP_COLL_PATH = BASE_DIR / "data" / "AP_Coll"
PART_B_DIR = BASE_DIR / "data" / "WET_PART_B"
QUERIES_PATH = PART_B_DIR / "queries.txt"
STOPWORDS_PATH = PART_B_DIR / "StopWords.txt"
QRELS_PATH = PART_B_DIR / "qrels_AP"

INDEX_STEMMED = BASE_DIR / "indexes" / "ap_stemmed"
INDEX_UNSTEMMED = BASE_DIR / "indexes" / "ap_unstemmed"

RESULTS_DIR = BASE_DIR / "results"
RESULTS_STEMMED = RESULTS_DIR / "stemmed.trec"
RESULTS_UNSTEMMED = RESULTS_DIR / "unstemmed.trec"
EVAL_STEMMED = RESULTS_DIR / "stemmed_eval.txt"
EVAL_UNSTEMMED = RESULTS_DIR / "unstemmed_eval.txt"

# Create directories
for path in [PART_A_INDEX, INDEX_STEMMED, INDEX_UNSTEMMED, RESULTS_DIR]:
    path.mkdir(parents=True, exist_ok=True)

print("Setup complete")

Setup complete


## Part A: Index docs.txt and test queries
Build index with `--keepStopwords --stemmer none` and test with specific queries.

In [138]:
def build_index(collection_path, index_path, stemmer='none', stopwords_path=None, keep_stopwords=False):
    """Build a Lucene index using pyserini."""
    if index_path.exists() and any(index_path.iterdir()):
        print(f"Index already exists: {index_path}")
        return
    
    print(f"Building index at {index_path}...")
    cmd = [
        "python", "-m", "pyserini.index.lucene",
        "--collection", "TrecCollection",
        "--input", str(collection_path),
        "--index", str(index_path),
        "--stemmer", stemmer,
        "--storePositions",
        "--storeDocvectors",
        "--storeRaw",
        "--optimize"
    ]
    
    if keep_stopwords:
        cmd.append("--keepStopwords")
    elif stopwords_path:
        cmd.extend(["--stopwords", str(stopwords_path)])
    
    result = subprocess.run(cmd, capture_output=True, text=True)
    if result.returncode == 0:
        print(f"Index built successfully")
    else:
        print(f"Build failed: {result.stderr}")

build_index(PART_A_DIR, PART_A_INDEX, stemmer='none', keep_stopwords=True)

Building index at /home/galnoy/git-projects/MSC-Text-Retrieval-and-Search-Engines/HW2/indexes/part_a_index...
Index built successfully
Index built successfully


### Check index statistics

In [139]:
reader = IndexReader(str(PART_A_INDEX))
stats = reader.stats()

print("\nPart A Index Statistics:")
for key, value in stats.items():
    print(f"{key:25s}: {value:,}")


Part A Index Statistics:
total_terms              : 212
documents                : 4
non_empty_documents      : 4
unique_terms             : 140


### Configure searcher

In [140]:
searcher = LuceneSearcher(str(PART_A_INDEX))
analyzer = get_lucene_analyzer(stemmer='none', stopwords=False)
searcher.set_analyzer(analyzer)
searcher.set_bm25(k1=0.9, b=0.4)

print("Searcher configured for Part A queries")

Searcher configured for Part A queries


### Question 1: Query "corporation"

**1a. How many documents did you retrieve?**

In [141]:
query = 'corporation'
hits = searcher.search(query, k=4)

print(f"Query: '{query}'")
print(f"Number of documents retrieved: {len(hits)}\n")
for i, hit in enumerate(hits, start=1):
    print(f"{i}. {hit.docid} - Score: {hit.score:.4f}")

Query: 'corporation'
Number of documents retrieved: 1

1. D2 - Score: 0.6747


**1b. How many documents did you expect to retrieve?**

In [142]:
# Read docs.txt
docs_path = PART_A_DIR / "docs.txt"
with open(docs_path, 'r') as f:
    content = f.read()

# Extract all documents
docs = re.findall(r'<DOC>.*?</DOC>', content, re.DOTALL)

print("Documents containing 'corporation' (or variants):\n")
expected_count = 0
for doc in docs:
    docno = re.search(r'<DOCNO>(.*?)</DOCNO>', doc).group(1)
    text = re.search(r'<TEXT>(.*?)</TEXT>', doc, re.DOTALL).group(1)
    
    if 'corporation' in text.lower():
        expected_count += 1
        # Find the sentence with corporation
        for sent in text.split('.'):
            if 'corporation' in sent.lower():
                print(f"{docno}: {sent.strip()}")
                break

print(f"\nExpected: {expected_count} documents")
print(f"Retrieved: {len(hits)} documents")

if expected_count > len(hits):
    print(f"\nMissing {expected_count - len(hits)} document(s)!")
    print("Reason: D3 contains 'corporations' (plural), but index has no stemming.")
    print("Without stemming, 'corporation' ≠ 'corporations'")

Documents containing 'corporation' (or variants):

D2: IBM corporation started with the IBM 1410, a member of the  IBM 1400 series, was a variable wordlength decimal computer that was announced by IBM on September 12 1960 and marketed as a midrange "Business Computer"
D3: At the same time, Burroughs was very much a competitor in which these two corporations tried to supply a complete answer for its customers

Expected: 2 documents
Retrieved: 1 documents

Missing 1 document(s)!
Reason: D3 contains 'corporations' (plural), but index has no stemming.
Without stemming, 'corporation' ≠ 'corporations'


### Question 2: Query to return D1 first (max 2 words)

In [143]:
# D1 is about the Nobel Prize - use these distinctive terms
query = 'Nobel Prize'
hits = searcher.search(query, k=10)

print(f"Query: '{query}'\n")
for i, hit in enumerate(hits, start=1):
    print(f"{i}. {hit.docid} - Score: {hit.score:.4f}")

print("\nExplanation:")
print("'Nobel Prize' appears 4 times in D1 and nowhere else in the collection.")
print("BM25 rewards terms with high frequency in a document and low frequency across the collection.")
print("Therefore, D1 gets the highest score and ranks first.")

Query: 'Nobel Prize'

1. D1 - Score: 1.9240

Explanation:
'Nobel Prize' appears 4 times in D1 and nowhere else in the collection.
BM25 rewards terms with high frequency in a document and low frequency across the collection.
Therefore, D1 gets the highest score and ranks first.


### Question 3: Is D4 relevant to "Michael Jackson"?

**3a. Run query and analyze relevance:**

In [144]:
query = 'Michael Jackson'
hits = searcher.search(query, k=10)

print(f"Query: '{query}'\n")
for i, hit in enumerate(hits, start=1):
    print(f"{i}. {hit.docid} - Score: {hit.score:.4f}")

# Show D4 content
print("\n" + "="*70)
print("D4 Content:")
print("="*70)
d4_match = re.search(r'<DOCNO>D4</DOCNO>.*?<TEXT>(.*?)</TEXT>', content, re.DOTALL)
if d4_match:
    print(d4_match.group(1).strip())

print("\n" + "="*70)
print("Answer: NO, D4 is NOT relevant to 'Michael Jackson'")
print("="*70)
print("Reason: D4 is about Lady Gaga. Michael Jackson is only mentioned once")
print("as someone she listened to as a child - a brief reference, not the main topic.")

Query: 'Michael Jackson'

1. D4 - Score: 1.1867

D4 Content:
Biography Stefania Gabriella Germanotta (AKA Lady GaGa) was born in March 1986, in New York, to an Italian family. She attended Convent of the Sacred Heart school and, as a little girl, she remembers singing into a plastic tape recorder to the likes of   Cyndi Lauper and Michael Jackson. 
By age 4, Lady Gaga had taught herself to play the piano by ear, and when she was a teenager she penned her first song.

Answer: NO, D4 is NOT relevant to 'Michael Jackson'
Reason: D4 is about Lady Gaga. Michael Jackson is only mentioned once
as someone she listened to as a child - a brief reference, not the main topic.


**3b. Query for which D4 IS relevant (max 2 words):**

In [145]:
# D4 is about Lady Gaga - this should retrieve it with high relevance
query = 'Lady Gaga'
hits = searcher.search(query, k=10)

print(f"Query: '{query}'\n")
for i, hit in enumerate(hits, start=1):
    print(f"{i}. {hit.docid} - Score: {hit.score:.4f}")

print("\nExplanation:")
print("'Lady Gaga' is the main topic of D4, appearing 3 times throughout the document.")
print("This query correctly identifies D4 as highly relevant with a much higher score.")

Query: 'Lady Gaga'

1. D4 - Score: 1.5899

Explanation:
'Lady Gaga' is the main topic of D4, appearing 3 times throughout the document.
This query correctly identifies D4 as highly relevant with a much higher score.


## Part B: Build 2 indexes and compare retrieval effectiveness
1. **Index 1**: WITH stopwords removal AND WITH Krovetz stemming
2. **Index 2**: WITH stopwords removal AND WITHOUT stemming
3. Compare MAP, P@5, P@10 for both indexes

### Index 1: WITH Krovetz stemming + WITH stopwords removal

In [146]:
build_index(AP_COLL_PATH, INDEX_STEMMED, stemmer='krovetz', stopwords_path=STOPWORDS_PATH)

Building index at /home/galnoy/git-projects/MSC-Text-Retrieval-and-Search-Engines/HW2/indexes/ap_stemmed...
Index built successfully
Index built successfully


### Index 2: WITHOUT stemming + WITH stopwords removal

In [147]:
build_index(AP_COLL_PATH, INDEX_UNSTEMMED, stemmer='none', stopwords_path=STOPWORDS_PATH)

Building index at /home/galnoy/git-projects/MSC-Text-Retrieval-and-Search-Engines/HW2/indexes/ap_unstemmed...
Index built successfully
Index built successfully


### Load Queries and Perform Retrieval

In [148]:
topics = get_topics_with_reader('io.anserini.search.topicreader.TsvIntTopicReader', str(QUERIES_PATH))

queries = {}
for topic_id, topic in topics.items():
    qid = f"0{topic_id}" if len(str(topic_id)) == 2 else str(topic_id)
    queries[qid] = topic['title']

assert len(queries) == 150, "Expected 150 queries"

print(f"Loaded {len(queries)} queries")
print(f"Sample: {list(queries.items())[:3]}")

Loaded 150 queries
Sample: [('088', 'crude  oil  price  trends'), ('089', 'downstream  investments  opec  member  states'), ('190', 'instances  fraud  involving  computer')]


### Configure search and save

In [149]:
def search_and_save(index_path, queries, output_path, stemmer='none', k=1000):
    """Search index and save results in TREC format."""
    searcher = LuceneSearcher(str(index_path))
    analyzer = get_lucene_analyzer(stemmer=stemmer, stopwords=False)
    searcher.set_analyzer(analyzer)
    searcher.set_bm25(k1=0.9, b=0.4)
    
    with open(output_path, 'w') as f:
        for qid in sorted(queries.keys()):
            hits = searcher.search(queries[qid], k=k)
            for rank, hit in enumerate(hits, start=1):
                f.write(f"{qid} Q0 {hit.docid} {rank} {hit.score:.4f} pyserini\n")
    
    print(f"Results saved to {output_path}")

### Search Index 1 (stemmed)

In [150]:
search_and_save(INDEX_STEMMED, queries, RESULTS_STEMMED, stemmer='krovetz')

Results saved to /home/galnoy/git-projects/MSC-Text-Retrieval-and-Search-Engines/HW2/results/stemmed.trec


### Search Index 2 (unstemmed)

In [151]:
search_and_save(INDEX_UNSTEMMED, queries, RESULTS_UNSTEMMED, stemmer='none', )

Results saved to /home/galnoy/git-projects/MSC-Text-Retrieval-and-Search-Engines/HW2/results/unstemmed.trec


## Evaluate effectiveness of the 2 retrieved lists

### Run trec_eval using pyserini

In [152]:
def run_trec_eval(qrels_path, results_path, output_path):
    """Run trec_eval using pyserini and save output."""
    cmd = [
        "python", "-m", "pyserini.eval.trec_eval",
        "-q",
        str(qrels_path),
        str(results_path)
    ]
    
    result = subprocess.run(cmd, capture_output=True, text=True)
    
    if result.returncode == 0:
        # Save full output to file
        with open(output_path, 'w') as f:
            f.write(result.stdout)
        print(f"Evaluation saved to {output_path}")
        return result.stdout
    else:
        print(f"Error running trec_eval: {result.stderr}")
        return None

In [153]:
# Run trec_eval for stemmed results
print("Evaluating stemmed index...")
eval_output_stemmed = run_trec_eval(QRELS_PATH, RESULTS_STEMMED, EVAL_STEMMED)

# Run trec_eval for unstemmed results
print("\nEvaluating unstemmed index...")
eval_output_unstemmed = run_trec_eval(QRELS_PATH, RESULTS_UNSTEMMED, EVAL_UNSTEMMED)

Evaluating stemmed index...
Evaluation saved to /home/galnoy/git-projects/MSC-Text-Retrieval-and-Search-Engines/HW2/results/stemmed_eval.txt

Evaluating unstemmed index...
Evaluation saved to /home/galnoy/git-projects/MSC-Text-Retrieval-and-Search-Engines/HW2/results/stemmed_eval.txt

Evaluating unstemmed index...
Evaluation saved to /home/galnoy/git-projects/MSC-Text-Retrieval-and-Search-Engines/HW2/results/unstemmed_eval.txt
Evaluation saved to /home/galnoy/git-projects/MSC-Text-Retrieval-and-Search-Engines/HW2/results/unstemmed_eval.txt


### Parse and display comparison table

In [154]:
def parse_trec_eval(output):
    """Parse trec_eval output to extract MAP, P@5, and P@10."""
    metrics = {}
    for line in output.split('\n'):
        if line.startswith('map ') and 'all' in line:
            metrics['MAP'] = float(line.split()[-1])
        elif line.startswith('P_5 ') and 'all' in line:
            metrics['P@5'] = float(line.split()[-1])
        elif line.startswith('P_10 ') and 'all' in line:
            metrics['P@10'] = float(line.split()[-1])
    return metrics

if eval_output_stemmed and eval_output_unstemmed:
    metrics_stemmed = parse_trec_eval(eval_output_stemmed)
    metrics_unstemmed = parse_trec_eval(eval_output_unstemmed)
    
    print("\n" + "="*70)
    print("EVALUATION RESULTS")
    print("="*70)
    print(f"{'Configuration':<35} {'MAP':>10} {'P@5':>10} {'P@10':>10}")
    print("-"*70)
    print(f"{'WITH Krovetz + WITH Stopwords':<35} {metrics_stemmed['MAP']:>10.4f} {metrics_stemmed['P@5']:>10.4f} {metrics_stemmed['P@10']:>10.4f}")
    print(f"{'WITHOUT Stemming + WITH Stopwords':<35} {metrics_unstemmed['MAP']:>10.4f} {metrics_unstemmed['P@5']:>10.4f} {metrics_unstemmed['P@10']:>10.4f}")
    print("="*70)
else:
    print("Error: Could not run trec_eval. Check that pyserini is installed correctly.")


EVALUATION RESULTS
Configuration                              MAP        P@5       P@10
----------------------------------------------------------------------
WITH Krovetz + WITH Stopwords           0.2144     0.4121     0.3913
WITHOUT Stemming + WITH Stopwords       0.1896     0.4094     0.3758


### Determine best configuration

In [155]:
if metrics_stemmed and metrics_unstemmed:
    if metrics_stemmed['MAP'] > metrics_unstemmed['MAP']:
        winner = "WITH Krovetz stemming"
        diff = metrics_stemmed['MAP'] - metrics_unstemmed['MAP']
    else:
        winner = "WITHOUT stemming"
        diff = metrics_unstemmed['MAP'] - metrics_stemmed['MAP']
    
    print(f"\nBest configuration by MAP: {winner}")
    print(f"MAP improvement: {diff:.4f}")
    print("\nExplanation:")
    print("Krovetz stemming reduces words to their root forms, improving recall by matching")
    print("different morphological variants (e.g., 'running', 'runs', 'ran' → 'run').")
    print("This typically improves retrieval effectiveness when queries and documents use")
    print("different word forms for the same concepts.")
else:
    print("Metrics not available.")


Best configuration by MAP: WITH Krovetz stemming
MAP improvement: 0.0248

Explanation:
Krovetz stemming reduces words to their root forms, improving recall by matching
different morphological variants (e.g., 'running', 'runs', 'ran' → 'run').
This typically improves retrieval effectiveness when queries and documents use
different word forms for the same concepts.
