# HW2 - WET PART

## Part A: Index docs.txt and test queries
Build index with `--keepStopwords --stemmer none` and test with specific queries.

## Part B: Build 2 indexes and compare retrieval effectiveness
1. **Index 1**: WITH stopwords removal AND WITH Krovetz stemming
2. **Index 2**: WITH stopwords removal AND WITHOUT stemming
3. Compare MAP, P@5, P@10 for both indexes

### Imports and Configuration

In [22]:
import os
import subprocess
import pandas as pd
import pytrec_eval
from pathlib import Path
from pyserini.search.lucene import LuceneSearcher
from pyserini.analysis import Analyzer, get_lucene_analyzer
from pyserini.index.lucene import IndexReader
from pyserini.search import get_topics_with_reader

# Paths
BASE_DIR = Path.cwd()

# Part A paths
PART_A_DIR = BASE_DIR / "data" / "WET_PART_A"
PART_A_INDEX = BASE_DIR / "indexes" / "part_a_index"

# Part B paths
AP_COLL_PATH = BASE_DIR / "data" / "AP_Coll"
PART_B_DIR = BASE_DIR / "data" / "WET_PART_B"
QUERIES_PATH = PART_B_DIR / "queries.txt"
STOPWORDS_PATH = PART_B_DIR / "StopWords.txt"
QRELS_PATH = PART_B_DIR / "qrels_AP"

# Part B indexes
INDEX_STEMMED = BASE_DIR / "indexes" / "ap_index_stemmed"
INDEX_UNSTEMMED = BASE_DIR / "indexes" / "ap_index_unstemmed"

# Results
RESULTS_DIR = BASE_DIR / "results"
RESULTS_STEMMED = RESULTS_DIR / "results_stemmed.txt"
RESULTS_UNSTEMMED = RESULTS_DIR / "results_unstemmed.txt"
EVAL_PATH = RESULTS_DIR / "evaluation_comparison.txt"

# Create directories
PART_A_INDEX.mkdir(parents=True, exist_ok=True)
INDEX_STEMMED.mkdir(parents=True, exist_ok=True)
INDEX_UNSTEMMED.mkdir(parents=True, exist_ok=True)
RESULTS_DIR.mkdir(parents=True, exist_ok=True)

print("Setup complete")

Collection: /home/galnoy/git-projects/MSC-Text-Retrieval-and-Search-Engines/HW2/data/AP_Coll
Index: /home/galnoy/git-projects/MSC-Text-Retrieval-and-Search-Engines/HW2/indexes/ap_index
Results: /home/galnoy/git-projects/MSC-Text-Retrieval-and-Search-Engines/HW2/results


### Build Part A Index

In [23]:
# Build index with --keepStopwords --stemmer none
if PART_A_INDEX.exists() and any(PART_A_INDEX.iterdir()):
    print(f"Part A index exists: {PART_A_INDEX}")
else:
    print("Building Part A index...")
    cmd = [
        "python", "-m", "pyserini.index.lucene",
        "--collection", "TrecCollection",
        "--input", str(PART_A_DIR),
        "--index", str(PART_A_INDEX),
        "--keepStopwords",
        "--stemmer", "none",
        "--storePositions",
        "--storeDocvectors",
        "--storeRaw",
        "--optimize"
    ]
    result = subprocess.run(cmd, capture_output=True, text=True)
    if result.returncode == 0:
        print(f"Part A index built: {PART_A_INDEX}")
    else:
        print("Failed!")
        print(result.stderr)

Building index...
Index built: /home/galnoy/git-projects/MSC-Text-Retrieval-and-Search-Engines/HW2/indexes/ap_index
Index built: /home/galnoy/git-projects/MSC-Text-Retrieval-and-Search-Engines/HW2/indexes/ap_index


### Check Part A Index Statistics

In [24]:
part_a_reader = IndexReader(str(PART_A_INDEX))
part_a_stats = part_a_reader.stats()

print("Part A Index Statistics:")
for key, value in part_a_stats.items():
    print(f"{key:25s}: {value:,}")

Index Statistics:
total_terms              : 110,570,409
documents                : 242,892
non_empty_documents      : 242,892
unique_terms             : 397,835


### Test Part A: Query 'corporation'

In [None]:
# Initialize searcher with matching analyzer
searcher_a = LuceneSearcher(str(PART_A_INDEX))
analyzer_a = get_lucene_analyzer(stemmer='none', stopwords=False)
searcher_a.set_analyzer(analyzer_a)
searcher_a.set_bm25(k1=0.9, b=0.4)

# Query: corporation
query = 'corporation'
hits = searcher_a.search(query, k=4)

print(f"Query: '{query}'")
print(f"Retrieved {len(hits)} documents:")
for i in range(len(hits)):
    print(f"Doc {hits[i].docid}, Score: {hits[i].score:.4f}")

## Part B: Build 2 Indexes and Compare

1. **Index 1**: WITH stopwords removal AND WITH Krovetz stemming
2. **Index 2**: WITH stopwords removal AND WITHOUT stemming

Then retrieve and evaluate both to compare MAP, P@5, P@10.

### Build Index 1: WITH Stopwords Removal + WITH Krovetz Stemming

In [6]:
if INDEX_STEMMED.exists() and any(INDEX_STEMMED.iterdir()):
    print(f"Stemmed index exists: {INDEX_STEMMED}")
else:
    print("Building Index 1 (WITH stopwords removal + WITH Krovetz stemming)...")
    cmd = [
        "python", "-m", "pyserini.index.lucene",
        "--collection", "TrecCollection",
        "--input", str(AP_COLL_PATH),
        "--index", str(INDEX_STEMMED),
        "--stemmer", "krovetz",
        "--stopwords", str(STOPWORDS_PATH),
        "--storePositions",
        "--storeDocvectors",
        "--storeRaw",
        "--optimize"
    ]
    result = subprocess.run(cmd, capture_output=True, text=True)
    if result.returncode == 0:
        print(f"Index 1 built: {INDEX_STEMMED}")
    else:
        print("Failed!")
        print(result.stderr)

✓ Loaded 418 stopwords
Sample stopwords: ['whole', 'to', 'something', 'per', 'again', 'hardly', 'hereafter', 'already', 'within', 'whereinto']


### Build Index 2: WITH Stopwords Removal + WITHOUT Stemming

In [None]:
if INDEX_UNSTEMMED.exists() and any(INDEX_UNSTEMMED.iterdir()):
    print(f"Unstemmed index exists: {INDEX_UNSTEMMED}")
else:
    print("Building Index 2 (WITH stopwords removal + WITHOUT stemming)...")
    cmd = [
        "python", "-m", "pyserini.index.lucene",
        "--collection", "TrecCollection",
        "--input", str(AP_COLL_PATH),
        "--index", str(INDEX_UNSTEMMED),
        "--stemmer", "none",
        "--stopwords", str(STOPWORDS_PATH),
        "--storePositions",
        "--storeDocvectors",
        "--storeRaw",
        "--optimize"
    ]
    result = subprocess.run(cmd, capture_output=True, text=True)
    if result.returncode == 0:
        print(f"Index 2 built: {INDEX_UNSTEMMED}")
    else:
        print("Failed!")
        print(result.stderr)

In [7]:
# Load queries using pyserini
topics = get_topics_with_reader('io.anserini.search.topicreader.TsvIntTopicReader', str(QUERIES_PATH))

# Fix query IDs (add leading zero if needed)
queries = {}
for topic_id, topic in topics.items():
    fixed_topic_id = str(topic_id)
    if len(fixed_topic_id) == 2:
        fixed_topic_id = '0' + str(topic_id)
    queries[fixed_topic_id] = topic['title']

print(f"Loaded {len(queries)} queries")
assert len(queries) == 150, 'missing queries'
print(f"Sample: {list(queries.items())[:3]}")

✓ Loaded 150 queries

Sample queries:
  051: airbus  subsidies
  052: south  african  sanctions
  053: leveraged  buyouts
  054: satellite  launch  contracts
  055: insider  trading


### Retrieval from Index 1 (Stemmed)

In [8]:
# Initialize searcher for stemmed index
searcher_stemmed = LuceneSearcher(str(INDEX_STEMMED))
analyzer_stemmed = get_lucene_analyzer(stemmer='krovetz', stopwords=False)
searcher_stemmed.set_analyzer(analyzer_stemmed)
searcher_stemmed.set_bm25(k1=0.9, b=0.4)

# Retrieve for all queries
results_stemmed = {}
for topic_id, topic in queries.items():
    hits = searcher_stemmed.search(topic, k=1000)
    results_stemmed[topic_id] = [(hit.docid, i+1, hit.score) for i, hit in enumerate(hits)]
    
print(f"Retrieved from Index 1 (stemmed): {len(results_stemmed)} queries")

Query 051 Processing Example:
Original: airbus  subsidies
Tokens after stemming: ['airbus', 'subsidy']
After stopword removal: ['airbus', 'subsidy']
Final query: airbus subsidy


### Save Results from Index 1

In [9]:
sorted_results_stemmed = dict(sorted(results_stemmed.items()))
with open(RESULTS_STEMMED, 'w') as f:
    for topic_id, hits in sorted_results_stemmed.items():
        for rank, (docid, _, score) in enumerate(hits, start=1):
            f.write(f"{topic_id} Q0 {docid} {rank} {score:.4f} pyserini\n")

print(f"Saved Index 1 results to: {RESULTS_STEMMED}")

Initializing BM25 searcher...
✓ Searcher initialized with BM25
Total documents in index: 242,892


### Retrieval from Index 2 (Unstemmed)

In [None]:
# Initialize searcher for unstemmed index
searcher_unstemmed = LuceneSearcher(str(INDEX_UNSTEMMED))
analyzer_unstemmed = get_lucene_analyzer(stemmer='none', stopwords=False)
searcher_unstemmed.set_analyzer(analyzer_unstemmed)
searcher_unstemmed.set_bm25(k1=0.9, b=0.4)

# Retrieve for all queries
results_unstemmed = {}
for topic_id, topic in queries.items():
    hits = searcher_unstemmed.search(topic, k=1000)
    results_unstemmed[topic_id] = [(hit.docid, i+1, hit.score) for i, hit in enumerate(hits)]
    
print(f"Retrieved from Index 2 (unstemmed): {len(results_unstemmed)} queries")

### Save Results from Index 2

In [None]:
sorted_results_unstemmed = dict(sorted(results_unstemmed.items()))
with open(RESULTS_UNSTEMMED, 'w') as f:
    for topic_id, hits in sorted_results_unstemmed.items():
        for rank, (docid, _, score) in enumerate(hits, start=1):
            f.write(f"{topic_id} Q0 {docid} {rank} {score:.4f} pyserini\n")

print(f"Saved Index 2 results to: {RESULTS_UNSTEMMED}")

### Evaluation - Compare Both Indexes

In [12]:
# Load qrels
def load_qrels(filepath):
    qrels = {}
    with open(filepath, 'r') as f:
        for line in f:
            parts = line.strip().split()
            if len(parts) >= 4:
                qid, _, docid, rel = parts[0], parts[1], parts[2], int(parts[3])
                if qid not in qrels:
                    qrels[qid] = {}
                qrels[qid][docid] = rel
    return qrels

qrels = load_qrels(QRELS_PATH)
print(f"Loaded qrels: {len(qrels)} queries, {sum(len(docs) for docs in qrels.values()):,} judgments")

✓ Loaded qrels for 149 queries
✓ Total relevance judgments: 45,386


In [13]:
# Convert results to pytrec_eval format
def convert_to_eval_format(results_dict):
    eval_results = {}
    for topic_id, hits in results_dict.items():
        eval_results[topic_id] = {}
        for docid, rank, score in hits:
            eval_results[topic_id][docid] = float(score)
    return eval_results

eval_format_stemmed = convert_to_eval_format(results_stemmed)
eval_format_unstemmed = convert_to_eval_format(results_unstemmed)

print("Converted results to evaluation format")

✓ Loaded results for 147 queries


In [14]:
# Evaluate both indexes
evaluator = pytrec_eval.RelevanceEvaluator(qrels, {'map', 'P_5', 'P_10'})

# Index 1 (stemmed)
eval_stemmed = evaluator.evaluate(eval_format_stemmed)
map_stemmed = sum(eval_stemmed[qid]['map'] for qid in eval_stemmed) / len(eval_stemmed)
p5_stemmed = sum(eval_stemmed[qid]['P_5'] for qid in eval_stemmed) / len(eval_stemmed)
p10_stemmed = sum(eval_stemmed[qid]['P_10'] for qid in eval_stemmed) / len(eval_stemmed)

# Index 2 (unstemmed)
eval_unstemmed = evaluator.evaluate(eval_format_unstemmed)
map_unstemmed = sum(eval_unstemmed[qid]['map'] for qid in eval_unstemmed) / len(eval_unstemmed)
p5_unstemmed = sum(eval_unstemmed[qid]['P_5'] for qid in eval_unstemmed) / len(eval_unstemmed)
p10_unstemmed = sum(eval_unstemmed[qid]['P_10'] for qid in eval_unstemmed) / len(eval_unstemmed)

# Display results table
print("\n" + "="*70)
print(" Stopword | Krovetz  |   MAP   |   P@5   |  P@10  ")
print(" Removal  | Stemmer  |         |         |        ")
print("-"*70)
print(f"   With   |   With   | {map_stemmed:.4f}  | {p5_stemmed:.4f}  | {p10_stemmed:.4f} ")
print(f"   With   | Without  | {map_unstemmed:.4f}  | {p5_unstemmed:.4f}  | {p10_unstemmed:.4f} ")
print("="*70)

# Save to file
with open(EVAL_PATH, 'w') as f:
    f.write("Stopword Removal | Krovetz Stemmer | MAP     | P@5    | P@10\n")
    f.write("-"*65 + "\n")
    f.write(f"With             | With            | {map_stemmed:.4f}  | {p5_stemmed:.4f} | {p10_stemmed:.4f}\n")
    f.write(f"With             | Without         | {map_unstemmed:.4f}  | {p5_unstemmed:.4f} | {p10_unstemmed:.4f}\n")

print(f"\nSaved comparison to: {EVAL_PATH}")

Evaluating with pytrec_eval...

EVALUATION RESULTS
MAP (Mean Average Precision): 0.1162
P@5  (Precision at 5):        0.2164
P@10 (Precision at 10):       0.2068

✓ Evaluation results saved to: /home/galnoy/git-projects/MSC-Text-Retrieval-and-Search-Engines/HW2/results/evaluation_results.txt


### Which Index Performed Better?

In [15]:
if map_stemmed > map_unstemmed:
    winner = "Index 1 (WITH Krovetz stemming)"
    difference = map_stemmed - map_unstemmed
else:
    winner = "Index 2 (WITHOUT stemming)"
    difference = map_unstemmed - map_stemmed

print(f"\nBest MAP: {winner}")
print(f"MAP difference: {difference:.4f}")
print("\nExplanation:")
print("Krovetz stemming conflates word variants to their root form, improving recall")
print("by matching different morphological forms of the same concept (e.g., 'running',")
print("'runs', 'ran' all map to 'run'). This helps when queries use different forms")
print("than the documents, improving the matching effectiveness.")


Top 10 Queries by MAP:
Query ID                                Query Text      MAP  P@5  P@10
     052                 south  african  sanctions 0.846048  1.0   1.0
     058                             rail  strikes 0.690898  1.0   1.0
     170 consequences  implantation  silicone  gel 0.687500  0.4   0.2
     132                         stealth  aircraft 0.676020  1.0   1.0
     057                                       mci 0.643268  0.6   0.7
     161                                acid  rain 0.531303  0.2   0.4
     061       israeli  role  iran  contra  affair 0.529885  1.0   1.0
     056  prime  lending  rate  moves  predictions 0.514333  1.0   1.0
     163          vietnam  veterans  agent  orange 0.501623  0.2   0.4
     099                      iran  contra  affair 0.474251  0.8   0.8


Bottom 10 Queries by MAP:
Query ID                                    Query Text  MAP  P@5  P@10
     063                          machine  translation  0.0  0.0   0.0
     066                 