# HW2 - WET PART

## Part B: Build 2 indexes and compare retrieval effectiveness
1. **Index 1**: WITH stopwords removal AND WITH Krovetz stemming
2. **Index 2**: WITH stopwords removal AND WITHOUT stemming
3. Compare MAP, P@5, P@10 for both indexes

### Imports and Configuration

In [2]:
import subprocess
from pathlib import Path
import pytrec_eval
import warnings
import logging
from pyserini.search.lucene import LuceneSearcher
from pyserini.analysis import get_lucene_analyzer
from pyserini.index.lucene import IndexReader
from pyserini.search import get_topics_with_reader

# Suppress warnings and Java logging
warnings.filterwarnings('ignore')
logging.getLogger().setLevel(logging.ERROR)

BASE_DIR = Path.cwd()

# Part A
PART_A_DIR = BASE_DIR / "data" / "WET_PART_A"
PART_A_INDEX = BASE_DIR / "indexes" / "part_a_index"

# Part B
AP_COLL_PATH = BASE_DIR / "data" / "AP_Coll"
PART_B_DIR = BASE_DIR / "data" / "WET_PART_B"
QUERIES_PATH = PART_B_DIR / "queries.txt"
STOPWORDS_PATH = PART_B_DIR / "StopWords.txt"
QRELS_PATH = PART_B_DIR / "qrels_AP"

INDEX_STEMMED = BASE_DIR / "indexes" / "ap_stemmed"
INDEX_UNSTEMMED = BASE_DIR / "indexes" / "ap_unstemmed"

RESULTS_DIR = BASE_DIR / "results"
RESULTS_STEMMED = RESULTS_DIR / "stemmed.txt"
RESULTS_UNSTEMMED = RESULTS_DIR / "unstemmed.txt"

# Create directories
for path in [PART_A_INDEX, INDEX_STEMMED, INDEX_UNSTEMMED, RESULTS_DIR]:
    path.mkdir(parents=True, exist_ok=True)

print("Setup complete")

  from .autonotebook import tqdm as notebook_tqdm
[0;93m2025-11-20 23:03:12.370834361 [W:onnxruntime:Default, device_discovery.cc:164 DiscoverDevicesForPlatform] GPU device discovery failed: device_discovery.cc:89 ReadFileContents Failed to open file: "/sys/class/drm/card0/device/vendor"[m
[0;93m2025-11-20 23:03:12.370834361 [W:onnxruntime:Default, device_discovery.cc:164 DiscoverDevicesForPlatform] GPU device discovery failed: device_discovery.cc:89 ReadFileContents Failed to open file: "/sys/class/drm/card0/device/vendor"[m


Setup complete


## Part A: Index docs.txt and test queries
Build index with `--keepStopwords --stemmer none` and test with specific queries.

In [8]:
def build_index(collection_path, index_path, stemmer='none', stopwords_path=None, keep_stopwords=False):
    """Build a Lucene index using pyserini."""
    if index_path.exists() and any(index_path.iterdir()):
        print(f"Index already exists: {index_path}")
        return
    
    print(f"Building index at {index_path}...")
    cmd = [
        "python", "-m", "pyserini.index.lucene",
        "--collection", "TrecCollection",
        "--input", str(collection_path),
        "--index", str(index_path),
        "--stemmer", stemmer,
        "--storePositions",
        "--storeDocvectors",
        "--storeRaw",
        "--optimize"
    ]
    
    if keep_stopwords:
        cmd.append("--keepStopwords")
    elif stopwords_path:
        cmd.extend(["--stopwords", str(stopwords_path)])
    
    result = subprocess.run(cmd, capture_output=True, text=True)
    if result.returncode == 0:
        print(f"Index built successfully")
    else:
        print(f"Build failed: {result.stderr}")

build_index(PART_A_DIR, PART_A_INDEX, stemmer='none', keep_stopwords=True)

Building index at /home/galnoy/git-projects/MSC-Text-Retrieval-and-Search-Engines/HW2/indexes/part_a_index...
Index built successfully
Index built successfully


### Check index statistics

In [1]:
reader = IndexReader(str(PART_A_INDEX))
stats = reader.stats()

print("\nPart A Index Statistics:")
for key, value in stats.items():
    print(f"{key:25s}: {value:,}")

NameError: name 'IndexReader' is not defined

---

## Part B: Build Two Indexes

In [None]:
# Index 1: WITH Krovetz stemming + WITH stopwords removal
build_index(AP_COLL_PATH, INDEX_STEMMED, stemmer='krovetz', stopwords_path=STOPWORDS_PATH)

Build Index 1 for Part B using the AP collection WITH Krovetz stemming and WITH stopwords removal.

In [None]:
# Index 2: WITHOUT stemming + WITH stopwords removal
build_index(AP_COLL_PATH, INDEX_UNSTEMMED, stemmer='none', stopwords_path=STOPWORDS_PATH)

Build Index 2 for Part B using the AP collection WITHOUT stemming but still WITH stopwords removal.

## Part B: Load Queries and Perform Retrieval

In [None]:
# Load and format queries
topics = get_topics_with_reader('io.anserini.search.topicreader.TsvIntTopicReader', str(QUERIES_PATH))

queries = {}
for topic_id, topic in topics.items():
    qid = str(topic_id) if len(str(topic_id)) == 3 else f"0{topic_id}"
    queries[qid] = topic['title']

print(f"Loaded {len(queries)} queries")
print(f"Sample: {list(queries.items())[:3]}")

In [None]:
def search_and_save(index_path, queries, output_path, stemmer='none', k=1000, run_name="run"):
    """Search index and save results in TREC format."""
    searcher = LuceneSearcher(str(index_path))
    analyzer = get_lucene_analyzer(stemmer=stemmer, stopwords=False)
    searcher.set_analyzer(analyzer)
    searcher.set_bm25(k1=0.9, b=0.4)
    
    with open(output_path, 'w') as f:
        for qid in sorted(queries.keys()):
            hits = searcher.search(queries[qid], k=k)
            for rank, hit in enumerate(hits, start=1):
                f.write(f"{qid} Q0 {hit.docid} {rank} {hit.score:.4f} {run_name}\n")
    
    print(f"Results saved to {output_path}")
    return len(queries)

Helper function to search an index and save results in TREC format with BM25 parameters k1=0.9 and b=0.4.

Load the 150 queries from queries.txt and format the query IDs with leading zeros where needed.

In [None]:
# Search Index 1 (stemmed)
search_and_save(INDEX_STEMMED, queries, RESULTS_STEMMED, stemmer='krovetz', run_name='stemmed')

Retrieve top 1000 documents for each query from Index 1 (stemmed) and save results in TREC format.

In [None]:
# Search Index 2 (unstemmed)
search_and_save(INDEX_UNSTEMMED, queries, RESULTS_UNSTEMMED, stemmer='none', run_name='unstemmed')

Retrieve top 1000 documents for each query from Index 2 (unstemmed) and save results in TREC format.

## Part B: Evaluation and Comparison

In [None]:
def load_qrels(qrels_path):
    """Load relevance judgments in pytrec_eval format."""
    qrels = {}
    with open(qrels_path, 'r') as f:
        for line in f:
            parts = line.strip().split()
            if len(parts) >= 4:
                qid, _, docid, rel = parts[0], parts[1], parts[2], int(parts[3])
                if qid not in qrels:
                    qrels[qid] = {}
                qrels[qid][docid] = rel
    return qrels

def load_results(results_path):
    """Load search results in pytrec_eval format."""
    results = {}
    with open(results_path, 'r') as f:
        for line in f:
            parts = line.strip().split()
            if len(parts) >= 6:
                qid, _, docid, rank, score, _ = parts
                if qid not in results:
                    results[qid] = {}
                results[qid][docid] = float(score)
    return results

def evaluate_with_pytrec_eval(qrels, results):
    """Evaluate results using pytrec_eval."""
    evaluator = pytrec_eval.RelevanceEvaluator(qrels, {'map', 'P_5', 'P_10'})
    metrics = evaluator.evaluate(results)
    
    # Calculate averages
    map_scores = [scores['map'] for scores in metrics.values()]
    p5_scores = [scores['P_5'] for scores in metrics.values()]
    p10_scores = [scores['P_10'] for scores in metrics.values()]
    
    return {
        'MAP': sum(map_scores) / len(map_scores) if map_scores else 0.0,
        'P@5': sum(p5_scores) / len(p5_scores) if p5_scores else 0.0,
        'P@10': sum(p10_scores) / len(p10_scores) if p10_scores else 0.0
    }

Helper functions to load qrels and results, then evaluate using pytrec_eval to calculate MAP, P@5, and P@10 metrics.

In [None]:
# Load qrels and results
qrels = load_qrels(QRELS_PATH)
results_stemmed = load_results(RESULTS_STEMMED)
results_unstemmed = load_results(RESULTS_UNSTEMMED)

print(f"Loaded {len(qrels)} queries with relevance judgments")

# Evaluate both configurations using pytrec_eval
print("\nEvaluating stemmed index...")
metrics_stemmed = evaluate_with_pytrec_eval(qrels, results_stemmed)

print("Evaluating unstemmed index...")
metrics_unstemmed = evaluate_with_pytrec_eval(qrels, results_unstemmed)

Load qrels and results files, then use pytrec_eval to calculate MAP, P@5, and P@10 metrics for both index configurations.

In [None]:
# Display comparison table
if metrics_stemmed and metrics_unstemmed:
    print("\n" + "="*70)
    print("EVALUATION RESULTS")
    print("="*70)
    print(f"{'Configuration':<30} {'MAP':>10} {'P@5':>10} {'P@10':>10}")
    print("-"*70)
    print(f"{'WITH Krovetz + WITH Stopwords':<30} {metrics_stemmed['MAP']:>10.4f} {metrics_stemmed['P@5']:>10.4f} {metrics_stemmed['P@10']:>10.4f}")
    print(f"{'WITHOUT Stemming + WITH Stopwords':<30} {metrics_unstemmed['MAP']:>10.4f} {metrics_unstemmed['P@5']:>10.4f} {metrics_unstemmed['P@10']:>10.4f}")
    print("="*70)
else:
    print("Error: Could not calculate metrics. Make sure trec_eval is installed.")

Display a comparison table showing MAP, P@5, and P@10 for both index configurations.

In [None]:
# Determine best configuration
if metrics_stemmed and metrics_unstemmed:
    if metrics_stemmed['MAP'] > metrics_unstemmed['MAP']:
        winner = "WITH Krovetz stemming"
        diff = metrics_stemmed['MAP'] - metrics_unstemmed['MAP']
    else:
        winner = "WITHOUT stemming"
        diff = metrics_unstemmed['MAP'] - metrics_stemmed['MAP']
    
    print(f"\nBest configuration by MAP: {winner}")
    print(f"MAP improvement: {diff:.4f}")
    print("\nExplanation:")
    print("Krovetz stemming reduces words to their root forms, improving recall by matching")
    print("different morphological variants (e.g., 'running', 'runs', 'ran' â†’ 'run').")
    print("This typically improves retrieval effectiveness when queries and documents use")
    print("different word forms for the same concepts.")

Determine which configuration performed better based on MAP and provide an explanation of the results.