In [9]:
import json
import numpy as np
from tqdm import tqdm
from sentence_transformers import SentenceTransformer, util
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import torch
from bs4 import BeautifulSoup
import re

# Check GPU availability
device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
print(f"Using device: {device}")

Using device: mps


## Configuration

In [10]:
# Parameters
WINDOW = 512
OVERLAP = 102
SEMANTIC_THRESHOLD = 0.5

# Dataset
DATASET_PATH = "data/nq_filtered_medium.jsonl"

# Model
MODEL_NAME = "avsolatorio/GIST-Embedding-v0"

# Methods to compare
CHUNKING_STRATEGIES = ["sliding_window", "html_aware", "semantic_similarity"]

## Load Model

In [11]:
print(f"Loading model: {MODEL_NAME}")
model = SentenceTransformer(MODEL_NAME, device=device)

print("Model loaded successfully!")
print(f"Max sequence length: {model.max_seq_length}")

Loading model: avsolatorio/GIST-Embedding-v0
Model loaded successfully!
Max sequence length: 512


## Chunking Strategies

In [12]:
def sliding_window_chunk(text, window=512, overlap=102):
    """Fixed-size sliding window chunking."""
    words = text.split()
    chunks = []
    step = window - overlap
    
    i = 0
    while i < len(words):
        chunk_words = words[i:i + window]
        if not chunk_words:
            break
        chunks.append(" ".join(chunk_words))
        i += step
    
    return chunks


def html_aware_chunk(html_text, max_chunk_size=512):
    """HTML-structure-aware chunking."""
    soup = BeautifulSoup(html_text, 'html.parser')
    chunks = []
    structural_tags = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'li', 'td', 'div']
    
    current_chunk = []
    current_word_count = 0
    
    def add_chunk():
        if current_chunk:
            chunks.append(" ".join(current_chunk))
    
    for element in soup.find_all(structural_tags):
        text = element.get_text(strip=True)
        if not text:
            continue
        
        words = text.split()
        
        if current_word_count + len(words) > max_chunk_size and current_chunk:
            add_chunk()
            current_chunk = []
            current_word_count = 0
        
        current_chunk.extend(words)
        current_word_count += len(words)
        
        if element.name in ['h1', 'h2', 'h3'] and current_word_count > max_chunk_size * 0.5:
            add_chunk()
            current_chunk = []
            current_word_count = 0
    
    add_chunk()
    
    # Fallback
    if not chunks:
        words = html_text.split()
        for i in range(0, len(words), max_chunk_size):
            chunk_words = words[i:i + max_chunk_size]
            chunks.append(" ".join(chunk_words))
    
    return chunks


def semantic_similarity_chunk(text, model, threshold=0.5, max_chunk_size=512):
    """Semantic similarity-based chunking."""
    sentences = re.split(r'(?<=[.!?])\s+', text)
    sentences = [s.strip() for s in sentences if s.strip()]
    
    if len(sentences) <= 1:
        return [text]
    
    sentence_embeddings = model.encode(sentences, convert_to_tensor=True, show_progress_bar=False)
    
    similarities = []
    for i in range(len(sentence_embeddings) - 1):
        sim = util.cos_sim(sentence_embeddings[i], sentence_embeddings[i + 1])[0][0].item()
        similarities.append(sim)
    
    chunks = []
    current_chunk = [sentences[0]]
    current_word_count = len(sentences[0].split())
    
    for i, sim in enumerate(similarities):
        next_sentence = sentences[i + 1]
        next_word_count = len(next_sentence.split())
        
        if sim < threshold or (current_word_count + next_word_count > max_chunk_size):
            chunks.append(" ".join(current_chunk))
            current_chunk = [next_sentence]
            current_word_count = next_word_count
        else:
            current_chunk.append(next_sentence)
            current_word_count += next_word_count
    
    if current_chunk:
        chunks.append(" ".join(current_chunk))
    
    return chunks

## Helper Functions

In [13]:
def find_gold_chunk_by_content(chunks, document_tokens, start_token, end_token):
    """Find which chunk contains the gold answer."""
    if start_token < 0 or end_token < 0 or start_token >= len(document_tokens):
        return None
    
    gold_tokens = [document_tokens[i]['token'] for i in range(start_token, min(end_token, len(document_tokens)))]
    gold_text = " ".join(gold_tokens).lower()
    
    # Exact match
    for idx, chunk in enumerate(chunks):
        if gold_text in chunk.lower():
            return idx
    
    # Fuzzy match
    gold_words = set(gold_text.split())
    best_idx = None
    best_overlap = 0
    
    for idx, chunk in enumerate(chunks):
        chunk_words = set(chunk.lower().split())
        overlap = len(gold_words & chunk_words)
        if overlap > best_overlap:
            best_overlap = overlap
            best_idx = idx
    
    if best_overlap >= len(gold_words) * 0.5:
        return best_idx
    
    return None


def compute_metrics(rank_list):
    """Compute Recall@10 and MRR."""
    recall10 = np.mean([1 if r <= 10 else 0 for r in rank_list])
    mrr = np.mean([1.0 / r for r in rank_list])
    return recall10, mrr

## Evaluation Function

In [14]:
def evaluate_method(model, dataset_path, chunking_strategy, method_name, **chunk_params):
    """
    Evaluate a chunking strategy with naive encoding.
    
    Args:
        model: SentenceTransformer model
        dataset_path: Path to dataset
        chunking_strategy: Function that returns chunk texts
        method_name: Name for logging
        **chunk_params: Parameters for chunking function
    
    Returns:
        Dictionary with metrics
    """
    rank_list = []
    skipped = 0
    
    with open(dataset_path, "r", encoding="utf-8") as f:
        for line in tqdm(f, desc=f"Evaluating {method_name}"):
            item = json.loads(line)
            
            question = item["question_text"]
            html_text = item["document_html"]
            doc_tokens = item["document_tokens"]
            
            # Apply chunking strategy
            try:
                if "semantic" in method_name:
                    chunks = chunking_strategy(html_text, model, **chunk_params)
                else:
                    chunks = chunking_strategy(html_text, **chunk_params)
            except Exception as e:
                skipped += 1
                continue
            
            if not chunks:
                skipped += 1
                continue
            
            # Encode chunks (naive encoding: each chunk independently)
            try:
                chunk_embeddings = model.encode(chunks, convert_to_tensor=True, show_progress_bar=False)
            except Exception as e:
                skipped += 1
                continue
            
            # Encode query
            query_embedding = model.encode(question, convert_to_tensor=True, show_progress_bar=False)
            
            # Similarity ranking
            scores = util.cos_sim(query_embedding, chunk_embeddings)[0]
            ranking = scores.argsort(descending=True).cpu().numpy()
            
            # Find gold answer
            ann = item["annotations"][0]
            if ann["short_answers"]:
                gold_start = ann["short_answers"][0]["start_token"]
                gold_end = ann["short_answers"][0]["end_token"]
            else:
                gold_start = ann["long_answer"]["start_token"]
                gold_end = ann["long_answer"]["end_token"]
            
            if gold_start < 0 or gold_end < 0:
                skipped += 1
                continue
            
            gold_chunk = find_gold_chunk_by_content(chunks, doc_tokens, gold_start, gold_end)
            if gold_chunk is None or gold_chunk >= len(chunks):
                skipped += 1
                continue
            
            # Find rank
            gold_rank = np.where(ranking == gold_chunk)[0][0] + 1
            rank_list.append(gold_rank)
    
    # Compute metrics
    if rank_list:
        recall10, mrr = compute_metrics(rank_list)
    else:
        recall10, mrr = 0.0, 0.0
    
    return {
        "method": method_name,
        "recall@10": recall10,
        "mrr": mrr,
        "total_samples": len(rank_list),
        "skipped": skipped
    }

## Run All Experiments

Compare the three chunking strategies: Sliding Window, HTML-Aware, and Semantic Similarity

In [None]:
results = {}

print("="*70)
print("Running Chunking Strategy Comparison")
print("="*70)

strategies_map = {
    "sliding_window": (sliding_window_chunk, {"window": WINDOW, "overlap": OVERLAP}),
    "html_aware": (html_aware_chunk, {"max_chunk_size": WINDOW}),
    "semantic_similarity": (semantic_similarity_chunk, {"threshold": SEMANTIC_THRESHOLD, "max_chunk_size": WINDOW})
}

for strategy_name, (strategy_func, params) in strategies_map.items():
    print(f"\n{'='*70}")
    print(f"Chunking Strategy: {strategy_name.upper()}")
    print(f"{'='*70}")
    
    results[strategy_name] = evaluate_method(
        model=model,
        dataset_path=DATASET_PATH,
        chunking_strategy=strategy_func,
        method_name=strategy_name,
        **params
    )
    print(f"  Recall@10: {results[strategy_name]['recall@10']:.4f}, MRR: {results[strategy_name]['mrr']:.4f}")

print("\n" + "="*70)
print("All experiments completed!")
print("="*70)

Running Chunking Strategy Comparison

Chunking Strategy: SLIDING_WINDOW


Evaluating sliding_window: 25it [00:09,  2.57it/s]

## Results Analysis

In [None]:
# Create results DataFrame
df_results = pd.DataFrame(results).T

print("\n=== Chunking Strategy Comparison Results ===")
print(df_results.to_string())

# Save to CSV
df_results.to_csv("chunking_comparison_results.csv")
print("\nResults saved to chunking_comparison_results.csv")

## Comparative Analysis

In [None]:
print("\n" + "="*70)
print("PERFORMANCE COMPARISON")
print("="*70)

# Sort by Recall@10
sorted_by_recall = sorted(results.items(), key=lambda x: x[1]['recall@10'], reverse=True)

print("\nRanking by Recall@10:")
for i, (strategy, metrics) in enumerate(sorted_by_recall, 1):
    print(f"  {i}. {strategy.upper().replace('_', ' ')}: Recall@10={metrics['recall@10']:.4f}, MRR={metrics['mrr']:.4f}")

# Best vs Worst comparison
best_strategy, best_metrics = sorted_by_recall[0]
worst_strategy, worst_metrics = sorted_by_recall[-1]

recall_improvement = ((best_metrics['recall@10'] - worst_metrics['recall@10']) / worst_metrics['recall@10'] * 100) if worst_metrics['recall@10'] > 0 else 0
mrr_improvement = ((best_metrics['mrr'] - worst_metrics['mrr']) / worst_metrics['mrr'] * 100) if worst_metrics['mrr'] > 0 else 0

print(f"\nüèÜ Best Strategy: {best_strategy.upper().replace('_', ' ')}")
print(f"   Recall@10: {best_metrics['recall@10']:.4f}")
print(f"   MRR: {best_metrics['mrr']:.4f}")

print(f"\nüìä Improvement over worst strategy:")
print(f"   Recall@10: {recall_improvement:+.2f}%")
print(f"   MRR: {mrr_improvement:+.2f}%")

## Visualizations

In [None]:
# Prepare data for plotting
strategies = ["Sliding Window", "HTML-Aware", "Semantic Similarity"]
recalls = [results["sliding_window"]['recall@10'], 
           results["html_aware"]['recall@10'], 
           results["semantic_similarity"]['recall@10']]
mrrs = [results["sliding_window"]['mrr'], 
        results["html_aware"]['mrr'], 
        results["semantic_similarity"]['mrr']]

# Create plots
fig, axes = plt.subplots(1, 2, figsize=(16, 6))
x = np.arange(len(strategies))
width = 0.6

# Define colors for each strategy
colors = ['#1f77b4', '#ff7f0e', '#2ca02c']

# Plot 1: Recall@10 Comparison
bars1 = axes[0].bar(x, recalls, width, color=colors)

axes[0].set_ylabel('Recall@10', fontsize=12)
axes[0].set_title('Recall@10 Comparison Across Chunking Strategies', fontsize=14, fontweight='bold')
axes[0].set_xticks(x)
axes[0].set_xticklabels(strategies)
axes[0].set_ylim([0, 1])
axes[0].grid(axis='y', alpha=0.3)

# Add value labels
for bar in bars1:
    height = bar.get_height()
    axes[0].text(bar.get_x() + bar.get_width()/2., height + 0.02,
                f'{height:.3f}', ha='center', va='bottom', fontsize=10, fontweight='bold')

# Plot 2: MRR Comparison
bars2 = axes[1].bar(x, mrrs, width, color=colors)

axes[1].set_ylabel('MRR', fontsize=12)
axes[1].set_title('MRR Comparison Across Chunking Strategies', fontsize=14, fontweight='bold')
axes[1].set_xticks(x)
axes[1].set_xticklabels(strategies)
axes[1].set_ylim([0, max(mrrs) * 1.2])
axes[1].grid(axis='y', alpha=0.3)

# Add value labels
for bar in bars2:
    height = bar.get_height()
    axes[1].text(bar.get_x() + bar.get_width()/2., height + 0.01,
                f'{height:.3f}', ha='center', va='bottom', fontsize=10, fontweight='bold')

plt.tight_layout()
plt.savefig('chunking_comparison.png', dpi=300, bbox_inches='tight')
plt.show()

print("\nVisualization saved to chunking_comparison.png")

## Summary and Conclusions

In [None]:
print("\n" + "="*70)
print("KEY FINDINGS")
print("="*70)

# Find best method
best_method = max(results.items(), key=lambda x: x[1]['recall@10'])
print(f"\nüèÜ Best Chunking Strategy: {best_method[0].upper().replace('_', ' ')}")
print(f"   Recall@10: {best_method[1]['recall@10']:.4f}")
print(f"   MRR: {best_method[1]['mrr']:.4f}")
print(f"   Samples Evaluated: {best_method[1]['total_samples']}")

print("\n" + "="*70)
print("CONCLUSION")
print("="*70)
print("\nThis experiment compares three chunking strategies for document retrieval:")
print("\n1. Sliding Window: Fixed-size chunks with overlap")
print("   - Simple and fast")
print("   - May split semantic units")
print("\n2. HTML-Aware: Respects HTML structure (headers, paragraphs)")
print("   - Preserves document structure")
print("   - Good for HTML documents")
print("\n3. Semantic Similarity: Groups sentences by semantic similarity")
print("   - Preserves semantic coherence")
print("   - More computationally expensive")

print(f"\nBest performing strategy: {best_method[0].upper().replace('_', ' ')}")
print("\nAll chunks are encoded using naive encoding (each chunk independently).")
print("\n" + "="*70)