# Semantic Search Experiments

In [1]:
import pandas as pd
import torch
from src.pipeline import SemanticSearchPipeline

print(f'GPU: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU"}')

  from .autonotebook import tqdm as notebook_tqdm


GPU: NVIDIA GeForce RTX 4060 Laptop GPU


In [None]:
# Baseline test
baseline = SemanticSearchPipeline({
    'embedding_model': 'sentence-transformers/all-MiniLM-L6-v2',
    'index_type': 'flat',
    'use_reranker': False,
    'text_strategy': 'basic'
})

# Quick test on 10% sample
metrics = baseline.run_full_pipeline('small', sample_frac=0.1)
print(f'Baseline MRR: {metrics["MRR"]:.4f}')

In [None]:
# Test with reranker (expect +15% MRR)
with_reranker = SemanticSearchPipeline({
    'embedding_model': 'sentence-transformers/all-MiniLM-L6-v2',
    'index_type': 'ivf',
    'use_reranker': True,
    'text_strategy': 'basic'
})

metrics = with_reranker.run_full_pipeline('small', sample_frac=0.1)
print(f'With Reranker MRR: {metrics["MRR"]:.4f}')

In [None]:
# Test enhanced text preparation
enhanced = SemanticSearchPipeline({
    'embedding_model': 'sentence-transformers/all-MiniLM-L6-v2',
    'index_type': 'ivf',
    'use_reranker': True,
    'text_strategy': 'enhanced'
})

metrics = enhanced.run_full_pipeline('small', sample_frac=0.1)
print(f'Enhanced Text MRR: {metrics["MRR"]:.4f}')

In [None]:
# Run optimization experiments
from src.experiments import run_optimization_experiments

results = run_optimization_experiments(sample_frac=0.1)

# Display results
df = pd.DataFrame(results)
df[['name', 'MRR', 'Hits@10']]

## Visualize Results

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Plot MRR improvements
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# MRR comparison
ax1 = axes[0]
ax1.bar(range(len(df)), df['MRR'], color='green', alpha=0.7)
ax1.axhline(y=0.488, color='red', linestyle='--', label='Baseline')
ax1.axhline(y=0.60, color='blue', linestyle='--', label='Target')
ax1.set_xlabel('Configuration')
ax1.set_ylabel('MRR')
ax1.set_title('MRR Improvements')
ax1.legend()

# Hits@10 comparison
ax2 = axes[1]
ax2.bar(range(len(df)), df['Hits@10'], color='blue', alpha=0.7)
ax2.axhline(y=0.714, color='red', linestyle='--', label='Baseline')
ax2.set_xlabel('Configuration')
ax2.set_ylabel('Hits@10')
ax2.set_title('Hits@10 Improvements')
ax2.legend()

plt.tight_layout()
plt.show()