# Semantic Search Experiments

In [3]:
import sys
sys.path.append('..')
import pandas as pd
import torch
from src.pipeline import SemanticSearchPipeline

print(f'GPU: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU"}')

  from .autonotebook import tqdm as notebook_tqdm


GPU: NVIDIA GeForce RTX 4060 Laptop GPU


In [4]:
# Baseline test
baseline = SemanticSearchPipeline({
    'embedding_model': 'sentence-transformers/all-MiniLM-L6-v2',
    'index_type': 'flat',
    'use_reranker': False,
    'text_strategy': 'basic'
})

# Quick test on 10% sample
metrics = baseline.run_full_pipeline('small', sample_frac=0.1)
print(f'Baseline MRR: {metrics["MRR"]:.4f}')

📦 Found 2 cached embeddings (1.38 GB)


Processing (basic): 100%|██████████| 57734/57734 [00:02<00:00, 24612.72it/s]
  with torch.cuda.amp.autocast(dtype=torch.float16):
Batches: 100%|██████████| 151/151 [00:26<00:00,  5.66it/s]


Baseline MRR: 0.3529


In [5]:
# Test with reranker
with_reranker = SemanticSearchPipeline({
    'embedding_model': 'sentence-transformers/all-MiniLM-L6-v2',
    'index_type': 'ivf',
    'use_reranker': True,
    'text_strategy': 'basic'
})

metrics = with_reranker.run_full_pipeline('small', sample_frac=0.1)
print(f'With Reranker MRR: {metrics["MRR"]:.4f}')

📦 Found 3 cached embeddings (1.46 GB)


Processing (basic): 100%|██████████| 57734/57734 [00:02<00:00, 20768.42it/s]
  with torch.cuda.amp.autocast(dtype=torch.float16):


With Reranker MRR: 0.4282


In [6]:
# Test enhanced text preparation
enhanced = SemanticSearchPipeline({
    'embedding_model': 'sentence-transformers/all-MiniLM-L6-v2',
    'index_type': 'ivf',
    'use_reranker': True,
    'text_strategy': 'enhanced'
})

metrics = enhanced.run_full_pipeline('small', sample_frac=0.1)
print(f'Enhanced Text MRR: {metrics["MRR"]:.4f}')

📦 Found 3 cached embeddings (1.46 GB)


Processing (enhanced): 100%|██████████| 57734/57734 [00:03<00:00, 16093.93it/s]
  with torch.cuda.amp.autocast(dtype=torch.float16):
Batches: 100%|██████████| 151/151 [00:41<00:00,  3.67it/s]


Enhanced Text MRR: 0.4352


In [None]:
# Run optimization experiments
from src.experiments import run_optimization_experiments

results = run_optimization_experiments(sample_frac=0.1)

# Display results
df = pd.DataFrame(results)
df[['name', 'Hits@1', 'Hits@5', 'Hits@10', 'MRR']]


OPTIMIZATION EXPERIMENTS
Baseline MRR: 0.4887

1. Testing basic text + reranker...
📦 Found 4 cached embeddings (1.54 GB)


Processing (basic): 100%|██████████| 57734/57734 [00:02<00:00, 23850.42it/s]
  with torch.cuda.amp.autocast(dtype=torch.float16):


Result: MRR = 0.4282

2. Testing enhanced text + reranker...
📦 Found 4 cached embeddings (1.54 GB)


Processing (enhanced): 100%|██████████| 57734/57734 [00:04<00:00, 12266.17it/s]
  with torch.cuda.amp.autocast(dtype=torch.float16):
