In [1]:
import sys
import os
import numpy as np
from sentence_transformers import SentenceTransformer

# Add the project root to Python path and change to parent directory
project_root = '/home/ubuntu/ilab_stuff/fine-tuned-hybrid-rag'
os.chdir(project_root)
sys.path.append(project_root)

from retrieval.hybrid_retriever import load_retrieval_components, hybrid_retrieve

print("Imports successful")
print(f"Current working directory: {os.getcwd()}")

  from .autonotebook import tqdm as notebook_tqdm


Imports successful
Current working directory: /home/ubuntu/ilab_stuff/fine-tuned-hybrid-rag


In [2]:
# Load base components (uses base model and embeddings)
print("Loading base retrieval components...")
base_components = load_retrieval_components()

print(f"Loaded {len(base_components['documents'])} documents")
print(f"Dense embeddings shape: {base_components['dense_embeddings'].shape}")
print(f"TF-IDF vocabulary size: {len(base_components['tfidf_vectorizer'].vocabulary_)}")
print(f"Base model: {base_components['model_name']}")
print(f"Device: {base_components['device']}")

Loading base retrieval components...
Loading hybrid retrieval components...
Loading dense embeddings from embeddings/dense.npy...
Loaded dense embeddings shape: (889289, 384)
Loading TF-IDF vectorizer from embeddings/tfidf_vectorizer.pkl...
TF-IDF vocabulary size: 10000
Loading documents from data/processed_docs.jsonl...


Loading documents: 889289it [00:27, 31927.15it/s]


Loaded 889289 documents
Loading sentence transformer model: sentence-transformers/all-MiniLM-L6-v2
Using device: cuda
Loading learned combination weights...
Learned weights loaded: dense=0.435, sparse=0.314, boost=0.251
All components loaded successfully!
Loaded 889289 documents
Dense embeddings shape: (889289, 384)
TF-IDF vocabulary size: 10000
Base model: sentence-transformers/all-MiniLM-L6-v2
Device: cuda


In [6]:
# Example query - using metadata-rich format that matches evaluation
query = "what is machine learning"

print(f"Query: '{query}'")
print(f"Searching with base model...")

# Retrieve with base model
base_results = hybrid_retrieve(
    query=query,
    components=base_components,
    top_k=5
)

print(f"\n=== BASE MODEL RESULTS (Top 5) ===")
for i, doc in enumerate(base_results):
    print(f"\n{i+1}. Title: {doc['metadata']['title'][:80]}...")
    print(f"   Final Score: {doc['scores']['final_score']:.4f}")
    print(f"   Dense: {doc['scores']['dense_score']:.4f} | Sparse: {doc['scores']['sparse_score']:.4f} | Boost: {doc['scores']['boost_score']:.4f}")
    if 'year' in doc['metadata']:
        print(f"   Year: {doc['metadata']['year']} | Venue: {doc['metadata'].get('venue', 'N/A')}")

Query: 'what is machine learning'
Searching with base model...
Retrieving top-5 documents for query: 'what is machine learning...'
Encoding query...
Computing dense similarities (top-1000)...
Computing sparse similarities...
Applying metadata boosting...
Combining scores with learned weights...
   Learned weights: dense=0.435, sparse=0.314, boost=0.251
Retrieved 5 documents

=== BASE MODEL RESULTS (Top 5) ===

1. Title: Will Skynet Need a Librarian? A Literature Review of Machine Learning–Based Data...
   Final Score: 0.5306
   Dense: 0.6086 | Sparse: 0.8464 | Boost: 1.0000
   Year: 2018 | Venue: 

2. Title: The papers of this issue on machine learning: editorial...
   Final Score: 0.5024
   Dense: 0.6380 | Sparse: 0.7160 | Boost: 1.0000
   Year: 1987 | Venue: Comput. Intell.

3. Title: Role of Intelligent Machines learning for the Successful Implementation of Busin...
   Final Score: 0.4967
   Dense: 0.6116 | Sparse: 0.7342 | Boost: 1.0000
   Year: 2019 | Venue: 

4. Title: Machine Le

In [4]:
# Load fine-tuned components
print("Loading fine-tuned components...")

# Start with base components
ft_components = load_retrieval_components()

# Replace with fine-tuned model and embeddings
config = ft_components['config']
ft_model_path = config['finetune']['output_path']
ft_embeddings_path = config['embeddings']['dense_finetuned_path']

print(f"Loading fine-tuned model from: {ft_model_path}")
print(f"Loading fine-tuned embeddings from: {ft_embeddings_path}")

# Load fine-tuned model
ft_components['sentence_model'] = SentenceTransformer(ft_model_path)

# Load fine-tuned embeddings
ft_components['dense_embeddings'] = np.load(ft_embeddings_path)

print(f"Fine-tuned model loaded")
print(f"Fine-tuned embeddings shape: {ft_components['dense_embeddings'].shape}")
print(f"Ready for improved retrieval!")

Loading fine-tuned components...
Loading hybrid retrieval components...
Loading dense embeddings from embeddings/dense.npy...
Loaded dense embeddings shape: (889289, 384)
Loading TF-IDF vectorizer from embeddings/tfidf_vectorizer.pkl...
TF-IDF vocabulary size: 10000
Loading documents from data/processed_docs.jsonl...


Loading documents: 889289it [00:39, 22634.83it/s]


Loaded 889289 documents
Loading sentence transformer model: sentence-transformers/all-MiniLM-L6-v2
Using device: cuda
Loading learned combination weights...
Learned weights loaded: dense=0.435, sparse=0.314, boost=0.251
All components loaded successfully!
Loading fine-tuned model from: finetune/model/
Loading fine-tuned embeddings from: embeddings/dense_finetuned.npy
Fine-tuned model loaded
Fine-tuned embeddings shape: (889289, 384)
Ready for improved retrieval!


In [7]:
# Same query with fine-tuned model
print(f"Query: '{query}'")
print(f"Searching with fine-tuned model...")

ft_results = hybrid_retrieve(
    query=query,
    components=ft_components,
    top_k=100
)

print(f"\n=== FINE-TUNED MODEL RESULTS (Top 5) ===")
for i, doc in enumerate(ft_results):
    print(f"\n{i+1}. Title: {doc['metadata']['title'][:80]}...")
    print(f"   Final Score: {doc['scores']['final_score']:.4f}")
    print(f"   Dense: {doc['scores']['dense_score']:.4f} | Sparse: {doc['scores']['sparse_score']:.4f} | Boost: {doc['scores']['boost_score']:.4f}")
    if 'year' in doc['metadata']:
        print(f"   Year: {doc['metadata']['year']} | Venue: {doc['metadata'].get('venue', 'N/A')}")

Query: 'what is machine learning'
Searching with fine-tuned model...
Retrieving top-100 documents for query: 'what is machine learning...'
Encoding query...
Computing dense similarities (top-1000)...
Computing sparse similarities...
Applying metadata boosting...
Combining scores with learned weights...
   Learned weights: dense=0.435, sparse=0.314, boost=0.251
Retrieved 100 documents

=== FINE-TUNED MODEL RESULTS (Top 5) ===

1. Title: A study on machine learning web service...
   Final Score: 0.5668
   Dense: 0.7463 | Sparse: 0.7711 | Boost: 1.0000
   Year: 2017 | Venue: 2017 International Conference on Information and Communication Technology Convergence (ICTC)

2. Title: Machine learning in the optimization of robotics in the operative field...
   Final Score: 0.5656
   Dense: 0.7770 | Sparse: 0.7246 | Boost: 1.0000
   Year: 2020 | Venue: Current opinion in urology

3. Title: Machine Learning for IoT...
   Final Score: 0.5610
   Dense: 0.7744 | Sparse: 0.7134 | Boost: 1.0000
   Year

In [8]:
# Example 1: Hard filters (exclude unwanted documents)
hard_filters = {
    "min_year": 2005,  # Only papers from 2018 onwards
    "excluded_venues": ["workshop"]  # Exclude workshop papers
}

print("=== QUERY WITH HARD FILTERS ===")
print(f"Filters: Only papers from 2018+, exclude workshops")

filtered_results = hybrid_retrieve(
    query=query,
    components=ft_components,
    user_filters=hard_filters,
    top_k=5
)

print(f"\nResults with hard filters:")
for i, doc in enumerate(filtered_results):
    year = doc['metadata'].get('year', 'N/A')
    venue = doc['metadata'].get('venue', 'N/A')
    print(f"{i+1}. {doc['metadata']['title'][:60]}...")
    print(f"   Year: {year} | Venue: {venue} | Score: {doc['scores']['final_score']:.4f}")

=== QUERY WITH HARD FILTERS ===
Filters: Only papers from 2018+, exclude workshops
Retrieving top-5 documents for query: 'what is machine learning...'
Encoding query...
Computing dense similarities (top-1000)...
Applying hard filters...
802 candidates after filtering
Computing sparse similarities...
Applying metadata boosting...
Combining scores with learned weights...
   Learned weights: dense=0.435, sparse=0.314, boost=0.251
Retrieved 5 documents

Results with hard filters:
1. A study on machine learning web service...
   Year: 2017 | Venue: 2017 International Conference on Information and Communication Technology Convergence (ICTC) | Score: 0.5668
2. Machine learning in the optimization of robotics in the oper...
   Year: 2020 | Venue: Current opinion in urology | Score: 0.5656
3. Machine Learning for IoT...
   Year: 2020 | Venue:  | Score: 0.5610
4. Machine Learning Techniques for Wireless-Powered Ambient Bac...
   Year: 2020 | Venue: Convergence of Artificial Intelligence and the 

In [9]:
# Example 2: Soft boosting (prefer certain documents)
boost_filters = {
    "venue": "NIPS",  # Boost NIPS papers
    "field": "machine learning",  # Boost ML papers
    "year_after": 2019  # Boost recent papers
}

print("\n=== QUERY WITH SOFT BOOSTING ===")
print(f"Boosting: NIPS papers, ML field, recent papers (2019+)")

boosted_results = hybrid_retrieve(
    query=query,
    components=ft_components,
    user_filters=boost_filters,
    top_k=5
)

print(f"\nResults with boosting:")
for i, doc in enumerate(boosted_results):
    year = doc['metadata'].get('year', 'N/A')
    venue = doc['metadata'].get('venue', 'N/A')
    fields = doc['metadata'].get('fieldsOfStudy', [])
    print(f"{i+1}. {doc['metadata']['title'][:60]}...")
    print(f"   Year: {year} | Venue: {venue}")
    print(f"   Fields: {fields[:2]}...")  # Show first 2 fields
    print(f"   Score: {doc['scores']['final_score']:.4f} (Boost: {doc['scores']['boost_score']:.4f})")


=== QUERY WITH SOFT BOOSTING ===
Boosting: NIPS papers, ML field, recent papers (2019+)
Retrieving top-5 documents for query: 'what is machine learning...'
Encoding query...
Computing dense similarities (top-1000)...
Computing sparse similarities...
Applying metadata boosting...
Combining scores with learned weights...
   Learned weights: dense=0.435, sparse=0.314, boost=0.251
Retrieved 5 documents

Results with boosting:
1. Machine learning in the optimization of robotics in the oper...
   Year: 2020 | Venue: Current opinion in urology
   Fields: ['Medicine']...
   Score: 0.5756 (Boost: 1.0200)
2. Machine Learning for IoT...
   Year: 2020 | Venue: 
   Fields: ['Computer Science']...
   Score: 0.5710 (Boost: 1.0200)
3. A study on machine learning web service...
   Year: 2017 | Venue: 2017 International Conference on Information and Communication Technology Convergence (ICTC)
   Fields: ['Computer Science']...
   Score: 0.5668 (Boost: 1.0000)
4. Machine Learning Techniques for Wireless