In [2]:
import sys
import os
import numpy as np
from sentence_transformers import SentenceTransformer

# Add the project root to Python path and change to parent directory
project_root = '/home/ubuntu/ilab_stuff/fine-tuned-hybrid-rag'
os.chdir(project_root)
sys.path.append(project_root)

from retrieval.hybrid_retriever import load_retrieval_components, hybrid_retrieve

print("Imports successful")
print(f"Current working directory: {os.getcwd()}")

  from .autonotebook import tqdm as notebook_tqdm


Imports successful
Current working directory: /home/ubuntu/ilab_stuff/fine-tuned-hybrid-rag


In [3]:
# Load base components (uses base model and embeddings)
print("Loading base retrieval components...")
base_components = load_retrieval_components()

print(f"Loaded {len(base_components['documents'])} documents")
print(f"Dense embeddings shape: {base_components['dense_embeddings'].shape}")
print(f"TF-IDF vocabulary size: {len(base_components['tfidf_vectorizer'].vocabulary_)}")
print(f"Base model: {base_components['model_name']}")
print(f"Device: {base_components['device']}")

Loading base retrieval components...
Loading hybrid retrieval components...
Loading dense embeddings from embeddings/dense.npy...


Loaded dense embeddings shape: (889289, 384)
Loading TF-IDF vectorizer from embeddings/tfidf_vectorizer.pkl...
TF-IDF vocabulary size: 10000
Loading documents from data/processed_docs.jsonl...


Loading documents: 889289it [00:29, 30606.67it/s]


Loaded 889289 documents
Loading sentence transformer model: sentence-transformers/all-MiniLM-L6-v2
Using device: cuda
Loading learned combination weights...
Learned weights loaded: dense=0.469, sparse=0.339, boost=0.192
All components loaded successfully!
Loaded 889289 documents
Dense embeddings shape: (889289, 384)
TF-IDF vocabulary size: 10000
Base model: sentence-transformers/all-MiniLM-L6-v2
Device: cuda


In [13]:
# Example query - using metadata-rich format that matches evaluation
query = "what are the roles of the computers"

print(f"Query: '{query}'")
print(f"Searching with base model...")

# Retrieve with base model
base_results = hybrid_retrieve(
    query=query,
    components=base_components,
    top_k=5
)

print(f"\n=== BASE MODEL RESULTS (Top 5) ===")
for i, doc in enumerate(base_results):
    print(f"\n{i+1}. Title: {doc['metadata']['title'][:80]}...")
    print(f"   Final Score: {doc['scores']['final_score']:.4f}")
    print(f"   Dense: {doc['scores']['dense_score']:.4f} | Sparse: {doc['scores']['sparse_score']:.4f} | Boost: {doc['scores']['boost_score']:.4f}")
    if 'year' in doc['metadata']:
        print(f"   Year: {doc['metadata']['year']} | Venue: {doc['metadata'].get('venue', 'N/A')}")

Query: 'what are the roles of the computers'
Searching with base model...
Retrieving top-5 documents for query: 'what are the roles of the computers...'
Encoding query...
Computing dense similarities (top-1000)...


Computing sparse similarities...
Applying metadata boosting...
Combining scores with learned weights...
   Learned weights: dense=0.469, sparse=0.339, boost=0.192
Retrieved 5 documents

=== BASE MODEL RESULTS (Top 5) ===

1. Title: Computer's Role in Crimes...
   Final Score: 0.5686
   Dense: 0.5948 | Sparse: 0.5830 | Boost: 1.0000
   Year: 2008 | Venue: 

2. Title: Small digital computers to assist large digital computers...
   Final Score: 0.5057
   Dense: 0.4595 | Sparse: 0.6146 | Boost: 1.0000
   Year: 1954 | Venue: AIEE-IRE '54 (Eastern)

3. Title: Computers: Tools for Knowledge Workers...
   Final Score: 0.4883
   Dense: 0.5442 | Sparse: 0.4544 | Boost: 1.0000
   Year: 1993 | Venue: 

4. Title: Technology and innovation with notebook computers...
   Final Score: 0.4740
   Dense: 0.4897 | Sparse: 0.4945 | Boost: 1.0000
   Year: 1997 | Venue: 

5. Title: Effect of Technology on Near Term Computer Structures...
   Final Score: 0.4639
   Dense: 0.5823 | Sparse: 0.3415 | Boost: 1.0000

In [14]:
# Load fine-tuned components
print("Loading fine-tuned components...")

# Start with base components
ft_components = load_retrieval_components()

# Replace with fine-tuned model and embeddings
config = ft_components['config']
ft_model_path = config['finetune']['output_path']
ft_embeddings_path = config['embeddings']['dense_finetuned_path']

print(f"Loading fine-tuned model from: {ft_model_path}")
print(f"Loading fine-tuned embeddings from: {ft_embeddings_path}")

# Load fine-tuned model
ft_components['sentence_model'] = SentenceTransformer(ft_model_path)

# Load fine-tuned embeddings
ft_components['dense_embeddings'] = np.load(ft_embeddings_path)

print(f"Fine-tuned model loaded")
print(f"Fine-tuned embeddings shape: {ft_components['dense_embeddings'].shape}")
print(f"Ready for improved retrieval!")

Loading fine-tuned components...
Loading hybrid retrieval components...
Loading dense embeddings from embeddings/dense.npy...
Loaded dense embeddings shape: (889289, 384)
Loading TF-IDF vectorizer from embeddings/tfidf_vectorizer.pkl...
TF-IDF vocabulary size: 10000
Loading documents from data/processed_docs.jsonl...


Loading documents: 889289it [00:25, 35250.64it/s]


Loaded 889289 documents
Loading sentence transformer model: sentence-transformers/all-MiniLM-L6-v2
Using device: cuda
Loading learned combination weights...
Learned weights loaded: dense=0.469, sparse=0.339, boost=0.192
All components loaded successfully!
Loading fine-tuned model from: finetune/model/
Loading fine-tuned embeddings from: embeddings/dense_finetuned.npy
Fine-tuned model loaded
Fine-tuned embeddings shape: (889289, 384)
Ready for improved retrieval!


In [15]:
# Same query with fine-tuned model
query = "what are the roles of the computers"
print(f"Query: '{query}'")
print(f"Searching with fine-tuned model...")

ft_results = hybrid_retrieve(
    query=query,
    components=ft_components,
    top_k=100
)

print(f"\n=== FINE-TUNED MODEL RESULTS (Top 5) ===")
for i, doc in enumerate(ft_results):
    print(f"\n{i+1}. Title: {doc['metadata']['title'][:80]}...")
    print(f"   Final Score: {doc['scores']['final_score']:.4f}")
    print(f"   Dense: {doc['scores']['dense_score']:.4f} | Sparse: {doc['scores']['sparse_score']:.4f} | Boost: {doc['scores']['boost_score']:.4f}")
    if 'year' in doc['metadata']:
        print(f"   Year: {doc['metadata']['year']} | Venue: {doc['metadata'].get('venue', 'N/A')}")

Query: 'what are the roles of the computers'
Searching with fine-tuned model...
Retrieving top-100 documents for query: 'what are the roles of the computers...'
Encoding query...
Computing dense similarities (top-1000)...
Computing sparse similarities...
Applying metadata boosting...
Combining scores with learned weights...
   Learned weights: dense=0.469, sparse=0.339, boost=0.192
Retrieved 100 documents

=== FINE-TUNED MODEL RESULTS (Top 5) ===

1. Title: Computer's Role in Crimes...
   Final Score: 0.6966
   Dense: 0.8235 | Sparse: 0.5830 | Boost: 1.0000
   Year: 2008 | Venue: 

2. Title: THE ROLE OF COMPUTER IN DESKTOP PUBLISHING...
   Final Score: 0.5699
   Dense: 0.7601 | Sparse: 0.3576 | Boost: 1.0000
   Year: 2010 | Venue: 

3. Title: The Role of the Computer in the School as Perceived by Computer Using Teachers a...
   Final Score: 0.4959
   Dense: 0.7672 | Sparse: 0.1647 | Boost: 1.0000
   Year: 1996 | Venue: 

4. Title: Roles as a Coordination Construct...
   Final Score: 0.

In [16]:
# Example 1: Hard filters (exclude unwanted documents)
hard_filters = {
    "min_year": 2005,  # Only papers from 2018 onwards
    "excluded_venues": ["workshop"]  # Exclude workshop papers
}

print("=== QUERY WITH HARD FILTERS ===")
print(f"Filters: Only papers from 2018+, exclude workshops")

filtered_results = hybrid_retrieve(
    query=query,
    components=ft_components,
    user_filters=hard_filters,
    top_k=5
)

print(f"\nResults with hard filters:")
for i, doc in enumerate(filtered_results):
    year = doc['metadata'].get('year', 'N/A')
    venue = doc['metadata'].get('venue', 'N/A')
    print(f"{i+1}. {doc['metadata']['title'][:60]}...")
    print(f"   Year: {year} | Venue: {venue} | Score: {doc['scores']['final_score']:.4f}")

=== QUERY WITH HARD FILTERS ===
Filters: Only papers from 2018+, exclude workshops
Retrieving top-5 documents for query: 'what are the roles of the computers...'
Encoding query...
Computing dense similarities (top-1000)...
Applying hard filters...
700 candidates after filtering
Computing sparse similarities...
Applying metadata boosting...
Combining scores with learned weights...
   Learned weights: dense=0.469, sparse=0.339, boost=0.192
Retrieved 5 documents

Results with hard filters:
1. Computer's Role in Crimes...
   Year: 2008 | Venue:  | Score: 0.6966
2. THE ROLE OF COMPUTER IN DESKTOP PUBLISHING...
   Year: 2010 | Venue:  | Score: 0.5699
3. Roles as a Coordination Construct...
   Year: 2006 | Venue:  | Score: 0.4930
4. Roles in a Software Project...
   Year: 2009 | Venue: EuroPLoP | Score: 0.4635
5. The role of computer software in presenting information....
   Year: 2009 | Venue: Nursing management | Score: 0.4332


In [8]:
# Example 2: Soft boosting (prefer certain documents)
boost_filters = {
    "venue": "NIPS",  # Boost NIPS papers
    "field": "machine learning",  # Boost ML papers
    "year_after": 2019  # Boost recent papers
}

print("\n=== QUERY WITH SOFT BOOSTING ===")
print(f"Boosting: NIPS papers, ML field, recent papers (2019+)")

boosted_results = hybrid_retrieve(
    query=query,
    components=ft_components,
    user_filters=boost_filters,
    top_k=5
)

print(f"\nResults with boosting:")
for i, doc in enumerate(boosted_results):
    year = doc['metadata'].get('year', 'N/A')
    venue = doc['metadata'].get('venue', 'N/A')
    fields = doc['metadata'].get('fieldsOfStudy', [])
    print(f"{i+1}. {doc['metadata']['title'][:60]}...")
    print(f"   Year: {year} | Venue: {venue}")
    print(f"   Fields: {fields[:2]}...")  # Show first 2 fields
    print(f"   Score: {doc['scores']['final_score']:.4f} (Boost: {doc['scores']['boost_score']:.4f})")


=== QUERY WITH SOFT BOOSTING ===
Boosting: NIPS papers, ML field, recent papers (2019+)
Retrieving top-5 documents for query: 'what is machine learning...'
Encoding query...
Computing dense similarities (top-1000)...
Computing sparse similarities...
Applying metadata boosting...
Combining scores with learned weights...
   Learned weights: dense=0.469, sparse=0.339, boost=0.192
Retrieved 5 documents

Results with boosting:
1. Machine learning in the optimization of robotics in the oper...
   Year: 2020 | Venue: Current opinion in urology
   Fields: ['Medicine']...
   Score: 0.7369 (Boost: 1.0200)
2. Machine Learning for IoT...
   Year: 2020 | Venue: 
   Fields: ['Computer Science']...
   Score: 0.7310 (Boost: 1.0200)
3. A study on machine learning web service...
   Year: 2017 | Venue: 2017 International Conference on Information and Communication Technology Convergence (ICTC)
   Fields: ['Computer Science']...
   Score: 0.7294 (Boost: 1.0000)
4. Machine Learning Techniques for Wireless

In [10]:
# Analyze scoring components without metadata boosting 

def analyze_scores(results, model_name):
    print(f"\n=== SCORE ANALYSIS: {model_name} ===")
    
    dense_scores = [doc['scores']['dense_score'] for doc in results]
    sparse_scores = [doc['scores']['sparse_score'] for doc in results]
    boost_scores = [doc['scores']['boost_score'] for doc in results]
    final_scores = [doc['scores']['final_score'] for doc in results]
    
    print(f"Dense scores  - Min: {min(dense_scores):.4f}, Max: {max(dense_scores):.4f}, Avg: {np.mean(dense_scores):.4f}")
    print(f"Sparse scores - Min: {min(sparse_scores):.4f}, Max: {max(sparse_scores):.4f}, Avg: {np.mean(sparse_scores):.4f}")
    print(f"Boost scores  - Min: {min(boost_scores):.4f}, Max: {max(boost_scores):.4f}, Avg: {np.mean(boost_scores):.4f}")
    print(f"Final scores  - Min: {min(final_scores):.4f}, Max: {max(final_scores):.4f}, Avg: {np.mean(final_scores):.4f}")
    
    # Show scoring weights
    config = ft_components['config']
    print(f"\nScoring weights: Dense={config['scoring']['dense_weight']}, Sparse={config['scoring']['sparse_weight']}, Boost={config['scoring']['boost_weight']}")

# Analyze both models
analyze_scores(base_results, "BASE MODEL")
analyze_scores(ft_results, "FINE-TUNED MODEL")


=== SCORE ANALYSIS: BASE MODEL ===
Dense scores  - Min: 0.5843, Max: 0.6380, Avg: 0.6129
Sparse scores - Min: 0.7134, Max: 0.8464, Avg: 0.7524
Boost scores  - Min: 1.0000, Max: 1.0000, Avg: 1.0000
Final scores  - Min: 0.6312, Max: 0.6828, Avg: 0.6472


KeyError: 'scoring'