In [None]:
#!/usr/bin/env python3
"""
Simple TF-IDF Search Demo
"""

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from utils import get_doc_info

print("üîç TF-IDF Search Demo")
print("=" * 50)

# Load documents from techcorp-docs
docs, doc_paths = get_doc_info()

# Create TF-IDF matrix
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(docs)

# Example searches
queries = ["remote work policy", "health insurance benefits", "pet policy dogs"]

for query in queries:
    print(f"üîé Searching for: '{query}'")
    
    # Transform query to TF-IDF
    query_vector = vectorizer.transform([query])
    
    # Calculate similarities
    similarities = cosine_similarity(query_vector, tfidf_matrix).flatten()
    
    # Get top results
    top_indices = similarities.argsort()[-3:][::-1]
    
    print("Results:")
    for i, idx in enumerate(top_indices, 1):
        # Show only document path and score
        doc_name = doc_paths[idx].split('/')[-1]  # Just the filename
        print(f"  {i}. Score: {similarities[idx]:.4f} - {doc_name}")
    print()

print("‚úÖ TF-IDF search completed!")

BM25 search

In [None]:
#!/usr/bin/env python3
"""
Simple BM25 Search Demo
"""

from rank_bm25 import BM25Okapi
import re
from utils import get_doc_info

print("üîç BM25 Search Demo")
print("=" * 50)

# Load documents from techcorp-docs
docs, doc_paths = get_doc_info()
print(f"üìö Loaded {len(docs)} documents\n")

# Tokenize documents
tokenized_docs = [re.sub(r'[^a-zA-Z\s]', '', doc.lower()).split() for doc in docs]

# Create BM25 index
bm25 = BM25Okapi(tokenized_docs)

# Example searches
queries = ["remote work policy", "health insurance benefits", "pet policy dogs"]

for query in queries:
    print(f"üîé Searching for: '{query}'")
    
    # Tokenize query
    tokenized_query = re.sub(r'[^a-zA-Z\s]', '', query.lower()).split()
    
    # Get BM25 scores
    scores = bm25.get_scores(tokenized_query)
    
    # Get top results
    top_indices = scores.argsort()[-3:][::-1]
    
    print("Results:")
    for i, idx in enumerate(top_indices, 1):
        # Show only document path and score
        doc_name = doc_paths[idx].split('/')[-1]  # Just the filename
        print(f"  {i}. Score: {scores[idx]:.4f} - {doc_name}")
    print()

print("‚úÖ BM25 search completed!")

Hybrid search

In [None]:
#!/usr/bin/env python3
"""
Hybrid Search Implementation
Combines TF-IDF and BM25 scores with different weights
"""

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from rank_bm25 import BM25Okapi
import re
from utils import get_doc_info

def hybrid_search(query, docs, tfidf_weight=0.3, bm25_weight=0.7):
    """Combine TF-IDF and BM25 scores with weights"""
    
    # TF-IDF scores
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(docs)
    query_vector = vectorizer.transform([query])
    tfidf_scores = cosine_similarity(query_vector, tfidf_matrix).flatten()
    
    # BM25 scores
    tokenized_docs = [re.sub(r'[^a-zA-Z\s]', '', doc.lower()).split() for doc in docs]
    bm25 = BM25Okapi(tokenized_docs)
    tokenized_query = re.sub(r'[^a-zA-Z\s]', '', query.lower()).split()
    bm25_scores = bm25.get_scores(tokenized_query)
    
    # Normalize BM25 scores to 0-1 range for fair comparison
    if bm25_scores.max() > 0:
        bm25_scores = bm25_scores / bm25_scores.max()
    
    # Combine scores
    hybrid_scores = tfidf_weight * tfidf_scores + bm25_weight * bm25_scores
    
    return tfidf_scores, bm25_scores, hybrid_scores

def main():
    """Main function to demonstrate hybrid search"""
    print("üîç Hybrid Search Demo")
    print("=" * 50)
    
    # Load documents from techcorp-docs
    docs, doc_paths = get_doc_info()
    
    # Test different weight combinations
    query = "remote work policy"
    print(f"üîé Testing query: '{query}'")
    print("=" * 50)
    
    weight_combinations = [
        (0.5, 0.5, "Equal weights"),
        (0.3, 0.7, "BM25 favored"),
        (0.7, 0.3, "TF-IDF favored")
    ]
    
    for tfidf_w, bm25_w, description in weight_combinations:
        print(f"\nüìä {description} (TF-IDF: {tfidf_w}, BM25: {bm25_w})")
        print("-" * 40)
        
        tfidf_scores, bm25_scores, hybrid_scores = hybrid_search(query, docs, tfidf_w, bm25_w)
        
        # Get top 3 results
        top_indices = hybrid_scores.argsort()[-3:][::-1]
        
        print("Top 3 results:")
        for i, idx in enumerate(top_indices, 1):
            # Show only document path and score
            doc_name = doc_paths[idx].split('/')[-1]  # Just the filename
            print(f"  {i}. Score: {hybrid_scores[idx]:.4f} - {doc_name}")
    
    print(f"\n‚úÖ Hybrid search analysis completed!")

if __name__ == "__main__":
    main()


compare method

In [None]:
#!/usr/bin/env python3
"""
Compare Search Methods
Demonstrates the differences between grep, TF-IDF, and BM25
"""

import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from rank_bm25 import BM25Okapi
from utils import get_doc_info

def grep_search(query, documents):
    """Simple grep-like search - exact keyword matching"""
    results = []
    query_lower = query.lower()
    
    for i, doc in enumerate(documents):
        if query_lower in doc.lower():
            count = doc.lower().count(query_lower)
            results.append((i, count))
    
    results.sort(key=lambda x: x[1], reverse=True)
    return results

def tfidf_search(query, documents):
    """TF-IDF search using sklearn"""
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(documents)
    query_vector = vectorizer.transform([query])
    similarities = cosine_similarity(query_vector, tfidf_matrix).flatten()
    
    results = [(i, similarities[i]) for i in range(len(documents))]
    results.sort(key=lambda x: x[1], reverse=True)
    return results

def bm25_search(query, documents):
    """BM25 search using rank_bm25"""
    tokenized_docs = [re.sub(r'[^a-zA-Z\s]', '', doc.lower()).split() for doc in documents]
    bm25 = BM25Okapi(tokenized_docs)
    tokenized_query = re.sub(r'[^a-zA-Z\s]', '', query.lower()).split()
    scores = bm25.get_scores(tokenized_query)
    
    results = [(i, scores[i]) for i in range(len(documents))]
    results.sort(key=lambda x: x[1], reverse=True)
    return results

def main():
    """Main function to compare search methods"""
    print("üîç Search Methods Comparison")
    print("=" * 60)
    
    # Load documents from techcorp-docs
    docs, doc_paths = get_doc_info()
    print()
    
    # Test query
    query = "remote work policy"
    print(f"üîé Testing query: '{query}'")
    print("=" * 60)
    
    # Grep search
    print("\n1Ô∏è‚É£ GREP SEARCH (Exact keyword matching):")
    grep_results = grep_search(query, docs)
    for rank, (doc_idx, count) in enumerate(grep_results[:3], 1):
        print(f"  {rank}. Doc {doc_idx+1}: {count} matches - {docs[doc_idx][:80]}...")
    
    # TF-IDF search
    print("\n2Ô∏è‚É£ TF-IDF SEARCH (Term frequency-inverse document frequency):")
    tfidf_results = tfidf_search(query, docs)
    for rank, (doc_idx, score) in enumerate(tfidf_results[:3], 1):
        print(f"  {rank}. Doc {doc_idx+1}: Score {score:.4f} - {docs[doc_idx][:80]}...")
    
    # BM25 search
    print("\n3Ô∏è‚É£ BM25 SEARCH (Okapi BM25 with document length normalization):")
    bm25_results = bm25_search(query, docs)
    for rank, (doc_idx, score) in enumerate(bm25_results[:3], 1):
        print(f"  {rank}. Doc {doc_idx+1}: Score {score:.4f} - {docs[doc_idx][:80]}...")
    
    print(f"\n‚úÖ Search methods comparison completed!")
    print("\nüí° Key Insights:")
    print("- Grep: Simple exact matching, good for specific terms")
    print("- TF-IDF: Balances term frequency with document rarity")
    print("- BM25: Advanced ranking with document length normalization")

if __name__ == "__main__":
    main()