# Combining BGE-M3's Signals

Experiments from the blog post. Testing hybrid scoring with weights [0.4, 0.2, 0.4].

**Data source:** [SemanticSonarDB](https://github.com/MrJoeSack/sqlserver-sample-databases/tree/master/semantic-sonar-db)

In [None]:
# Install dependencies (run once)
# pip install FlagEmbedding pyodbc numpy

In [None]:
# Setup
import warnings
warnings.filterwarnings('ignore')

import pyodbc
from FlagEmbedding import BGEM3FlagModel

def get_conn():
    return pyodbc.connect(
        "DRIVER={ODBC Driver 18 for SQL Server};"
        "SERVER=localhost;DATABASE=SemanticSonarDB;"
        "TrustServerCertificate=yes;Trusted_Connection=yes;"
    )

print("Loading BGE-M3...")
model = BGEM3FlagModel('BAAI/bge-m3', use_fp16=False)
print("Done.")

In [None]:
# Load sample documents
conn = get_conn()
cursor = conn.cursor()
cursor.execute("""
    SELECT TOP 500 property_id, listing_description
    FROM PropertyListings
    WHERE listing_description IS NOT NULL
    ORDER BY property_id
""")
docs = [(r[0], r[1]) for r in cursor.fetchall()]
conn.close()

print(f"Loaded {len(docs)} documents")

In [None]:
# Encode all documents with all three modes
print("Encoding...")
doc_output = model.encode(
    [d[1] for d in docs],
    return_dense=True,
    return_sparse=True,
    return_colbert_vecs=True
)
print("Done.")

In [None]:
# Scoring functions

def dense_score(q_emb, d_emb):
    return float(q_emb @ d_emb.T)

def sparse_score(q_sparse, d_sparse):
    score = 0.0
    for token in q_sparse:
        if token in d_sparse:
            score += min(q_sparse[token], d_sparse[token])
    return score

def colbert_score(q_vecs, d_vecs):
    return model.colbert_score(q_vecs, d_vecs)

def hybrid_score(dense, sparse, colbert, weights=(0.4, 0.2, 0.4)):
    """Paper recommends [0.4, 0.2, 0.4] for dense, sparse, colbert"""
    return weights[0] * dense + weights[1] * sparse + weights[2] * colbert

---
## Finding Hybrid Discoveries

Documents that hybrid ranks highly but no single mode puts in top 5.

In [None]:
def rank_query(query, top_n=5):
    """Rank documents by all modes, return discoveries."""
    q_out = model.encode(
        [query],
        return_dense=True,
        return_sparse=True,
        return_colbert_vecs=True
    )
    
    # Collect all scores
    all_scores = []
    for i, (pid, text) in enumerate(docs):
        d = dense_score(q_out['dense_vecs'][0], doc_output['dense_vecs'][i])
        s = sparse_score(q_out['lexical_weights'][0], doc_output['lexical_weights'][i])
        c = colbert_score(q_out['colbert_vecs'][0], doc_output['colbert_vecs'][i])
        all_scores.append((pid, text, d, s, c))
    
    # Normalize to [0, 1]
    d_vals = [x[2] for x in all_scores]
    s_vals = [x[3] for x in all_scores]
    c_vals = [x[4] for x in all_scores]
    
    d_min, d_max = min(d_vals), max(d_vals)
    s_min, s_max = min(s_vals), max(s_vals)
    c_min, c_max = min(c_vals), max(c_vals)
    
    normalized = []
    for pid, text, d, s, c in all_scores:
        dn = (d - d_min) / (d_max - d_min) if d_max > d_min else 0.5
        sn = (s - s_min) / (s_max - s_min) if s_max > s_min else 0.5
        cn = (c - c_min) / (c_max - c_min) if c_max > c_min else 0.5
        h = hybrid_score(dn, sn, cn)
        normalized.append((pid, text, dn, sn, cn, h))
    
    # Sort by each mode
    by_dense = sorted(normalized, key=lambda x: x[2], reverse=True)
    by_sparse = sorted(normalized, key=lambda x: x[3], reverse=True)
    by_colbert = sorted(normalized, key=lambda x: x[4], reverse=True)
    by_hybrid = sorted(normalized, key=lambda x: x[5], reverse=True)
    
    # Find hybrid discoveries
    top_d = set(x[0] for x in by_dense[:top_n])
    top_s = set(x[0] for x in by_sparse[:top_n])
    top_c = set(x[0] for x in by_colbert[:top_n])
    
    discoveries = []
    for item in by_hybrid[:top_n]:
        if item[0] not in top_d and item[0] not in top_s and item[0] not in top_c:
            discoveries.append(item)
    
    return {
        'dense': by_dense[:top_n],
        'sparse': by_sparse[:top_n],
        'colbert': by_colbert[:top_n],
        'hybrid': by_hybrid[:top_n],
        'discoveries': discoveries
    }

In [None]:
# Primary example from blog post
query = "spacious layout great for entertaining guests"
results = rank_query(query, top_n=5)

print(f'Query: "{query}"\n')
print(f"Dense top 5:   {[x[0] for x in results['dense']]}")
print(f"Sparse top 5:  {[x[0] for x in results['sparse']]}")
print(f"ColBERT top 5: {[x[0] for x in results['colbert']]}")
print(f"Hybrid top 5:  {[x[0] for x in results['hybrid']]}")

if results['discoveries']:
    print(f"\nHybrid discoveries (not in any mode's top 5): {len(results['discoveries'])}")
    for pid, text, dn, sn, cn, h in results['discoveries']:
        print(f"\n  Property {pid}:")
        print(f"    Dense={dn:.3f}, Sparse={sn:.3f}, ColBERT={cn:.3f}, Hybrid={h:.3f}")
        print(f"    {text[:200]}...")

---
## Testing Multiple Queries

In [None]:
queries = [
    "quiet neighborhood with character and charm",
    "spacious layout great for entertaining guests",
    "well-maintained property in desirable location",
    "modern downtown condo with parking",
    "cozy home perfect for young family",
]

for query in queries:
    results = rank_query(query, top_n=5)
    
    if results['discoveries']:
        print(f"\n{query}")
        print(f"  Discoveries: {len(results['discoveries'])}")
        for pid, text, dn, sn, cn, h in results['discoveries']:
            print(f"    {pid}: hybrid={h:.3f} (d={dn:.3f}, s={sn:.3f}, c={cn:.3f})")

---
## When Hybrid Fails

In [None]:
# Query where hybrid picks the wrong answer
query = "fixer-upper with potential good bones"
results = rank_query(query, top_n=3)

print(f'Query: "{query}"\n')
print("Top 3 by hybrid:")
for pid, text, dn, sn, cn, h in results['hybrid']:
    print(f"\n  {pid}: hybrid={h:.3f}")
    print(f"    {text[:150]}...")