In [None]:
!pip install pymongo
!pip install sentence_transformers



In [None]:
from sentence_transformers import SentenceTransformer
import numpy as np
from datetime import datetime
import time
import json
import gc
import torch
import pandas as pd

In [None]:
from sentence_transformers import SentenceTransformer

# Load once (global)
embedding_model = SentenceTransformer(
    "nomic-ai/nomic-embed-text-v1.5",
    trust_remote_code=True
)

def embed_query(query_text):
    prefix = "classification: "
    return embedding_model.encode(
        prefix + query_text,
        normalize_embeddings=True
    ).tolist()




In [None]:
import pymongo

db_username = 'username'
db_password = 'password'
URI = f'url'

client = pymongo.MongoClient(URI)

In [None]:
db = client["project3"]
collection = db["wikipedia_holdings"]
print("Connected to MongoDB.")

Connected to MongoDB.


In [None]:
# First create the production summary text

todo_filter = {
    "SUMMARY_material_points": {"$exists": True},
    "prod_text": {"$exists": False}
}

projection = {
    "SUMMARY_business_description":1,
    "SUMMARY_material_points": 1,
    "SUMMARY_investment_industry":1,
    "SUMMARY_investment_exposure":1

}

cursor = collection.find(todo_filter, projection)

In [None]:
updates = []
for doc in cursor:
  summary_text = " ".join([
      str(doc.get("SUMMARY_business_description", "")),
      str(doc.get("SUMMARY_material_points", [])),
      str(doc.get("SUMMARY_investment_industry", [])),
      str(doc.get("SUMMARY_investment_exposure", []))
  ])

  updates.append(
      pymongo.UpdateOne(
          {"_id": doc["_id"]},
          {"$set": {"prod_text": summary_text}}
  ))

if updates:
  collection.bulk_write(updates)
  print(f"Summary text of {len(updates)} documents updated successfully.")
else:
    print("No documents needed updating.")

No documents needed updating.


Now with the production emebddings we added to the mongodb from the last part, we can beging the search code.

In [None]:
# Index 1 — Basic Sparse Search Index (lrcm_sparse)
# Create this index manually in MongoDB Atlas

"""
{
  "mappings": {
    "dynamic": false,
    "fields": {
      "prod_text": {
        "type": "string"
      }
    }
  }
}
"""


'\n{\n  "mappings": {\n    "dynamic": false,\n    "fields": {\n      "prod_text": {\n        "type": "string"\n      }\n    }\n  }\n}\n'

In [None]:
# Index 2 — English Analyzer Sparse (lrcm_sparse_english)
# Create this index manually in MongoDB Atlas
"""
{
  "analyzer": "lucene.english",
  "searchAnalyzer": "lucene.english",
  "mappings": {
    "dynamic": false,
    "fields": {
      "prod_text": {
        "type": "string"
      }
    }
  }
}
"""


'\n{\n  "analyzer": "lucene.english",\n  "searchAnalyzer": "lucene.english",\n  "mappings": {\n    "dynamic": false,\n    "fields": {\n      "prod_text": {\n        "type": "string"\n      }\n    }\n  }\n}\n'

In [None]:
# Index 3 — Dense Vector Search (lrcm_dense)
# Create this index manually in MongoDB Atlas

"""
{
  "fields": [
    {
      "path": "production_embedding.embedding",
      "numDimensions": 768,
      "similarity": "cosine",
      "type": "vector"
    }
  ]
}
"""


'\n{\n  "fields": [\n    {\n      "path": "production_embedding.embedding",\n      "numDimensions": 768,\n      "similarity": "cosine",\n      "type": "vector"\n    }\n  ]\n}\n'

### Sparse Search Implementation

In [None]:
query_text = "quantum computer"

text_search_pipeline = [
    {
        "$search": {
            "index": "lrcm_sparse",
            "text": {
                "query": query_text,
                "path": "prod_text"
            }
        }
    },
    {
        "$project": {
            "_id": 1,
            "ticker": 1,
            "name": 1,
            "prod_text": 1,
            "score": { "$meta": "searchScore" }
        }
    },
    {
        "$limit": 5
    }
]

# Execute search
res = pd.DataFrame(collection.aggregate(text_search_pipeline))

# Keyword frequency diagnostics
res["quantum_count"] = res["prod_text"].str.lower().str.count("quantum")
res["computer_count"] = res["prod_text"].str.lower().str.count("computer")

# Document length
res["doc_length"] = res["prod_text"].str.len()

# Display
res[
    [
        "_id",
        "ticker",
        "name",
        "score",
        "quantum_count",
        "computer_count",
        "doc_length"
    ]
]

Unnamed: 0,_id,ticker,name,score,quantum_count,computer_count,doc_length
0,690c2f0530de78ca9390ab7d,JBHT,JB HUNT TRANSPORT SERVICES INC,3.100106,1,0,937
1,690c2f0530de78ca9390ab2c,CDW,CDW CORP,2.587149,0,1,704
2,690c2f0530de78ca9390abb8,U,UNITY SOFTWARE INC,2.576286,0,2,1993
3,690c2f0530de78ca9390a98e,GOOG,ALPHABET INC CLASS C,2.296149,1,0,1800
4,690c2f0530de78ca9390ab90,ZBRA,ZEBRA TECHNOLOGIES CORP CLASS A,2.208602,0,1,1262


In [None]:
def search_sparse(collection, query_text, k=10, index_name="lrcm_sparse_english"):
    """
    Execute BM25 sparse search using MongoDB Atlas Search
    """
    pipeline = [
        {
            "$search": {
                "index": index_name,
                "text": {
                    "query": query_text,
                    "path": ["prod_text"]
                }
            }
        },
        {"$limit": k},
        {
            "$project": {
                "_id": 0,
                "ticker": 1,
                "name": 1,
                "score": {"$meta": "searchScore"}
            }
        }
    ]

    results = list(collection.aggregate(pipeline))
    return pd.DataFrame(results)

### Dense Search Implementation

In [None]:
def search_dense(collection, query_text, k=10, index_name="lrcm_dense"):
    embedding = embed_query(query_text)

    """
    Execute semantic vector search
    """
    pipeline = [
        {
            "$vectorSearch": {
                "index": index_name,
                "path": "production_embedding.embedding",
                "queryVector": embedding,
                "numCandidates": k + 50,
                "limit": k
            }
        },
        {
            "$project": {
                "ticker": 1,
                "name": 1,
                "score": {"$meta": "vectorSearchScore"}
            }
        }
    ]

    results = list(collection.aggregate(pipeline))
    return pd.DataFrame(results)


### Hybrid Search: The Best of Both Worlds

In [None]:
def search_hybrid_manual(collection, query_text, k=10):
    RRF_K = 60

    sparse_results = search_sparse(collection, query_text, k=50)
    dense_results  = search_dense(collection, query_text, k=50)

    rrf_scores = {}

    for rank, ticker in enumerate(sparse_results['ticker'].dropna()):
        rrf_scores[str(ticker)] = rrf_scores.get(str(ticker), 0) + 1.0 / (RRF_K + rank + 1)

    for rank, ticker in enumerate(dense_results['ticker'].dropna()):
        rrf_scores[str(ticker)] = rrf_scores.get(str(ticker), 0) + 1.0 / (RRF_K + rank + 1)

    sorted_items = sorted(rrf_scores.items(), key=lambda x: x[1], reverse=True)

    return pd.DataFrame(sorted_items[:k], columns=["ticker", "score"])


### PART-4.4: Evaluation Framework

In [None]:
evaluation_set = [
    {
    # --- AI & Computing Infrastructure ---
        'query': 'artificial intelligence hardware acceleration',
        'expected': ['NVDA', 'AMD', 'INTC', 'QCOM', 'MRVL', 'TSM', 'AVGO', 'ASML', 'MU', 'XLNX', 'AMAT', 'LRCX'],
        'theme': 'AI Hardware'
    },
    {
        'query':'hyperscale cloud infrastructure',
        'expected': ['AMZN', 'MSFT', 'GOOG', 'GOOGL', 'ORCL', 'IBM', 'DELL', 'HPE', 'EQIX', 'DLR', 'AMT', 'CCI', 'SBAC'],
        'theme': 'Cloud Infrastructure'
    },
    {
        'query':'business intelligence automation platforms',
        'expected': ['CRM', 'NOW', 'SNOW', 'MDB', 'PLTR', 'ADBE', 'SAP', 'WDAY', 'DDOG', 'AI', 'PATH', 'OKTA'],
        'theme': 'Enterprise AI Software'
    },
    {
        'query':'thermal management data centers',
        'expected': ['VRT', 'JCI', 'TT', 'CARR', 'MODG', 'NVENT', 'SMCI', 'CWT'],
        'theme': 'Data Center Cooling'
    },
    {
    # --- Clean Energy & Power ---
        'query':'photovoltaic energy generation',
        'expected': ['ENPH', 'SEDG', 'FSLR', 'RUN', 'SPWR', 'CSIQ', 'ARRY', 'NOVA', 'MAXN', 'JKS', 'DQ', 'SOL'],
        'theme': 'Solar Energy'
    },
    {
        'query':'fission reactor electricity utilities',
        'expected': ['CEG', 'VST', 'ETR', 'D', 'SO', 'DUK', 'NEE', 'AEP', 'EXC', 'PEG', 'FE', 'ES'],
        'theme': 'Nuclear Power'
    },
    {
        'query':'electrical grid stabilization technology',
        'expected': ['TSLA', 'FLNC', 'PLUG', 'ENPH', 'ALB', 'STEM', 'EOSE', 'GWH', 'FREY', 'BE', 'CHPT', 'BLNK'],
        'theme': 'Energy Storage'
    },
    {
        'query':'offshore renewable power generation',
        'expected': ['GEV', 'NEE', 'AES', 'BEP', 'CWEN', 'AY', 'TPIC', 'SHLS'],
        'theme': 'Wind Energy'
    },
    {
    # --- Electric Vehicles & Autonomous ---
        'query':'battery powered passenger vehicles',
        'expected': ['TSLA', 'RIVN', 'LCID', 'NIO', 'GM', 'F', 'LI', 'XPEV', 'STLA', 'VFS', 'PTRA', 'GOEV', 'ARVL'],
        'theme': 'Electric Vehicles'
    },
    {
        'query':'self driving sensor technology',
        'expected': ['TSLA', 'GM', 'GOOGL', 'INTC', 'MBLY', 'LAZR', 'AEVA', 'OUST', 'INVZ', 'LIDR', 'AUR', 'VLDR'],
        'theme': 'Autonomous Driving'
    },
    {
    # --- Fintech & Digital Payments ---
        'query':'electronic transaction processing',
        'expected': ['V', 'MA', 'PYPL', 'SQ', 'ADYE', 'GPN', 'FIS', 'FISV', 'FOUR', 'TOST', 'PAY', 'PAYO', 'DLO'],
        'theme': 'Digital Payments'
    },
    {
        'query':'installment lending platforms',
        'expected': ['AFRM', 'SQ', 'PYPL', 'SOFI', 'UPST', 'MQ', 'LC', 'BILL', 'ZIP', 'SEZL'],
        'theme': 'BNPL'
    },
    {
        'query':'digital asset trading platforms',
        'expected': ['COIN', 'HOOD', 'SOFI', 'PYPL', 'SQ', 'IBKR', 'SCHW', 'VIRT'],
        'theme': 'Crypto Trading'
    },
    {
    # --- Cybersecurity ---
        'query':'enterprise threat prevention systems',
        'expected': ['CRWD', 'PANW', 'ZS', 'FTNT', 'S', 'CYBR', 'CHKP', 'TENB', 'RPD', 'QLYS', 'VRNS', 'FEYE'],
        'theme': 'Cybersecurity'
    },
    {
        'query':'identity verification access management',
        'expected': ['ZS', 'OKTA', 'CRWD', 'PANW', 'NET', 'CYBR', 'PING', 'TENB', 'DUO', 'SAIL'],
        'theme': 'Zero Trust'
    },
    {

        'query':'remote patient care technology',
        'expected': ['TDOC', 'AMWL', 'DOCS', 'HIMS', 'ONEM', 'GDRX', 'OSCR', 'CVS', 'UNH'],
        'theme': 'Digital Health'
    },
    {
    # --- Quantum & Advanced Computing ---
        'query':'superposition based computing',
        'expected': ['IBM', 'GOOGL', 'MSFT', 'IONQ', 'RGTI', 'QTUM', 'HON', 'HPE', 'QBTS'],
        'theme': 'Quantum Computing'
    },
    {
        'query':'parallel processing supercomputers',
        'expected': ['NVDA', 'AMD', 'INTC', 'HPE', 'DELL', 'CRAY', 'SMCI', 'PSTG', 'NTAP'],
        'theme': 'HPC'
    },
    {
    # --- Robotics & Automation ---
        'query': 'industrial process automation',
        'expected': ['ROK', 'ABB', 'EMR', 'TER', 'CGNX', 'ISRG', 'MKSI', 'NOVT', 'ZBRA', 'ADSK', 'PTC'],
        'theme': 'Robotics & Automation'
    },
    {
        'query':'fulfillment center optimization',
        'expected': ['AMZN', 'AIOT', 'TGT', 'WMT', 'HD', 'FAST', 'GXO', 'ODFL'],
        'theme': 'Warehouse Automation'
    },
    {
    # --- Metaverse & Gaming ---
        'query':'immersive digital environments',
        'expected': ['META', 'AAPL', 'RBLX', 'U', 'MSFT', 'SONY', 'SNAP', 'MTTR', 'VUZI', 'IMMR', 'TTWO', 'EA', 'ATVI'],
        'theme': 'Metaverse & Gaming'
    },
    {
        'query':'competitive gaming platforms',
        'expected': ['TTWO', 'EA', 'ATVI', 'NTDOY', 'RBLX', 'U', 'DKNG', 'PENN', 'GMBL', 'SLGG'],
        'theme': 'Gaming & Esports'
    },
    {
    # --- Traditional Sectors (Control Group) ---
        'query':'hospitality accommodation services',
        'expected': ['MAR', 'HLT', 'IHG', 'H', 'WH', 'CHH', 'PLYA', 'RHP', 'APLE'],
        'theme': 'Hotels'
    },
    {
        'query':'commercial passenger aviation',
        'expected': ['DAL', 'UAL', 'AAL', 'LUV', 'ALK', 'JBLU', 'SAVE', 'HA', 'ULCC'],
        'theme': 'Airlines'
    },
    {
        'query':'discount wholesale retail operations',
        'expected': ['WMT', 'COST', 'TGT', 'BJ', 'DG', 'DLTR', 'KR', 'ACI', 'SFM'],
        'theme': 'Big Box Retail'
    },
    {
        'query': 'quick service dining franchises',
        'expected': ['MCD', 'YUM', 'QSR', 'DPZ', 'CMG', 'SBUX', 'WEN', 'JACK', 'SHAK', 'WING'],
        'theme': 'Fast Food'
    },
    {
        'query':'residential construction supplies retail',
        'expected': ['HD', 'LOW', 'FND', 'TSCO', 'WSM', 'BBY', 'LL', 'BLDR'],
        'theme': 'Home Improvement'
    },
    {
    # --- Specific Tech Niches ---
        'query':'wireless network infrastructure',
        'expected': ['AMT', 'CCI', 'SBAC', 'VZ', 'T', 'TMUS', 'QCOM', 'NOK', 'ERIC', 'COMM', 'CIEN'],
        'theme': '5G & Edge Computing'
    },
    {
        'query':'chip fabrication equipment',
        'expected': ['ASML', 'AMAT', 'LRCX', 'KLAC', 'TER', 'ENTG', 'ONTO', 'ACLS', 'NVMI', 'UCTT'],
        'theme': 'Semiconductor Equipment'
    },
    {
        'query':'data storage solutions',
        'expected': ['MU', 'WDC', 'STX', 'NAND', 'INTC', 'SK', 'SMCI', 'PSTG', 'NTAP'],
        'theme': 'Memory & Storage'
    },
    {
    # --- Abstract/Conceptual Queries (True Semantic Test) ---

        'query':'machine learning infrastructure stack',
        'expected': ['NVDA', 'GOOGL', 'MSFT', 'AMZN', 'META', 'PLTR', 'SNOW', 'MDB'],
        'theme': 'AI Infrastructure Stack'
    },
    {
        'query':'precision medicine technology',
        'expected': ['ILMN', 'TMO', 'DHR', 'A', 'VRTX', 'REGN', 'CRSP', 'BEAM'],
        'theme': 'Precision Medicine'
    }
]

print(f"Total evaluation queries: {len(evaluation_set)}")

print(
    f"Total unique tickers referenced: "
    f"{len(set(ticker for q in evaluation_set for ticker in q['expected']))}"
)


Total evaluation queries: 32
Total unique tickers referenced: 268


In [None]:
for item in evaluation_set:
    query_text = item['query']
    expected_tickers = item['expected']

    count = collection.count_documents({
        'ticker': {'$in': expected_tickers}
    })

    print(query_text, count)


artificial intelligence hardware acceleration 9
hyperscale cloud infrastructure 12
business intelligence automation platforms 10
thermal management data centers 5
photovoltaic energy generation 2
fission reactor electricity utilities 12
electrical grid stabilization technology 3
offshore renewable power generation 4
battery powered passenger vehicles 5
self driving sensor technology 4
electronic transaction processing 7
installment lending platforms 4
digital asset trading platforms 7
enterprise threat prevention systems 5
identity verification access management 6
remote patient care technology 3
superposition based computing 4
parallel processing supercomputers 8
industrial process automation 9
fulfillment center optimization 7
immersive digital environments 7
competitive gaming platforms 6
hospitality accommodation services 5
commercial passenger aviation 5
discount wholesale retail operations 9
quick service dining franchises 8
residential construction supplies retail 7
wireless net

In [None]:
def get_doc_id(row):
    """
    Safely extract a document identifier for evaluation.
    Priority: ticker - _id - name
    """
    return (
        row.get("ticker")
        or row.get("_id")
        or row.get("name")
        or ""
    )


In [None]:
def calculate_metrics(results_df, expected_tickers, k=10):
    if results_df.empty:
        return {"p_at_k": 0.0, "rr_at_k": 0.0}

    top_k = results_df.head(k)["ticker"].tolist()

    # Precision@K
    relevant_found = len([t for t in top_k if t in expected_tickers])
    precision = relevant_found / k

    # Reciprocal Rank@K
    rr = 0.0
    for rank, ticker in enumerate(top_k, 1):
        if ticker in expected_tickers:
            rr = 1.0 / rank
            break

    return {
        "p_at_k": precision,
        "rr_at_k": rr
    }


In [None]:
# Dense-only Evaluation
dense_eval_results = []

print("\nRunning Dense Search Evaluation")

for item in evaluation_set:
    query_text = item["query"]
    theme = item["theme"]
    expected = item["expected"]

    df_dense = search_dense(
        collection,
        query_text,
        k=20,
        index_name="lrcm_dense"
    )

    if df_dense.empty:
        continue

    dense_eval_results.append({
        "theme": theme,
        "query": query_text,
        "p_at_10": calculate_metrics(df_dense, expected, k=10)["p_at_k"],
        "p_at_20": calculate_metrics(df_dense, expected, k=20)["p_at_k"],
        "rr_at_10": calculate_metrics(df_dense, expected, k=10)["rr_at_k"]
    })

if dense_eval_results:
    dense_report_df = pd.DataFrame(dense_eval_results)

    print("\nDense Search (classification prefix) Evaluation Report")
    print(dense_report_df.to_string(index=False))

    print("\nOverall Dense Search Performance")
    print(f"Mean Precision@10: {dense_report_df['p_at_10'].mean():.4f}")
    print(f"Mean Precision@20: {dense_report_df['p_at_20'].mean():.4f}")
    print(f"Mean RR@10: {dense_report_df['rr_at_10'].mean():.4f}")
dense_report_df.to_csv(
    "dense_search_evaluation_results.csv",
    index=False
)


Running Dense Search Evaluation

Dense Search (classification prefix) Evaluation Report
                  theme                                         query  p_at_10  p_at_20  rr_at_10
            AI Hardware artificial intelligence hardware acceleration      0.2     0.20  1.000000
   Cloud Infrastructure               hyperscale cloud infrastructure      0.3     0.15  0.166667
 Enterprise AI Software    business intelligence automation platforms      0.1     0.20  0.500000
    Data Center Cooling               thermal management data centers      0.2     0.20  0.333333
           Solar Energy                photovoltaic energy generation      0.2     0.10  0.500000
          Nuclear Power         fission reactor electricity utilities      0.3     0.30  0.250000
         Energy Storage      electrical grid stabilization technology      0.0     0.05  0.000000
            Wind Energy           offshore renewable power generation      0.3     0.20  0.500000
      Electric Vehicles      

In [None]:
# Baseline Sparse-only Evaluation (lucene.standard)

baseline_sparse_eval_results = []

print("\nRunning Baseline Sparse Search Evaluation (lucene.standard)")

for item in evaluation_set:
    query_text = item["query"]
    theme = item["theme"]
    expected = item["expected"]

    df_sparse_baseline = search_sparse(
        collection,
        query_text,
        k=20,
        index_name="lrcm_sparse"
    )

    if df_sparse_baseline.empty:
        continue

    baseline_sparse_eval_results.append({
        "theme": theme,
        "query": query_text,
        "p_at_10": calculate_metrics(df_sparse_baseline, expected, k=10)["p_at_k"],
        "p_at_20": calculate_metrics(df_sparse_baseline, expected, k=20)["p_at_k"],
        "rr_at_10": calculate_metrics(df_sparse_baseline, expected, k=10)["rr_at_k"],
    })

if baseline_sparse_eval_results:
    baseline_sparse_report_df = pd.DataFrame(baseline_sparse_eval_results)

    print("\nBaseline Sparse (lucene.standard) Evaluation Report")
    print(baseline_sparse_report_df.to_string(index=False))

    print("\nOverall Baseline Sparse Search Performance")
    print(f"Mean Precision@10: {baseline_sparse_report_df['p_at_10'].mean():.4f}")
    print(f"Mean Precision@20: {baseline_sparse_report_df['p_at_20'].mean():.4f}")
    print(f"Mean RR@10: {baseline_sparse_report_df['rr_at_10'].mean():.4f}")

    # Save CSV
    baseline_sparse_report_df.to_csv(
        "baseline_sparse_search_evaluation_results.csv",
        index=False
    )



Running Baseline Sparse Search Evaluation (lucene.standard)

Baseline Sparse (lucene.standard) Evaluation Report
                  theme                                         query  p_at_10  p_at_20  rr_at_10
            AI Hardware artificial intelligence hardware acceleration      0.1     0.10  0.500000
   Cloud Infrastructure               hyperscale cloud infrastructure      0.4     0.20  0.200000
 Enterprise AI Software    business intelligence automation platforms      0.1     0.10  0.200000
    Data Center Cooling               thermal management data centers      0.1     0.10  0.111111
           Solar Energy                photovoltaic energy generation      0.0     0.00  0.000000
          Nuclear Power         fission reactor electricity utilities      0.4     0.20  0.250000
         Energy Storage      electrical grid stabilization technology      0.1     0.10  0.100000
            Wind Energy           offshore renewable power generation      0.3     0.15  1.000000
    

In [None]:
# Sparse Evaluation
sparse_eval_results = []

print("\nRunning Sparse Search Evaluation (BM25 English)")

for item in evaluation_set:
    query_text = item["query"]
    theme = item["theme"]
    expected = item["expected"]

    df_sparse = search_sparse(
        collection,
        query_text,
        k=20,
        index_name="lrcm_sparse_english"
    )

    if df_sparse.empty:
        continue

    sparse_eval_results.append({
        "theme": theme,
        "query": query_text,
        "p_at_10": calculate_metrics(df_sparse, expected, k=10)["p_at_k"],
        "p_at_20": calculate_metrics(df_sparse, expected, k=20)["p_at_k"],
        "rr_at_10": calculate_metrics(df_sparse, expected, k=10)["rr_at_k"],
    })

if sparse_eval_results:
    sparse_report_df = pd.DataFrame(sparse_eval_results)

    print("\nSparse Search (BM25 English) Evaluation Report")
    print(sparse_report_df.to_string(index=False))

    print("\nOverall Sparse Search Performance")
    print(f"Mean Precision@10: {sparse_report_df['p_at_10'].mean():.4f}")
    print(f"Mean Precision@20: {sparse_report_df['p_at_20'].mean():.4f}")
    print(f"Mean RR@10: {sparse_report_df['rr_at_10'].mean():.4f}")

    # Save CSV
    sparse_report_df.to_csv(
        "sparse_search_evaluation_results.csv",
        index=False
    )



Running Sparse Search Evaluation (BM25 English)

Sparse Search (BM25 English) Evaluation Report
                  theme                                         query  p_at_10  p_at_20  rr_at_10
            AI Hardware artificial intelligence hardware acceleration      0.2     0.15  0.333333
   Cloud Infrastructure               hyperscale cloud infrastructure      0.4     0.20  0.200000
 Enterprise AI Software    business intelligence automation platforms      0.2     0.15  0.500000
    Data Center Cooling               thermal management data centers      0.0     0.10  0.000000
           Solar Energy                photovoltaic energy generation      0.0     0.05  0.000000
          Nuclear Power         fission reactor electricity utilities      0.1     0.25  0.200000
         Energy Storage      electrical grid stabilization technology      0.2     0.10  0.200000
            Wind Energy           offshore renewable power generation      0.3     0.15  0.500000
      Electric Vehicl

In [None]:
# Hybrid Evaluation
hybrid_eval_results = []

print("\nRunning Hybrid RRF Search Evaluation")

for item in evaluation_set:
    query_text = item["query"]
    theme = item["theme"]
    expected = item["expected"]

    df_hybrid = search_hybrid_manual(
        collection,
        query_text,
        k=20
    )

    if df_hybrid.empty:
        continue

    hybrid_eval_results.append({
        "theme": theme,
        "query": query_text,
        "p_at_10": calculate_metrics(df_hybrid, expected, k=10)["p_at_k"],
        "p_at_20": calculate_metrics(df_hybrid, expected, k=20)["p_at_k"],
        "rr_at_10": calculate_metrics(df_hybrid, expected, k=10)["rr_at_k"],
    })

if hybrid_eval_results:
    hybrid_report_df = pd.DataFrame(hybrid_eval_results)

    print("\nHybrid RRF Evaluation Report")
    print(hybrid_report_df.to_string(index=False))

    print("\nOverall Hybrid Search Performance")
    print(f"Mean Precision@10: {hybrid_report_df['p_at_10'].mean():.4f}")
    print(f"Mean Precision@20: {hybrid_report_df['p_at_20'].mean():.4f}")
    print(f"Mean RR@10: {hybrid_report_df['rr_at_10'].mean():.4f}")

    # Save CSV (same naming style as your other parts)
    hybrid_report_df.to_csv(
        "hybrid_search_evaluation_results.csv",
        index=False
    )



Running Hybrid RRF Search Evaluation

Hybrid RRF Evaluation Report
                  theme                                         query  p_at_10  p_at_20  rr_at_10
            AI Hardware artificial intelligence hardware acceleration      0.3     0.20  1.000000
   Cloud Infrastructure               hyperscale cloud infrastructure      0.2     0.25  0.200000
 Enterprise AI Software    business intelligence automation platforms      0.2     0.20  1.000000
    Data Center Cooling               thermal management data centers      0.2     0.15  0.166667
           Solar Energy                photovoltaic energy generation      0.1     0.05  0.166667
          Nuclear Power         fission reactor electricity utilities      0.2     0.35  0.125000
         Energy Storage      electrical grid stabilization technology      0.1     0.05  0.250000
            Wind Energy           offshore renewable power generation      0.4     0.20  0.500000
      Electric Vehicles            battery powered

In [None]:
# FINAL RESULTS SCOREBOARD
experiment_results = [
    {
        "Experiment": "1: Baseline Sparse (lrcm_sparse)",
        "Precision@10": baseline_sparse_report_df["p_at_10"].mean(),
        "RR@10": baseline_sparse_report_df["rr_at_10"].mean()
    },
    {
        "Experiment": "2: English Sparse (lrcm_sparse_english)",
        "Precision@10": sparse_report_df["p_at_10"].mean(),
        "RR@10": sparse_report_df["rr_at_10"].mean()
    },
    {
        "Experiment": "3: Dense Search (Semantic)",
        "Precision@10": dense_report_df["p_at_10"].mean(),
        "RR@10": dense_report_df["rr_at_10"].mean()
    },
    {
        "Experiment": "4: Hybrid RRF (Combined)",
        "Precision@10": hybrid_report_df["p_at_10"].mean(),
        "RR@10": hybrid_report_df["rr_at_10"].mean()
    }
]

final_scoreboard_df = pd.DataFrame(experiment_results)

print("\nFINAL RESULTS SCOREBOARD\n")
print(final_scoreboard_df.to_string(index=False))

# Save to CSV
final_scoreboard_df.to_csv(
    "search_evaluation_scoreboard.csv",
    index=False
)


FINAL RESULTS SCOREBOARD

                             Experiment  Precision@10    RR@10
       1: Baseline Sparse (lrcm_sparse)      0.168750 0.343874
2: English Sparse (lrcm_sparse_english)      0.178125 0.393403
             3: Dense Search (Semantic)      0.275000 0.583420
               4: Hybrid RRF (Combined)      0.259375 0.547693


In [None]:
collection.find_one(
    {"prod_text": {"$exists": True}},
    {"prod_text": 1}
)


{'_id': ObjectId('690c2f0530de78ca9390a9cd'),
 'prod_text': ' **Strong Historical Financial Performance:** Accenture has delivered approximately 370% total returns (including dividends) between 2015 and 2024, significantly outperforming the S&P 500, indicating robust revenue growth and financial strength.\n**Aggressive Growth Strategy via M&A:**  Since 2013, Accenture has acquired over 200 companies and established strategic partnerships (e.g., The Alan Turing Institute), demonstrating a consistent focus on expansion and capability enhancement.\n**Recent Contract Underperformance:** A 2024 management alert highlights a past contract issue (terminated in 2019) where Accenture failed to meet promised deliverables, raising concerns about project execution and client relationships.\n**D&I Policy Shift & Potential Government Contract Impact:** Accenture significantly altered its Diversity, Equity, and Inclusion (DEI) policies in 2025, discontinuing global representation goals and demographi