In [55]:
# Enable autoreload of imported modules
%load_ext autoreload
%autoreload 2
import sys
import os
# Add parent of 'physbot' to the path
sys.path.append(os.path.abspath(".."))
    
import json
import logging
from tqdm import tqdm
from pathlib import Path
from elasticsearch import Elasticsearch
from openai import OpenAI
from dotenv import load_dotenv
from physbot.path_utils import get_project_root

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [56]:
# Load environment variables from the project root
# For Jupyter: assume this notebook is in /PhysBot/notebooks
notebook_dir = Path.cwd()
project_root = get_project_root()
load_dotenv(dotenv_path=project_root / ".env")

# OpenAI API Key
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

# Elasticsearch settings
ES_HOST = "http://localhost:9200"
ES_INDEX = "physbot_units"

# Paths
JSON_DIR = project_root / "data" / "json"

In [57]:
# Configure logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")

# Initialize Elasticsearch and OpenAI clients
es = Elasticsearch(ES_HOST)
openai_client = OpenAI(api_key=OPENAI_API_KEY)

In [58]:
def create_physbot_index(es_client: Elasticsearch, index_name: str = "physbot_units"):
    """
    Creates an Elasticsearch index for PhysBot with OpenAI-compatible embedding and rich metadata.
    """
    index_body = {
        "settings": {
            "number_of_shards": 1,
            "number_of_replicas": 0
        },
        "mappings": {
            "properties": {
                "unit": {"type": "keyword"},
                "section": {"type": "keyword"},
                "content": {"type": "text"},
                "embedding": {
                    "type": "dense_vector",
                    "dims": 1536,
                    "index": True,
                    "similarity": "cosine"
                },
                "source": {"type": "keyword"},
                "chunk_index": {"type": "integer"}
            }
        }
    }

    # Delete old index if it exists
    if es_client.indices.exists(index=index_name):
        print(f"🗑️ Deleting existing index '{index_name}'...")
        es_client.indices.delete(index=index_name)
        print("✅ Deleted.")
    
    # Create fresh index with correct settings
    print(f"🆕 Creating index '{index_name}'...")
    es_client.indices.create(index=index_name, body=index_body)
    print("✅ Index created successfully.")

def embed_text(text):
    """Get a 1536-dim OpenAI embedding for the given text with validation."""
    response = openai_client.embeddings.create(
        model="text-embedding-ada-002",
        input=text
    )
    embedding = response.data[0].embedding

    # Validation
    if not isinstance(embedding, list) or not all(isinstance(x, (float, int)) for x in embedding):
        raise ValueError("Embedding is not a flat list of floats.")
    if len(embedding) != 1536:
        raise ValueError(f"Expected 1536-dim embedding, got {len(embedding)}.")

    return embedding

def validate_doc(doc):
    # Check embedding format
    embedding = doc.get("embedding", [])
    if not isinstance(embedding, list):
        raise ValueError("Embedding must be a list.")
    if len(embedding) != 1536:
        raise ValueError(f"Embedding must be 1536 dimensions, got {len(embedding)}")
    if not all(isinstance(x, (float, int)) for x in embedding):
        raise ValueError("Embedding contains non-numeric values.")

    # Check required fields
    for key in ["unit", "section", "content", "source", "chunk_index"]:
        if key not in doc:
            raise ValueError(f"Missing required field: {key}")
        if key in ["unit", "section", "content", "source"] and not isinstance(doc[key], str):
            raise ValueError(f"Field {key} must be a string.")
        if key == "chunk_index" and not isinstance(doc[key], int):
            raise ValueError("chunk_index must be an integer.")

    return True

def index_unit_file(json_path):
    """Index all text chunks from a single unit JSON file."""
    with open(json_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    unit = data.get("unit", "Unknown")
    text_chunks = data.get("text_chunks", [])

    for i, chunk in enumerate(tqdm(text_chunks, desc=f"Indexing {json_path.name}")):
        content = chunk.get("content", "").strip()
        if not content:
            continue

        try:
            embedding = embed_text(content)
        except Exception as e:
            logging.error(f"❌ Embedding failed for chunk {i} in {json_path.name}: {str(e)}")
            continue

        doc = {
            "unit": unit,
            "section": chunk.get("section", "Unknown"),
            "content": content,
            "embedding": embedding,
            "source": json_path.name,
            "chunk_index": i
        }

        try:
            validate_doc(doc)
            es.index(index=ES_INDEX, document=doc)
        except Exception as validation_error:
            logging.error(f"❌ Failed to index chunk {i} from {json_path.name}: {validation_error}")

In [65]:
create_physbot_index(es)

2025-05-04 12:46:55,377 [INFO] HEAD http://localhost:9200/physbot_units [status:200 duration:0.013s]
2025-05-04 12:46:55,471 [INFO] DELETE http://localhost:9200/physbot_units [status:200 duration:0.092s]
  es_client.indices.create(index=index_name, body=index_body)


🗑️ Deleting existing index 'physbot_units'...
✅ Deleted.
🆕 Creating index 'physbot_units'...


2025-05-04 12:46:55,757 [INFO] PUT http://localhost:9200/physbot_units [status:200 duration:0.256s]


✅ Index created successfully.


In [66]:
json_files = sorted(JSON_DIR.glob("unit_*.json"))
for json_file in json_files:
    try:
        index_unit_file(json_file)
    except Exception as e:
        logging.error(f"❌ Failed to index {json_file.name}: {str(e)}")

Indexing unit_101_output.json:   0%|                                                      | 0/19 [00:00<?, ?it/s]2025-05-04 12:47:20,246 [INFO] HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2025-05-04 12:47:20,273 [INFO] POST http://localhost:9200/physbot_units/_doc [status:201 duration:0.021s]
Indexing unit_101_output.json:   5%|██▍                                           | 1/19 [00:00<00:10,  1.74it/s]2025-05-04 12:47:20,693 [INFO] HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2025-05-04 12:47:20,716 [INFO] POST http://localhost:9200/physbot_units/_doc [status:201 duration:0.017s]
Indexing unit_101_output.json:  11%|████▊                                         | 2/19 [00:01<00:08,  2.01it/s]2025-05-04 12:47:21,049 [INFO] HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2025-05-04 12:47:21,075 [INFO] POST http://localhost:9200/physbot_units/_doc [status:201 duration:0.020s]
Indexing unit_101_output.jso

In [68]:
from collections import defaultdict

# Aggregation query to count docs grouped by 'source' (e.g., unit_101_output.json)
query = {
    "size": 0,
    "aggs": {
        "by_source": {
            "terms": {
                "field": "source",
                "size": 1000  # Adjust as needed if you have many files
            }
        }
    }
}

response = es.search(index=ES_INDEX, body=query)
buckets = response.get("aggregations", {}).get("by_source", {}).get("buckets", [])

print("\n📊 Document Count per Unit File:\n")
for bucket in sorted(buckets, key=lambda b: b["key"]):
    print(f"{bucket['key']:<30} → {bucket['doc_count']} chunks")

  response = es.search(index=ES_INDEX, body=query)
2025-05-04 13:24:29,269 [INFO] POST http://localhost:9200/physbot_units/_search [status:200 duration:0.305s]



📊 Document Count per Unit File:

unit_101_output.json           → 19 chunks
unit_102_output.json           → 17 chunks
unit_103_output.json           → 16 chunks
unit_104_output.json           → 12 chunks
unit_105_output.json           → 19 chunks
unit_106_output.json           → 13 chunks
unit_107_output.json           → 12 chunks
unit_108_output.json           → 17 chunks
unit_109_output.json           → 16 chunks
unit_110_output.json           → 18 chunks
unit_111_output.json           → 8 chunks
unit_112_output.json           → 17 chunks
unit_113_output.json           → 7 chunks
unit_114_output.json           → 12 chunks
unit_115_output.json           → 10 chunks
unit_116_output.json           → 9 chunks
unit_117_output.json           → 10 chunks
unit_118_output.json           → 8 chunks
unit_119_output.json           → 13 chunks
unit_120_output.json           → 14 chunks
unit_121_output.json           → 8 chunks
unit_122_output.json           → 14 chunks
unit_123_output.json     