In [1]:
import json
import os
import numpy as np
import faiss
from dotenv import load_dotenv
import openai
import tiktoken

# Load environment variables
load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")

# Instantiate OpenAI client
client = openai.OpenAI(api_key=api_key)

# Ensure output folder exists
os.makedirs("rag_data", exist_ok=True)

# Load preprocessed chunks
with open("rag_data/chunks.json", "r") as f:
    chunks = json.load(f)

texts = [chunk["text"] for chunk in chunks]

# Tokenizer setup for ada-002
encoding = tiktoken.encoding_for_model("text-embedding-ada-002")
MAX_TOKENS_PER_BATCH = 8191
MAX_TOKENS_PER_CHUNK = 8191
SAFETY_MARGIN = 50  # leave headroom

# Estimate token count for a string
def count_tokens(text):
    return len(encoding.encode(text))

# Split a long text into smaller parts by sentence
def split_into_chunks(text, max_tokens):
    sentences = text.split('. ')
    chunks = []
    current = ""
    for sentence in sentences:
        if count_tokens(current + sentence) < max_tokens - SAFETY_MARGIN:
            current += sentence + '. '
        else:
            chunks.append(current.strip())
            current = sentence + '. '
    if current:
        chunks.append(current.strip())
    return chunks

# Embedding with token-aware batching and long chunk splitting
def get_openai_embeddings(texts, model="text-embedding-ada-002"):
    embeddings = []
    batch = []
    batch_token_count = 0
    total_tokens = 0

    for text in texts:
        if count_tokens(text) > MAX_TOKENS_PER_CHUNK - SAFETY_MARGIN:
            sub_chunks = split_into_chunks(text, MAX_TOKENS_PER_CHUNK)
        else:
            sub_chunks = [text]

        for chunk in sub_chunks:
            if not isinstance(chunk, str) or not chunk.strip():
                print("⚠️ Skipped an invalid or empty chunk.")
                continue
            tokens = count_tokens(chunk)
            total_tokens += tokens
            if tokens > MAX_TOKENS_PER_CHUNK - SAFETY_MARGIN:
                print("⚠️ Chunk still too large after splitting. Truncating.")
                encoded = encoding.encode(chunk)
                chunk = encoding.decode(encoded[:MAX_TOKENS_PER_CHUNK - SAFETY_MARGIN])
                tokens = count_tokens(chunk)

            if batch_token_count + tokens > MAX_TOKENS_PER_BATCH - SAFETY_MARGIN:
                response = client.embeddings.create(input=batch, model=model)
                embeddings.extend([r.embedding for r in response.data])
                batch = []
                batch_token_count = 0

            batch.append(chunk)
            batch_token_count += tokens

    if batch:
        response = client.embeddings.create(input=batch, model=model)
        embeddings.extend([r.embedding for r in response.data])

    estimated_cost = total_tokens / 1000 * 0.0001
    print(f"\n📊 Total tokens embedded: {total_tokens}")
    print(f"💰 Estimated cost: ${estimated_cost:.4f}")

    return embeddings, total_tokens

print("🔄 Generating embeddings with OpenAI using token-aware batching and splitting long chunks...")
embeddings, total_tokens = get_openai_embeddings(texts)
embedding_matrix = np.array(embeddings).astype("float32")

# Create FAISS index
dimension = len(embedding_matrix[0])
index = faiss.IndexFlatL2(dimension)
index.add(embedding_matrix)
faiss.write_index(index, "rag_data/faiss.index")
print("✅ FAISS index saved as rag_data/faiss.index")

# Save metadata and track per-document token usage
faiss_metadata = []
embedding_index = 0
token_log_by_doc = {}

for chunk in chunks:
    text = chunk["text"]
    source_file = chunk["metadata"].get("source_file", "unknown")
    token_log_by_doc.setdefault(source_file, 0)

    if count_tokens(text) > MAX_TOKENS_PER_CHUNK - SAFETY_MARGIN:
        sub_chunks = split_into_chunks(text, MAX_TOKENS_PER_CHUNK)
    else:
        sub_chunks = [text]

    for sub_chunk in sub_chunks:
        if not isinstance(sub_chunk, str) or not sub_chunk.strip():
            continue
        if count_tokens(sub_chunk) > MAX_TOKENS_PER_CHUNK - SAFETY_MARGIN:
            encoded = encoding.encode(sub_chunk)
            sub_chunk = encoding.decode(encoded[:MAX_TOKENS_PER_CHUNK - SAFETY_MARGIN])
        token_log_by_doc[source_file] += count_tokens(sub_chunk)
        faiss_metadata.append({
            "text": sub_chunk,
            "section_header": chunk["section_header"],
            "metadata": chunk["metadata"]
        })
        embedding_index += 1

# Save metadata
with open("rag_data/faiss_metadata.json", "w") as f:
    json.dump(faiss_metadata, f, indent=2)
print("✅ Metadata saved as rag_data/faiss_metadata.json")

# Print token usage per document
print("\n📄 Token usage by document:")
doc_costs = {}
for doc, tokens in token_log_by_doc.items():
    cost = tokens / 1000 * 0.0001
    doc_costs[doc] = {"tokens": tokens, "cost": round(cost, 4)}
    print(f"- {doc}: {tokens} tokens ≈ ${cost:.4f}")

# Save cost summary
with open("rag_data/embedding_cost_summary.json", "w") as f:
    json.dump({
        "total_tokens": total_tokens,
        "estimated_total_cost": round(total_tokens / 1000 * 0.0001, 4),
        "per_document": doc_costs
    }, f, indent=2)
print("💾 Cost summary saved as rag_data/embedding_cost_summary.json")

🔄 Generating embeddings with OpenAI using token-aware batching and splitting long chunks...

📊 Total tokens embedded: 2232358
💰 Estimated cost: $0.2232
✅ FAISS index saved as rag_data/faiss.index
✅ Metadata saved as rag_data/faiss_metadata.json

📄 Token usage by document:
- 2022 Hospice Final Rule.xml: 163622 tokens ≈ $0.0164
- 2022 Hospice Proposed Rule.xml: 144402 tokens ≈ $0.0144
- 2023 Hospice Final Rule.xml: 43751 tokens ≈ $0.0044
- 2023 Hospice Proposed Rule.xml: 51436 tokens ≈ $0.0051
- 2023 SNF Final Rule.xml: 465514 tokens ≈ $0.0466
- 2023 SNF Proposed Rule.xml: 332939 tokens ≈ $0.0333
- 2024 Hospice Final Rule.xml: 51436 tokens ≈ $0.0051
- 2024 SNF Final Rule.xml: 301541 tokens ≈ $0.0302
- 2025 Hospice Final Rule.xml: 96572 tokens ≈ $0.0097
- 2025 Hospice Proposed Rule.xml: 64766 tokens ≈ $0.0065
- 2025 SNF Final Rule.xml: 301541 tokens ≈ $0.0302
- 2025 SNF Proposed Rule.xml: 214838 tokens ≈ $0.0215
💾 Cost summary saved as rag_data/embedding_cost_summary.json
