In [13]:
import os, sys
from pathlib import Path

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
if not OPENAI_API_KEY:
    raise RuntimeError("OPENAI_API_KEY is not set. Set it before running this notebook.")

CHROMA_PATH = "chroma"
TXT_DIR = Path("parsed/legislation")
COLLECTION_NAME = "legislation_policy"  # rebuilding this name with OpenAI 3-large

if not TXT_DIR.exists():
    raise RuntimeError(f"TXT_DIR not found: {TXT_DIR.resolve()} — convert your PDFs first.")


In [14]:
import hashlib
from typing import List
from tqdm import tqdm

def sha1(s: str) -> str:
    import hashlib
    return hashlib.sha1(s.encode("utf-8")).hexdigest()

def chunk_text(text: str, max_chars=1000, overlap=150) -> List[str]:
    """Paragraph-aware chunking suited for legislation (precise, short chunks)."""
    paras = [p.strip() for p in text.split("\n\n") if p.strip()]
    chunks, buf = [], ""
    for p in paras:
        if len(buf) + len(p) + 2 <= max_chars:
            buf = f"{buf}\n\n{p}" if buf else p
        else:
            if buf:
                chunks.append(buf)
            # handle very long single paragraphs
            while len(p) > max_chars:
                head = p[:max_chars]
                chunks.append(head)
                p = p[max(0, max_chars - overlap):]
            buf = p
    if buf:
        chunks.append(buf)

    # add small overlap
    with_overlap = []
    for i, c in enumerate(chunks):
        if i == 0 or overlap <= 0:
            with_overlap.append(c)
        else:
            with_overlap.append(chunks[i-1][-overlap:] + c)
    return with_overlap


In [16]:
import chromadb
from chromadb.utils import embedding_functions

# Attach OpenAI 3-large embedding function (3072-dim)
ef_openai_large = embedding_functions.OpenAIEmbeddingFunction(
    api_key=OPENAI_API_KEY,
    model_name="text-embedding-3-large"  # highest quality; 3072-dim
)

client = chromadb.PersistentClient(path=CHROMA_PATH)

# If a collection with same name exists (possibly wrong dimension), drop it cleanly.
try:
    client.delete_collection(COLLECTION_NAME)
    print(f"Dropped existing collection '{COLLECTION_NAME}'.")
except Exception:
    pass  # ok if it doesn't exist

coll = client.get_or_create_collection(
    name=COLLECTION_NAME,
    metadata={"corpus_type":"A","jurisdiction":"BC","source":"legislation/policy","embedding_model":"text-embedding-3-large"},
    embedding_function=ef_openai_large
)
print("Ready:", coll.name)


Dropped existing collection 'legislation_policy'.
Ready: legislation_policy


In [17]:
txt_files = sorted(TXT_DIR.glob("*.txt"))
if not txt_files:
    raise SystemExit(f"No .txt files found in {TXT_DIR.resolve()}")

total_chunks = 0
for f in tqdm(txt_files, desc="Ingesting legislation"):
    text = f.read_text(encoding="utf-8").strip()
    if not text:
        print(f"[WARN] Empty file skipped: {f.name}")
        continue

    chunks = chunk_text(text, max_chars=1000, overlap=150)
    base_id = sha1(f"{f.name}:{len(text)}")

    ids = [f"{base_id}_{i:04d}" for i in range(len(chunks))]
    metadatas = [{
        "filename": f.name,
        "filepath": str(f),
        "corpus_type": "A",
        "section": "legislation",
        "chunk_index": i
    } for i in range(len(chunks))]

    # Batch to avoid payload/timeouts; embeddings are computed by the collection’s ef
    BATCH = 64
    for i in range(0, len(chunks), BATCH):
        coll.add(
            documents=chunks[i:i+BATCH],
            metadatas=metadatas[i:i+BATCH],
            ids=ids[i:i+BATCH]
        )
    total_chunks += len(chunks)

print(f"Done. Added {total_chunks} chunks. Collection count ≈ {coll.count()}.")


Ingesting legislation: 100%|██████████| 4/4 [00:04<00:00,  1.16s/it]

Done. Added 103 chunks. Collection count ≈ 103.





In [18]:
# Critical: query using the SAME embedding function by calling coll.query (it already has ef attached)
q = "reasonable management action bullying harassment definition repeated or one serious incident"
res = coll.query(query_texts=[q], n_results=5)

for i in range(len(res["ids"][0])):
    meta = res["metadatas"][0][i]
    doc = res["documents"][0][i].replace("\n"," ")[:300]
    print(f"- {meta['filename']} [chunk {meta['chunk_index']}]")
    print(f"  {doc}\n")


- worker-fact-sheet-workplace-bullying-harassment-pdf-en.txt [chunk 2]
  s rumours  * Calling someone derogatory names  * Any other inappropriate conduct or comment that would cause someone to feel humiliated or intimidated* Offering constructive feedback, guidance, or advice about work-related behaviour  * Reasonable action taken by an employer or supervisor relating to

- Define discrimination, bullying and harassment - Province of British Columbia.txt [chunk 2]
  ng to managing the workforce. The conduct may be written, verbal,  physical, online, or electronic, a gesture or display, or any combination of these.Examples of bullying and harassment  2 of 7  2025-10-25, 6:45 p.m.  Define discrimination, bullying and harassment - Province of British C...  https:/

- english-workplace-bullying-and-harassment-2014_online.txt [chunk 4]
  get of the workplace bullying and harassment may be one person or a group.  It is not workplace bullying or harassment when an employer or supervisor:*  ex