In [1]:
!pip install langchain-huggingface langchain-community chromadb tqdm

Collecting langchain-huggingface
  Downloading langchain_huggingface-0.3.1-py3-none-any.whl.metadata (996 bytes)
Collecting langchain-community
  Downloading langchain_community-0.3.27-py3-none-any.whl.metadata (2.9 kB)
Collecting chromadb
  Downloading chromadb-1.0.16-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.4 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.10.1-py3-none-any.whl.metadata (3.4 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 (from langchain-community)
  Downloading httpx_sse-0.4.1-py3-none-any.whl.metadata (9.4 kB)
Collecting pybase64>=1.4.1 (from chromadb)
  Downloading pybase64-1.4.2-cp311-cp311-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl.metadata (8.7 kB)
Collecting posthog<6.0.0,>=2.4.0 (from chromadb)
  Downloa

In [2]:
!pip install langchain_experimental

Collecting langchain_experimental
  Downloading langchain_experimental-0.3.4-py3-none-any.whl.metadata (1.7 kB)
Downloading langchain_experimental-0.3.4-py3-none-any.whl (209 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/209.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m209.2/209.2 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: langchain_experimental
Successfully installed langchain_experimental-0.3.4


In [3]:
!pip uninstall -y torch torchvision sentence-transformers transformers

Found existing installation: torch 2.6.0+cu124
Uninstalling torch-2.6.0+cu124:
  Successfully uninstalled torch-2.6.0+cu124
Found existing installation: torchvision 0.21.0+cu124
Uninstalling torchvision-0.21.0+cu124:
  Successfully uninstalled torchvision-0.21.0+cu124
Found existing installation: sentence-transformers 5.1.0
Uninstalling sentence-transformers-5.1.0:
  Successfully uninstalled sentence-transformers-5.1.0
Found existing installation: transformers 4.55.0
Uninstalling transformers-4.55.0:
  Successfully uninstalled transformers-4.55.0


In [4]:
!pip install torch torchvision --index-url https://download.pytorch.org/whl/cu118
!pip install sentence-transformers transformers

Looking in indexes: https://download.pytorch.org/whl/cu118
Collecting torch
  Downloading https://download.pytorch.org/whl/cu118/torch-2.7.1%2Bcu118-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (28 kB)
Collecting torchvision
  Downloading https://download.pytorch.org/whl/cu118/torchvision-0.22.1%2Bcu118-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (6.1 kB)
Collecting sympy>=1.13.3 (from torch)
  Downloading https://download.pytorch.org/whl/sympy-1.13.3-py3-none-any.whl.metadata (12 kB)
Collecting nvidia-cuda-nvrtc-cu11==11.8.89 (from torch)
  Downloading https://download.pytorch.org/whl/cu118/nvidia_cuda_nvrtc_cu11-11.8.89-py3-none-manylinux1_x86_64.whl (23.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.2/23.2 MB[0m [31m88.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-runtime-cu11==11.8.89 (from torch)
  Downloading https://download.pytorch.org/whl/cu118/nvidia_cuda_runtime_cu11-11.8.89-py3-none-manylinux1_x86_64.whl (875 kB)
[2K     [90

In [None]:
# save as create_vector_db.py (replace your old file)
import os
import hashlib
import json
import shutil
import logging
import time
from pathlib import Path
from typing import List, Tuple
from tqdm import tqdm
import torch
from langchain.schema import Document
from langchain_experimental.text_splitter import SemanticChunker
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from google.colab import files  # keep for Colab upload/download

# ----------------- CONFIG -----------------
CONFIG = {
    "embedding_models": {"en": "sentence-transformers/all-mpnet-base-v2"},
    # Update this persist path consistently with your main app
    "persist_directory": "/content/vector_db",
    "collections": {
        "en": {
            "name": "ai_matters",
            "input_file": "/content/ai_matters_data.jsonl"  # will be updated if you upload
        }
    },
    "chunking": {
        "breakpoint_threshold_type": "percentile",
        "breakpoint_threshold_amount": 80
    },
    # keep small-ish batch default; script will fallback to per-doc on failure
    "batch_size": 32,
    "device": "cuda" if torch.cuda.is_available() else "cpu",
    # IMPORTANT: keep in sync with your query-time embedding encode_kwargs
    "normalize_embeddings": True
}

# ----------------- Logging -----------------
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)

# ----------------- Helpers -----------------
def upload_file():
    """Upload the JSONL file directly to Colab"""
    logger.info("Please upload your ai_matters_data.jsonl file")
    uploaded = files.upload()
    for filename, content in uploaded.items():
        with open(f"/content/{filename}", "wb") as f:
            f.write(content)
    logger.info(f"File {list(uploaded.keys())[0]} uploaded successfully")
    return f"/content/{list(uploaded.keys())[0]}"

def make_source_id(url: str, title: str) -> str:
    base = (url or "") + "|" + (title or "")
    h = hashlib.sha1(base.encode("utf-8")).hexdigest()
    return h[:14]

# safe flatten utility (in case splitter returns nested lists)
def flatten_documents(maybe_nested):
    out = []
    for item in maybe_nested:
        if item is None:
            continue
        if isinstance(item, list):
            out.extend(flatten_documents(item))
        else:
            out.append(item)
    return out

# ----------------- Load Documents -----------------
def load_documents(file_path: str, language: str) -> Tuple[List[Document], int]:
    documents = []
    try:
        with open(file_path, "r", encoding="utf-8") as f:
            for line in tqdm(f, desc=f"Loading {language} documents"):
                line = line.strip()
                if not line:
                    continue
                try:
                    rec = json.loads(line)
                except json.JSONDecodeError:
                    logger.warning("Skipping invalid json line")
                    continue
                content = (rec.get("content") or "").strip()
                title = (rec.get("title") or "").strip()
                url = (rec.get("url") or "").strip()
                if not content and not title:
                    continue
                combined = f"{title}\n{content}" if title else content
                source_id = make_source_id(url, title)
                metadata = {
                    "url": url,
                    "title": title,
                    "source": Path(url).stem if url else "",
                    "language": language,
                    "domain": "ai-matters.eu",
                    "source_id": source_id
                }
                documents.append(Document(page_content=combined, metadata=metadata))
    except Exception as e:
        logger.exception(f"Failed to load documents: {e}")
        raise
    return documents, len(documents)

# ----------------- Create Collection -----------------
def create_collection(language: str, config: dict):
    input_file = config["collections"][language]["input_file"]
    docs, doc_count = load_documents(input_file, language)
    if not docs:
        raise ValueError("No documents loaded.")
    logger.info(f"Loaded {doc_count} original documents for language={language}")

    # small index of original sources
    sources_index = {
        d.metadata["source_id"]: {
            "title": d.metadata.get("title", ""),
            "url": d.metadata.get("url", ""),
            "language": d.metadata.get("language", "")
        } for d in docs
    }

    # init embeddings – KEEP encode_kwargs consistent with your query-time use
    embeddings = HuggingFaceEmbeddings(
        model_name=config["embedding_models"][language],
        model_kwargs={"device": config["device"]},
        encode_kwargs={"normalize_embeddings": config["normalize_embeddings"]}
    )

    # text splitter
    text_splitter = SemanticChunker(
        embeddings,
        breakpoint_threshold_type=config["chunking"]["breakpoint_threshold_type"],
        breakpoint_threshold_amount=config["chunking"]["breakpoint_threshold_amount"]
    )

    logger.info("Splitting documents into semantic chunks (may take time)...")
    raw_chunks = text_splitter.split_documents(docs)
    # flatten nested returns
    chunks = flatten_documents(raw_chunks)

    # drop empty chunks and ensure metadata exists
    clean_chunks = []
    for i, c in enumerate(chunks):
        content = (c.page_content or "").strip()
        if not content:
            continue
        # ensure metadata present
        metadata = c.metadata or {}
        if "source_id" not in metadata or not metadata["source_id"]:
            metadata["source_id"] = make_source_id(metadata.get("url", ""), metadata.get("title", ""))
        c.metadata = metadata
        clean_chunks.append(c)

    chunks = clean_chunks
    total_chunks = len(chunks)
    logger.info(f"Chunking complete: {total_chunks} chunks produced (after cleaning)")

    # sample a few chunks for sanity
    for s in range(min(3, total_chunks)):
        snippet = chunks[s].page_content[:200].replace("\n", " ")
        logger.debug(f"Sample chunk {s}: {snippet}... | metadata keys: {list(chunks[s].metadata.keys())}")

    # ensure persist directory exists
    Path(config["persist_directory"]).mkdir(parents=True, exist_ok=True)

    # initialize chroma vectorstore
    vectorstore = Chroma(
        persist_directory=config["persist_directory"],
        collection_name=config["collections"][language]["name"],
        embedding_function=embeddings
    )

    # robust add_documents: batch -> on failure fallback to per-doc
    batch_size = max(1, int(config.get("batch_size", 32)))
    added_count = 0
    failed_chunks = []
    global_idx = 0
    chunks_index = {}  # map chunk_id -> {source_id, snippet}

    logger.info(f"Adding {total_chunks} chunks to Chroma in batches of {batch_size}...")

    for start in tqdm(range(0, total_chunks, batch_size), desc="Adding batches"):
        batch = chunks[start : start + batch_size]
        batch_ids = []
        for c in batch:
            src = c.metadata.get("source_id") or make_source_id(c.metadata.get("url", ""), c.metadata.get("title", ""))
            chunk_id = f"{src}_chunk_{global_idx}"
            global_idx += 1
            c.metadata["chunk_id"] = chunk_id
            batch_ids.append(chunk_id)
            chunks_index[chunk_id] = {
                "source_id": src,
                "title": c.metadata.get("title"),
                "url": c.metadata.get("url"),
                "snippet": c.page_content[:300]
            }

        # try adding whole batch
        try:
            vectorstore.add_documents(batch, ids=batch_ids)
            added_count += len(batch)
            logger.info(f"Added batch {start}-{start+len(batch)-1} ({len(batch)})")
        except Exception as e:
            logger.exception(f"Batch add failed for items {start}-{start+len(batch)-1}. Falling back to per-doc add. Error: {e}")
            # try per-document to isolate the bad ones
            for idx, doc in enumerate(batch):
                doc_id = batch_ids[idx]
                try:
                    vectorstore.add_documents([doc], ids=[doc_id])
                    added_count += 1
                except Exception as e_doc:
                    logger.exception(f"Failed to add document id={doc_id}: {e_doc}")
                    failed_chunks.append(doc_id)

    # persist to disk
    try:
        vectorstore.persist()
    except Exception as e:
        logger.exception(f"Error persisting vectorstore: {e}")
        raise

    logger.info(f"Persisted vectorstore to {config['persist_directory']}. added={added_count}, failed={len(failed_chunks)}")

    # save sources index + chunks index for offline inspection
    try:
        src_path = Path(config["persist_directory"]) / f"{config['collections'][language]['name']}_sources_index.json"
        chunks_path = Path(config["persist_directory"]) / f"{config['collections'][language]['name']}_chunks_index.json"
        with open(src_path, "w", encoding="utf-8") as fh:
            json.dump(sources_index, fh, indent=2, ensure_ascii=False)
        with open(chunks_path, "w", encoding="utf-8") as fh:
            json.dump(chunks_index, fh, indent=2, ensure_ascii=False)
        logger.info(f"Wrote sources_index -> {src_path} and chunks_index -> {chunks_path}")
    except Exception as e:
        logger.exception(f"Failed to write index files: {e}")

    # verify using underlying chroma client: counts and existence of a sample of chunk ids
    try:
        client = Chroma(persist_directory=config["persist_directory"])._client
        collections = client.list_collections()
        counts = {c.name: c.count() for c in collections}
        logger.info(f"Chroma collections & counts: {counts}")
        col = None
        # Try to get the collection object for our name
        try:
            col = client.get_collection(config["collections"][language]["name"])
        except Exception:
            # fallback: find by listing
            for c in collections:
                if c.name == config["collections"][language]["name"]:
                    col = c
                    break

        # sample some chunk ids to ensure they exist in collection
        missing_sample = []
        if col is not None:
            sample_ids = list(chunks_index.keys())[:20]
            if sample_ids:
                try:
                    # use get (works on chromadb collection) – return structure depends on client version
                    found = col.get(ids=sample_ids, include=["ids"])
                    found_ids = found.get("ids", []) if isinstance(found, dict) else found.ids if hasattr(found, "ids") else []
                    for sid in sample_ids:
                        if sid not in found_ids:
                            missing_sample.append(sid)
                except Exception as eget:
                    logger.debug("Collection.get check failed; trying query by ids. Error: %s", eget)
                    try:
                        resp = col.query(ids=sample_ids, n_results=1, include=["ids"])
                        # resp shape varies by version; try to extract ids
                        resp_ids = []
                        if isinstance(resp, dict) and "ids" in resp:
                            for block in resp["ids"]:
                                resp_ids.extend(block)
                        missing_sample = [s for s in sample_ids if s not in resp_ids]
                    except Exception as errq:
                        logger.warning("Could not verify sample ids in collection due to client API differences: %s", errq)
        else:
            logger.warning("Unable to find collection object to verify chunk ids (client API mismatch)")

        if missing_sample:
            logger.warning(f"Some sample chunk ids were not found in Chroma collection (count={len(missing_sample)}). Examples: {missing_sample[:5]}")
        else:
            logger.info("Sample chunk id verification passed (no missing ids in sample).")

    except Exception as e:
        logger.exception(f"Verification via client failed: {e}")

    # Final summary
    logger.info(f"Indexing complete. attempted_chunks={total_chunks}, added={added_count}, failed_chunks={len(failed_chunks)}")
    if failed_chunks:
        logger.warning(f"The following chunk ids failed to add (you can retry individually): {failed_chunks[:20]}")

# ----------------- Secondary utilities -----------------
def verify_collections(config: dict):
    logger.info("Current collections:")
    client = Chroma(persist_directory=config["persist_directory"])._client
    for collection in client.list_collections():
        logger.info(f"- {collection.name} ({collection.count()} vectors)")

def zip_and_download(config: dict):
    try:
        zip_path = str(Path(config["persist_directory"]).with_suffix(""))
        logger.info(f"Creating ZIP archive at {zip_path}.zip...")
        shutil.make_archive(zip_path, "zip", config["persist_directory"])
        logger.info("Downloading vector database...")
        files.download(f"{zip_path}.zip")
        logger.info("Download completed successfully!")
    except Exception as e:
        logger.exception(f"Failed to create/download ZIP: {e}")
        raise

# ----------------- Main -----------------
def main():
    logger.info("Starting vector database creation process for AI Matters")

    # optionally upload a file in Colab
    uploaded_file = upload_file()
    CONFIG["collections"]["en"]["input_file"] = uploaded_file

    # clear old DB
    if os.path.exists(CONFIG["persist_directory"]):
        logger.warning(f"Deleting existing DB at {CONFIG['persist_directory']} to create fresh DB")
        shutil.rmtree(CONFIG["persist_directory"])

    start = time.time()
    create_collection("en", CONFIG)
    logger.info(f"Indexing finished in {time.time() - start:.2f}s")

    verify_collections(CONFIG)
    zip_and_download(CONFIG)
    logger.info("Process completed successfully!")

if __name__ == "__main__":
    main()


Saving ai_matters_data.jsonl to ai_matters_data.jsonl


Loading en documents: 415it [00:00, 5189.61it/s]
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Chunking en documents: 100%|██████████| 415/415 [00:46<00:00,  8.87it/s]
  vectorstore = Chroma(
Adding to ai_matters: 100%|██████████| 33/33 [00:40<00:00,  1.24s/it]
  vectorstore.persist()


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>