# MiniLM Embedding Builder for AI Tutor

This Colab-ready notebook mirrors the ingestion pipeline used in the `ai_tutor` project while keeping the `sentence-transformers/all-MiniLM-L6-v2` embedding model.

It guides you through:
- installing dependencies required for parsing/chunking,
- loading the project settings and overriding the embedding provider,
- uploading custom study materials,
- parsing, chunking, and embedding them with the same utilities the app uses, and
- exporting a JSONL chunk index plus vector store files ready for retrieval.

Run each cell sequentially.


In [None]:
# Install runtime dependencies (safe to skip if already available)
!pip install -q sentence-transformers pymupdf pandas pyarrow tqdm


In [None]:
# Configure project paths and settings
import sys
from pathlib import Path

import pandas as pd

try:
    from google.colab import files as colab_files  # type: ignore
except ImportError:
    colab_files = None

NOTEBOOK_DIR = Path.cwd()
PROJECT_ROOT = NOTEBOOK_DIR
for candidate in [NOTEBOOK_DIR, *NOTEBOOK_DIR.parents]:
    if (candidate / "src" / "ai_tutor").exists():
        PROJECT_ROOT = candidate
        break
SRC_ROOT = PROJECT_ROOT / "src"
if str(SRC_ROOT) not in sys.path:
    sys.path.insert(0, str(SRC_ROOT))

from ai_tutor.config.loader import load_settings
from ai_tutor.config.schema import EmbeddingConfig
from ai_tutor.data_models import Chunk, Document, DocumentMetadata
from ai_tutor.ingestion.chunker import chunk_document
from ai_tutor.ingestion.embeddings import EmbeddingClient
from ai_tutor.ingestion.parsers import parse_path
from ai_tutor.retrieval.simple_store import SimpleVectorStore
from ai_tutor.storage import ChunkJsonlStore

UPLOAD_DIR = NOTEBOOK_DIR / "source_documents"
OUTPUT_DIR = NOTEBOOK_DIR / "notebook_outputs"
VECTOR_STORE_DIR = OUTPUT_DIR / "vector_store"
CHUNKS_PATH = OUTPUT_DIR / "chunks.jsonl"

for directory in [UPLOAD_DIR, OUTPUT_DIR, VECTOR_STORE_DIR]:
    directory.mkdir(parents=True, exist_ok=True)

settings = load_settings()
minilm_config = EmbeddingConfig(
    model="sentence-transformers/all-MiniLM-L6-v2",
    provider="sentence-transformers",
    batch_size=settings.embeddings.batch_size,
    normalize=True,
)
settings.embeddings = minilm_config

print(f"Project root: {PROJECT_ROOT}")
print(f"Chunk size / overlap: {settings.chunking.chunk_size} / {settings.chunking.chunk_overlap}")
print(f"Embedding provider: {settings.embeddings.provider}")
print(f"Embedding model: {settings.embeddings.model}")
print(f"Upload directory: {UPLOAD_DIR.resolve()}")
print(f"Output directory: {OUTPUT_DIR.resolve()}")


In [None]:
# Helpers for parsing documents and creating chunks
from typing import List

def parse_documents(paths: List[Path]) -> List[Document]:
    documents: List[Document] = []
    for path in paths:
        try:
            document = parse_path(path)
        except Exception as err:
            text = path.read_text(encoding="utf-8", errors="ignore")
            metadata = DocumentMetadata(
                doc_id=path.stem,
                title=path.stem.replace("_", " ").title(),
                source_path=path,
                extra={"format": path.suffix.lower() or "txt", "parser": "fallback"},
            )
            document = Document(metadata=metadata, text=text)
            print(f"Fallback parser used for {path.name}: {err}")
        documents.append(document)
    print(f"Loaded {len(documents)} document(s).")
    return documents

def chunk_documents(documents: List[Document]) -> List[Chunk]:
    chunks: List[Chunk] = []
    for document in documents:
        chunks.extend(chunk_document(document, settings.chunking))
    print(f"Created {len(chunks)} chunk(s).")
    return chunks


In [None]:
# Upload source documents (Colab) or gather from the upload directory
source_paths = []
if colab_files is not None:
    uploaded = colab_files.upload()
    for filename, data in uploaded.items():
        path = UPLOAD_DIR / filename
        path.write_bytes(data)
        source_paths.append(path)
    print(f"Saved {len(source_paths)} file(s) to {UPLOAD_DIR}.")
else:
    print("google.colab not available; expecting files to already exist in the upload directory.")
    source_paths = sorted(UPLOAD_DIR.glob('*'))
source_paths


In [None]:
# Parse and chunk uploaded documents
documents = parse_documents(source_paths)
if not documents:
    raise ValueError("No documents loaded; please upload at least one supported file.")

chunks = chunk_documents(documents)
if not chunks:
    raise ValueError("Chunking produced no data; adjust the source material or chunk settings.")

pd.DataFrame(
    [
        {
            "chunk_id": chunk.metadata.chunk_id,
            "doc_id": chunk.metadata.doc_id,
            "source": chunk.metadata.source_path.name,
            "page": chunk.metadata.page,
            "token_count": chunk.token_count,
            "text_preview": chunk.text[:120] + ("..." if len(chunk.text) > 120 else ""),
        }
        for chunk in chunks[:5]
    ]
)


In [None]:
# Build MiniLM embeddings and persist outputs
embedder = EmbeddingClient(settings.embeddings)
embeddings = embedder.embed_documents(chunk.text for chunk in chunks)

for chunk, embedding in zip(chunks, embeddings):
    chunk.embedding = embedding

chunk_store = ChunkJsonlStore(CHUNKS_PATH)
chunk_store.upsert(chunks)

vector_store = SimpleVectorStore(VECTOR_STORE_DIR)
vector_store.add(chunks)
vector_store.persist()

print(f"Saved chunk index → {CHUNKS_PATH.resolve()}")
print(f"Saved vector store → {VECTOR_STORE_DIR.resolve()}")

pd.DataFrame(
    [
        {
            "chunk_id": chunk.metadata.chunk_id,
            "embedding_dim": len(chunk.embedding) if chunk.embedding is not None else None,
        }
        for chunk in chunks[:5]
    ]
)


In [None]:
# Bundle outputs for optional download
import shutil

archive_path = shutil.make_archive(OUTPUT_DIR.as_posix(), "zip", root_dir=OUTPUT_DIR)
print(f"Created archive: {archive_path}")
if colab_files is not None:
    colab_files.download(archive_path)
