# MiniLM Embedding Builder for AI Tutor

This Colab-ready notebook mirrors the ingestion pipeline used in the `ai_tutor` project while keeping the `sentence-transformers/all-MiniLM-L6-v2` embedding model.

It guides you through:
- installing dependencies required for parsing/chunking,
- loading the project settings and overriding the embedding provider,
- uploading custom study materials,
- parsing, chunking, and embedding them with the same utilities the app uses, and
- exporting a JSONL chunk index plus vector store files ready for retrieval.

Run each cell sequentially.


In [1]:
!git clone https://github.com/HenryNVP/ai-tutor.git

Cloning into 'ai-tutor'...
remote: Enumerating objects: 90, done.[K
remote: Counting objects: 100% (90/90), done.[K
remote: Compressing objects: 100% (68/68), done.[K
remote: Total 90 (delta 15), reused 90 (delta 15), pack-reused 0 (from 0)[K
Receiving objects: 100% (90/90), 41.96 KiB | 20.98 MiB/s, done.
Resolving deltas: 100% (15/15), done.


In [13]:
%cd ai-tutor

/content/ai-tutor


In [None]:
# Install runtime dependencies
!pip install -q sentence-transformers pymupdf pandas pyarrow tqdm


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/24.1 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.4/24.1 MB[0m [31m253.0 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━[0m [32m18.6/24.1 MB[0m [31m289.3 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m24.1/24.1 MB[0m [31m291.3 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m120.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
# Configure project paths and settings
import sys
from pathlib import Path

import pandas as pd

try:
    from google.colab import files as colab_files  # type: ignore
except ImportError:
    colab_files = None

NOTEBOOK_DIR = Path.cwd()
PROJECT_ROOT = NOTEBOOK_DIR / "ai-tutor"
SRC_ROOT = PROJECT_ROOT / "src"
if str(SRC_ROOT) not in sys.path:
    sys.path.insert(0, str(SRC_ROOT))

from ai_tutor.config.loader import load_settings
from ai_tutor.config.schema import EmbeddingConfig
from ai_tutor.data_models import Chunk, Document, DocumentMetadata
from ai_tutor.ingestion.chunker import chunk_document
from ai_tutor.ingestion.embeddings import EmbeddingClient
from ai_tutor.ingestion.parsers import parse_path
from ai_tutor.retrieval.simple_store import SimpleVectorStore
from ai_tutor.storage import ChunkJsonlStore

UPLOAD_DIR = NOTEBOOK_DIR / "source_documents"
OUTPUT_DIR = NOTEBOOK_DIR / "notebook_outputs"
VECTOR_STORE_DIR = OUTPUT_DIR / "vector_store"
CHUNKS_PATH = OUTPUT_DIR / "chunks.jsonl"

for directory in [UPLOAD_DIR, OUTPUT_DIR, VECTOR_STORE_DIR]:
    directory.mkdir(parents=True, exist_ok=True)

settings = load_settings()
minilm_config = EmbeddingConfig(
    model="BAAI/bge-base-en",
    provider="sentence-transformers",
    batch_size=settings.embeddings.batch_size,
    normalize=True,
)
settings.embeddings = minilm_config

print(f"Project root: {PROJECT_ROOT}")
print(f"Chunk size / overlap: {settings.chunking.chunk_size} / {settings.chunking.chunk_overlap}")
print(f"Embedding provider: {settings.embeddings.provider}")
print(f"Embedding model: {settings.embeddings.model}")
print(f"Upload directory: {UPLOAD_DIR.resolve()}")
print(f"Output directory: {OUTPUT_DIR.resolve()}")

Project root: /content/ai-tutor/ai-tutor
Chunk size / overlap: 900 / 120
Embedding provider: sentence-transformers
Embedding model: BAAI/bge-base-en
Upload directory: /content/ai-tutor/source_documents
Output directory: /content/ai-tutor/notebook_outputs


In [15]:
# Helpers for parsing documents and creating chunks
from typing import List

def parse_documents(paths: List[Path]) -> List[Document]:
    documents: List[Document] = []
    for path in paths:
        try:
            document = parse_path(path)
        except Exception as err:
            text = path.read_text(encoding="utf-8", errors="ignore")
            metadata = DocumentMetadata(
                doc_id=path.stem,
                title=path.stem.replace("_", " ").title(),
                source_path=path,
                extra={"format": path.suffix.lower() or "txt", "parser": "fallback"},
            )
            document = Document(metadata=metadata, text=text)
            print(f"Fallback parser used for {path.name}: {err}")
        documents.append(document)
    print(f"Loaded {len(documents)} document(s).")
    return documents

def chunk_documents(documents: List[Document]) -> List[Chunk]:
    chunks: List[Chunk] = []
    for document in documents:
        chunks.extend(chunk_document(document, settings.chunking))
    print(f"Created {len(chunks)} chunk(s).")
    return chunks


In [16]:
# Upload source documents (Colab) or gather from the upload directory
source_paths = []
if colab_files is not None:
    uploaded = colab_files.upload()
    for filename, data in uploaded.items():
        path = UPLOAD_DIR / filename
        path.write_bytes(data)
        source_paths.append(path)
    print(f"Saved {len(source_paths)} file(s) to {UPLOAD_DIR}.")
else:
    print("google.colab not available; expecting files to already exist in the upload directory.")
    source_paths = sorted(UPLOAD_DIR.glob('*'))
source_paths


Saving collegephysicstbqvol3_2014_01_13color.pdf to collegephysicstbqvol3_2014_01_13color.pdf
Saving collegephysicstbqvol12014_01_05d.pdf to collegephysicstbqvol12014_01_05d.pdf
Saving collegephysicsvol22014_08_07b.pdf to collegephysicsvol22014_08_07b.pdf
Saved 3 file(s) to /content/ai-tutor/source_documents.


[PosixPath('/content/ai-tutor/source_documents/collegephysicstbqvol3_2014_01_13color.pdf'),
 PosixPath('/content/ai-tutor/source_documents/collegephysicstbqvol12014_01_05d.pdf'),
 PosixPath('/content/ai-tutor/source_documents/collegephysicsvol22014_08_07b.pdf')]

In [25]:
# Parse and chunk uploaded documents
documents = parse_documents(source_paths)
if not documents:
    raise ValueError("No documents loaded; please upload at least one supported file.")

chunks = chunk_documents(documents)
if not chunks:
    raise ValueError("Chunking produced no data; adjust the source material or chunk settings.")

pd.DataFrame(
    [
        {
            "chunk_id": chunk.metadata.chunk_id,
            "doc_id": chunk.metadata.doc_id,
            "source": chunk.metadata.source_path.name,
            "page": chunk.metadata.page,
            "token_count": chunk.token_count,
            "text_preview": chunk.text[:120] + ("..." if len(chunk.text) > 120 else ""),
        }
        for chunk in chunks[:5]
    ]
)


Loaded 3 document(s).
Created 1008 chunk(s).


Unnamed: 0,chunk_id,doc_id,source,page,token_count,text_preview
0,collegephysicstbqvol3_2014_01_13color-0-94d8967d,collegephysicstbqvol3_2014_01_13color,collegephysicstbqvol3_2014_01_13color.pdf,p.1,900,College Physics Textbook Equity Edition Volume...
1,collegephysicstbqvol3_2014_01_13color-1-989d1149,collegephysicstbqvol3_2014_01_13color,collegephysicstbqvol3_2014_01_13color.pdf,p.2,900,"lives. In developing countries, it focuses on ..."
2,collegephysicstbqvol3_2014_01_13color-2-aaadfe25,collegephysicstbqvol3_2014_01_13color,collegephysicstbqvol3_2014_01_13color.pdf,p.3,900,. . . . . . . . . . . . . . . . . . . . . . . ...
3,collegephysicstbqvol3_2014_01_13color-3-08bd1562,collegephysicstbqvol3_2014_01_13color,collegephysicstbqvol3_2014_01_13color.pdf,p.4,900,. . . . . . . . . . 95 Projectile Motion . . ....
4,collegephysicstbqvol3_2014_01_13color-4-a0c8a0bc,collegephysicstbqvol3_2014_01_13color,collegephysicstbqvol3_2014_01_13color.pdf,p.6,900,. . . . . . . . . . . . . . . . . . . . . . . ...


In [26]:
# Build MiniLM embeddings and persist outputs
embedder = EmbeddingClient(settings.embeddings)
embeddings = embedder.embed_documents(chunk.text for chunk in chunks)

for chunk, embedding in zip(chunks, embeddings):
    chunk.embedding = embedding

chunk_store = ChunkJsonlStore(CHUNKS_PATH)
chunk_store.upsert(chunks)

vector_store = SimpleVectorStore(VECTOR_STORE_DIR)
vector_store.add(chunks)
vector_store.persist()

print(f"Saved chunk index → {CHUNKS_PATH.resolve()}")
print(f"Saved vector store → {VECTOR_STORE_DIR.resolve()}")

pd.DataFrame(
    [
        {
            "chunk_id": chunk.metadata.chunk_id,
            "embedding_dim": len(chunk.embedding) if chunk.embedding is not None else None,
        }
        for chunk in chunks[:5]
    ]
)


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/719 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Saved chunk index → /content/ai-tutor/notebook_outputs/chunks.jsonl
Saved vector store → /content/ai-tutor/notebook_outputs/vector_store


Unnamed: 0,chunk_id,embedding_dim
0,collegephysicstbqvol3_2014_01_13color-0-94d8967d,768
1,collegephysicstbqvol3_2014_01_13color-1-989d1149,768
2,collegephysicstbqvol3_2014_01_13color-2-aaadfe25,768
3,collegephysicstbqvol3_2014_01_13color-3-08bd1562,768
4,collegephysicstbqvol3_2014_01_13color-4-a0c8a0bc,768


In [28]:
# Bundle outputs for optional download
import shutil

archive_path = shutil.make_archive(OUTPUT_DIR.as_posix(), "zip", root_dir=OUTPUT_DIR)
print(f"Created archive: {archive_path}")
if colab_files is not None:
    colab_files.download(archive_path)


Created archive: /content/ai-tutor/notebook_outputs.zip


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>