# AI Tutor - Textbook Ingestion (Google Colab)

This notebook uses the AI Tutor's ingestion system to process textbooks.

**Features:**
- ✅ Uses TutorSystem ingestion pipeline
- ✅ Upload your own PDFs or use sample textbooks
- ✅ Creates embeddings with sentence-transformers

In [None]:
import os
from pathlib import Path


print("🔄 Cloning AI Tutor repository...")
!git clone https://github.com/HenryNVP/ai-tutor.git
%cd ai-tutor

PROJECT_ROOT = Path.cwd()
print(f"Project root: {PROJECT_ROOT}")

Cloning into 'ai-tutor'...
remote: Enumerating objects: 90, done.[K
remote: Counting objects: 100% (90/90), done.[K
remote: Compressing objects: 100% (68/68), done.[K
remote: Total 90 (delta 15), reused 90 (delta 15), pack-reused 0 (from 0)[K
Receiving objects: 100% (90/90), 41.96 KiB | 20.98 MiB/s, done.
Resolving deltas: 100% (15/15), done.


In [None]:
!pip install -r requirements.txt

# Add src to Python path
import sys
SRC_ROOT = PROJECT_ROOT / "src"
if str(SRC_ROOT) not in sys.path:
    sys.path.insert(0, str(SRC_ROOT))

/content/ai-tutor


In [None]:
# Initialize TutorSystem
from ai_tutor.system import TutorSystem
import os

# Optional: Set API key
# os.environ["OPENAI_API_KEY"] = "your_key_here"

print("🚀 Initializing TutorSystem...")
system = TutorSystem.from_config()

print(f"✓ TutorSystem initialized")
print(f"  Embedding model: {system.embedder.config.model}")
print(f"  Chunk size: {system.settings.chunking.chunk_size}")
print(f"  Vector store: {system.settings.paths.vector_store_dir}")


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/24.1 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.4/24.1 MB[0m [31m253.0 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━[0m [32m18.6/24.1 MB[0m [31m289.3 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m24.1/24.1 MB[0m [31m291.3 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m120.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
# Upload your PDFs (or use existing ones in data/raw)
from pathlib import Path
from google.colab import files

raw_data_dir = PROJECT_ROOT / "data" / "raw"
raw_data_dir.mkdir(parents=True, exist_ok=True)


print("📤 Upload your PDF textbooks:")
    

uploaded = files.upload()
        
# Save uploaded files to data/raw
for filename, data in uploaded.items():
    filepath = raw_data_dir / filename
    filepath.write_bytes(data)
    print(f"  ✓ Saved: {filename}")

# List all PDF files
pdf_files = sorted(raw_data_dir.glob("*.pdf"))
print(f"\n📚 Found {len(pdf_files)} PDF file(s) ready for ingestion:\n")
for i, pdf in enumerate(pdf_files, 1):
    size_mb = pdf.stat().st_size / (1024 * 1024)
    print(f"  {i}. {pdf.name} ({size_mb:.1f} MB)")

Project root: /content/ai-tutor/ai-tutor
Chunk size / overlap: 900 / 120
Embedding provider: sentence-transformers
Embedding model: BAAI/bge-base-en
Upload directory: /content/ai-tutor/source_documents
Output directory: /content/ai-tutor/notebook_outputs


In [None]:
# Ingest all textbooks
# This will:
# 1. Parse PDFs
# 2. Chunk the text  
# 3. Generate embeddings
# 4. Store in vector store

print("🔄 Starting ingestion (this may take a few minutes)...\n")

result = system.ingest_directory(raw_data_dir)

print("\n" + "="*60)
print("✅ INGESTION COMPLETE")
print("="*60)
print(f"📄 Documents processed: {len(result.documents)}")
print(f"📝 Chunks created: {len(result.chunks)}")
print(f"⏭️  Files skipped: {len(result.skipped)}")

if result.skipped:
    print(f"\nSkipped files:")
    for skip in result.skipped:
        print(f"  - {skip}")


In [None]:
# Verify ingestion
print("📊 Verification:\n")

# Check chunks file
chunks_file = PROJECT_ROOT / "data" / "processed" / "chunks.jsonl"
if chunks_file.exists():
    with open(chunks_file, 'r') as f:
        chunk_count = sum(1 for _ in f)
    print(f"✓ Chunks stored: {chunk_count}")
else:
    print("✗ No chunks file found")

# Check vector store
vector_dir = PROJECT_ROOT / "data" / "vector_store"
embeddings_file = vector_dir / "embeddings.npy"
metadata_file = vector_dir / "metadata.json"

if embeddings_file.exists():
    import numpy as np
    embeddings = np.load(embeddings_file)
    print(f"✓ Embeddings: {embeddings.shape[0]} vectors of dim {embeddings.shape[1]}")
else:
    print("✗ No embeddings file found")

if metadata_file.exists():
    print(f"✓ Metadata file exists")
else:
    print("✗ No metadata file found")

print("\n✅ Ingestion successful!")
