# AI Tutor - Textbook Ingestion (Google Colab)

This notebook uses the AI Tutor's ingestion system to process textbooks.

**Features:**
- ‚úÖ Uses TutorSystem ingestion pipeline
- ‚úÖ Upload your own PDFs or use sample textbooks
- ‚úÖ Creates embeddings with sentence-transformers
- ‚úÖ Works with `chroma_data/chroma.sqlite3` database
- ‚úÖ Save/load database from Google Drive


In [None]:
print("üîÑ Cloning AI Tutor repository...")
!git clone -b mcp_server https://github.com/HenryNVP/ai-tutor.git
%cd ai-tutor

!pip installl -r requirements.txt

In [None]:
import os
from pathlib import Path

%cd ai-tutor

PROJECT_ROOT = Path.cwd()
print(f"Project root: {PROJECT_ROOT}")

import sys
SRC_ROOT = PROJECT_ROOT / "src"
if str(SRC_ROOT) not in sys.path:
    sys.path.insert(0, str(SRC_ROOT))


In [None]:
# Initialize TutorSystem with chroma_data directory
from ai_tutor.system import TutorSystem
from ai_tutor.config.loader import load_settings
from pathlib import Path
import os
from google.colab import userdata

# Optional: Set API key
os.environ["OPENAI_DEFAULT_MODEL"]="gpt-4o-mini"
os.environ["OPENAI_API_KEY"] = userdata.get("OPENAI_API_KEY")

print("üöÄ Loading settings...")
settings = load_settings()

# Override vector_store_dir to use chroma_data instead of data/vector_store
settings.paths.vector_store_dir = Path("chroma_data")
print(f"  Vector store directory: {settings.paths.vector_store_dir}")

# Create chroma_data directory if it doesn't exist
chroma_data_dir = PROJECT_ROOT / "chroma_data"
chroma_data_dir.mkdir(exist_ok=True)
print(f"  Created/verified chroma_data directory: {chroma_data_dir}")

print("\nüöÄ Initializing TutorSystem...")
system = TutorSystem(settings=settings)

print(f"\n‚úì TutorSystem initialized")
print(f"  Embedding model: {system.embedder.config.model}")
print(f"  Chunk size: {system.settings.chunking.chunk_size}")
print(f"  Vector store: {system.settings.paths.vector_store_dir}")
print(f"  Database location: {chroma_data_dir}/chroma.sqlite3")


In [None]:
from google.colab import drive

drive.mount('/content/drive')
# Data folder in Drive
drive_folder_path = '/content/drive/MyDrive/ai-tutor/raw'

In [None]:
# Upload PDFs
from pathlib import Path
from google.colab import files
import shutil

raw_data_dir = PROJECT_ROOT / "data" / "raw"
raw_data_dir.mkdir(parents=True, exist_ok=True)

# Copy PDF files from Google Drive to data/raw
print(f"Copying PDF files from '{drive_folder_path}' to '{raw_data_dir}'...")
for pdf_file in Path(drive_folder_path).glob("*.pdf"):
    shutil.copy(pdf_file, raw_data_dir / pdf_file.name)
    print(f"‚úÖ {pdf_file.name}")

# List all PDF files
pdf_files = sorted(raw_data_dir.glob("*.pdf"))
print(f"\nüìö Found {len(pdf_files)} PDF file(s) ready for ingestion:\n")
for i, pdf in enumerate(pdf_files, 1):
    size_mb = pdf.stat().st_size / (1024 * 1024)
    print(f"  {i}. {pdf.name} ({size_mb:.1f} MB)")

In [None]:
# Ingest all textbooks
# This will:
# 1. Parse PDFs
# 2. Chunk the text
# 3. Generate embeddings
# 4. Store in vector store

print("üîÑ Starting ingestion (this may take a few minutes)...\n")

result = system.ingest_directory(raw_data_dir)

print("\n" + "="*60)
print("‚úÖ INGESTION COMPLETE")
print("="*60)
print(f"üìÑ Documents processed: {len(result.documents)}")
print(f"üìù Chunks created: {len(result.chunks)}")
print(f"‚è≠Ô∏è  Files skipped: {len(result.skipped)}")

if result.skipped:
    print(f"\nSkipped files:")
    for skip in result.skipped:
        print(f"  - {skip}")


In [None]:
# Verify ingestion
print("üìä Verification:\n")

# Check chunks file
chunks_file = PROJECT_ROOT / "data" / "processed" / "chunks.jsonl"
if chunks_file.exists():
    with open(chunks_file, 'r') as f:
        chunk_count = sum(1 for _ in f)
    print(f"‚úì Chunks stored: {chunk_count}")
else:
    print("‚úó No chunks file found")

# Check Chroma database
chroma_db_path = PROJECT_ROOT / "chroma_data" / "chroma.sqlite3"
if chroma_db_path.exists():
    import sqlite3
    conn = sqlite3.connect(chroma_db_path)
    cursor = conn.cursor()
    cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
    tables = cursor.fetchall()
    conn.close()
    print(f"‚úì Chroma database exists: {chroma_db_path}")
    print(f"  Tables: {len(tables)}")
else:
    print("‚úó No Chroma database found")

# Check for collection directories
chroma_data_dir = PROJECT_ROOT / "chroma_data"
collection_dirs = [d for d in chroma_data_dir.iterdir() if d.is_dir() and d.name != 'chroma_example']
if collection_dirs:
    print(f"‚úì Collection directories: {len(collection_dirs)}")
else:
    print("‚ÑπÔ∏è  No collection directories yet (will be created on first use)")

print("\n‚úÖ Ingestion successful!")


In [None]:
folders_to_download = [
    PROJECT_ROOT / "data" / "processed",
    PROJECT_ROOT / "data" / "vector_store"
]

for folder_to_download in folders_to_download:
    if folder_to_download.exists():
        zip_filename = f"{folder_to_download.name}.zip"
        # Create a zip archive of the folder
        print(f"Compressing '{folder_to_download}' into '{zip_filename}'...")
        shutil.make_archive(zip_filename.replace(".zip", ""), 'zip', folder_to_download)

        # Download the zip file
        print(f"Downloading '{zip_filename}'...")
        files.download(zip_filename)
        print(f"‚úÖ Download initiated for {zip_filename}")
    else:
        print(f"‚ùóÔ∏è Folder not found: {folder_to_download}. Skipping download for this folder.")

print("\nFinished processing download requests.")

In [None]:
# Save database to Google Drive
chroma_db_path = PROJECT_ROOT / 'chroma_data' / 'chroma.sqlite3'
drive_db_dir = Path('/content/drive/MyDrive/ai-tutor/chroma_data')
drive_db_dir.mkdir(parents=True, exist_ok=True)
drive_db_path = drive_db_dir / 'chroma.sqlite3'

if chroma_db_path.exists():
    print(f"üíæ Saving database to Drive...")
    shutil.copy(chroma_db_path, drive_db_path)
    size_mb = chroma_db_path.stat().st_size / (1024 * 1024)
    print(f"‚úÖ Database saved to Drive")
    print(f"   Size: {size_mb:.2f} MB")
    print(f"   Location: {drive_db_path}")

    # Also save collection directories if they exist
    chroma_data_dir = PROJECT_ROOT / 'chroma_data'
    for item in chroma_data_dir.iterdir():
        if item.is_dir() and item.name != 'chroma_example':
            drive_collection_dir = drive_db_dir / item.name
            if drive_collection_dir.exists():
                shutil.rmtree(drive_collection_dir)
            shutil.copytree(item, drive_collection_dir)
            print(f"   ‚úÖ Saved collection directory: {item.name}")
else:
    print("‚ö†Ô∏è  No database found to save")
