In [None]:
print('Setup complete.')

# Lab 9: Note Catalog System

## Learning Objectives
- Build a knowledge management system with AI assistance
- Implement note indexing and search functionality
- Create semantic search with embeddings
- Design a command-line interface for note management

## Lab Overview
Build an AI-powered note catalog system that can:
1. **Import and Index Notes** - Process markdown files and extract metadata
2. **Semantic Search** - Find relevant notes using natural language queries
3. **Auto-Tagging** - Generate relevant tags for notes automatically
4. **Knowledge Graph** - Build connections between related notes

## Exit Ticket
- [ ] Functional note import system
- [ ] Working semantic search
- [ ] AI-powered auto-tagging
- [ ] CLI interface for note operations

In [None]:
# Install required packages
!pip install asksageclient pip_system_certs tiktoken pandas numpy scikit-learn click rich

In [None]:
# ================================
# 🔐 Cell 1 — Load secrets (Colab) + pricing + token utils
# ================================
import os, time, csv
from typing import Optional, Dict
import tiktoken

from google.colab import userdata

ASKSAGE_API_KEY = userdata.get("ASKSAGE_API_KEY")
ASKSAGE_BASE_URL = userdata.get("ASKSAGE_BASE_URL")
ASKSAGE_EMAIL = userdata.get("ASKSAGE_EMAIL")

assert ASKSAGE_API_KEY, "ASKSAGE_API_KEY not provided."
assert ASKSAGE_EMAIL, "ASKSAGE_EMAIL not provided."

print("✓ Secrets loaded")
print("  • EMAIL:", ASKSAGE_EMAIL)
print("  • BASE URL:", ASKSAGE_BASE_URL or "(default)")

# Pricing (USD per 1,000,000 tokens)
PRICES_PER_M = {
    "gpt-5": {"input_per_m": 1.25, "output_per_m": 10.00},
    "gpt-5-mini": {"input_per_m": 0.25, "output_per_m": 2.00},
}

# Tokenizer
enc = tiktoken.get_encoding("o200k_base")

def count_tokens(text: str) -> int:
    return len(enc.encode(text or ""))

def cost_usd(model: str, input_tokens: int, output_tokens: int) -> float:
    if model not in PRICES_PER_M:
        raise ValueError(f"Unknown model: {model}")
    r = PRICES_PER_M[model]
    return (input_tokens / 1_000_000) * r["input_per_m"] + (output_tokens / 1_000_000) * r["output_per_m"]

In [None]:
# ================================
# 🔧 Cell 2 — Import bootcamp_common and setup AskSage client
# ================================
import sys
sys.path.append('../../../')  # Adjust path to reach bootcamp_common

from bootcamp_common.ask_sage import AskSageClient

# Initialize AskSage client
client = AskSageClient(
    api_key=ASKSAGE_API_KEY,
    base_url=ASKSAGE_BASE_URL
)

print("✓ AskSage client initialized")

In [None]:
import os
import json
import hashlib
from pathlib import Path
from typing import Dict, List, Optional, Any
from dataclasses import dataclass, asdict
from datetime import datetime

import openai
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import click
from rich.console import Console
from rich.table import Table
from rich.panel import Panel

console = Console()
print("📚 Note Catalog System loading...")

## Note Data Model

In [None]:
@dataclass
class Note:
    """Represents a single note in the catalog"""
    id: str
    title: str
    content: str
    filepath: str
    tags: List[str]
    created_at: str
    modified_at: str
    word_count: int
    embedding: Optional[List[float]] = None
    
    @classmethod
    def from_file(cls, filepath: Path) -> 'Note':
        """Create Note from markdown file"""
        content = filepath.read_text(encoding='utf-8')
        
        # Extract title (first # heading or filename)
        lines = content.split('\n')
        title = filepath.stem
        for line in lines:
            if line.startswith('# '):
                title = line[2:].strip()
                break
        
        # Generate ID from filepath
        note_id = hashlib.md5(str(filepath).encode()).hexdigest()[:8]
        
        # File timestamps
        stat = filepath.stat()
        created_at = datetime.fromtimestamp(stat.st_ctime).isoformat()
        modified_at = datetime.fromtimestamp(stat.st_mtime).isoformat()
        
        return cls(
            id=note_id,
            title=title,
            content=content,
            filepath=str(filepath),
            tags=[],  # Will be populated by AI
            created_at=created_at,
            modified_at=modified_at,
            word_count=len(content.split())
        )

print("📋 Note model defined")

## Note Catalog Manager

In [None]:
class NoteCatalog:
    """Manages a catalog of notes with AI-powered features"""
    
    def __init__(self, catalog_path: str = "note_catalog.json"):
        self.catalog_path = catalog_path
        self.notes: Dict[str, Note] = {}
        self.setup_client()
        self.load_catalog()
    
    def setup_client(self):
        """Setup OpenAI client"""
        if os.getenv('OPENAI_API_KEY'):
            try:
                self.client = openai.OpenAI()
                self.has_api = True
                console.print("✅ OpenAI client configured")
            except Exception as e:
                self.has_api = False
                console.print(f"⚠️ Using mock responses: {e}")
        else:
            self.has_api = False
            console.print("💡 No API key found, using mock features")
    
    def load_catalog(self):
        """Load existing catalog from disk"""
        if Path(self.catalog_path).exists():
            with open(self.catalog_path, 'r') as f:
                data = json.load(f)
                for note_data in data.get('notes', []):
                    note = Note(**note_data)
                    self.notes[note.id] = note
            console.print(f"📚 Loaded {len(self.notes)} notes from catalog")
    
    def save_catalog(self):
        """Save catalog to disk"""
        data = {
            'notes': [asdict(note) for note in self.notes.values()],
            'updated_at': datetime.now().isoformat()
        }
        with open(self.catalog_path, 'w') as f:
            json.dump(data, f, indent=2)
        console.print(f"💾 Saved catalog with {len(self.notes)} notes")

# TODO: Implement the import_notes method
    def import_notes(self, directory: str) -> int:
        """Import markdown notes from directory"""
        imported = 0
        notes_dir = Path(directory)
        
        if not notes_dir.exists():
            console.print(f"❌ Directory not found: {directory}")
            return 0
        
        # TODO: Find all .md files in directory
        # TODO: Create Note objects from files
        # TODO: Generate embeddings and tags for each note
        # TODO: Add notes to catalog
        
        # HINT: Use Path.glob("**/*.md") to find markdown files
        # HINT: Call generate_tags() and generate_embedding() for each note
        
        pass  # Replace with your implementation

# Initialize catalog
catalog = NoteCatalog()
print("📚 Note catalog ready!")

## AI-Powered Features

In [None]:
class AIFeatures:
    """AI-powered note analysis features"""
    
    def __init__(self, catalog: NoteCatalog):
        self.catalog = catalog
    
    def generate_tags(self, note: Note) -> List[str]:
        """Generate relevant tags for a note using AI"""
        
        prompt = f"""Analyze this note and generate 3-5 relevant tags.
Return only the tags as a comma-separated list.

Title: {note.title}
Content: {note.content[:500]}...

Tags:"""
        
        if self.catalog.has_api:
            try:
                response = self.catalog.client.chat.completions.create(
                    model="gpt-3.5-turbo",
                    messages=[{"role": "user", "content": prompt}],
                    max_tokens=50,
                    temperature=0.3
                )
                tags_text = response.choices[0].message.content.strip()
                return [tag.strip() for tag in tags_text.split(',')]
            except Exception as e:
                console.print(f"⚠️ Tag generation failed: {e}")
        
        # Mock tags based on content
        content_lower = note.content.lower()
        mock_tags = []
        if 'python' in content_lower: mock_tags.append('python')
        if 'ai' in content_lower or 'machine learning' in content_lower: mock_tags.append('ai')
        if 'data' in content_lower: mock_tags.append('data')
        if 'web' in content_lower: mock_tags.append('web')
        return mock_tags or ['general']
    
    def generate_embedding(self, text: str) -> List[float]:
        """Generate text embedding for semantic search"""
        
        if self.catalog.has_api:
            try:
                response = self.catalog.client.embeddings.create(
                    model="text-embedding-ada-002",
                    input=text[:8000]  # Limit text length
                )
                return response.data[0].embedding
            except Exception as e:
                console.print(f"⚠️ Embedding generation failed: {e}")
        
        # Mock embedding (random vector for testing)
        return np.random.rand(1536).tolist()

# TODO: Implement semantic search
    def semantic_search(self, query: str, top_k: int = 5) -> List[tuple]:
        """Search notes using semantic similarity"""
        
        # TODO: Generate embedding for query
        # TODO: Calculate similarity with all note embeddings
        # TODO: Return top-k most similar notes with scores
        
        # HINT: Use cosine_similarity from sklearn
        # HINT: Filter out notes without embeddings
        
        pass  # Replace with your implementation

# Initialize AI features
ai_features = AIFeatures(catalog)
print("🤖 AI features ready!")

## Demo: Create Sample Notes

In [None]:
# Create sample notes directory
sample_dir = Path("sample_notes")
sample_dir.mkdir(exist_ok=True)

# Sample notes content
sample_notes = {
    "python_basics.md": """# Python Basics

Python is a high-level programming language known for its simplicity and readability.

## Key Features
- Easy to learn syntax
- Extensive standard library
- Large ecosystem of third-party packages
- Great for data science, web development, and automation

## Getting Started
```python
print("Hello, World!")
```""",
    
    "machine_learning_intro.md": """# Introduction to Machine Learning

Machine learning is a subset of AI that enables computers to learn without explicit programming.

## Types of ML
1. **Supervised Learning** - Learning with labeled data
2. **Unsupervised Learning** - Finding patterns in unlabeled data
3. **Reinforcement Learning** - Learning through interaction

## Popular Libraries
- scikit-learn
- TensorFlow
- PyTorch""",
    
    "web_development.md": """# Web Development Notes

Web development involves creating applications that run on the internet.

## Frontend Technologies
- HTML/CSS for structure and styling
- JavaScript for interactivity
- React, Vue, Angular for frameworks

## Backend Technologies
- Python (Django, Flask)
- Node.js
- Database integration"""
}

# Write sample files
for filename, content in sample_notes.items():
    filepath = sample_dir / filename
    filepath.write_text(content, encoding='utf-8')

console.print(f"📝 Created {len(sample_notes)} sample notes in {sample_dir}")

## Task 1: Implement Note Import

Complete the `import_notes` method in the `NoteCatalog` class above.

In [None]:
# TODO: Test your import_notes implementation
# imported_count = catalog.import_notes("sample_notes")
# console.print(f"✅ Imported {imported_count} notes")
# catalog.save_catalog()

pass  # Replace with your test code

## Task 2: Implement Semantic Search

Complete the `semantic_search` method in the `AIFeatures` class above.

In [None]:
# TODO: Test your semantic search implementation
# search_results = ai_features.semantic_search("python programming")
# 
# table = Table(title="Search Results")
# table.add_column("Score", style="green")
# table.add_column("Title", style="blue")
# table.add_column("Tags", style="yellow")
# 
# for score, note in search_results:
#     table.add_row(f"{score:.3f}", note.title, ", ".join(note.tags))
# 
# console.print(table)

pass  # Replace with your test code

## Task 3: Build CLI Interface

In [None]:
# TODO: Implement a CLI interface using Click
# Commands to implement:
# - import: Import notes from directory
# - search: Search notes by query
# - list: List all notes
# - show: Show specific note content
# - tags: List all tags

@click.group()
def cli():
    """Note Catalog CLI"""
    pass

# TODO: Add CLI commands here
# Example:
# @cli.command()
# @click.argument('directory')
# def import_cmd(directory):
#     """Import notes from directory"""
#     pass

print("🖥️ CLI framework ready (implement commands above)")

## Extension Ideas

🚀 **Advanced Features to Implement:**

1. **Knowledge Graph**: Build connections between related notes
2. **Auto-Summary**: Generate summaries of long notes
3. **Duplicate Detection**: Find similar or duplicate notes
4. **Export Features**: Export to various formats (PDF, HTML)
5. **Version Control**: Track note changes over time
6. **Web Interface**: Build a web UI with Streamlit or Flask
7. **Integration**: Connect with existing note apps (Obsidian, Notion)

## Deliverable Checklist

- [ ] Functional note import system that processes markdown files
- [ ] AI-powered tag generation for automatic categorization
- [ ] Semantic search using embeddings and similarity matching
- [ ] Command-line interface for note operations
- [ ] Proper error handling and user feedback
- [ ] Data persistence with JSON catalog storage

**Bonus Points:**
- [ ] Performance optimization for large note collections
- [ ] Advanced search filters (date, tags, file type)
- [ ] Export functionality for search results
- [ ] Integration with external APIs or services