In [None]:
!pip install --quiet "livekit-agents[openai,silero]"
!pip install --quiet livekit-api
!pip install --quiet llama-index
!pip install --quiet aiohttp
!pip install --quiet nest-asyncio

In [None]:
# WORKING NOTEBOOK + RAG

import logging
import os
from pathlib import Path

# =============================================================================
# SET YOUR API KEYS HERE (same as working version)
# =============================================================================
# os.environ['OPENAI_API_KEY'] = 'your-openai-api-key-here'
# os.environ['ELEVEN_API_KEY'] = 'your-elevenlabs-api-key-here'  # Optional
# os.environ['LIVEKIT_URL'] = 'wss://your-project.livekit.cloud'
# os.environ['LIVEKIT_API_KEY'] = 'your-livekit-api-key'
# os.environ['LIVEKIT_API_SECRET'] = 'your-livekit-api-secret'


# =============================================================================
# QUICK RAG SETUP (works with YOUR uploaded files)
# =============================================================================
from llama_index.core import SimpleDirectoryReader, VectorStoreIndex

# Setup directories
DATA_DIR = Path("/content/rag_data")
DATA_DIR.mkdir(exist_ok=True)

print("📁 UPLOAD YOUR OWN FILES:")
print("1. Click the folder icon on the left sidebar")
print("2. Upload your .txt files to Colab")
print("3. Run this code to move them:")
print("")
print("# Copy your uploaded files to the RAG directory")
print("!cp /content/*.txt /content/rag_data/")
print("")
print("4. Then run the main code below")
print("="*50)

def setup_quick_rag():
    """RAG setup using YOUR uploaded files"""
    global rag_index

    print("📁 Looking for your uploaded text files...")

    # Check for uploaded files first
    uploaded_files = list(DATA_DIR.glob("*.txt"))

    if uploaded_files:
        print(f"✅ Found {len(uploaded_files)} uploaded files:")
        for file in uploaded_files:
            print(f"   - {file.name}")
    else:
        print("📝 No uploaded files found. Creating sample documents...")
        print("💡 To use your own files:")
        print("   1. Upload .txt files to Colab")
        print("   2. Move them to /content/rag_data/")
        print("   3. Restart this cell")
        print("")

        # Create sample documents as fallback
        sample_docs = [
            ("ai_basics.txt", """
            Artificial Intelligence (AI) is computer science focused on creating intelligent machines.
            Machine Learning is a subset of AI that learns from data without explicit programming.
            Deep Learning uses neural networks with multiple layers for complex pattern recognition.
            Natural Language Processing (NLP) enables computers to understand human language.
            Computer Vision allows machines to interpret visual information from images and videos.
            The three main types of machine learning are supervised learning, unsupervised learning, and reinforcement learning.
            """),

            ("python_info.txt", """
            Python is a high-level programming language known for simplicity and readability.
            Popular Python libraries include NumPy for numerical computing, Pandas for data analysis,
            Matplotlib for visualization, Scikit-learn for machine learning, and TensorFlow for deep learning.
            Python is used in web development, data science, automation, and artificial intelligence.
            Python was created by Guido van Rossum and first released in 1991.
            """),

            ("vector_db_info.txt", """
            Vector databases store high-dimensional vectors and enable similarity search.
            Embeddings are numerical representations of data as vectors in high-dimensional space.
            Popular vector databases include Pinecone, Weaviate, Chroma, and FAISS.
            Vector databases are essential for RAG systems, recommendation engines, and semantic search.
            Similarity search in vector databases typically uses cosine similarity or Euclidean distance.
            """)
        ]

        # Write sample documents
        for filename, content in sample_docs:
            doc_path = DATA_DIR / filename
            if not doc_path.exists():
                with open(doc_path, 'w') as f:
                    f.write(content)

    # Create RAG index from all files in directory
    print("🧠 Building RAG index from text files...")
    documents = SimpleDirectoryReader(str(DATA_DIR)).load_data()
    rag_index = VectorStoreIndex.from_documents(documents)
    print(f"✅ RAG ready with {len(documents)} documents")

    # Show what's in the knowledge base
    print(f"📚 Knowledge base contains:")
    for doc in documents:
        # Get first 100 characters as preview
        preview = doc.text[:100].replace('\n', ' ').strip()
        print(f"   - {preview}...")


# Initialize RAG
rag_index = None
setup_quick_rag()

# =============================================================================
# LIVEKIT SETUP
# =============================================================================
logger = logging.getLogger("dlai-agent")
logger.setLevel(logging.INFO)

from livekit import agents
from livekit.agents import Agent, AgentSession, JobContext, WorkerOptions, jupyter, llm
from livekit.plugins import (
    openai,
    elevenlabs,
    silero,
)

# =============================================================================
# RAG FUNCTION
# =============================================================================
@llm.function_tool
async def search_knowledge(query: str) -> str:
    """REQUIRED: Search the knowledge base for information. Must be used for ALL questions."""
    try:
        print(f"🔍 RAG Search: {query}")
        query_engine = rag_index.as_query_engine(
            use_async=True,
            similarity_top_k=3,  # Get top 3 most relevant chunks
        )
        response = await query_engine.aquery(query)
        result = str(response)

        # Check if we got meaningful results
        if len(result.strip()) < 20 or "sorry" in result.lower():
            return "NO_KNOWLEDGE_FOUND: I don't have information about this topic in my knowledge base."

        print(f"📝 RAG Answer: {result[:100]}...")
        return f"KNOWLEDGE_BASE_RESULT: {result}"
    except Exception as e:
        return f"SEARCH_ERROR: I couldn't search the knowledge base: {str(e)}"

# =============================================================================
# ASSISTANT CLASS
# =============================================================================
class Assistant(Agent):
    def __init__(self) -> None:
        llm_model = openai.LLM(model="gpt-4o")
        stt = openai.STT()
        tts = elevenlabs.TTS()
        #tts = elevenlabs.TTS(voice_id="CwhRBWXzGAHq8TQ4Fs17")  # example with defined voice
        silero_vad = silero.VAD.load()

        super().__init__(
            instructions="""
                CRITICAL INSTRUCTIONS - FOLLOW EXACTLY:

                1. You MUST use the search_knowledge function for EVERY question or request.
                2. You can ONLY answer based on information returned by search_knowledge.
                3. If search_knowledge returns "NO_KNOWLEDGE_FOUND", say "I don't have that information in my knowledge base."
                4. If search_knowledge returns "SEARCH_ERROR", say "I'm having trouble accessing my knowledge base."
                5. NEVER use your built-in knowledge or training data to answer questions.
                6. ALWAYS start your response by calling search_knowledge first.
                7. If the search result doesn't fully answer the question, say "My knowledge base has limited information on this topic" and only share what was found.

                You are a voice assistant that ONLY knows what's in the knowledge base.
                Keep responses conversational and concise for voice interaction.
            """,
            stt=stt,
            llm=llm_model,
            tts=tts,
            vad=silero_vad,
            tools=[search_knowledge],  # ADD RAG TOOL HERE
        )

# =============================================================================
# ENTRYPOINT
# =============================================================================
async def entrypoint(ctx: JobContext):
    await ctx.connect()

    session = AgentSession()

    await session.start(
        room=ctx.room,
        agent=Assistant()
    )

# =============================================================================
# RUN SETUP
# =============================================================================
print("🚀 Starting your working notebook + RAG...")
print("🧠 Now with knowledge about AI, Python, and vector databases!")
print("")
print("📝 Instructions:")
print("- Unmute the microphone symbol on the left")
print("- You can ignore the 'Start Audio' button")
print("- Start by speaking a long phrase like 'hello, how are you today'")
print("")
print("🎤 Try asking (should work - in knowledge base):")
print("- 'What is machine learning?'")
print("- 'Tell me about Python programming'")
print("- 'How do vector databases work?'")
print("- 'Who created Python?'")
print("")
print("🚫 Try asking (should say 'no knowledge'):")
print("- 'What's the weather today?'")
print("- 'Tell me about JavaScript'")
print("- 'Who is the CEO of our company?'")
print("")
print("✅ The agent will ONLY answer from YOUR uploaded files (or samples if none uploaded)!")

# jupyter.run_app call
jupyter.run_app(
    WorkerOptions(entrypoint_fnc=entrypoint),
    jupyter_url="https://jupyter-api-livekit.vercel.app/api/join-token"
)
