In [1]:

# Install required packages
# !pip install -q ipywidgets IPython.display

import asyncio
import numpy as np
import tempfile
import wave
import os
import time
from typing import Optional, AsyncGenerator

# Import IPython widgets for interactive UI
import ipywidgets as widgets
from IPython.display import display, Audio, clear_output

# Import our local adapters
from local_tts_adapter import LocalTTSAdapter
from local_stt_adapter import LocalSTTAdapter
from local_llm_adapter import MockLocalLLMAdapter, LocalLLMAdapter
from livekit.agents.llm import ChatContext, ChatMessage

print("✅ All dependencies imported successfully!")


  from pkg_resources import resource_filename


✅ All dependencies imported successfully!


In [2]:

class VoiceAgent:
    """Voice agent for voice-to-voice interaction"""
    
    def __init__(self, use_ollama=False, ollama_model="llama3.2"):
        self.tts = None
        self.stt = None
        self.llm = None
        self.use_ollama = use_ollama
        self.ollama_model = ollama_model
        
        # Chat context
        self.chat_context = ChatContext(messages=[
            ChatMessage(
                role="system",
                content="You are a helpful local AI assistant. Keep responses concise and friendly."
            )
        ])
        
        self.initialized = False
        
    async def initialize(self):
        """Initialize all components"""
        print("🚀 Initializing Voice Agent...")
        
        # Initialize TTS
        print("🔊 Loading TTS...")
        self.tts = LocalTTSAdapter()
        await self.tts.initialize()
        
        # Initialize STT
        print("🎤 Loading STT...")
        self.stt = LocalSTTAdapter()
        await self.stt.initialize()
        
        # Initialize LLM
        print("🤖 Loading LLM...")
        if self.use_ollama:
            self.llm = LocalLLMAdapter(
                model_name=self.ollama_model,
                base_url="http://localhost:11434"
            )
        else:
            self.llm = MockLocalLLMAdapter()
        
        self.initialized = True
        print("✅ All components initialized!")
        
    async def process_voice_input(self):
        """Process voice input and return response"""
        if not self.initialized:
            await self.initialize()
        
        # Create audio recorder widget
        print("🎤 Click the microphone button below and speak:")
        audio_recorder = widgets.Audio(
            value=None,
            format='wav',
            sample_rate=16000,
            autoplay=False
        )
        
        display(audio_recorder)
        
        # Wait for user to record
        while audio_recorder.value is None:
            time.sleep(0.1)
            
        audio_data = audio_recorder.value
        print("✅ Audio recorded successfully!")
        
        # Save audio to temporary file
        with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as temp_file:
            temp_path = temp_file.name
            temp_file.write(audio_data)
        
        try:
            # Transcribe audio
            print("🎤 Transcribing audio...")
            transcription = await self.stt.transcribe_audio_file(temp_path)
            
            if not transcription or transcription.strip() == "":
                return "I couldn't hear anything. Please try again."
            
            print(f"👤 You said: {transcription}")
            
            # Add to chat context
            self.chat_context.messages.append(
                ChatMessage(role="user", content=transcription)
            )
            
            # Generate response
            print("🤖 Generating response...")
            response_chunks = []
            async for chunk in self.llm.chat(self.chat_context):
                response_chunks.append(chunk)
            
            response = ''.join(response_chunks)
            print(f"🤖 Assistant: {response}")
            
            # Add assistant response to context
            self.chat_context.messages.append(
                ChatMessage(role="assistant", content=response)
            )
            
            # Generate speech
            print("🔊 Generating speech...")
            audio_chunks = []
            async for chunk in self.tts.synthesize(response):
                audio_chunks.append(chunk)
            
            # Combine audio chunks and play
            if audio_chunks:
                combined_audio = b''.join(audio_chunks)
                audio_widget = Audio(data=combined_audio, autoplay=True)
                display(audio_widget)
                print("🔊 Playing response...")
            
            return response
            
        finally:
            # Clean up temporary file
            try:
                os.unlink(temp_path)
            except:
                pass
    
    async def process_text_input(self, text: str):
        """Process text input and return response"""
        if not self.initialized:
            await self.initialize()
        
        print(f"👤 You: {text}")
        
        # Add to chat context
        self.chat_context.messages.append(
            ChatMessage(role="user", content=text)
        )
        
        # Generate response
        print("🤖 Generating response...")
        response_chunks = []
        async for chunk in self.llm.chat(self.chat_context):
            response_chunks.append(chunk)
        
        response = ''.join(response_chunks)
        print(f"🤖 Assistant: {response}")
        
        # Add assistant response to context
        self.chat_context.messages.append(
            ChatMessage(role="assistant", content=response)
        )
        
        # Generate speech
        print("🔊 Generating speech...")
        audio_chunks = []
        async for chunk in self.tts.synthesize(response):
            audio_chunks.append(chunk)
        
        # Combine audio chunks and play
        if audio_chunks:
            combined_audio = b''.join(audio_chunks)
            audio_widget = Audio(data=combined_audio, autoplay=True)
            display(audio_widget)
            print("🔊 Playing response...")
        
        return response
    
    def get_chat_history(self):
        """Get chat history"""
        return self.chat_context.messages
    
    def clear_chat_history(self):
        """Clear chat history"""
        self.chat_context.messages = [
            ChatMessage(
                role="system",
                content="You are a helpful local AI assistant. Keep responses concise and friendly."
            )
        ]
        print("🗑️ Chat history cleared!")


In [5]:

# Create voice agent instance
agent = VoiceAgent(
    use_ollama=False,  # Set to True if you have Ollama running
    ollama_model="llama3.2"
)

# Initialize the agent
await agent.initialize()

print("🎉 Voice agent ready for interaction!")


🚀 Initializing Voice Agent...
🔊 Loading TTS...
✅ Local TTS model loaded: tts_models/en/vctk/vits
🎤 Loading STT...
✅ Local STT model loaded on GPU: small
🤖 Loading LLM...
✅ All components initialized!
🎉 Voice agent ready for interaction!


In [6]:
print("🎤 Starting voice-to-voice interaction...")
print("Click the microphone button below and speak:")

# Skip the voice input for now due to audio widget issues
print("⚠️ Voice input temporarily disabled due to audio widget compatibility issues")
print("💡 Please use the text input in the next cell instead")


🎤 Starting voice-to-voice interaction...
Click the microphone button below and speak:
⚠️ Voice input temporarily disabled due to audio widget compatibility issues
💡 Please use the text input in the next cell instead


In [7]:

# Text interaction - run this cell to test text input
print("💬 Text-to-voice interaction...")

await agent.process_text_input("Hello, how are you today?")

💬 Text-to-voice interaction...
👤 You: Hello, how are you today?
🤖 Generating response...
🤖 Assistant: Hello! I'm your local AI assistant. How can I help you today?
🔊 Generating speech...


🔊 Playing response...


"Hello! I'm your local AI assistant. How can I help you today?"

In [None]:

# Create interactive interface
voice_button = widgets.Button(
    description="🎤 Voice Input",
    button_style='success',
    layout=widgets.Layout(width='150px', height='40px')
)

text_input = widgets.Text(
    placeholder="Type your message...",
    layout=widgets.Layout(width='300px')
)

text_button = widgets.Button(
    description="💬 Send",
    button_style='info',
    layout=widgets.Layout(width='100px', height='40px')
)

clear_button = widgets.Button(
    description="🗑️ Clear",
    button_style='warning',
    layout=widgets.Layout(width='100px', height='40px')
)

output_area = widgets.Output()

# Function to handle voice input
async def on_voice_click(b):
    with output_area:
        clear_output(wait=True)
        await agent.process_voice_input()

# Function to handle text input
async def on_text_click(b):
    text = text_input.value.strip()
    if text:
        with output_area:
            clear_output(wait=True)
            await agent.process_text_input(text)
        text_input.value = ""

# Function to clear chat
def on_clear_click(b):
    agent.clear_chat_history()
    with output_area:
        clear_output(wait=True)
        print("🗑️ Chat history cleared!")

# Connect button events
voice_button.on_click(lambda b: asyncio.create_task(on_voice_click(b)))
text_button.on_click(lambda b: asyncio.create_task(on_text_click(b)))
clear_button.on_click(on_clear_click)

# Display the interface
print("🎮 Interactive Voice Agent Interface")
print("=" * 50)

button_row = widgets.HBox([voice_button, text_input, text_button, clear_button])
interface = widgets.VBox([button_row, output_area])

display(interface)

🎮 Interactive Voice Agent Interface


VBox(children=(HBox(children=(Button(button_style='success', description='🎤 Voice Input', layout=Layout(height…

In [9]:

# View chat history
print("📝 Chat History:")
print("=" * 30)

for i, message in enumerate(agent.get_chat_history()):
    if message.role == "system":
        continue
    print(f"{message.role.upper()}: {message.content}")
    print("-" * 30)


📝 Chat History:
USER: Hello, how are you today?
------------------------------
ASSISTANT: Hello! I'm your local AI assistant. How can I help you today?
------------------------------


In [11]:

# Check system status
print("🔍 System Status:")
print("=" * 30)

# Check GPU
try:
    import torch
    if torch.cuda.is_available():
        print(f"✅ GPU available: {torch.cuda.get_device_name(0)}")
    else:
        print("⚠️ GPU not available, using CPU")
except ImportError:
    print("⚠️ PyTorch not installed")

# Check components
print(f"\n🔊 TTS: {'✅ Initialized' if agent.tts else '❌ Not initialized'}")
print(f"🎤 STT: {'✅ Initialized' if agent.stt else '❌ Not initialized'}")
print(f"🤖 LLM: {'✅ Initialized' if agent.llm else '❌ Not initialized'}")

print("\n🎉 Voice agent is ready for voice-to-voice interaction!")

🔍 System Status:
✅ GPU available: NVIDIA GeForce RTX 3050 Laptop GPU

🔊 TTS: ✅ Initialized
🎤 STT: ✅ Initialized
🤖 LLM: ✅ Initialized

🎉 Voice agent is ready for voice-to-voice interaction!


In [14]:

# Alternative: Manual voice interaction
print("🎤 Manual Voice Interaction")
print("Uncomment the line below to start voice interaction:")

# await agent.process_voice_input()

🎤 Manual Voice Interaction
Uncomment the line below to start voice interaction:


In [15]:

# Configuration options
print("⚙️ Configuration Options:")
print("=" * 30)

print("To use Ollama instead of Mock LLM:")
print("1. Install Ollama: curl -fsSL https://ollama.ai/install.sh | sh")
print("2. Start Ollama: ollama serve")
print("3. Pull a model: ollama pull llama3.2")
print("4. Set use_ollama=True in the agent initialization")

print("\nTo change TTS voice:")
print("Modify the LocalTTSAdapter initialization:")
print("tts = LocalTTSAdapter(speaker_id='p227')  # Different voice")

print("\nTo change STT model size:")
print("Modify the LocalSTTAdapter initialization:")
print("stt = LocalSTTAdapter(model_size='medium')  # Larger model")

print("\n🎉 Your local voice agent is ready!") 

⚙️ Configuration Options:
To use Ollama instead of Mock LLM:
1. Install Ollama: curl -fsSL https://ollama.ai/install.sh | sh
2. Start Ollama: ollama serve
3. Pull a model: ollama pull llama3.2
4. Set use_ollama=True in the agent initialization

To change TTS voice:
Modify the LocalTTSAdapter initialization:
tts = LocalTTSAdapter(speaker_id='p227')  # Different voice

To change STT model size:
Modify the LocalSTTAdapter initialization:
stt = LocalSTTAdapter(model_size='medium')  # Larger model

🎉 Your local voice agent is ready!
