In [11]:

# Install required packages
# !pip install -q ipywidgets IPython.display

import asyncio
import numpy as np
import tempfile
import wave
import os
import time
from typing import Optional, AsyncGenerator

# Import IPython widgets for interactive UI
import ipywidgets as widgets
from IPython.display import display, Audio, clear_output

# Import our local adapters
from local_tts_adapter import LocalTTSAdapter
from local_stt_adapter import LocalSTTAdapter
from local_llm_adapter import MockLocalLLMAdapter, LocalLLMAdapter
from livekit.agents.llm import ChatContext, ChatMessage

print("✅ All dependencies imported successfully!")


✅ All dependencies imported successfully!


In [12]:
class VoiceAgent:
    """Voice agent for voice-to-voice interaction"""
    
    def __init__(self, use_ollama=False, ollama_model="llama3.2"):
        self.tts = None
        self.stt = None
        self.llm = None
        self.use_ollama = use_ollama
        self.ollama_model = ollama_model
        
        # Chat context
        self.chat_context = ChatContext(messages=[
            ChatMessage(
                role="system",
                content="You are a helpful local AI assistant. Keep responses concise and friendly."
            )
        ])
        
        self.initialized = False
        
    async def initialize(self):
        """Initialize all components"""
        print("🚀 Initializing Voice Agent...")
        
        # Initialize TTS
        print("🔊 Loading TTS...")
        self.tts = LocalTTSAdapter()
        await self.tts.initialize()
        
        # Initialize STT
        print("🎤 Loading STT...")
        self.stt = LocalSTTAdapter()
        await self.stt.initialize()
        
        # Initialize LLM
        print("🤖 Loading LLM...")
        if self.use_ollama:
            self.llm = LocalLLMAdapter(
                model_name=self.ollama_model,
                base_url="http://localhost:11434"
            )
        else:
            self.llm = MockLocalLLMAdapter()
        
        self.initialized = True
        print("✅ All components initialized!")
        
    async def process_voice_input(self):
        """Process voice input and return response"""
        if not self.initialized:
            await self.initialize()
        
        # Create audio recorder widget
        print("🎤 Click the microphone button below and speak:")
        audio_recorder = widgets.Audio(
            format='wav',
            sample_rate=16000,
            autoplay=False
        )
        
        display(audio_recorder)
        
        # Wait for user to record
        while audio_recorder.value is None:
            await asyncio.sleep(0.1)
            
        audio_data = audio_recorder.value
        print("✅ Audio recorded successfully!")
        
        # Save audio to temporary file with proper wav format
        with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as temp_file:
            temp_path = temp_file.name
            
            # Convert audio data to proper WAV format
            try:
                with wave.open(temp_path, 'wb') as wf:
                    wf.setnchannels(1)  # Mono
                    wf.setsampwidth(2)  # 16-bit
                    wf.setframerate(16000)  # 16kHz
                    wf.writeframes(audio_data)
            except Exception as e:
                print(f"❌ Error saving audio: {str(e)}")
                return "There was an issue with the audio recording. Please try again."
        
        try:
            # Transcribe audio
            print("🎤 Transcribing audio...")
            try:
                transcription = await self.stt.transcribe_audio_file(temp_path)
                
                if not transcription or transcription.strip() == "":
                    print("⚠️ No speech detected in audio")
                    return "I couldn't hear anything. Please try again."
                
                print(f"👤 You said: {transcription}")
            except Exception as e:
                print(f"❌ Transcription failed: {str(e)}")
                return "I couldn't hear anything. Please try again."
            
            # Add to chat context
            self.chat_context.messages.append(
                ChatMessage(role="user", content=transcription)
            )
            
            # Generate response
            print("🤖 Generating response...")
            response_chunks = []
            async for chunk in self.llm.chat(self.chat_context):
                response_chunks.append(chunk)
            
            response = ''.join(response_chunks)
            print(f"🤖 Assistant: {response}")
            
            # Add assistant response to context
            self.chat_context.messages.append(
                ChatMessage(role="assistant", content=response)
            )
            
            # Generate speech
            print("🔊 Generating speech...")
            audio_chunks = []
            async for chunk in self.tts.synthesize(response):
                audio_chunks.append(chunk)
            
            # Combine audio chunks and play
            if audio_chunks:
                combined_audio = b''.join(audio_chunks)
                audio_widget = Audio(data=combined_audio, autoplay=True)
                display(audio_widget)
                print("🔊 Playing response...")
            
            return response
            
        finally:
            # Clean up temporary file
            try:
                os.unlink(temp_path)
            except:
                pass
    
    async def process_text_input(self, text: str):
        """Process text input and return response"""
        if not self.initialized:
            await self.initialize()
        
        print(f"👤 You: {text}")
        
        # Add to chat context
        self.chat_context.messages.append(
            ChatMessage(role="user", content=text)
        )
        
        # Generate response
        print("🤖 Generating response...")
        response_chunks = []
        async for chunk in self.llm.chat(self.chat_context):
            response_chunks.append(chunk)
        
        response = ''.join(response_chunks)
        print(f"🤖 Assistant: {response}")
        
        # Add assistant response to context
        self.chat_context.messages.append(
            ChatMessage(role="assistant", content=response)
        )
        
        # Generate speech
        print("🔊 Generating speech...")
        try:
            audio_chunks = []
            chunk_count = 0
            async for chunk in self.tts.synthesize(response):
                audio_chunks.append(chunk)
                chunk_count += 1
                
            print(f"📊 Received {chunk_count} audio chunks")
            
            # Combine audio chunks and play
            if audio_chunks:
                combined_audio = b''.join(audio_chunks)
                print(f"📊 Audio size: {len(combined_audio)} bytes")
                
                # Enhanced audio player with explicit sample rate
                audio_widget = Audio(
                    data=combined_audio, 
                    rate=self.tts.get_sample_rate(), 
                    autoplay=True
                )
                display(audio_widget)
                print("🔊 Playing response...")
            else:
                print("⚠️ No audio chunks were generated")
                
        except Exception as e:
            print(f"❌ Error generating or playing speech: {str(e)}")
        
        return response
    
    def get_chat_history(self):
        """Get chat history"""
        return self.chat_context.messages
    
    def clear_chat_history(self):
        """Clear chat history"""
        self.chat_context.messages = [
            ChatMessage(
                role="system",
                content="You are a helpful local AI assistant. Keep responses concise and friendly."
            )
        ]
        print("🗑️ Chat history cleared!")


In [13]:

# Create voice agent instance
agent = VoiceAgent(
    use_ollama=False,  # Set to True if you have Ollama running
    ollama_model="llama3.2"
)

# Initialize the agent
await agent.initialize()

print("🎉 Voice agent ready for interaction!")


🚀 Initializing Voice Agent...
🔊 Loading TTS...
✅ Local TTS model loaded: tts_models/en/vctk/vits
🎤 Loading STT...
✅ Local STT model loaded on GPU: small
🤖 Loading LLM...
✅ All components initialized!
🎉 Voice agent ready for interaction!


In [14]:
print("🎤 Starting voice-to-voice interaction...")
print("Click the microphone button below and speak:")

await agent.process_voice_input()


🎤 Starting voice-to-voice interaction...
Click the microphone button below and speak:
🎤 Click the microphone button below and speak:


Audio(value=b'', autoplay='False', format='wav')

✅ Audio recorded successfully!
🎤 Transcribing audio...
✅ Transcription completed: ''
⚠️ No speech detected in audio


"I couldn't hear anything. Please try again."

In [15]:

# Text interaction - run this cell to test text input
print("💬 Text-to-voice interaction...")

await agent.process_text_input("Hello, how are you today?")

💬 Text-to-voice interaction...
👤 You: Hello, how are you today?
🤖 Generating response...
🤖 Assistant: Hello! I'm your local AI assistant. How can I help you today?
🔊 Generating speech...
📊 Received 56 audio chunks
📊 Audio size: 226912 bytes


🔊 Playing response...


"Hello! I'm your local AI assistant. How can I help you today?"

In [16]:

# Create interactive interface
voice_button = widgets.Button(
    description="🎤 Voice Input",
    button_style='success',
    layout=widgets.Layout(width='150px', height='40px')
)

text_input = widgets.Text(
    placeholder="Type your message...",
    layout=widgets.Layout(width='300px')
)

text_button = widgets.Button(
    description="💬 Send",
    button_style='info',
    layout=widgets.Layout(width='100px', height='40px')
)

clear_button = widgets.Button(
    description="🗑️ Clear",
    button_style='warning',
    layout=widgets.Layout(width='100px', height='40px')
)

output_area = widgets.Output()

# Function to handle voice input
async def on_voice_click(b):
    with output_area:
        clear_output(wait=True)
        await agent.process_voice_input()

# Function to handle text input
async def on_text_click(b):
    text = text_input.value.strip()
    if text:
        with output_area:
            clear_output(wait=True)
            await agent.process_text_input(text)
        text_input.value = ""

# Function to clear chat
def on_clear_click(b):
    agent.clear_chat_history()
    with output_area:
        clear_output(wait=True)
        print("🗑️ Chat history cleared!")

# Connect button events
voice_button.on_click(lambda b: asyncio.create_task(on_voice_click(b)))
text_button.on_click(lambda b: asyncio.create_task(on_text_click(b)))
clear_button.on_click(on_clear_click)

# Display the interface
print("🎮 Interactive Voice Agent Interface")
print("=" * 50)

button_row = widgets.HBox([voice_button, text_input, text_button, clear_button])
interface = widgets.VBox([button_row, output_area])

display(interface)

🎮 Interactive Voice Agent Interface


VBox(children=(HBox(children=(Button(button_style='success', description='🎤 Voice Input', layout=Layout(height…

In [17]:

# View chat history
print("📝 Chat History:")
print("=" * 30)

for i, message in enumerate(agent.get_chat_history()):
    if message.role == "system":
        continue
    print(f"{message.role.upper()}: {message.content}")
    print("-" * 30)


📝 Chat History:
USER: Hello, how are you today?
------------------------------
ASSISTANT: Hello! I'm your local AI assistant. How can I help you today?
------------------------------


In [18]:

# Check system status
print("🔍 System Status:")
print("=" * 30)

# Check GPU
try:
    import torch
    if torch.cuda.is_available():
        print(f"✅ GPU available: {torch.cuda.get_device_name(0)}")
    else:
        print("⚠️ GPU not available, using CPU")
except ImportError:
    print("⚠️ PyTorch not installed")

# Check components
print(f"\n🔊 TTS: {'✅ Initialized' if agent.tts else '❌ Not initialized'}")
print(f"🎤 STT: {'✅ Initialized' if agent.stt else '❌ Not initialized'}")
print(f"🤖 LLM: {'✅ Initialized' if agent.llm else '❌ Not initialized'}")

print("\n🎉 Voice agent is ready for voice-to-voice interaction!")

🔍 System Status:
✅ GPU available: NVIDIA GeForce RTX 3050 Laptop GPU

🔊 TTS: ✅ Initialized
🎤 STT: ✅ Initialized
🤖 LLM: ✅ Initialized

🎉 Voice agent is ready for voice-to-voice interaction!


In [19]:

# Alternative: Manual voice interaction
print("🎤 Manual Voice Interaction")
print("Uncomment the line below to start voice interaction:")

# await agent.process_voice_input()

🎤 Manual Voice Interaction
Uncomment the line below to start voice interaction:


In [20]:

# Configuration options
print("⚙️ Configuration Options:")
print("=" * 30)

print("To use Ollama instead of Mock LLM:")
print("1. Install Ollama: curl -fsSL https://ollama.ai/install.sh | sh")
print("2. Start Ollama: ollama serve")
print("3. Pull a model: ollama pull llama3.2")
print("4. Set use_ollama=True in the agent initialization")

print("\nTo change TTS voice:")
print("Modify the LocalTTSAdapter initialization:")
print("tts = LocalTTSAdapter(speaker_id='p227')  # Different voice")

print("\nTo change STT model size:")
print("Modify the LocalSTTAdapter initialization:")
print("stt = LocalSTTAdapter(model_size='medium')  # Larger model")

print("\n🎉 Your local voice agent is ready!") 

⚙️ Configuration Options:
To use Ollama instead of Mock LLM:
1. Install Ollama: curl -fsSL https://ollama.ai/install.sh | sh
2. Start Ollama: ollama serve
3. Pull a model: ollama pull llama3.2
4. Set use_ollama=True in the agent initialization

To change TTS voice:
Modify the LocalTTSAdapter initialization:
tts = LocalTTSAdapter(speaker_id='p227')  # Different voice

To change STT model size:
Modify the LocalSTTAdapter initialization:
stt = LocalSTTAdapter(model_size='medium')  # Larger model

🎉 Your local voice agent is ready!


In [21]:
# Simple audio test - Run this cell to verify your browser can play audio
from IPython.display import Audio
import numpy as np

# Create a simple audio test (1 second beep at 440Hz)
sample_rate = 22050
t = np.linspace(0, 1, sample_rate, False)
audio = np.sin(440 * 2 * np.pi * t) * 0.5  # 440 Hz sine wave at half volume

print("🔊 Testing basic audio playback...")
print("You should hear a 1-second beep sound")
display(Audio(data=audio, rate=sample_rate, autoplay=True))

🔊 Testing basic audio playback...
You should hear a 1-second beep sound


In [22]:
# Diagnostic: Test audio playback directly
print("🔍 Audio Playback Diagnostic")
print("=" * 30)

# Create a simple test audio
def create_test_audio():
    import numpy as np
    from IPython.display import Audio
    
    # Create a simple sine wave
    sample_rate = 22050
    seconds = 2
    t = np.linspace(0, seconds, int(seconds * sample_rate), False)
    note = np.sin(440 * 2 * np.pi * t)  # 440 Hz sine wave
    audio = note * 0.5  # Reduce volume
    
    # Convert to 16-bit PCM
    audio_bytes = (audio * 32767).astype(np.int16).tobytes()
    
    # Play audio
    print("▶️ Playing test audio...")
    display(Audio(data=audio_bytes, rate=sample_rate, autoplay=True))
    
    return audio_bytes

# Test TTS directly
async def test_tts():
    print("🔊 Testing TTS directly...")
    tts = LocalTTSAdapter()
    await tts.initialize()
    
    text = "This is a test of the text to speech system."
    print(f"💬 Text: '{text}'")
    
    audio_chunks = []
    async for chunk in tts.synthesize(text):
        audio_chunks.append(chunk)
    
    # Combine audio chunks and play
    if audio_chunks:
        combined_audio = b''.join(audio_chunks)
        print(f"📊 Audio size: {len(combined_audio)} bytes")
        audio_widget = Audio(data=combined_audio, rate=tts.get_sample_rate(), autoplay=True)
        display(audio_widget)
        print("🔊 Playing TTS audio...")
        return True
    else:
        print("❌ No audio generated")
        return False

# Run tests
test_audio = create_test_audio()
print("\nRunning TTS test...")
await test_tts()

🔍 Audio Playback Diagnostic
▶️ Playing test audio...



Running TTS test...
🔊 Testing TTS directly...
✅ Local TTS model loaded: tts_models/en/vctk/vits
💬 Text: 'This is a test of the text to speech system.'
📊 Audio size: 139296 bytes


🔊 Playing TTS audio...


True

In [None]:
# Comprehensive TTS and STT Diagnostic and Fix
print("🔧 TTS and STT Diagnostic Tool")
print("=" * 50)

async def diagnose_and_fix_tts_stt():
    results = {
        "tts_install": False,
        "stt_install": False,
        "tts_model": False,
        "stt_model": False,
        "tts_audio": False,
        "stt_audio": False
    }
    
    # Test 1: Check package installations
    print("\n📦 Checking package installations...")
    
    try:
        from TTS.api import TTS
        print("✅ TTS package is installed")
        results["tts_install"] = True
    except ImportError:
        print("❌ TTS package is not installed")
        print("   Try: pip install TTS")
    
    try:
        from faster_whisper import WhisperModel
        print("✅ faster-whisper package is installed")
        results["stt_install"] = True
    except ImportError:
        print("❌ faster-whisper package is not installed")
        print("   Try: pip install faster-whisper")
    
    # Test 2: Check if models can be loaded
    print("\n🧠 Checking model initialization...")
    
    if results["tts_install"]:
        try:
            tts_adapter = LocalTTSAdapter()
            await tts_adapter.initialize()
            print("✅ TTS model initialized successfully")
            results["tts_model"] = True
        except Exception as e:
            print(f"❌ TTS model initialization failed: {str(e)}")
            print("   This might be due to missing model files or network issues")
    
    if results["stt_install"]:
        try:
            stt_adapter = LocalSTTAdapter()
            await stt_adapter.initialize()
            print("✅ STT model initialized successfully")
            results["stt_model"] = True
        except Exception as e:
            print(f"❌ STT model initialization failed: {str(e)}")
            print("   This might be due to missing model files or hardware issues")
    
    # Test 3: Check TTS functionality
    print("\n🔊 Testing TTS functionality...")
    
    if results["tts_model"]:
        try:
            text = "This is a test of the text to speech system."
            audio_chunks = []
            async for chunk in tts_adapter.synthesize(text):
                audio_chunks.append(chunk)
            
            if audio_chunks and len(audio_chunks) > 0:
                combined_audio = b''.join(audio_chunks)
                if len(combined_audio) > 0:
                    print(f"✅ TTS generated audio successfully ({len(combined_audio)} bytes)")
                    audio_widget = Audio(data=combined_audio, rate=tts_adapter.get_sample_rate(), autoplay=True)
                    display(audio_widget)
                    results["tts_audio"] = True
                else:
                    print("❌ TTS generated empty audio")
            else:
                print("❌ TTS did not generate any audio chunks")
        except Exception as e:
            print(f"❌ TTS synthesis failed: {str(e)}")
    
    # Test 4: Create a test audio file for STT
    print("\n🎤 Testing STT functionality...")
    
    if results["stt_model"]:
        try:
            # Create a test WAV file with a sine wave
            with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as temp_file:
                temp_path = temp_file.name
                
                sample_rate = 16000
                duration = 2  # seconds
                t = np.linspace(0, duration, int(duration * sample_rate), False)
                audio = np.sin(440 * 2 * np.pi * t) * 0.5
                audio_int16 = (audio * 32767).astype(np.int16)
                
                with wave.open(temp_path, 'wb') as wf:
                    wf.setnchannels(1)
                    wf.setsampwidth(2)
                    wf.setframerate(sample_rate)
                    wf.writeframes(audio_int16.tobytes())
                
                print(f"✅ Created test audio file: {temp_path}")
                
                # Try to transcribe it (will likely be empty since it's just a tone)
                print("⏳ Testing STT with test audio...")
                try:
                    result = await stt_adapter.transcribe_audio_file(temp_path)
                    print(f"✅ STT processed the audio file (result: '{result}')")
                    results["stt_audio"] = True
                    # Don't expect actual transcription since it's just a tone
                except Exception as e:
                    print(f"❌ STT transcription failed: {str(e)}")
                
                # Clean up
                os.unlink(temp_path)
                
        except Exception as e:
            print(f"❌ STT test failed: {str(e)}")
    
    # Summary and recommendations
    print("\n📝 Diagnostic Summary:")
    print("=" * 50)
    
    all_passed = all(results.values())
    
    if all_passed:
        print("✅ All tests passed! TTS and STT should be working properly.")
        print("   If you're still having issues, check your browser audio settings and try a different browser.")
    else:
        print("❌ Some tests failed. Here are the recommendations:")
        
        if not results["tts_install"] or not results["stt_install"]:
            print("\n📦 Package Installation Issues:")
            print("   Run the following to reinstall required packages:")
            print("   pip install -r requirements.txt")
            print("   # Or specifically:")
            print("   pip install TTS faster-whisper torch torchaudio")
            
        if not results["tts_model"] or not results["stt_model"]:
            print("\n🧠 Model Initialization Issues:")
            print("   - Check your internet connection (models download on first use)")
            print("   - Check disk space and permissions")
            print("   - Try smaller models: LocalTTSAdapter(model_name='tts_models/en/ljspeech/tacotron2-DDC')")
            print("   - Try smaller STT models: LocalSTTAdapter(model_size='tiny')")
            
        if not results["tts_audio"]:
            print("\n🔊 TTS Audio Generation Issues:")
            print("   - Check if your TTS model was properly initialized")
            print("   - Try a different model or voice")
            print("   - Check for PyTorch/CUDA compatibility issues")
            
        if not results["stt_audio"]:
            print("\n🎤 STT Processing Issues:")
            print("   - Check microphone permissions in browser")
            print("   - Try a different microphone")
            print("   - Check audio format compatibility")
    
    return results

# Run the diagnostic
await diagnose_and_fix_tts_stt()

# Installation Guide for TTS and STT

If the diagnostic tool identified issues with your TTS or STT components, here are some installation commands you can try. Uncomment and run these commands as needed:

## 1. Fix TTS Installation

```python
# Install TTS and dependencies
# !pip install TTS==0.14.0 torch torchaudio numpy

# Force reinstall if needed
# !pip install --force-reinstall TTS==0.14.0
```

## 2. Fix STT Installation 

```python
# Install faster-whisper and dependencies
# !pip install faster-whisper==0.7.1 torch torchaudio

# If you have GPU issues, try CPU-only
# !pip install --force-reinstall faster-whisper==0.7.1 --no-deps
# !pip install transformers>=4.19.0 optimum>=1.5.0 numpy
```

## 3. Install System Dependencies (run in terminal)

```bash
# Ubuntu/Debian
# sudo apt-get update
# sudo apt-get install -y libsndfile1 ffmpeg portaudio19-dev python3-pyaudio
```

## 4. Test with Simplified Code

Run the cell below to test a minimal implementation of TTS and STT functionality.

In [None]:
# Simplified TTS and STT Test
print("🧪 Minimal TTS and STT Test")
print("=" * 30)

# Simplified TTS test
async def simple_tts_test():
    print("🔊 Testing TTS directly...")
    try:
        from TTS.api import TTS
        import numpy as np
        from IPython.display import Audio
        
        # Initialize TTS with a simpler model
        model_name = "tts_models/en/ljspeech/tacotron2-DDC"  # Smaller model
        print(f"Loading model: {model_name}")
        tts = TTS(model_name=model_name, progress_bar=True)
        
        # Generate speech
        text = "This is a test of text to speech."
        print(f"Generating speech for: '{text}'")
        wav = tts.tts(text)
        
        if wav is not None and len(wav) > 0:
            # Normalize and convert to int16
            wav_array = np.array(wav, dtype=np.float32)
            if np.max(np.abs(wav_array)) > 0:
                wav_array = wav_array / np.max(np.abs(wav_array)) * 0.95
            audio_bytes = (wav_array * 32767).astype(np.int16).tobytes()
            
            # Play audio
            audio_widget = Audio(data=audio_bytes, rate=22050, autoplay=True)
            display(audio_widget)
            print("✅ TTS successful!")
            return True
        else:
            print("❌ TTS generated empty audio")
            return False
    except Exception as e:
        print(f"❌ TTS test failed: {str(e)}")
        return False

# Simplified STT test
async def simple_stt_test():
    print("\n🎤 Testing STT directly...")
    try:
        from faster_whisper import WhisperModel
        import wave
        import tempfile
        import numpy as np
        import os
        
        # Create a test WAV file with speech-like waveform
        with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as temp_file:
            temp_path = temp_file.name
            
            # Generate a more speech-like test tone (frequency modulated)
            sample_rate = 16000
            duration = 3  # seconds
            t = np.linspace(0, duration, int(duration * sample_rate), False)
            
            # Create a frequency modulated tone (to make it more speech-like)
            carrier = 800  # base frequency
            mod_freq = 2.5  # modulation frequency
            mod_index = 500  # modulation index
            
            # FM modulation formula
            audio = np.sin(2 * np.pi * (carrier * t + mod_index * np.sin(2 * np.pi * mod_freq * t)))
            audio = audio * 0.5  # reduce volume
            
            # Convert to int16
            audio_int16 = (audio * 32767).astype(np.int16)
            
            # Write to WAV
            with wave.open(temp_path, 'wb') as wf:
                wf.setnchannels(1)
                wf.setsampwidth(2)
                wf.setframerate(sample_rate)
                wf.writeframes(audio_int16.tobytes())
            
            print(f"Created test audio file: {temp_path}")
            
            # Initialize Whisper with smallest model
            print("Loading STT model (tiny)...")
            model = WhisperModel("tiny", device="cpu", compute_type="int8")
            
            # Transcribe
            print("Transcribing test audio...")
            segments, info = model.transcribe(temp_path, beam_size=1)
            
            # Get result
            segments_list = list(segments)
            if segments_list:
                result = " ".join([segment.text for segment in segments_list])
                print(f"Transcription result: '{result}'")
                print("✅ STT processing successful!")
                success = True
            else:
                print("⚠️ No transcription produced (this is expected for test tones)")
                print("✅ STT processing completed without errors")
                success = True
            
            # Clean up
            os.unlink(temp_path)
            return success
            
    except Exception as e:
        print(f"❌ STT test failed: {str(e)}")
        return False

# Run the tests
print("Running TTS test...")
tts_success = await simple_tts_test()

print("\nRunning STT test...")
stt_success = await simple_stt_test()

if tts_success and stt_success:
    print("\n🎉 Basic functionality tests passed!")
    print("If your agent still doesn't work, check the diagnostic tool for more detailed analysis.")
else:
    print("\n⚠️ Some tests failed. Please check the installation guide and error messages above.")

In [None]:
# Quick TTS and STT Test
print("🚀 Running Quick TTS and STT Test")
print("=" * 40)

# Test basic package imports
print("📦 Testing package imports...")

try:
    from TTS.api import TTS
    print("✅ TTS package imported successfully")
except Exception as e:
    print(f"❌ TTS import error: {str(e)}")

try:
    from faster_whisper import WhisperModel
    print("✅ faster-whisper package imported successfully")
except Exception as e:
    print(f"❌ faster-whisper import error: {str(e)}")

# Test audio libraries
try:
    import numpy as np
    from IPython.display import Audio
    
    # Create a simple audio test (1 second beep)
    sample_rate = 22050
    t = np.linspace(0, 1, sample_rate, False)
    audio = np.sin(440 * 2 * np.pi * t) * 0.5
    
    # Play audio
    print("\n🔊 Testing audio playback...")
    display(Audio(data=audio, rate=sample_rate, autoplay=True))
    print("If you heard a beep, audio playback is working!")
except Exception as e:
    print(f"❌ Audio test error: {str(e)}")

# Print system info
print("\n💻 System Information:")
import sys
print(f"Python version: {sys.version}")

try:
    import torch
    print(f"PyTorch version: {torch.__version__}")
    print(f"CUDA available: {torch.cuda.is_available()}")
    if torch.cuda.is_available():
        print(f"CUDA device: {torch.cuda.get_device_name(0)}")
except Exception as e:
    print(f"❌ PyTorch info error: {str(e)}")

print("\n✅ Quick test completed. If all checks passed, try running the more comprehensive tests.")

In [None]:
# Fix TTS and STT Installation Issues
print("🔧 Fix TTS and STT Installation")
print("=" * 40)
print("This cell will reinstall the required packages for TTS and STT")
print("Uncomment and run the appropriate fix based on your needs:")

# Uncomment the line below to reinstall TTS package
# !pip install --force-reinstall TTS==0.14.0 torch torchaudio numpy

# Uncomment the line below to reinstall faster-whisper package
# !pip install --force-reinstall faster-whisper==0.7.1 torch torchaudio

# Uncomment the lines below to install missing system dependencies 
# import os
# !apt-get update
# !apt-get install -y libsndfile1 ffmpeg portaudio19-dev python3-pyaudio

print("\n🔄 After running the installations, restart your notebook kernel and run the cells again")

In [None]:
# Simplified Voice Agent with Smaller Models
print("🤖 Simplified Voice Agent (Lightweight Version)")
print("=" * 50)

import asyncio
import numpy as np
import tempfile
import wave
import os
from IPython.display import Audio, display

class SimpleVoiceAgent:
    """Lightweight voice agent using smaller models"""
    
    def __init__(self):
        self.tts_model = None
        self.stt_model = None
        self.sample_rate = 22050
        
    async def initialize(self):
        """Initialize TTS and STT with smaller models"""
        print("🚀 Initializing simplified voice agent...")
        
        # Initialize TTS with smaller model
        print("🔊 Loading TTS (small model)...")
        try:
            from TTS.api import TTS
            self.tts_model = TTS(model_name="tts_models/en/ljspeech/tacotron2-DDC", 
                                progress_bar=True, 
                                gpu=False)
            print("✅ TTS loaded successfully")
        except Exception as e:
            print(f"❌ TTS initialization failed: {str(e)}")
        
        # Initialize STT with tiny model
        print("🎤 Loading STT (tiny model)...")
        try:
            from faster_whisper import WhisperModel
            self.stt_model = WhisperModel("tiny", device="cpu", compute_type="int8")
            print("✅ STT loaded successfully")
        except Exception as e:
            print(f"❌ STT initialization failed: {str(e)}")
    
    async def text_to_speech(self, text):
        """Convert text to speech"""
        if self.tts_model is None:
            print("❌ TTS model not initialized")
            return False
            
        print(f"🔊 Converting to speech: '{text}'")
        try:
            # Generate speech
            wav = self.tts_model.tts(text)
            
            if wav is not None and len(wav) > 0:
                # Normalize and convert to int16
                wav_array = np.array(wav, dtype=np.float32)
                if np.max(np.abs(wav_array)) > 0:
                    wav_array = wav_array / np.max(np.abs(wav_array)) * 0.95
                audio_bytes = (wav_array * 32767).astype(np.int16).tobytes()
                
                # Play audio
                audio_widget = Audio(data=audio_bytes, rate=self.sample_rate, autoplay=True)
                display(audio_widget)
                print("✅ Speech generated successfully")
                return True
            else:
                print("❌ TTS generated empty audio")
                return False
        except Exception as e:
            print(f"❌ TTS synthesis failed: {str(e)}")
            return False
            
    async def speech_to_text(self, audio_file_path=None):
        """Convert speech to text"""
        if self.stt_model is None:
            print("❌ STT model not initialized")
            return None
            
        if audio_file_path is None:
            print("❌ No audio file provided")
            return None
            
        print(f"🎤 Transcribing audio: {audio_file_path}")
        try:
            # Transcribe audio
            segments, info = self.stt_model.transcribe(audio_file_path, beam_size=1)
            
            # Extract text
            segments_list = list(segments)
            if segments_list:
                result = " ".join([segment.text for segment in segments_list])
                print(f"✅ Transcription: '{result}'")
                return result
            else:
                print("❌ No transcription produced")
                return None
        except Exception as e:
            print(f"❌ STT transcription failed: {str(e)}")
            return None

# Create and run simple tests
async def test_simple_agent():
    agent = SimpleVoiceAgent()
    await agent.initialize()
    
    print("\n🔊 Testing TTS...")
    await agent.text_to_speech("Hello, this is a simple test of the voice agent.")
    
    print("\n🎤 Testing STT...")
    # Create a test WAV file
    with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as temp_file:
        temp_path = temp_file.name
        
        # Generate a simple sine wave
        sample_rate = 16000
        duration = 2  # seconds
        t = np.linspace(0, duration, int(duration * sample_rate), False)
        audio = np.sin(440 * 2 * np.pi * t) * 0.5
        audio_int16 = (audio * 32767).astype(np.int16)
        
        with wave.open(temp_path, 'wb') as wf:
            wf.setnchannels(1)
            wf.setsampwidth(2)
            wf.setframerate(sample_rate)
            wf.writeframes(audio_int16.tobytes())
        
        # Test STT with the generated file
        await agent.speech_to_text(temp_path)
        
        # Clean up
        os.unlink(temp_path)

# Run the test
print("Running simple voice agent test...")
await test_simple_agent()

In [None]:
# Standalone Test Script
print("🧪 Testing Voice Agent Components Independently")
print("=" * 50)

import asyncio
from local_tts_adapter import LocalTTSAdapter
from local_stt_adapter import LocalSTTAdapter
from local_llm_adapter import MockLocalLLMAdapter

async def test_components():
    # Test TTS adapter
    print("\n🔊 Testing TTS Adapter...")
    tts_adapter = LocalTTSAdapter()
    await tts_adapter.initialize()
    
    test_text = "This is a test of the text to speech system."
    print(f"Converting text: '{test_text}'")
    
    audio_chunks = []
    async for chunk in tts_adapter.synthesize(test_text):
        audio_chunks.append(chunk)
    
    if audio_chunks:
        combined_audio = b''.join(audio_chunks)
        print(f"Generated {len(audio_chunks)} audio chunks, total size: {len(combined_audio)} bytes")
        audio_widget = Audio(data=combined_audio, rate=tts_adapter.get_sample_rate(), autoplay=True)
        display(audio_widget)
        print("✅ TTS test passed")
    else:
        print("❌ TTS generated no audio")
    
    # Test STT adapter
    print("\n🎤 Testing STT Adapter...")
    stt_adapter = LocalSTTAdapter()
    await stt_adapter.initialize()
    
    # Create a test audio file
    import wave
    import tempfile
    import numpy as np
    import os
    
    with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as temp_file:
        temp_path = temp_file.name
        
        # Generate a simple sine wave
        sample_rate = 16000
        duration = 2  # seconds
        t = np.linspace(0, duration, int(duration * sample_rate), False)
        audio = np.sin(440 * 2 * np.pi * t) * 0.5
        audio_int16 = (audio * 32767).astype(np.int16)
        
        with wave.open(temp_path, 'wb') as wf:
            wf.setnchannels(1)
            wf.setsampwidth(2)
            wf.setframerate(sample_rate)
            wf.writeframes(audio_int16.tobytes())
        
        print(f"Created test audio file: {temp_path}")
        
        # Test transcription
        try:
            result = await stt_adapter.transcribe_audio_file(temp_path)
            print(f"Transcription result: '{result}'")
            print("✅ STT test completed")
        except Exception as e:
            print(f"❌ STT test failed: {str(e)}")
        
        # Clean up
        os.unlink(temp_path)
    
    # Test LLM adapter
    print("\n🤖 Testing LLM Adapter...")
    from livekit.agents.llm import ChatContext, ChatMessage
    
    llm_adapter = MockLocalLLMAdapter()
    
    chat_context = ChatContext(
        messages=[
            ChatMessage(role="system", content="You are a helpful assistant."),
            ChatMessage(role="user", content="Hello, how are you?")
        ]
    )
    
    print("Generating response...")
    response_chunks = []
    async for chunk in llm_adapter.chat(chat_context):
        response_chunks.append(chunk)
    
    response = ''.join(response_chunks)
    print(f"LLM response: '{response}'")
    print("✅ LLM test passed")

# Run the test
await test_components()