In [1]:
%pip install google-generativeai requests sounddevice scipy SpeechRecognition python-dotenv

Collecting sounddevice
  Downloading sounddevice-0.5.3-py3-none-win_amd64.whl.metadata (1.6 kB)
Collecting SpeechRecognition
  Downloading speechrecognition-3.14.3-py3-none-any.whl.metadata (30 kB)
Collecting google-ai-generativelanguage==0.6.15 (from google-generativeai)
  Using cached google_ai_generativelanguage-0.6.15-py3-none-any.whl.metadata (5.7 kB)
Collecting standard-aifc (from SpeechRecognition)
  Downloading standard_aifc-3.13.0-py3-none-any.whl.metadata (969 bytes)
Collecting standard-chunk (from standard-aifc->SpeechRecognition)
  Downloading standard_chunk-3.13.0-py3-none-any.whl.metadata (860 bytes)
Using cached google_ai_generativelanguage-0.6.15-py3-none-any.whl (1.3 MB)
Downloading sounddevice-0.5.3-py3-none-win_amd64.whl (364 kB)
Downloading speechrecognition-3.14.3-py3-none-any.whl (32.9 MB)
   ---------------------------------------- 0.0/32.9 MB ? eta -:--:--
    --------------------------------------- 0.8/32.9 MB 9.4 MB/s eta 0:00:04
   -- ------------------------

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
langchain-google-genai 3.0.0 requires google-ai-generativelanguage<1.0.0,>=0.7.0, but you have google-ai-generativelanguage 0.6.15 which is incompatible.


In [None]:
import os
import platform
import subprocess
import sounddevice as sd
from scipy.io.wavfile import write
import speech_recognition as sr
import requests
import google.generativeai as genai
from dotenv import load_dotenv

# === Load .env Variables ===
load_dotenv()

GOOGLE_API_KEY = os.getenv("GEMINI_API_KEY")
ELEVEN_API_KEY = os.getenv("ELEVENLABS_API_KEY")
SAMPLERATE = 16000
DURATION = 6
counter = 1
voice_id = None

# === Setup Gemini ===
genai.configure(api_key=GOOGLE_API_KEY)

# ✅ Play audio file (cross-platform)
def play_audio_file(filepath):
    system = platform.system()
    try:
        if system == "Windows":
            os.startfile(filepath)
        elif system == "Darwin":  # macOS
            subprocess.call(["afplay", filepath])
        elif system == "Linux":
            subprocess.call(["xdg-open", filepath])
        else:
            print("⚠️ Unsupported OS for audio playback.")
    except Exception as e:
        print(f"❌ Error playing audio: {e}")

# === Get Default ElevenLabs Voice
def get_default_voice():
    response = requests.get("https://api.elevenlabs.io/v1/voices", headers={
        "xi-api-key": ELEVEN_API_KEY
    })
    if response.status_code == 200:
        voices = response.json()["voices"]
        return voices[0]["voice_id"]
    else:
        print("❌ Could not fetch voices.")
        exit()

# === Record Audio from Mic
def record_audio(filename):
    print("🎤 Speak now...")
    audio = sd.rec(int(DURATION * SAMPLERATE), samplerate=SAMPLERATE, channels=1, dtype='int16')
    sd.wait()
    write(filename, SAMPLERATE, audio)
    print(f"✅ Audio saved to: {filename}")

# === Transcribe Audio
def transcribe_audio(filename):
    recognizer = sr.Recognizer()
    with sr.AudioFile(filename) as source:
        audio = recognizer.record(source)
    try:
        text = recognizer.recognize_google(audio)
        print(f"🗣️ You asked: {text}")
        return text
    except Exception as e:
        print(f"❌ Could not transcribe: {e}")
        return ""

# === Gemini Reply
def get_gemini_reply(prompt):
    print("🤖 Generating reply using Gemini Flash...")
    model = genai.GenerativeModel(model_name="models/gemini-1.5-flash-latest")
    response = model.generate_content(prompt)
    answer = response.text
    print(f"🤖 AI says: {answer}")
    return answer

# === Text to Speech (ElevenLabs)
def text_to_speech(text, filename, voice_id):
    url = f"https://api.elevenlabs.io/v1/text-to-speech/{voice_id}"
    headers = {
        "xi-api-key": ELEVEN_API_KEY,
        "Content-Type": "application/json"
    }
    payload = {
        "text": text,
        "model_id": "eleven_monolingual_v1",
        "voice_settings": {"stability": 0.7, "similarity_boost": 0.7}
    }

    response = requests.post(url, headers=headers, json=payload)
    if response.status_code == 200:
        with open(filename, "wb") as f:
            f.write(response.content)
        print(f"✅ Voice reply saved as: {filename}")
    else:
        print(f"❌ ElevenLabs Error: {response.text}")

# === Voice Assistant Loop
def run_bot():
    global counter, voice_id
    print("\n🎙️ Voice-to-Voice Assistant (Gemini Flash + ElevenLabs)")

    if voice_id is None:
        voice_id = get_default_voice()

    while True:
        cmd = input("\n▶️ Press Enter to ask or type 'exit': ")
        if cmd.lower() == "exit":
            print("👋 Exiting the assistant.")
            break

        user_audio = f"user_audio_{counter}.wav"
        ai_audio = f"ai_reply_{counter}.mp3"

        record_audio(user_audio)
        question = transcribe_audio(user_audio)
        if not question.strip():
            continue

        answer = get_gemini_reply(question)
        text_to_speech(answer, ai_audio, voice_id)
        play_audio_file(ai_audio)

        counter += 1

# === Run the Assistant ===
if __name__ == "__main__":
    run_bot()