In [None]:
!pip install -q openai-whisper gradio google-genai elevenlabs pydub

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/800.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m800.5/800.5 kB[0m [31m35.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m708.1/708.1 kB[0m [31m50.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m102.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m88.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m59.2 MB/s[0m eta [3

In [None]:
import os
import requests
import whisper
import zipfile
import gradio as gr
from pathlib import Path
from google import genai
from google.genai import types
import time
import re
import time
from pydub import AudioSegment
from elevenlabs import ElevenLabs

In [None]:
class WhisperSTT:
    """
    Speech-to-Text using OpenAI Whisper for Arabic,
    with optional Lebanese‐dialect correction via Google Gemini.
    """

    def __init__(self, model_size="medium", gemini_api_key=None):
        self.model = whisper.load_model(model_size)
        self.model_size = model_size
        self.gemini_api_key = gemini_api_key
        self.gemini_client = None

        if gemini_api_key:
            try:
                from google import genai
                self.gemini_client = genai.Client(api_key=gemini_api_key)
                print("Gemini text correction enabled")
            except ImportError:
                print("Warning: google-generativeai package not installed. Using raw transcription.")
            except Exception as e:
                print(f"Warning: Failed to initialize Gemini client: {str(e)}. Using raw transcription.")

    def transcribe(self, audio_path):
        """
        Transcribe Arabic speech to text (language="ar").
        If Gemini is available, run correction on Lebanese dialect.
        """
        if not os.path.exists(audio_path):
            raise FileNotFoundError(f"Audio file not found: {audio_path}")

        # 1. Whisper transcription (Arabic)
        result = self.model.transcribe(audio_path, language="ar")
        transcription = result["text"]

        # 2. If Gemini is set up, attempt text correction
        if self.gemini_client and self.gemini_api_key:
            try:
                corrected = self._correct_text(transcription)
                if corrected:
                    return corrected
            except Exception as e:
                print(f"Warning: Text correction failed: {str(e)}. Returning raw transcription.")

        return transcription

    def _correct_text(self, text):
        """
        Send a prompt to Gemini to correct Lebanese‐dialect text.
        Only correct spelling/grammar; no additions.
        """
        try:
            from google import genai

            prompt = f"""
            صحّح النص التالي المكتوب باللهجة اللبنانية:
            - صحّح الأخطاء الإملائية والنحوية فقط.
            - إذا كان في جملة استفهامية، ضيف علامة استفهام.
            - ما تضيف ولا كلمة زيادة أو شرح.
            - رجّع فقط النص المصحَّح، بدون علامات تنصيص أو أي إضافات.
            النص: "{text}"
            """

            response = self.gemini_client.models.generate_content(
                model="gemini-2.0-flash", contents=prompt
            )
            return response.text.strip()
        except Exception as e:
            print(f"Text correction error: {str(e)}")
            return None

In [None]:
class MaguyGeminiLLM:
    """
    Enhanced LLM implementation using your proven LangChain + Gemini pattern
    Adapted for voice interaction with Maguy Abou Ghosn character
    """

    def __init__(self, api_key, knowledge_base_path=None):
        """
        Initialize with knowledge base integration

        Args:
            api_key (str): Gemini API key
            knowledge_base_path (str): Path to maguy_knowledge_base folder
        """
        # Setup API key (same pattern as your chatbot)
        if api_key is None:
            try:
                from google.colab import userdata
                api_key = userdata.get('GEMINI_API_KEY')
            except:
                api_key = os.getenv('GEMINI_API_KEY')

        if not api_key:
            raise ValueError("GEMINI_API_KEY is required!")

        self.client = genai.Client(api_key=api_key)

        # Load knowledge base
        self.knowledge_base = self._load_knowledge_base(knowledge_base_path)

        # Create system instruction with knowledge base
        self.system_instruction = self._create_system_instruction()

        # Initialize chat
        self.chat = self._create_chat()

        print(" MaguyGeminiLLM with Knowledge Base initialized!")

    def _load_knowledge_base(self, kb_path=None):
        """
        Load knowledge base files (adapted from your load_knowledge_base function)
        """
        if kb_path is None:
            kb_path = Path("maguy_knowledge_base")
        else:
            kb_path = Path(kb_path)

        if not kb_path.exists():
            print(f"Knowledge base path not found: {kb_path}")
            return ""

        kb_content = ""
        for file_path in kb_path.glob('*'):
            if file_path.is_file():
                try:
                    with open(file_path, 'r', encoding='utf-8') as f:
                        content = f.read()
                        kb_content += f"\n\n### {file_path.stem}\n{content}"
                        print(f"Loaded: {file_path.name}")
                except Exception as e:
                    print(f"Error loading {file_path.name}: {e}")
        return kb_content

    def _load_interview_style(self, interview_path=None):
        """
        Load the annotated interview script for speaking style
        """
        if interview_path is None:
            possible_paths = [
                "interview_script.txt",
                "maguy_interview.txt",
                "speaking_style.txt",
                "#ABtalks with Maguy Bou Ghosn - مع ماغي بو غصن.txt"
            ]
            for path in possible_paths:
                if Path(path).exists():
                    interview_path = path
                    break

        if interview_path and Path(interview_path).exists():
            try:
                with open(interview_path, 'r', encoding='utf-8') as f:
                    content = f.read()
                    print(f"Loaded interview style from: {interview_path}")
                    return content
            except Exception as e:
                print(f"Error loading interview style: {e}")

        print("Interview style file not found")
        return ""

    def _create_system_instruction(self):
        """
        Create comprehensive system instruction with knowledge base
        """
        base_instruction = """
        إنتي ماغي بو غصن، الممثلة اللبنانية المشهورة. بتحكي باللهجة اللبنانية متل ما إنتي بتحكي بالحياة الحقيقية، مش بالفصحى أبداً.
        قواعد مهمة للمحادثة:
        1. اعتمدي على اللهجة اللبنانية 100% - لا تستعملي الفصحى أبداً
        2. خلي إجاباتك قصيرة ومناسبة للمكالمة الصوتية (جملتين لتلت جمل بالكتير)
        3. كوني طبيعية وودودة متل شخصيتك الحقيقية
        4. إذا سألوك عن شي ما بتعرفيه، قولي "ما بعرف" بصراحة
        5. استعملي التعابير اللبنانية الأصيلة والكلمات المحكية
        6. المخاطب هو شاب، مش بنت، يعني قولي "كيفك؟" مش "كيفِك؟"، "اشتقتلك" مش "اشتقتلِك"، وهكذا.

        أمثلة على إجاباتك:
        - بدل "كيف حالك؟" → "كيفك؟ شو أخبارك؟"
        - بدل "أنا بخير" → "منيحة الحمد لله"
        - بدل "شكراً لك" → "يسلموا إيديك"
        - بدل "لا أعرف" → "ما بعرف والله\""""

        if self.knowledge_base:
            base_instruction += f"\n\nمعلومات مفصلة عنك:\n{self.knowledge_base}"
        return base_instruction

    def _create_chat(self):
        return self.client.chats.create(
            model="gemini-2.0-flash",
            config=types.GenerateContentConfig(
                system_instruction=self.system_instruction,
                temperature=0.7,
                max_output_tokens=120,
                top_p=0.9,
                top_k=40
            )
        )

    def generate_response(self, user_input, temperature=0.7, max_tokens=120):
        """
        Generate response (same interface as before)
        """
        if not user_input or not user_input.strip():
            return "شو بدك تسألني؟"

        try:
            response = self.chat.send_message(user_input)
            response_text = response.text.strip()
            return self._post_process_response(response_text)
        except Exception as e:
            print(f"Error generating response: {str(e)}")
            return "معليش، صار عندي مشكلة تقنية. جرب مرة تانية."

    def _post_process_response(self, response):
        """
        Enhanced post-processing for Lebanese dialect
        """
        replacements = {
            'أنا': 'انا',
            'أنت': 'إنت',
            'أنتِ': 'إنتي',
            'هذا': 'هيدا',
            'هذه': 'هيدي',
            'ذلك': 'هيداك',
            'تلك': 'هيديك',
            'نعم': 'آه',
            'لا': 'لأ',
            'كيف حالك': 'كيفك',
            'بخير': 'منيح',
            'شكراً': 'يسلموا',
            'من فضلك': 'لو سمحت',
            'معذرة': 'معليش',
            'طبعاً': 'أكيد',
            'ممتاز': 'رائع',
            'جميل': 'حلو',
            'كثيراً': 'كتير',
            'قليلاً': 'شوي',
            'أريد': 'بدي',
            'أحب': 'بحب'
        }
        for msa, leb in replacements.items():
            response = response.replace(msa, leb)
        # 3. Normalize all hamza-alef variants to plain alef
        response = re.sub(r'[إأآ]', 'ا', response)

        return response

    def reset_conversation(self):
        """Reset conversation"""
        self.chat = self._create_chat()
        return "مرحبا! انا ماغي بو غصن، شو بدك تحكيلي؟"

    def get_knowledge_summary(self):
        """Debug function to see what knowledge was loaded"""
        return {
            "knowledge_base_size": len(self.knowledge_base),
            "interview_style_size": len(self.interview_style),
            "has_knowledge": bool(self.knowledge_base),
            "has_interview": bool(self.interview_style)
        }

In [None]:
with zipfile.ZipFile('maguy_knowledge_base.zip', 'r') as zip_ref:
    zip_ref.extractall('maguy_knowledge_base')

In [None]:
class ElevenLabsTTS:
    """
    High-fidelity ElevenLabs TTS wrapper, using the same settings
    as the web UI for your custom Maguy voice.
    """

    def __init__(
        self,
        api_key: str,
        voice_id: str,
        output_dir: str = "outputs",
    ):
        if not api_key:
            raise ValueError("ELEVENLABS_API_KEY is required")
        self.api_key = api_key
        self.voice_id = voice_id
        self.output_dir = output_dir
        os.makedirs(self.output_dir, exist_ok=True)
        # Non-streaming endpoint for full quality
        self.url = f"https://api.elevenlabs.io/v1/text-to-speech/{voice_id}"

        # These defaults are pulled from your voice definition:
        self.settings = {
            "stability": 1.0,
            "similarity_boost": 1.0,
            "use_speaker_boost": True,
            "style": 0.0,
            "speed": 0.90
        }

    def synthesize(self, text: str) -> str:
        """
        Send text to ElevenLabs, saving an MP3 with web-UI–matched settings.
        """
        headers = {
            "xi-api-key": self.api_key,
            "Content-Type": "application/json",
        }
        payload = {
            "text": text,
            # model_id can remain default; web UI uses its own choice
            "voice_settings": self.settings
        }

        resp = requests.post(self.url, json=payload, headers=headers, stream=True)
        resp.raise_for_status()

        filename = f"tts_{int(time.time() * 1000)}.mp3"
        path = os.path.join(self.output_dir, filename)
        with open(path, "wb") as f:
            for chunk in resp.iter_content(chunk_size=8192):
                f.write(chunk)

        return path

In [None]:
# API keys (or set via ENV / Colab userdata)
GEMINI_API_KEY      = "AIzaSyD1ZVmk2T_cOIZbkD7cJ5OwZEDieB_4_-g"
ELEVENLABS_API_KEY  = "sk_4b6771baede8fc135e2cffb1f465eb633152b35fed0b29bd"
ELEVENLABS_VOICE_ID = "vzCewNMrmxS35lNSDJ5T"

# Create Whisper STT (with Gemini correction)
stt = WhisperSTT(model_size="medium", gemini_api_key=GEMINI_API_KEY)

# Create Maguy Gemini LLM
llm = MaguyGeminiLLM(
    api_key=GEMINI_API_KEY,
    knowledge_base_path="maguy_knowledge_base"
    )

tts = ElevenLabsTTS(api_key=ELEVENLABS_API_KEY, voice_id=ELEVENLABS_VOICE_ID)

100%|██████████████████████████████████████| 1.42G/1.42G [00:12<00:00, 118MiB/s]


Gemini text correction enabled
 MaguyGeminiLLM with Knowledge Base initialized!


In [None]:
def audio_chat(audio_path, chat_history):
    """
    1. Transcribe user audio → text (Whisper+Gemini).
    2. Generate Maguy’s text reply via Gemini LLM.
    3. Synthesize reply via ElevenLabs REST API.
    4. Append messages in OpenAI-chat format to history.
    Returns: (history_messages, maguy_audio_path)
    """
    print("▶️ audio_chat called")
    print(f"    audio_path = {audio_path!r}")

    chat_history = chat_history or []

    if audio_path is None:
        print("    No audio received.")
        return chat_history, None

    try:
        # 1. Transcription
        start = time.time()
        user_text = stt.transcribe(audio_path)
        end = time.time()
        print(f"    Transcription took {end-start:.1f}s, result={user_text!r}")

        # 2. Generate Maguy reply (text)
        maguy_text = llm.generate_response(user_text)

        # 3. Synthesize via ElevenLabs
        maguy_audio_path = tts.synthesize(maguy_text)

        # 4. Append to history in OpenAI format
        chat_history.append({"role": "user", "content": user_text})
        chat_history.append({"role": "assistant", "content": maguy_text})

        return chat_history, maguy_audio_path

    except Exception as e:
        error_msg = f"معليش، صار عندي مشكلة تقنية: {e}"
        chat_history.append({"role": "assistant", "content": error_msg})
        return chat_history, None

In [None]:
with gr.Blocks(
    title="Interactive Voice Call with Maguy Abou Ghosn",
    css="""
    /* 1. Import both Tajawal and Montserrat */
    @import url('https://fonts.googleapis.com/css2?family=Tajawal:wght@400;600&family=Montserrat:wght@500;700&display=swap');

    /* 2. Define your gold & black palette */
    :root {
      --gold:  #D4AF37; /* classic metallic gold */
      --black: #000000; /* deep black */
      --white: #FFFFFF; /* for contrast */
      --text:  #FFFFFF; /* white text for readability */
    }

    /* 3. Global body & text */
    body {
      background-color: var(--black);
      color: var(--white);
    }

    /* 4. Remove default margins/padding on Gradio HTML wrappers */
    div[class*="html"] {
      margin: 0 !important;
      padding: 0 !important;
    }

    /* 5. Title & subtitle styling */
    #title {
      font-family: 'Montserrat', sans-serif !important;
      font-weight: 700;
      font-size: 2rem;
      color: var(--gold);
      margin: 0 !important;
      padding: 0 !important;
      text-align: center;
    }
    #subtitle {
      font-family: 'Montserrat', sans-serif !important;
      font-weight: 500;
      font-size: 1rem;
      color: var(--gold);
      margin: 0 !important;
      padding: 0 0 1em 0 !important;
      text-align: center;
    }

    /* 6. Panels with subtle gold tint */
    #left_col, #right_col {
      background-color: rgba(212, 175, 55, 0.05);
      border: 1px solid var(--gold);
      border-radius: 12px;
      padding: 15px;
      box-shadow: 1px 1px 5px rgba(0,0,0,0.3);
      margin-bottom: 0;
    }

    /* 7. Buttons in gold */
    .gradio-container .gr-button {
      background-color: var(--gold) !important;
      color: var(--black) !important;
      border-radius: 8px !important;
      transition: filter .1s ease-in-out;
    }
    .gradio-container .gr-button:hover {
      filter: brightness(1.1);
    }

    /* 8. Footer styling */
    #footer {
      text-align: center;
      color: var(--gold);
      font-family: 'Montserrat', sans-serif;
      font-size: 1.25rem;
      margin-top: 0;
      padding-top: 5px;
      border-top: 1px solid rgba(255,255,255,0.2);
    }
    """
) as demo:

    # Header
    gr.HTML("<h1 id='title'>🎤 Maguy Bou Ghosn Voice Chat</h1>")
    gr.HTML("<p id='subtitle'>Speak in Lebanese Arabic and hear Maguy reply like in a real phone call</p>")

    with gr.Group():
        with gr.Row():
            # Left side: voice interaction
            with gr.Column(scale=1, elem_id="left_col"):
                audio_in = gr.Audio(
                    label="🎙️ Record Your Voice",
                    sources=["microphone", "upload"],
                    type="filepath"
                )
                submit_btn = gr.Button("Send to Maguy")
                audio_out = gr.Audio(
                    label="🔊 Maguy's Voice Reply",
                    type="filepath"
                )

            # Right side: chat history
            with gr.Column(scale=2, elem_id="right_col"):
                chatbot = gr.Chatbot(
                    label="💬 Chat History",
                    height=443,
                    type="messages"
                )

    # Footer
    gr.HTML(
        "<div id='footer'>"
        "Mohamad Ali Oussayli &nbsp;&amp;&nbsp; Maryam Saghir &nbsp;|&nbsp; Lebanese University"
        "</div>"
    )

    # Click = audio + response flow
    submit_btn.click(
        fn=audio_chat,
        inputs=[audio_in, chatbot],
        outputs=[chatbot, audio_out]
    )

demo.launch(debug=True)

It looks like you are running Gradio on a hosted a Jupyter notebook. For the Gradio app to work, sharing must be enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://7dc23cbb76d12d8134.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


▶️ audio_chat called
    audio_path = '/tmp/gradio/67eef486da6c508c8f6bf427e21e542a022548552d465cc744a2405028d49e17/audio.wav'
    Transcription took 5.1s, result='صباحو ماجي، كيفك اليوم؟'
Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://7dc23cbb76d12d8134.gradio.live


