<a href="https://colab.research.google.com/github/Maxxx-VS/The-Founder/blob/master/50_4_Speech_synthesis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%%capture
# Install dependencies with compatible versions
!pip uninstall -y numpy torch torchaudio
!pip install numpy==1.23.5
!pip install torch==2.0.1 torchaudio==2.0.2 --extra-index-url https://download.pytorch.org/whl/cu118
!pip install transformers==4.30.2 speechbrain==0.5.15 gradio==3.41.2 sounddevice==0.4.6
!pip install git+https://github.com/coqui-ai/TTS.git@v0.13.0
!pip install pydub

import os
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"  # Resolves potential library conflicts

import gradio as gr
import torch
import torchaudio
import numpy as np
import sounddevice as sd
from queue import Queue
import time
import warnings
from transformers import pipeline, AutoModelForSeq2SeqLM, AutoTokenizer
from TTS.api import TTS
from speechbrain.pretrained import EncoderClassifier
from pydub import AudioSegment
from pydub.playback import play

warnings.filterwarnings("ignore")

class VoiceTranslator:
    def __init__(self):
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        print(f"🚀 Initializing on {self.device}...")

        # Initialize models
        self.init_models()

        # Audio settings
        self.sample_rate = 16000
        self.is_recording = False
        self.audio_queue = Queue()

    def init_models(self):
        """Initialize all required models"""
        try:
            print("🔊 Loading speech recognition model...")
            self.asr = pipeline(
                "automatic-speech-recognition",
                model="openai/whisper-medium",
                device=self.device
            )

            print("🌍 Loading translation models...")
            self.translators = {
                ('en', 'ru'): self.load_translator("Helsinki-NLP/opus-mt-en-ru"),
                ('ru', 'en'): self.load_translator("Helsinki-NLP/opus-mt-ru-en"),
                ('de', 'en'): self.load_translator("Helsinki-NLP/opus-mt-de-en"),
                ('en', 'de'): self.load_translator("Helsinki-NLP/opus-mt-en-de"),
                ('de', 'ru'): self.load_translator("Helsinki-NLP/opus-mt-de-ru"),
                ('ru', 'de'): self.load_translator("Helsinki-NLP/opus-mt-ru-de")
            }

            print("🎙️ Loading voice encoder...")
            self.voice_encoder = EncoderClassifier.from_hparams(
                source="speechbrain/spkrec-ecapa-voxceleb",
                run_opts={"device": self.device},
                savedir="tmp_voice_model"
            )

            print("🗣️ Loading TTS with voice cloning...")
            self.tts = TTS(
                model_name="tts_models/multilingual/multi-dataset/your_tts",
                progress_bar=False
            ).to(self.device)

            print("✅ All models loaded successfully!")

        except Exception as e:
            print(f"❌ Initialization failed: {str(e)}")
            raise

    def load_translator(self, model_name):
        """Helper to load translation model"""
        model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(self.device)
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        return {'model': model, 'tokenizer': tokenizer}

    def record_audio(self, duration=5):
        """Record audio from microphone"""
        print(f"⏺️ Recording {duration}s of audio...")
        self.is_recording = True
        audio = []

        def callback(indata, frames, time, status):
            if self.is_recording:
                audio.append(indata.copy())

        try:
            with sd.InputStream(
                callback=callback,
                channels=1,
                samplerate=self.sample_rate,
                dtype='float32'
            ):
                start_time = time.time()
                while self.is_recording and (time.time() - start_time < duration):
                    time.sleep(0.1)

            return np.concatenate(audio) if audio else np.zeros(0)
        except Exception as e:
            print(f"❌ Recording error: {str(e)}")
            return np.zeros(0)

    def recognize_speech(self, audio):
        """Convert speech to text"""
        if len(audio) == 0:
            return ""

        print("🔍 Recognizing speech...")
        try:
            # Convert to mono if needed
            if len(audio.shape) > 1:
                audio = np.mean(audio, axis=1)

            # Normalize audio
            audio = audio / np.max(np.abs(audio))

            # Recognize
            result = self.asr(audio, generate_kwargs={"language": "<detect>"})
            text = result["text"]
            print(f"💬 Recognized: {text}")
            return text
        except Exception as e:
            print(f"❌ Recognition error: {str(e)}")
            return ""

    def translate_text(self, text, source_lang, target_lang):
        """Translate text between languages"""
        if not text or source_lang == target_lang:
            return text

        print(f"🌐 Translating {source_lang} → {target_lang}...")
        try:
            translator = self.translators.get((source_lang, target_lang))
            if not translator:
                return f"[UNSUPPORTED TRANSLATION: {source_lang}→{target_lang}]"

            inputs = translator['tokenizer'](text, return_tensors="pt").to(self.device)
            translated_ids = translator['model'].generate(**inputs)
            translated_text = translator['tokenizer'].decode(translated_ids[0], skip_special_tokens=True)
            print(f"✅ Translated: {translated_text}")
            return translated_text
        except Exception as e:
            print(f"❌ Translation error: {str(e)}")
            return f"[TRANSLATION ERROR]"

    def extract_voice_features(self, audio):
        """Extract speaker embeddings for voice cloning"""
        if len(audio) == 0:
            return None

        print("🎤 Extracting voice features...")
        try:
            audio_tensor = torch.from_numpy(audio).float().unsqueeze(0)
            if audio_tensor.shape[0] > 1:
                audio_tensor = torch.mean(audio_tensor, dim=0)
            audio_tensor = audio_tensor / torch.max(torch.abs(audio_tensor))

            with torch.no_grad():
                embedding = self.voice_encoder.encode_batch(audio_tensor.to(self.device))

            return embedding.squeeze(0).cpu().numpy()
        except Exception as e:
            print(f"❌ Voice feature extraction error: {str(e)}")
            return None

    def synthesize_speech(self, text, voice_features, target_lang):
        """Convert text to speech with voice cloning"""
        if not text or voice_features is None:
            return None

        print("🔊 Synthesizing speech...")
        try:
            lang_code = {'en': 'en', 'ru': 'ru', 'de': 'de'}.get(target_lang, 'en')
            speaker_embedding = torch.from_numpy(voice_features).unsqueeze(0).to(self.device)

            wav = self.tts.tts_with_embeddings(
                text=text,
                speaker_embedding=speaker_embedding,
                language=lang_code
            )

            return np.array(wav)
        except Exception as e:
            print(f"❌ Synthesis error: {str(e)}")
            return None

    def process_audio(self, source_lang, target_lang, duration):
        """Complete processing pipeline"""
        try:
            # 1. Record audio
            audio = self.record_audio(duration)
            if len(audio) == 0:
                return "No audio recorded", "", None

            # 2. Recognize speech
            text = self.recognize_speech(audio)
            if not text.strip():
                return "No speech detected", "", None

            # 3. Translate text
            translated_text = self.translate_text(text, source_lang, target_lang)

            # 4. Extract voice features
            voice_features = self.extract_voice_features(audio)

            # 5. Synthesize translated speech
            translated_audio = self.synthesize_speech(translated_text, voice_features, target_lang)

            if translated_audio is None:
                return text, translated_text, None

            return text, translated_text, (self.sample_rate, translated_audio)

        except Exception as e:
            print(f"❌ Processing error: {str(e)}")
            return f"Error: {str(e)}", "", None

    def stop_recording(self):
        """Stop the recording process"""
        self.is_recording = False
        return "⏹️ Recording stopped"

def create_interface():
    translator = VoiceTranslator()

    with gr.Blocks(title="Real-Time Voice Translator") as demo:
        gr.Markdown("""
        # 🎙️ Real-Time Voice Translator
        *Supports English ↔ Russian ↔ German with voice preservation*
        """)

        with gr.Row():
            with gr.Column():
                source_lang = gr.Dropdown(
                    ["en", "ru", "de"], label="Source Language", value="en"
                )
                target_lang = gr.Dropdown(
                    ["en", "ru", "de"], label="Target Language", value="ru"
                )
                duration = gr.Slider(1, 10, value=5, step=1, label="Recording Duration (seconds)")

                with gr.Row():
                    record_btn = gr.Button("🎤 Record & Translate", variant="primary")
                    stop_btn = gr.Button("⏹ Stop")

            with gr.Column():
                source_text = gr.Textbox(label="Original Text")
                translated_text = gr.Textbox(label="Translation")
                audio_output = gr.Audio(label="Translated Speech", type="numpy")

        record_btn.click(
            fn=translator.process_audio,
            inputs=[source_lang, target_lang, duration],
            outputs=[source_text, translated_text, audio_output]
        )
        stop_btn.click(
            fn=translator.stop_recording,
            outputs=source_text
        )

    return demo

# Launch the interface
print("🚀 Starting application...")
demo = create_interface()
demo.launch(share=True)