<a href="https://colab.research.google.com/github/Gowthamtj17/Gowthamtj17-SeamlessSpeech-Multilingual-Speech-Text-Translator/blob/main/SeamlessSpeech.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import torch
import numpy as np
import soundfile as sf
import librosa
import gradio as gr
from transformers import AutoProcessor, SeamlessM4Tv2Model


In [None]:
MODEL_ID = "facebook/seamless-m4t-v2-large"   # change if needed
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
print("Loading processor and model (this may take a while)...")
processor = AutoProcessor.from_pretrained(MODEL_ID)
model = SeamlessM4Tv2Model.from_pretrained(MODEL_ID).to(device)
model.eval()
print("Loaded model on", device)

Loading processor and model (this may take a while)...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json: 0.00B [00:00, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.17M [00:00<?, ?B/s]

added_tokens.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.24G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

Instantiating a decoder SeamlessM4Tv2Attention without passing `layer_idx` is not recommended and will lead to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` when creating this class.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json: 0.00B [00:00, ?B/s]

Loaded model on cpu


In [None]:
# -----------------------------
# Utility Functions
# -----------------------------
def downsample_to_16k(src_path, dst_path="tmp_16k.wav"):
    wav, sr = librosa.load(src_path, sr=None)
    if sr != 16000:
        wav = librosa.resample(wav, orig_sr=sr, target_sr=16000)
    sf.write(dst_path, wav, 16000)
    return dst_path


def waveform_to_file(waveform, sample_rate, out_path):
    arr = np.asarray(waveform).astype(np.float32)
    if arr.ndim > 1:
        arr = arr.mean(axis=0)
    sf.write(out_path, arr, sample_rate)
    return out_path


def decode_text(tokens):
    if isinstance(tokens, torch.Tensor):
        tokens = tokens.cpu().numpy()

    try:
        if tokens.ndim == 2:
            tokens = tokens[0]
        return processor.decode(tokens.tolist(), skip_special_tokens=True)
    except:
        return str(tokens)

In [None]:
# -----------------------------
# Model Task Functions
# -----------------------------
def do_asr(audio_file):
    tmp = downsample_to_16k(audio_file)
    audio, _ = sf.read(tmp)
    inp = processor(audio=audio, sampling_rate=16000, return_tensors="pt").to(device)
    with torch.no_grad():
        out = model.generate(**inp, generate_speech=False)
    return decode_text(out[0])


def do_s2tt(audio_file, tgt_lang):
    tmp = downsample_to_16k(audio_file)
    audio, _ = sf.read(tmp)
    inp = processor(audio=audio, sampling_rate=16000, return_tensors="pt").to(device)
    with torch.no_grad():
        out = model.generate(**inp, tgt_lang=tgt_lang, generate_speech=False)
    return decode_text(out[0])


def do_s2st(audio_file, tgt_lang, out_file="s2st.wav"):
    tmp = downsample_to_16k(audio_file)
    audio, _ = sf.read(tmp)
    inp = processor(audio=audio, sampling_rate=16000, return_tensors="pt").to(device)
    with torch.no_grad():
        out_audio = model.generate(**inp, tgt_lang=tgt_lang)
    wav = out_audio[0].cpu().numpy().squeeze()
    waveform_to_file(wav, 16000, out_file)
    return out_file


def do_t2st(text, src_lang, tgt_lang, out_file="t2st.wav"):
    inp = processor(text=[text], src_lang=src_lang, return_tensors="pt").to(device)

    with torch.no_grad():
        out_audio = model.generate(**inp, tgt_lang=tgt_lang)
    wav = out_audio[0].cpu().numpy().squeeze()
    waveform_to_file(wav, 16000, out_file)
    return out_file



def do_t2tt(text, tgt_lang):
    inp = processor(text=[text], return_tensors="pt").to(device)
    with torch.no_grad():
        out_ids = model.generate(**inp, tgt_lang=tgt_lang, generate_speech=False)
    return decode_text(out_ids[0])

In [None]:
# -----------------------------
# Language Choices
# -----------------------------
LANG_CHOICES = [
    ("English", "eng"),
    ("Hindi", "hin"),
    ("Tamil", "tam"),
    ("Telugu", "tel"),
    ("Malayalam", "mal"),
    ("Spanish", "spa"),
    ("French", "fra"),
    ("German", "deu"),
    ("Chinese", "zho"),
    ("Japanese", "jpn"),
    ("Arabic", "ara"),
    ("Russian", "rus"),
]

In [None]:
# -----------------------------
# Dispatcher
# -----------------------------
def run_task(task, audio, text, src, tgt):
    try:
        if task == "ASR":
            return do_asr(audio), None, "ASR Completed"

        if task == "S2TT":
            return do_s2tt(audio, tgt), None, "S2TT Completed"

        if task == "S2ST":
            out = do_s2st(audio, tgt)
            return "Generated audio", out, "S2ST Completed"

        if task == "T2ST":
            out = do_t2st(text, src, tgt)
            return "Generated audio", out, "T2ST Completed"

        if task == "T2TT":
            return do_t2tt(text, tgt), None, "T2TT Completed"

        return "Unknown task", None, "Error"
    except Exception as e:
        return f"Error: {e}", None, str(e)



In [None]:

# -----------------------------
# UI — Clean & Compatible
# -----------------------------
with gr.Blocks(title="SeamlessSpeech— Multilingual Speech & Text Translator") as demo:
    gr.Markdown(
        """
        # SeamlessSpeech— Multilingual Speech & Text Translator
        **Speech ↔ Text • Multilingual • Translation & Generation**
        """
    )

    with gr.Row():
        with gr.Column(scale=1):
            task = gr.Radio(
                ["ASR", "S2TT", "S2ST", "T2ST", "T2TT"],
                value="S2TT",
                label="Choose Task",
            )

            audio_in = gr.Audio(label="Audio Input", type="filepath")
            text_in = gr.Textbox(label="Text Input", lines=4, placeholder="Enter text here...")

            with gr.Row():
                src_lang = gr.Dropdown(LANG_CHOICES, value="eng", label="Source Language")
                tgt_lang = gr.Dropdown(LANG_CHOICES, value="eng", label="Target Language")

            run = gr.Button("Run", variant="primary")

        with gr.Column(scale=1):
            out_text = gr.Textbox(label="Output Text", lines=6)
            out_audio = gr.Audio(label="Output Audio")
            log = gr.Textbox(label="Log", lines=3)

    def ui_visibility(t):
        return (
            gr.update(visible=t in ["ASR", "S2TT", "S2ST"]),
            gr.update(visible=t in ["T2ST", "T2TT"]),
            gr.update(visible=t == "T2ST"),
        )

    task.change(ui_visibility, task, [audio_in, text_in, src_lang])

    run.click(
        run_task,
        inputs=[task, audio_in, text_in, src_lang, tgt_lang],
        outputs=[out_text, out_audio, log],
    )


print("Launching Gradio app…")
demo.launch()


Launching Gradio app…
It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://5059cc67ea76fd5c17.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


