- This notebook for testing the new META Seamless M4T-V2 Model


### Supported Languages

https://huggingface.co/facebook/seamless-m4t-v2-large#supported-languages

In [None]:
!pip install transformers==4.35.2
!pip install datasets==2.15.0 sentencepiece==0.1.99

!sudo apt install libsndfile1
!pip install torchaudio==2.1.0 fairseq2==0.2.0

In [None]:
!git clone https://github.com/facebookresearch/seamless_communication
!cd seamless_communication && pip install .

In [None]:
import os
import pathlib
import numpy as np
import torch
import torchaudio
from huggingface_hub import snapshot_download
from seamless_communication.inference import Translator
from IPython.display import Audio

In [None]:
AUDIO_SAMPLE_RATE = 16000.0
REPO_ID = "facebook/seamless-m4t-v2-large"
MAX_INPUT_AUDIO_LENGTH = 60
CHECKPOINTS_PATH = pathlib.Path("/content/models")

if torch.cuda.is_available():
    device = torch.device("cuda:0")
    dtype = torch.float16
else:
    device = torch.device("cpu")
    dtype = torch.float32

In [None]:
def preprocess_audio(input_audio: str) -> None:
    arr, org_sr = torchaudio.load(input_audio)
    new_arr = torchaudio.functional.resample(arr, orig_freq=org_sr, new_freq=AUDIO_SAMPLE_RATE)
    max_length = int(MAX_INPUT_AUDIO_LENGTH * AUDIO_SAMPLE_RATE)
    if new_arr.shape[1] > max_length:
        new_arr = new_arr[:, :max_length]
        print(f"Input audio is too long. Only the first {MAX_INPUT_AUDIO_LENGTH} seconds is used.")
    torchaudio.save(input_audio, new_arr, sample_rate=int(AUDIO_SAMPLE_RATE))

def run_s2st(
    input_audio: str, source_language: str, target_language: str
) -> tuple[tuple[int, np.ndarray] | None, str]:
    preprocess_audio(input_audio)
    source_language_code = source_language
    target_language_code = target_language
    out_texts, out_audios = translator.predict(
        input=input_audio,
        task_str="S2ST",
        src_lang=source_language_code,
        tgt_lang=target_language_code,
    )
    out_text = str(out_texts[0])
    out_wav = out_audios.audio_wavs[0].cpu().detach().numpy()
    return (int(AUDIO_SAMPLE_RATE), out_wav), out_text


def run_s2tt(input_audio: str, source_language: str, target_language: str) -> str:
    preprocess_audio(input_audio)
    source_language_code = source_language
    target_language_code = target_language
    out_texts, _ = translator.predict(
        input=input_audio,
        task_str="S2TT",
        src_lang=source_language_code,
        tgt_lang=target_language_code,
    )
    return str(out_texts[0])


def run_t2st(input_text: str, source_language: str, target_language: str) -> tuple[tuple[int, np.ndarray] | None, str]:
    source_language_code = source_language
    target_language_code = target_language
    out_texts, out_audios = translator.predict(
        input=input_text,
        task_str="T2ST",
        src_lang=source_language_code,
        tgt_lang=target_language_code,
    )
    out_text = str(out_texts[0])
    out_wav = out_audios.audio_wavs[0].cpu().detach().numpy()
    return (int(AUDIO_SAMPLE_RATE), out_wav), out_text


def run_t2tt(input_text: str, source_language: str, target_language: str) -> str:
    source_language_code = source_language
    target_language_code = target_language
    out_texts, _ = translator.predict(
        input=input_text,
        task_str="T2TT",
        src_lang=source_language_code,
        tgt_lang=target_language_code,
    )
    return str(out_texts[0])


def run_asr(input_audio: str, target_language: str) -> str:
    preprocess_audio(input_audio)
    target_language_code = target_language
    out_texts, _ = translator.predict(
        input=input_audio,
        task_str="ASR",
        src_lang=target_language_code,
        tgt_lang=target_language_code,
    )
    return str(out_texts[0])

In [None]:
if not CHECKPOINTS_PATH.exists():
    snapshot_download(repo_id=REPO_ID, repo_type="model", local_dir=CHECKPOINTS_PATH)

In [None]:
translator = Translator(
    model_name_or_card="seamlessM4T_v2_large",
    vocoder_name_or_card="vocoder_v2",
    device=device,
    dtype=dtype,
    apply_mintox=True,
)

### Let's Try

In [None]:
# listen to the sample
Audio("/content/voices/r1.wav", rate=48000)

In [None]:
output_audio, output_text = run_s2st("/content/voices/r1.wav", "arz", "eng")
print(output_text)
Audio(output_audio[1][0], rate=output_audio[0])

I bought a book from you yesterday, but so far it hasn't reached me.


In [None]:
output_text = run_s2tt("/content/voices/r1.wav", "arz", "eng")
print(output_text)

I bought a book from you yesterday, but so far it hasn't reached me.


In [None]:
output_text = run_t2tt("هو آخر الشهر حييجي امتى عشان الواحد ياخد المرتب", "arz", "eng")
print(output_text)

It's the end of the month. When does one get paid?


In [None]:
output_text = run_asr("/content/voices/r1.wav", "arz")
print(output_text)

اشتريت منكم كتاب أول امبارح لكن لحد دلوقتي موصلنيش.
