In [1]:
import torch
from transformers import SeamlessM4TProcessor, SeamlessM4TModel
from datasets import load_dataset



In [2]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [3]:
version = "facebook/hf-seamless-m4t-medium"
src_lang = "eng"
tgt_lang = "cmn"
example_english_phrase = "UN Chief Says There Is No Military Solution in Syria"
expected_translation_cmn = "联合国秘书长表示叙利亚问题没有军事解决方案"

# dataset

https://github.com/facebookresearch/ImageBind#usage

For windows users, you might need to install librosa and soundfile for reading/writing audio files. (Thanks @congyue1977)

`pip install soundfile librosa`

In [4]:
dataset = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
dataset = dataset.sort("id")
dataset

Dataset({
    features: ['file', 'audio', 'text', 'speaker_id', 'chapter_id', 'id'],
    num_rows: 73
})

In [5]:
sampling_rate = dataset.features["audio"].sampling_rate
sampling_rate

16000

In [6]:
dataset[0]

{'file': 'C:/Users/Administrator/.cache/huggingface/datasets/downloads/extracted/b49df5cb4e26d70a35c542fbe0eadc8bfee0f971809886d2131859668faeba1c/dev_clean/1272/128104\\1272-128104-0000.flac',
 'audio': {'path': 'C:/Users/Administrator/.cache/huggingface/datasets/downloads/extracted/b49df5cb4e26d70a35c542fbe0eadc8bfee0f971809886d2131859668faeba1c/dev_clean/1272/128104\\1272-128104-0000.flac',
  'array': array([0.00238037, 0.0020752 , 0.00198364, ..., 0.00042725, 0.00057983,
         0.0010376 ]),
  'sampling_rate': 16000},
 'text': 'MISTER QUILTER IS THE APOSTLE OF THE MIDDLE CLASSES AND WE ARE GLAD TO WELCOME HIS GOSPEL',
 'speaker_id': 1272,
 'chapter_id': 128104,
 'id': '1272-128104-0000'}

In [7]:
# get multi array
input_array = [d["array"] for d in dataset[:2]["audio"]]

In [8]:
# get multi text
input_text = [d for d in dataset[:2]["text"]]

# SeamlessM4TProcessor

In [9]:
processor = SeamlessM4TProcessor.from_pretrained(version, src_lang=src_lang)
processor

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


SeamlessM4TProcessor:
- feature_extractor: SeamlessM4TFeatureExtractor {
  "feature_extractor_type": "SeamlessM4TFeatureExtractor",
  "feature_size": 80,
  "language_code": [
    "__ace__",
    "__ace_Latn__",
    "__acm__",
    "__acq__",
    "__aeb__",
    "__afr__",
    "__ajp__",
    "__aka__",
    "__amh__",
    "__apc__",
    "__arb__",
    "__ars__",
    "__ary__",
    "__arz__",
    "__asm__",
    "__ast__",
    "__awa__",
    "__ayr__",
    "__azb__",
    "__azj__",
    "__bak__",
    "__bam__",
    "__ban__",
    "__bel__",
    "__bem__",
    "__ben__",
    "__bho__",
    "__bjn__",
    "__bjn_Latn__",
    "__bod__",
    "__bos__",
    "__bug__",
    "__bul__",
    "__cat__",
    "__ceb__",
    "__ces__",
    "__cjk__",
    "__ckb__",
    "__crh__",
    "__cym__",
    "__dan__",
    "__deu__",
    "__dik__",
    "__dyu__",
    "__dzo__",
    "__ell__",
    "__eng__",
    "__epo__",
    "__est__",
    "__eus__",
    "__ewe__",
    "__fao__",
    "__pes__",
    "__fij__",
    "

## processor

In [10]:
audio_inputs = processor(
    audios=input_array,
    sampling_rate=16000,
    return_tensors="pt"
).to(device, torch.float16)

print(audio_inputs .keys())
print(audio_inputs ["input_features"].shape)
print(audio_inputs ["attention_mask"].shape)

dict_keys(['input_features', 'attention_mask'])
torch.Size([2, 292, 160])
torch.Size([2, 292])


In [11]:
text_inputs = processor(
    text=example_english_phrase,
    padding = True,                         # 填充方式选择 [True, 'longest', 'max_length', 'do_not_pad']
    return_attention_mask = True,           # 返回attention_mask
    return_tensors="pt",
).to(device, torch.float16)

print(text_inputs.keys())
print(text_inputs["input_ids"])
print(text_inputs["attention_mask"])

dict_keys(['input_ids', 'attention_mask'])
tensor([[256047,  16297, 134408,   8165, 248066,  14734,    950,   1135, 105721,
           3573,     83,  27352,    108,  49486,      3,      0]],
       device='cuda:0')
tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0]], device='cuda:0')


# SeamlessM4TModel

The original SeamlessM4T Model transformer which can be used for every tasks available (S2ST, S2TT, T2TT, T2ST).

In [12]:
model: SeamlessM4TModel = SeamlessM4TModel.from_pretrained(version, torch_dtype=torch.float16).to(device)
model.eval()

SeamlessM4TModel(
  (shared): Embedding(256206, 1024, padding_idx=0)
  (text_encoder): SeamlessM4TEncoder(
    (embed_tokens): Embedding(256206, 1024, padding_idx=0)
    (embed_positions): SeamlessM4TSinusoidalPositionalEmbedding()
    (layers): ModuleList(
      (0-11): 12 x SeamlessM4TEncoderLayer(
        (self_attn): SeamlessM4TAttention(
          (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
          (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
          (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
          (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
        )
        (attn_dropout): Dropout(p=0.1, inplace=False)
        (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (ffn): SeamlessM4TFeedForwardNetwork(
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias=True)
          (d

## text2speech

In [13]:
with torch.inference_mode():
    # 指定语言
    outputs = model.generate(**text_inputs, tgt_lang=tgt_lang)
outputs

(tensor([[ 1.9245e-04,  1.1210e-04,  3.2409e-05,  ..., -4.2189e-05,
          -6.8308e-05, -3.4201e-05]], device='cuda:0'),
 tensor(68800, device='cuda:0'))

In [14]:
waveform = outputs[0]
waveform.shape

torch.Size([1, 68800])

In [15]:
waveform_lengths = outputs[1]
waveform_lengths

tensor(68800, device='cuda:0')

## speech2speech

In [16]:
with torch.inference_mode():
    # 指定语言
    outputs = model.generate(**audio_inputs, tgt_lang=tgt_lang)
outputs

(tensor([[-4.4274e-04, -5.2015e-04, -5.3410e-04,  ..., -4.1043e-05,
          -6.6654e-05, -5.0480e-05],
         [ 1.7635e-04,  2.2193e-04,  2.5847e-04,  ..., -7.6679e-03,
          -6.2004e-03, -6.5573e-03]], device='cuda:0'),
 tensor([90240, 77440], device='cuda:0'))

In [17]:
waveform = outputs[0]
waveform.shape

torch.Size([2, 90240])

In [18]:
waveform_lengths = outputs[1]
waveform_lengths

tensor([90240, 77440], device='cuda:0')

## text2text

In [19]:
with torch.inference_mode():
    # 指定语言
    outputs = model.generate(**text_inputs, tgt_lang=tgt_lang, generate_speech=False)
outputs

GreedySearchEncoderDecoderOutput(sequences=tensor([[     3, 256200, 248059, 250844, 251583, 250102,  24042, 249057, 253641,
          87933,  23938, 231527, 120580,  61805,      3]], device='cuda:0'), scores=(tensor([[2.0243, 3.8037, 2.0156,  ..., 2.0702, 1.8899, 1.7301]],
       device='cuda:0'), tensor([[ 2.8007, 11.3387,  2.7928,  ...,  2.6987,  2.2983,  1.2382]],
       device='cuda:0'), tensor([[ 2.4083, 10.7458,  2.3955,  ...,  1.9864,  1.7501,  1.4787]],
       device='cuda:0'), tensor([[2.9782, 9.8788, 2.9707,  ..., 3.1583, 3.0495, 2.3948]],
       device='cuda:0'), tensor([[3.3733, 9.4048, 3.3634,  ..., 3.3496, 2.9956, 2.5184]],
       device='cuda:0'), tensor([[2.5210, 8.9454, 2.5155,  ..., 2.5643, 2.3843, 2.5242]],
       device='cuda:0'), tensor([[2.3521, 5.9110, 2.3473,  ..., 2.2244, 2.1500, 2.2896]],
       device='cuda:0'), tensor([[2.0188, 7.5920, 2.0157,  ..., 2.4107, 2.3485, 3.0948]],
       device='cuda:0'), tensor([[2.0560, 8.5258, 2.0454,  ..., 1.8322, 1.7285, 1.59

In [20]:
token_ids = outputs[0]
token_ids

tensor([[     3, 256200, 248059, 250844, 251583, 250102,  24042, 249057, 253641,
          87933,  23938, 231527, 120580,  61805,      3]], device='cuda:0')

In [21]:
processor.batch_decode(token_ids, skip_special_tokens=True)

['首席说联合国在叙利亚没有军事解决方案']

In [22]:
expected_translation_cmn

'联合国秘书长表示叙利亚问题没有军事解决方案'

## speech2text

In [23]:
with torch.inference_mode():
    # 指定语言
    outputs = model.generate(**audio_inputs, tgt_lang=tgt_lang, generate_speech=False)
outputs

GreedySearchEncoderDecoderOutput(sequences=tensor([[     3, 256200, 248059, 255967, 251021, 249743,  72579, 249221, 249054,
         251447, 253622, 251916, 248506,  71895, 248079,  14994, 251563, 249714,
         253368,  89642,  19763,  82017, 248075,      3],
        [     3, 256200,  49046,  36475,  72579, 255967, 251021, 249743, 248506,
          48081, 250323,  19763, 249173, 250158, 249113, 252579,      3,      0,
              0,      0,      0,      0,      0,      0]], device='cuda:0'), scores=(tensor([[2.1133, 5.8568, 2.1060,  ..., 1.8466, 1.7780, 1.9377],
        [1.6804, 4.2907, 1.6828,  ..., 1.8228, 1.6466, 1.5474]],
       device='cuda:0'), tensor([[ 2.2555, 11.2914,  2.2486,  ...,  2.4986,  2.2985,  2.4363],
        [ 1.5327,  5.7040,  1.5291,  ...,  1.1883,  1.0787,  1.2731]],
       device='cuda:0'), tensor([[ 2.1127,  7.5353,  2.1156,  ...,  2.2168,  2.3183,  2.4251],
        [ 2.2816, 10.7246,  2.2739,  ...,  2.1025,  2.0131,  1.9780]],
       device='cuda:0'), tens

In [24]:
token_ids = outputs[0]
token_ids

tensor([[     3, 256200, 248059, 255967, 251021, 249743,  72579, 249221, 249054,
         251447, 253622, 251916, 248506,  71895, 248079,  14994, 251563, 249714,
         253368,  89642,  19763,  82017, 248075,      3],
        [     3, 256200,  49046,  36475,  72579, 255967, 251021, 249743, 248506,
          48081, 250323,  19763, 249173, 250158, 249113, 252579,      3,      0,
              0,      0,      0,      0,      0,      0]], device='cuda:0')

In [25]:
processor.batch_decode(token_ids, skip_special_tokens=True)

['奎尔特先生是中产阶级的使徒,我们很高兴欢迎他的福音.', '也不是先生奎尔特的方式比他的事更有趣']

In [26]:
input_text
# 翻译
# 奎尔特先生是中产阶级的使徒，我们很高兴欢迎他的福音
# 奎尔特先生的态度也不比他的事情更有趣

['MISTER QUILTER IS THE APOSTLE OF THE MIDDLE CLASSES AND WE ARE GLAD TO WELCOME HIS GOSPEL',
 "NOR IS MISTER QUILTER'S MANNER LESS INTERESTING THAN HIS MATTER"]