In [1]:
import torch
from transformers import Wav2Vec2ForCTC, AutoProcessor
from datasets import load_dataset

In [2]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [3]:
version = "facebook/mms-1b-all"
target_lang = "eng"

# dataset

https://github.com/facebookresearch/ImageBind#usage

For windows users, you might need to install librosa and soundfile for reading/writing audio files. (Thanks @congyue1977)

`pip install soundfile librosa`

In [4]:
dataset = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
dataset = dataset.sort("id")
dataset

Dataset({
    features: ['file', 'audio', 'text', 'speaker_id', 'chapter_id', 'id'],
    num_rows: 73
})

In [5]:
sampling_rate = dataset.features["audio"].sampling_rate
sampling_rate

16000

In [6]:
dataset[0]

{'file': 'C:/Users/Administrator/.cache/huggingface/datasets/downloads/extracted/b49df5cb4e26d70a35c542fbe0eadc8bfee0f971809886d2131859668faeba1c/dev_clean/1272/128104\\1272-128104-0000.flac',
 'audio': {'path': 'C:/Users/Administrator/.cache/huggingface/datasets/downloads/extracted/b49df5cb4e26d70a35c542fbe0eadc8bfee0f971809886d2131859668faeba1c/dev_clean/1272/128104\\1272-128104-0000.flac',
  'array': array([0.00238037, 0.0020752 , 0.00198364, ..., 0.00042725, 0.00057983,
         0.0010376 ]),
  'sampling_rate': 16000},
 'text': 'MISTER QUILTER IS THE APOSTLE OF THE MIDDLE CLASSES AND WE ARE GLAD TO WELCOME HIS GOSPEL',
 'speaker_id': 1272,
 'chapter_id': 128104,
 'id': '1272-128104-0000'}

In [7]:
# get multi array
[d["array"] for d in dataset[:2]["audio"]]

[array([0.00238037, 0.0020752 , 0.00198364, ..., 0.00042725, 0.00057983,
        0.0010376 ]),
 array([-1.52587891e-04, -9.15527344e-05, -1.83105469e-04, ...,
         9.76562500e-04,  9.46044922e-04, -4.88281250e-04])]

In [8]:
# get multi text
[d for d in dataset[:2]["text"]]

['MISTER QUILTER IS THE APOSTLE OF THE MIDDLE CLASSES AND WE ARE GLAD TO WELCOME HIS GOSPEL',
 "NOR IS MISTER QUILTER'S MANNER LESS INTERESTING THAN HIS MATTER"]

# AutoProcessor

In [12]:
processor: AutoProcessor = AutoProcessor.from_pretrained(version)
processor

Wav2Vec2Processor:
- feature_extractor: Wav2Vec2FeatureExtractor {
  "do_normalize": true,
  "feature_extractor_type": "Wav2Vec2FeatureExtractor",
  "feature_size": 1,
  "padding_side": "right",
  "padding_value": 0,
  "processor_class": "Wav2Vec2Processor",
  "return_attention_mask": true,
  "sampling_rate": 16000
}

- tokenizer: Wav2Vec2CTCTokenizer(name_or_path='facebook/mms-1b-all', vocab_size=154, model_max_length=1000000000000000019884624838656, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>'}, clean_up_tokenization_spaces=True)

## processor

In [13]:
inputs = processor(
    [d["array"] for d in dataset[:2]["audio"]],
    sampling_rate=sampling_rate,
    padding=True,
    return_tensors="pt"
).to(device, torch.float16)
inputs

{'input_values': tensor([[ 0.0386,  0.0337,  0.0322,  ...,  0.0070,  0.0095,  0.0169],
        [-0.0015, -0.0008, -0.0019,  ...,  0.0000,  0.0000,  0.0000]],
       device='cuda:0'), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0', dtype=torch.int32)}

In [14]:
inputs["input_values"].shape

torch.Size([2, 93680])

In [15]:
processor.decode(5)

'e'

In [16]:
processor.batch_decode(range(40))

['<pad>',
 '<s>',
 '</s>',
 '<unk>',
 '',
 'e',
 't',
 'a',
 'o',
 'i',
 'n',
 's',
 'r',
 'h',
 'l',
 'd',
 'c',
 'u',
 'm',
 'f',
 'p',
 'g',
 'y',
 'w',
 'b',
 'v',
 'k',
 '0',
 'x',
 'j',
 '1',
 "'",
 '-',
 'z',
 'q',
 '2',
 '9',
 '.',
 '5',
 '4']

## suppror language

In [17]:
processor.tokenizer.vocab.keys()

dict_keys(['abi', 'abk', 'abp', 'aca', 'acd', 'ace', 'acf', 'ach', 'acn', 'acr', 'acu', 'ade', 'adh', 'adj', 'adx', 'aeu', 'afr', 'agd', 'agg', 'agn', 'agr', 'agu', 'agx', 'aha', 'ahk', 'aia', 'aka', 'akb', 'ake', 'akp', 'alj', 'alp', 'alt', 'alz', 'ame', 'amf', 'amh', 'ami', 'amk', 'ann', 'any', 'aoz', 'apb', 'apr', 'ara', 'arl', 'asa', 'asg', 'asm', 'ast', 'ata', 'atb', 'atg', 'ati', 'atq', 'ava', 'avn', 'avu', 'awa', 'awb', 'ayo', 'ayr', 'ayz', 'azb', 'azg', 'azj-script_cyrillic', 'azj-script_latin', 'azz', 'bak', 'bam', 'ban', 'bao', 'bas', 'bav', 'bba', 'bbb', 'bbc', 'bbo', 'bcc-script_arabic', 'bcc-script_latin', 'bcl', 'bcw', 'bdg', 'bdh', 'bdq', 'bdu', 'bdv', 'beh', 'bel', 'bem', 'ben', 'bep', 'bex', 'bfa', 'bfo', 'bfy', 'bfz', 'bgc', 'bgq', 'bgr', 'bgt', 'bgw', 'bha', 'bht', 'bhz', 'bib', 'bim', 'bis', 'biv', 'bjr', 'bjv', 'bjw', 'bjz', 'bkd', 'bkv', 'blh', 'blt', 'blx', 'blz', 'bmq', 'bmr', 'bmu', 'bmv', 'bng', 'bno', 'bnp', 'boa', 'bod', 'boj', 'bom', 'bor', 'bos', 'bov', 'b

# Wav2Vec2ForCTC


In [18]:
model: Wav2Vec2ForCTC = Wav2Vec2ForCTC.from_pretrained(version, torch_dtype=torch.float16, target_lang=target_lang, ignore_mismatched_sizes=True).to(device)
model

Downloading model.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

Wav2Vec2ForCTC(
  (wav2vec2): Wav2Vec2Model(
    (feature_extractor): Wav2Vec2FeatureEncoder(
      (conv_layers): ModuleList(
        (0): Wav2Vec2LayerNormConvLayer(
          (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
        (1-4): 4 x Wav2Vec2LayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
        (5-6): 2 x Wav2Vec2LayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(2,), stride=(2,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
      )
    )
    (feature_projection): Wav2Vec2FeatureProjection(
      (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (projec

In [19]:
model.eval()
with torch.inference_mode():
    outputs = model(**inputs)
outputs

CausalLMOutput(loss=None, logits=tensor([[[ 10.0347, -18.0810, -20.7676,  ..., -17.9896, -18.2610, -20.0932],
         [  9.6300, -18.6121, -21.5910,  ..., -18.5579, -18.5911, -20.5323],
         [  9.8797, -19.1469, -21.6907,  ..., -18.8769, -18.7257, -20.6271],
         ...,
         [  9.8885, -17.8844, -20.5762,  ..., -17.6687, -18.0048, -19.4892],
         [  9.8782, -17.9860, -19.9908,  ..., -16.9794, -17.8562, -19.0027],
         [  4.1092, -15.5776, -15.4139,  ..., -14.8518, -14.8594, -16.3123]],

        [[ 10.4863, -17.4048, -20.1359,  ..., -17.4654, -17.6441, -19.5235],
         [ 10.2937, -17.8056, -21.0499,  ..., -18.1527, -17.7918, -19.8415],
         [ 10.3531, -17.8547, -21.0130,  ..., -18.1862, -17.8032, -19.8252],
         ...,
         [  8.3843, -16.2233, -16.4010,  ..., -15.4796, -15.4063, -17.1707],
         [  8.4696, -16.2308, -16.4429,  ..., -15.4954, -15.4338, -17.1817],
         [  8.5553, -16.2380, -16.4870,  ..., -15.5120, -15.4621, -17.1953]]],
       devi

In [20]:
outputs.logits.shape

torch.Size([2, 292, 154])

In [None]:
ids = torch.argmax(outputs.logits, dim=-1)

In [None]:
processor.batch_decode(ids, skip_special_tokens=True)

In [24]:
ids = torch.argmax(outputs.logits, dim=-1)
print(ids)
ids.shape

tensor([[ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 18,  0,  0,  0,  0,  0,  0,
          0,  0, 12,  0,  4,  4,  0, 34,  0, 17,  0,  9,  0,  0,  0, 14,  0,  0,
          6,  0,  5,  0,  0, 12,  0,  0,  0,  0,  4,  4,  9,  0, 11,  4,  4,  4,
          4,  6, 13,  5,  4,  4,  4,  0,  7,  0,  0,  0, 20,  0,  0,  0,  8,  0,
          0,  0, 11,  0,  0,  6,  0,  0,  0, 14,  0,  5,  0,  0,  0,  0,  0,  4,
          4,  8, 19,  4,  4,  4,  4,  6, 13,  5,  4,  4, 18,  0,  9, 15,  0,  0,
         15,  0, 14,  5,  0,  0,  4,  0, 16,  0, 14,  0,  0,  7,  0,  0,  0, 11,
          0,  0,  0, 11,  0,  0,  0,  5,  0,  0,  0,  0,  0, 11,  0,  0,  0,  0,
          0,  0,  0,  4,  0,  7,  0, 10, 15,  0,  4,  4,  4,  0, 23,  5,  0,  4,
          4,  7, 12,  0,  5,  4,  4,  4,  4, 21,  0, 14,  0,  0,  7,  0,  0,  0,
          0, 15,  0,  0,  0,  0,  4,  4,  6,  8,  4,  4,  4,  4,  4, 23,  0,  5,
          0,  0, 14,  0,  0,

torch.Size([2, 292])

In [25]:
processor.batch_decode(ids, skip_special_tokens=True)

['mr quilter is the apostle of the midle clases and we are glad to welcome his gospel',
 "nor is mr quilter's maner les interesting than his mater"]

In [26]:
[d for d in dataset[:2]["text"]]

['MISTER QUILTER IS THE APOSTLE OF THE MIDDLE CLASSES AND WE ARE GLAD TO WELCOME HIS GOSPEL',
 "NOR IS MISTER QUILTER'S MANNER LESS INTERESTING THAN HIS MATTER"]