In [18]:
import torch
from transformers import Data2VecAudioForCTC, AutoProcessor, AutoFeatureExtractor
from datasets import load_dataset
from PIL import Image
import requests

In [19]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [20]:
version = "facebook/data2vec-audio-base-960h"

# dataset

https://github.com/facebookresearch/ImageBind#usage

For windows users, you might need to install librosa and soundfile for reading/writing audio files. (Thanks @congyue1977)

`pip install soundfile librosa`

In [21]:
dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation")
dataset = dataset.sort("id")
dataset

Dataset({
    features: ['file', 'audio', 'text', 'speaker_id', 'chapter_id', 'id'],
    num_rows: 73
})

In [22]:
sampling_rate = dataset.features["audio"].sampling_rate
sampling_rate

16000

In [23]:
dataset[0]

{'file': 'C:/Users/Administrator/.cache/huggingface/datasets/downloads/extracted/b49df5cb4e26d70a35c542fbe0eadc8bfee0f971809886d2131859668faeba1c/dev_clean/1272/128104\\1272-128104-0000.flac',
 'audio': {'path': 'C:/Users/Administrator/.cache/huggingface/datasets/downloads/extracted/b49df5cb4e26d70a35c542fbe0eadc8bfee0f971809886d2131859668faeba1c/dev_clean/1272/128104\\1272-128104-0000.flac',
  'array': array([0.00238037, 0.0020752 , 0.00198364, ..., 0.00042725, 0.00057983,
         0.0010376 ]),
  'sampling_rate': 16000},
 'text': 'MISTER QUILTER IS THE APOSTLE OF THE MIDDLE CLASSES AND WE ARE GLAD TO WELCOME HIS GOSPEL',
 'speaker_id': 1272,
 'chapter_id': 128104,
 'id': '1272-128104-0000'}

In [24]:
# get multi array
[d["array"] for d in dataset[:2]["audio"]]

[array([0.00238037, 0.0020752 , 0.00198364, ..., 0.00042725, 0.00057983,
        0.0010376 ]),
 array([-1.52587891e-04, -9.15527344e-05, -1.83105469e-04, ...,
         9.76562500e-04,  9.46044922e-04, -4.88281250e-04])]

In [25]:
# get multi text
[d for d in dataset[:2]["text"]]

['MISTER QUILTER IS THE APOSTLE OF THE MIDDLE CLASSES AND WE ARE GLAD TO WELCOME HIS GOSPEL',
 "NOR IS MISTER QUILTER'S MANNER LESS INTERESTING THAN HIS MATTER"]

# AutoProcessor

In [26]:
processor: AutoProcessor = AutoProcessor.from_pretrained(version)
processor

Wav2Vec2Processor:
- feature_extractor: Wav2Vec2FeatureExtractor {
  "do_normalize": true,
  "feature_extractor_type": "Wav2Vec2FeatureExtractor",
  "feature_size": 1,
  "padding_side": "right",
  "padding_value": 0.0,
  "processor_class": "Wav2Vec2Processor",
  "return_attention_mask": true,
  "sampling_rate": 16000
}

- tokenizer: Wav2Vec2CTCTokenizer(name_or_path='facebook/data2vec-audio-base-960h', vocab_size=32, model_max_length=1000000000000000019884624838656, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>'}, clean_up_tokenization_spaces=True)

## processor

In [27]:
inputs = processor(dataset[0]["audio"]["array"], sampling_rate=sampling_rate, return_tensors="pt").to(device, torch.float16)
inputs

{'input_values': tensor([[0.0386, 0.0337, 0.0322,  ..., 0.0070, 0.0095, 0.0169]],
       device='cuda:0'), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1]], device='cuda:0', dtype=torch.int32)}

In [28]:
inputs["input_values"].shape

torch.Size([1, 93680])

In [29]:
processor.decode(5)

'E'

In [30]:
processor.batch_decode(range(40))

['<pad>',
 '<s>',
 '</s>',
 '<unk>',
 '',
 'E',
 'T',
 'A',
 'O',
 'N',
 'I',
 'H',
 'S',
 'R',
 'D',
 'L',
 'U',
 'M',
 'W',
 'C',
 'F',
 'G',
 'Y',
 'P',
 'B',
 'V',
 'K',
 "'",
 'X',
 'J',
 'Q',
 'Z',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>']

# Data2VecAudioForCTC 语音转文字

Data2VecAudio Model with a language modeling head on top for Connectionist Temporal Classification (CTC).

In [31]:
model: Data2VecAudioForCTC = Data2VecAudioForCTC.from_pretrained(version, torch_dtype=torch.float16).to(device)
model

Data2VecAudioForCTC(
  (data2vec_audio): Data2VecAudioModel(
    (feature_extractor): Data2VecAudioFeatureEncoder(
      (conv_layers): ModuleList(
        (0): Data2VecAudioConvLayer(
          (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,), bias=False)
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
        (1-4): 4 x Data2VecAudioConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
        (5-6): 2 x Data2VecAudioConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(2,), stride=(2,), bias=False)
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
      )
    )
    (feature_projection): Data2VecAudioFeatureProjection(
      (layer_norm): LayerNorm((512,),

In [32]:
model.eval()
with torch.inference_mode():
    outputs = model(**inputs)
outputs

CausalLMOutput(loss=None, logits=tensor([[[ 16.9211, -40.4852, -40.2714,  ...,  -8.6162, -11.0335,  -8.5654],
         [ 16.9211, -40.4851, -40.2714,  ...,  -8.6162, -11.0334,  -8.5654],
         [ 16.9211, -40.4851, -40.2713,  ...,  -8.6162, -11.0334,  -8.5654],
         ...,
         [ -2.1565, -25.2310, -25.1217,  ...,  -6.0423,  -5.5824,  -6.2172],
         [ -2.1373, -25.2431, -25.1347,  ...,  -6.0379,  -5.5736,  -6.2268],
         [ -2.0901, -25.2736, -25.1665,  ...,  -6.0373,  -5.5776,  -6.2142]]],
       device='cuda:0'), hidden_states=None, attentions=None)

In [33]:
model.config.id2label

{0: 'LABEL_0', 1: 'LABEL_1'}

In [34]:
logits = outputs.logits
logits

tensor([[[ 16.9211, -40.4852, -40.2714,  ...,  -8.6162, -11.0335,  -8.5654],
         [ 16.9211, -40.4851, -40.2714,  ...,  -8.6162, -11.0334,  -8.5654],
         [ 16.9211, -40.4851, -40.2713,  ...,  -8.6162, -11.0334,  -8.5654],
         ...,
         [ -2.1565, -25.2310, -25.1217,  ...,  -6.0423,  -5.5824,  -6.2172],
         [ -2.1373, -25.2431, -25.1347,  ...,  -6.0379,  -5.5736,  -6.2268],
         [ -2.0901, -25.2736, -25.1665,  ...,  -6.0373,  -5.5776,  -6.2142]]],
       device='cuda:0')

In [35]:
predicted_ids = torch.argmax(logits, dim=-1)
predicted_ids

tensor([[ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 17,  0, 10,  0, 12,  0,  6,  0,
          0,  5, 13,  0,  4,  4,  4, 30, 16,  0,  0, 10,  0, 15, 15,  0,  0,  6,
          0,  0,  5,  0, 13, 13,  0,  0,  4,  4,  0, 10,  0, 12, 12,  0,  4,  4,
          6, 11,  0,  5,  4,  4,  4,  0,  7,  0,  0, 23, 23,  0,  0,  0,  0,  8,
          0, 12,  0,  0,  6,  0,  0,  0, 15, 15,  5,  5,  0,  0,  0,  0,  4,  4,
          0,  8, 20,  0,  4,  4,  4,  6, 11,  5,  4,  4, 17,  0, 10, 14,  0,  0,
         14,  0, 15,  5,  0,  0,  4,  4, 19, 15, 15,  0,  0,  0,  0,  7,  0, 12,
          0,  0, 12,  0,  0,  0,  5,  0,  0,  0, 12,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  4,  4,  7,  9, 14, 14,  4,  4,  4,  4, 18,  0,  5,  5,  0,  4,
          0,  7,  0, 13,  5,  4,  4, 21,  0, 15,  0,  0,  0,  0,  7,  7,  0,  0,
         14, 14,  0,  0,  4,  4,  4,  6,  0,  8,  0,  4,  4,  0, 18,  0,  0,  5,
         15, 15,  0,  0, 19,

In [36]:
processor.batch_decode(predicted_ids)

['MISTER QUILTER IS THE APOSTLE OF THE MIDDLE CLASSES AND WE ARE GLAD TO WELCOME HIS GOSPEL']

In [37]:
dataset[0]["text"]

'MISTER QUILTER IS THE APOSTLE OF THE MIDDLE CLASSES AND WE ARE GLAD TO WELCOME HIS GOSPEL'