In [1]:
import torch
from transformers import Data2VecAudioModel, AutoProcessor, AutoFeatureExtractor
from datasets import load_dataset
from PIL import Image
import requests

In [2]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [3]:
version = "facebook/data2vec-audio-base-960h"

# dataset

https://github.com/facebookresearch/ImageBind#usage

For windows users, you might need to install librosa and soundfile for reading/writing audio files. (Thanks @congyue1977)

`pip install soundfile librosa`

In [None]:
dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation")
dataset = dataset.sort("id")
dataset

Dataset({
    features: ['file', 'audio', 'text', 'speaker_id', 'chapter_id', 'id'],
    num_rows: 73
})

In [None]:
sampling_rate = dataset.features["audio"].sampling_rate
sampling_rate

16000

In [None]:
dataset[0]

{'file': 'C:/Users/Administrator/.cache/huggingface/datasets/downloads/extracted/b49df5cb4e26d70a35c542fbe0eadc8bfee0f971809886d2131859668faeba1c/dev_clean/1272/128104\\1272-128104-0000.flac',
 'audio': {'path': 'C:/Users/Administrator/.cache/huggingface/datasets/downloads/extracted/b49df5cb4e26d70a35c542fbe0eadc8bfee0f971809886d2131859668faeba1c/dev_clean/1272/128104\\1272-128104-0000.flac',
  'array': array([0.00238037, 0.0020752 , 0.00198364, ..., 0.00042725, 0.00057983,
         0.0010376 ]),
  'sampling_rate': 16000},
 'text': 'MISTER QUILTER IS THE APOSTLE OF THE MIDDLE CLASSES AND WE ARE GLAD TO WELCOME HIS GOSPEL',
 'speaker_id': 1272,
 'chapter_id': 128104,
 'id': '1272-128104-0000'}

In [None]:
# get multi array
[d["array"] for d in dataset[:2]["audio"]]

[array([0.00238037, 0.0020752 , 0.00198364, ..., 0.00042725, 0.00057983,
        0.0010376 ]),
 array([-1.52587891e-04, -9.15527344e-05, -1.83105469e-04, ...,
         9.76562500e-04,  9.46044922e-04, -4.88281250e-04])]

In [None]:
# get multi text
[d for d in dataset[:2]["text"]]

['MISTER QUILTER IS THE APOSTLE OF THE MIDDLE CLASSES AND WE ARE GLAD TO WELCOME HIS GOSPEL',
 "NOR IS MISTER QUILTER'S MANNER LESS INTERESTING THAN HIS MATTER"]

# AutoProcessor

In [6]:
processor: AutoProcessor = AutoProcessor.from_pretrained(version)
processor

Wav2Vec2Processor:
- feature_extractor: Wav2Vec2FeatureExtractor {
  "do_normalize": true,
  "feature_extractor_type": "Wav2Vec2FeatureExtractor",
  "feature_size": 1,
  "padding_side": "right",
  "padding_value": 0.0,
  "processor_class": "Wav2Vec2Processor",
  "return_attention_mask": true,
  "sampling_rate": 16000
}

- tokenizer: Wav2Vec2CTCTokenizer(name_or_path='facebook/data2vec-audio-base-960h', vocab_size=32, model_max_length=1000000000000000019884624838656, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>'}, clean_up_tokenization_spaces=True)

## processor

In [7]:
inputs = processor(dataset[0]["audio"]["array"], sampling_rate=sampling_rate, return_tensors="pt").to(device, torch.float16)
inputs

{'input_values': tensor([[0.0386, 0.0337, 0.0322,  ..., 0.0070, 0.0095, 0.0169]],
       device='cuda:0'), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1]], device='cuda:0', dtype=torch.int32)}

In [8]:
inputs["input_values"].shape

torch.Size([1, 93680])

In [28]:
processor.decode(5)

'E'

In [27]:
processor.batch_decode(range(40))

['<pad>',
 '<s>',
 '</s>',
 '<unk>',
 '',
 'E',
 'T',
 'A',
 'O',
 'N',
 'I',
 'H',
 'S',
 'R',
 'D',
 'L',
 'U',
 'M',
 'W',
 'C',
 'F',
 'G',
 'Y',
 'P',
 'B',
 'V',
 'K',
 "'",
 'X',
 'J',
 'Q',
 'Z',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>']

# AutoFeatureExtractor(same as AutoProcessor)

In [9]:
feature_extractor: AutoFeatureExtractor = AutoFeatureExtractor.from_pretrained(version)
feature_extractor

Wav2Vec2FeatureExtractor {
  "do_normalize": true,
  "feature_extractor_type": "Wav2Vec2FeatureExtractor",
  "feature_size": 1,
  "padding_side": "right",
  "padding_value": 0.0,
  "processor_class": "Wav2Vec2Processor",
  "return_attention_mask": true,
  "sampling_rate": 16000
}

In [10]:
features = feature_extractor(dataset[0]["audio"]["array"], sampling_rate=sampling_rate, return_tensors="pt").to(device)
features

{'input_values': tensor([[0.0386, 0.0337, 0.0322,  ..., 0.0070, 0.0095, 0.0169]],
       device='cuda:0'), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1]], device='cuda:0', dtype=torch.int32)}

In [11]:
features["input_values"].shape

torch.Size([1, 93680])

In [12]:
torch.all(inputs["input_values"] == features["input_values"])

tensor(True, device='cuda:0')

# Data2VecAudioModel(Encoder+Decoder)

The bare Data2VecAudio Model transformer outputting raw hidden-states without any specific head on top.

In [13]:
model: Data2VecAudioModel = Data2VecAudioModel.from_pretrained(version, torch_dtype=torch.float16).to(device)
model

Data2VecAudioModel(
  (feature_extractor): Data2VecAudioFeatureEncoder(
    (conv_layers): ModuleList(
      (0): Data2VecAudioConvLayer(
        (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,), bias=False)
        (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (activation): GELUActivation()
      )
      (1-4): 4 x Data2VecAudioConvLayer(
        (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
        (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (activation): GELUActivation()
      )
      (5-6): 2 x Data2VecAudioConvLayer(
        (conv): Conv1d(512, 512, kernel_size=(2,), stride=(2,), bias=False)
        (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (activation): GELUActivation()
      )
    )
  )
  (feature_projection): Data2VecAudioFeatureProjection(
    (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
    (projection): Linear(in_features=512, out_

In [14]:
model.eval()
with torch.inference_mode():
    outputs = model(**inputs)
outputs

Wav2Vec2BaseModelOutput(last_hidden_state=tensor([[[-0.9334,  0.3089, -0.6611,  ..., -0.1616,  0.2516,  0.3117],
         [-0.9334,  0.3089, -0.6611,  ..., -0.1616,  0.2516,  0.3117],
         [-0.9334,  0.3089, -0.6611,  ..., -0.1616,  0.2516,  0.3117],
         ...,
         [-1.0996, -0.1713, -0.2078,  ..., -0.2466, -0.0577, -0.0303],
         [-1.1000, -0.1701, -0.2100,  ..., -0.2475, -0.0610, -0.0314],
         [-1.1015, -0.1651, -0.2113,  ..., -0.2483, -0.0623, -0.0311]]],
       device='cuda:0'), extract_features=tensor([[[-0.0135, -0.0045, -0.0422,  ...,  0.0849,  0.1696,  0.0117],
         [ 0.0067,  0.0077, -0.0392,  ...,  0.0547,  0.1741,  0.0390],
         [ 0.0057, -0.0101, -0.0360,  ...,  0.0109,  0.0764,  0.0809],
         ...,
         [-0.0278, -0.0017, -0.0413,  ...,  0.0036,  0.0574,  0.0296],
         [-0.0313,  0.0114, -0.0358,  ..., -0.0166,  0.0744,  0.0723],
         [ 0.0044,  0.0196, -0.0367,  ...,  0.0035,  0.0609,  0.0199]]],
       device='cuda:0'), hidden_

In [None]:
model.config.id2label

{0: 'LABEL_0', 1: 'LABEL_1'}

In [15]:
print(outputs.last_hidden_state.shape)
print(outputs.last_hidden_state)

torch.Size([1, 292, 768])
tensor([[[-0.9334,  0.3089, -0.6611,  ..., -0.1616,  0.2516,  0.3117],
         [-0.9334,  0.3089, -0.6611,  ..., -0.1616,  0.2516,  0.3117],
         [-0.9334,  0.3089, -0.6611,  ..., -0.1616,  0.2516,  0.3117],
         ...,
         [-1.0996, -0.1713, -0.2078,  ..., -0.2466, -0.0577, -0.0303],
         [-1.1000, -0.1701, -0.2100,  ..., -0.2475, -0.0610, -0.0314],
         [-1.1015, -0.1651, -0.2113,  ..., -0.2483, -0.0623, -0.0311]]],
       device='cuda:0')


In [16]:
# 应该为encoder的输出
print(outputs.extract_features.shape)
print(outputs.extract_features)

torch.Size([1, 292, 512])
tensor([[[-0.0135, -0.0045, -0.0422,  ...,  0.0849,  0.1696,  0.0117],
         [ 0.0067,  0.0077, -0.0392,  ...,  0.0547,  0.1741,  0.0390],
         [ 0.0057, -0.0101, -0.0360,  ...,  0.0109,  0.0764,  0.0809],
         ...,
         [-0.0278, -0.0017, -0.0413,  ...,  0.0036,  0.0574,  0.0296],
         [-0.0313,  0.0114, -0.0358,  ..., -0.0166,  0.0744,  0.0723],
         [ 0.0044,  0.0196, -0.0367,  ...,  0.0035,  0.0609,  0.0199]]],
       device='cuda:0')


In [17]:
outputs.hidden_states