In [1]:
import torch

import flash
from flash.audio import SpeechRecognition, SpeechRecognitionData
from flash.core.data.utils import download_data


In [2]:
from datasets import load_dataset

dataset = load_dataset("timit_asr", data_dir="../data/timit")
dataset


Using custom data configuration default-data_dir=..%2Fdata%2Ftimit
Found cached dataset timit_asr (/home/nm_rostislav/.cache/huggingface/datasets/timit_asr/default-data_dir=..%2Fdata%2Ftimit/0.0.0/43f9448dd5db58e95ee48a277f466481b151f112ea53e27f8173784da9254fb2)


  0%|          | 0/2 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['file', 'audio', 'text', 'phonetic_detail', 'word_detail', 'dialect_region', 'sentence_type', 'speaker_id', 'id'],
        num_rows: 4620
    })
    test: Dataset({
        features: ['file', 'audio', 'text', 'phonetic_detail', 'word_detail', 'dialect_region', 'sentence_type', 'speaker_id', 'id'],
        num_rows: 1680
    })
})

In [3]:
def preprocess_fn(file, text):
    return {"input": file, "target": text}


train_dataset = dataset["train"]
train_dataset_processed = train_dataset.map(
    preprocess_fn,
    input_columns=["file", "text"],
    remove_columns=[
        "file",
        "audio",
        "phonetic_detail",
        "phonetic_detail",
        "word_detail",
        "dialect_region",
        "sentence_type",
        "speaker_id",
        "id",
    ],
)


Loading cached processed dataset at /home/nm_rostislav/.cache/huggingface/datasets/timit_asr/default-data_dir=..%2Fdata%2Ftimit/0.0.0/43f9448dd5db58e95ee48a277f466481b151f112ea53e27f8173784da9254fb2/cache-6c6f006ba0646cc7.arrow


In [39]:
train_dataset_processed


Dataset({
    features: ['text', 'input', 'target'],
    num_rows: 4620
})

In [4]:
datamodule = SpeechRecognitionData.from_json(
    train_dataset=train_dataset_processed,
    batch_size=16,
)


  exec(code_obj, self.user_global_ns, self.user_ns)


In [5]:
datamodule.train_dataset[0]


{'text': 'She had your dark suit in greasy wash water all year.',
 'input': array([ 3.0517578e-05, -3.0517578e-05,  6.1035156e-05, ...,
        -3.0517578e-05, -1.5258789e-04, -2.4414062e-04], dtype=float32),
 'target': 'She had your dark suit in greasy wash water all year.',
 'metadata': {'sampling_rate': 16000}}

In [6]:
# 2. Build the task
model = SpeechRecognition(backbone="facebook/wav2vec2-base-960h", learning_rate=1e-3)


Using 'facebook/wav2vec2-base-960h' provided by Hugging Face/transformers (https://github.com/huggingface/transformers) and PyTorch/fairseq (https://github.com/pytorch/fairseq).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
next(iter(datamodule.train_dataset))["target"]

'She had your dark suit in greasy wash water all year.'

In [44]:
model._output_transform._tokenizer.batch_decode(model._collate_fn([next(iter(datamodule.train_dataset))])["labels"])

['S<unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk>']

In [8]:
# 3. Create the trainer and finetune the model
trainer = flash.Trainer(max_epochs=10, accelerator="gpu", devices=[2], precision="bf16")
trainer.finetune(model, datamodule=datamodule)


Using bfloat16 Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2]

  | Name          | Type           | Params
-------------------------------------------------
0 | model         | Wav2Vec2ForCTC | 94.4 M
1 | train_metrics | ModuleDict     | 0     
2 | val_metrics   | ModuleDict     | 0     
3 | test_metrics  | ModuleDict     | 0     
-------------------------------------------------
94.4 M    Trainable params
0         Non-trainable params
94.4 M    Total params
377.585   Total estimated model params size (MB)


Training: 0it [00:00, ?it/s]

`Trainer.fit` stopped: `max_epochs=10` reached.


In [9]:
trainer.predict(model, datamodule.train_dataloader())

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2]


Predicting: 288it [00:00, ?it/s]

[['<unk>',
  '<unk>',
  '<unk>',
  '<unk>',
  '<unk>',
  '<unk>',
  '<unk>',
  '<unk>',
  '<unk>',
  '<unk>',
  '<unk>',
  '<unk>',
  '<unk>',
  '<unk>',
  '<unk>',
  '<unk>'],
 ['<unk>',
  '<unk>',
  '<unk>',
  '<unk>',
  '<unk>',
  '<unk>',
  '<unk>',
  '<unk>',
  '<unk>',
  '<unk>',
  '<unk>',
  '<unk>',
  '<unk>',
  '<unk>',
  '<unk>',
  '<unk>'],
 ['<unk>',
  '<unk>',
  '<unk>',
  '<unk>',
  '<unk>',
  '<unk>',
  '<unk>',
  '<unk>',
  '<unk>',
  '<unk>',
  '<unk>',
  '<unk>',
  '<unk>',
  '<unk>',
  '<unk>',
  '<unk>'],
 ['<unk>',
  '<unk>',
  '<unk>',
  '<unk>',
  '<unk>',
  '<unk>',
  '<unk>',
  '<unk>',
  '<unk>',
  '<unk>',
  '<unk>',
  '<unk>',
  '<unk>',
  '<unk>',
  '<unk>',
  '<unk>'],
 ['<unk>',
  '<unk>',
  '<unk>',
  '<unk>',
  '<unk>',
  '<unk>',
  '<unk>',
  '<unk>',
  '<unk>',
  '<unk>',
  '<unk>',
  '<unk>',
  '<unk>',
  '<unk>',
  '<unk>',
  '<unk>'],
 ['<unk>',
  '<unk>',
  '<unk>',
  '<unk>',
  '<unk>',
  '<unk>',
  '<unk>',
  '<unk>',
  '<unk>',
  '<unk>',
  '<u