In [None]:
!pip install datasets>=1.18.3
!pip install transformers==4.11.3
!pip install librosa
!pip install jiwer

zsh:1: 1.18.3 not found
Collecting transformers==4.11.3
  Downloading transformers-4.11.3-py3-none-any.whl (2.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.9/2.9 MB[0m [31m48.9 kB/s[0m eta [36m0:00:00[0m00:01[0m00:02[0mm
Collecting sacremoses (from transformers==4.11.3)
  Obtaining dependency information for sacremoses from https://files.pythonhosted.org/packages/0b/f0/89ee2bc9da434bd78464f288fdb346bc2932f2ee80a90b2a4bbbac262c74/sacremoses-0.1.1-py3-none-any.whl.metadata
  Downloading sacremoses-0.1.1-py3-none-any.whl.metadata (8.3 kB)
Collecting tokenizers<0.11,>=0.10.1 (from transformers==4.11.3)
  Downloading tokenizers-0.10.3.tar.gz (212 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.7/212.7 kB[0m [31m66.3 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [

In [None]:
import sys
import os

In [None]:
import numpy as np
from transformers import (
    Wav2Vec2Processor,
    TrainingArguments,
    Wav2Vec2ForCTC,
    HubertForCTC,
    Wav2Vec2ConformerForCTC,
)
from transformers.trainer import Trainer
from datasets import Dataset, load_metric
import torch
from dataclasses import dataclass
from typing import Dict, List, Union


In [None]:
@dataclass
class DataCollatorCTCWithPadding:
    """
    Data collator that will dynamically pad the inputs received.
    Args:
        processor (:class:`~transformers.Wav2Vec2Processor`)
            The processor used for proccessing the data.
        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
            among:
            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
              sequence if provided).
            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
              maximum acceptable input length for the model if that argument is not provided.
            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
              different lengths).
    """

    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True

    def __call__(
            self, features: List[Dict[str, Union[List[int], torch.Tensor]]]
    ) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lenghts and need
        # different padding methods
        input_features = [
            {"input_values": feature["input_values"]} for feature in features
        ]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            return_tensors="pt",
        )
        labels_batch = self.processor.pad(
            labels=label_features,
            padding=self.padding,
            return_tensors="pt",
        )

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(
            labels_batch.attention_mask.ne(1), -100
        )

        batch["labels"] = labels

        return batch



In [None]:
wer_metric = load_metric("wer")

In [None]:
def compute_metrics(pred):
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)
    processor = init_wav2vec_processor(TOKENIZER_PATH)

    pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id

    pred_str = processor.batch_decode(pred_ids)
    # we do not want to group tokens when computing the metrics
    label_str = processor.batch_decode(pred.label_ids, group_tokens=False)
    table = zip(
        pred_str,
        label_str,
    )
    for row in table:
        print(row[0], "\n", row[1], "\n---------------------------")
    wer = wer_metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}



In [None]:
def train_wav2vec(
        train_dataset, train_dataset_length, test_dataset, test_dataset_length
):
    processor = init_wav2vec_processor(TOKENIZER_PATH)
    data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)
    LEARNING_RATE = float(os.getenv("LEARNING_RATE", 1e-4))
    os.makedirs(OUTPUT_PATH, exist_ok=True)
    batch_size = int(os.getenv("BATCH_SIZE", 4))
    gradient_accumulation_steps = int(os.getenv("GRADIENT_ACCUMULATION_STEPS", 1))
    training_args = TrainingArguments(
        output_dir=OUTPUT_PATH,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=2,
        eval_steps=max(
            int(
                train_dataset_length / batch_size / gradient_accumulation_steps / 2 / 30
            ),
            30,
        ),
        save_steps=max(
            int(
                train_dataset_length / batch_size / gradient_accumulation_steps / 2 / 30
            ),
            30,
        ),
        max_steps=int(
            train_dataset_length
            / batch_size
            / gradient_accumulation_steps
            / 2
            * int(os.getenv("NUM_TRAIN_EPOCHS", 1))
        ),
        gradient_accumulation_steps=gradient_accumulation_steps,
        evaluation_strategy="steps",
        num_train_epochs=int(os.getenv("NUM_TRAIN_EPOCHS", 1)),
        gradient_checkpointing=True,
        fp16=True,
        logging_steps=1,
        learning_rate=LEARNING_RATE,
        warmup_steps=0,
        save_total_limit=1,
        push_to_hub=False,
        report_to=[
            "tensorboard",
        ],
        # metric_for_best_model="wer",
        greater_is_better=False,
        dataloader_num_workers=0 if os.name == "nt" else os.cpu_count() * 2,
        remove_unused_columns=False,
        label_names=["labels"],
        ignore_data_skip=True,
        sharded_ddp=["simple"] if os.name != "nt" else [],
    )
    # model = Wav2Vec2ForCTC.from_pretrained(
    # model = HubertForCTC.from_pretrained(
    model = Wav2Vec2ConformerForCTC.from_pretrained(
        BASE_MODEL,
        attention_dropout=0.0,
        hidden_dropout=0.0,
        feat_proj_dropout=0.0,
        mask_time_prob=0.05,
        layerdrop=0.0,
        ctc_loss_reduction="mean",
        pad_token_id=processor.tokenizer.pad_token_id,
        vocab_size=len(processor.tokenizer),
        # output_hidden_size=768,
    )
    model.freeze_feature_encoder()
    trainer = Trainer(
        model=model,
        data_collator=data_collator,
        args=training_args,
        compute_metrics=compute_metrics,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        tokenizer=processor.feature_extractor,
    )

    processor.save_pretrained(training_args.output_dir)
    last_checkpoint = None
    # last_checkpoint = get_last_checkpoint(args.output_dir)
    print("Starting to train", last_checkpoint)
    trainer.train(
        resume_from_checkpoint=last_checkpoint,
    )
    final_path = os.path.join(training_args.output_dir, "last_checkpoint")
    trainer.save_model(os.path.join(final_path))
    

In [None]:
if __name__ == "__main__":
    print("OUTPUT_PATH", OUTPUT_PATH)
    print("BASE_MODEL", BASE_MODEL)
    print("TOKENIZER_PATH", TOKENIZER_PATH)
    print("CLIPS_PATH", CLIPS_PATH)
    print("TRAIN_DATASET_PATH", TRAIN_DATASET_PATH)
    print("TEST_DATASET_PATH", TEST_DATASET_PATH)
    print("VALIDATE_DATASET_PATH", VALIDATE_DATASET_PATH)
    print("CACHE_DIR", CACHE_DIR)

    train_dataset = Dataset.from_json(TRAIN_DATASET_PATH, cache_dir=CACHE_DIR)
    # train_dataset = train_dataset.select(range(16688, len(train_dataset)))
    train_dataset_length = len(train_dataset)
    test_dataset = Dataset.from_json(TEST_DATASET_PATH, cache_dir=CACHE_DIR)
    validate_dataset = Dataset.from_json(VALIDATE_DATASET_PATH, cache_dir=CACHE_DIR)
    validate_dataset = validate_dataset.select_columns(
        [
            "path",
            "original_sentence",
        ]
    )
    train_dataset = train_dataset.select_columns(
        [
            "path",
            "original_sentence",
        ]
    )
    test_dataset = test_dataset.select_columns(
        [
            "path",
            "original_sentence",
        ]
    )

    train_dataset = streamify_dataset(train_dataset)

    os.makedirs(OUTPUT_PATH, exist_ok=True)
    # wav2vec_vocab(train_dataset)
    init_wav2vec_processor(TOKENIZER_PATH)

    validate_dataset = validate_dataset.map(
        generate_sentence_data,
        num_proc=os.cpu_count(),
        disable_nullable=True,
        cache_file_name=os.path.join(CACHE_DIR, f"t-wav2vec2-validate.arrow"),
        load_from_cache_file=False,
        writer_batch_size=500,
        remove_columns=validate_dataset.column_names,
    )
    test_dataset = test_dataset.map(
        generate_sentence_data,
        num_proc=os.cpu_count(),
        disable_nullable=True,
        cache_file_name=os.path.join(CACHE_DIR, f"t-wav2vec2-test.arrow"),
        load_from_cache_file=False,
        writer_batch_size=500,
        remove_columns=test_dataset.column_names,
    )
    # dictionary = train_dataset.map(
    #     extract_all_chars,
    #     batched=True,
    #     batch_size=-1,
    #     keep_in_memory=True,
    #     remove_columns=train_dataset.column_names,
    # )

    train_dataset = train_dataset.map(
        generate_sentence_data,
        # num_proc=os.cpu_count(),
        # disable_nullable=True,
        # cache_file_name=os.path.join(CACHE_DIR, f"t-wav2vec2-train.arrow"),
        # load_from_cache_file=False,
        # writer_batch_size=500,
        # remove_columns=train_dataset.column_names,
    )
    train_dataset = train_dataset.shuffle(seed=24)

    train_wav2vec(
        train_dataset,
        train_dataset_length,
        validate_dataset,
        len(validate_dataset),
    )
    