<a href="https://colab.research.google.com/github/Losiyu/test/blob/master/wav2vec_base_All.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

https://huggingface.co/Siyong/wav2vec-base-Millad

In [None]:
!pip install -q huggingface_hub torchaudio librosa 
!pip install -q datasets==1.18.3
!pip install -q transformers==4.17.0
!pip install -q jiwer
!pip install -q git-lfs

In [None]:
import numpy as np
import re
import datasets
from datasets import load_dataset

MODEL_REPO_NAME = "wav2vec-base-All"
PRETRAIN_MODEL = "facebook/wav2vec2-base"

TOKEN = "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"

# Dataset

In [None]:
from datasets import Features, Value, Sequence

millad = load_dataset("Siyong/speech_millad", use_auth_token=TOKEN)
customer = load_dataset("Siyong/speech_customer", use_auth_token=TOKEN)
train = datasets.concatenate_datasets([millad['train'], customer['train']])
test = datasets.concatenate_datasets([millad['test'], customer['test']])
data = datasets.DatasetDict({
    "train": train,
    "test": test
})
data

Using custom data configuration Siyong--speech_millad-caff1d6bacb1a568
Reusing dataset parquet (/root/.cache/huggingface/datasets/parquet/Siyong--speech_millad-caff1d6bacb1a568/0.0.0/0b6d5799bb726b24ad7fc7be720c170d8e497f575d02d47537de9a5bac074901)


  0%|          | 0/2 [00:00<?, ?it/s]

Using custom data configuration Siyong--speech_customer-b77751ac3ca1b97d
Reusing dataset parquet (/root/.cache/huggingface/datasets/parquet/Siyong--speech_customer-b77751ac3ca1b97d/0.0.0/0b6d5799bb726b24ad7fc7be720c170d8e497f575d02d47537de9a5bac074901)


  0%|          | 0/2 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['sentence', 'audio'],
        num_rows: 1197
    })
    test: Dataset({
        features: ['sentence', 'audio'],
        num_rows: 300
    })
})

In [None]:
import random
import pandas as pd
from IPython.display import display, HTML

def show_random_elements(dataset, num_examples=10):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)
    
    df = pd.DataFrame(dataset[picks])
    display(HTML(df.to_html()))
  

In [None]:
show_random_elements(data['train'].remove_columns(["audio"]), num_examples=10)

Unnamed: 0,sentence
0,what kind of sandwich
1,so it's gonna be um
2,one of my ten out of tens
3,sounds good
4,alright wrap it up
5,i'm gonna add a little now and a little later
6,no
7,how are you doing
8,ok
9,yeah


In [None]:
from transformers import Wav2Vec2CTCTokenizer, Wav2Vec2FeatureExtractor, Wav2Vec2Processor

tokenizer = Wav2Vec2CTCTokenizer.from_pretrained("Siyong/"+MODEL_REPO_NAME, use_auth_token=TOKEN)
feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=False)
processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)

loading file https://huggingface.co/Siyong/wav2vec-base-All/resolve/main/vocab.json from cache at /root/.cache/huggingface/transformers/20e7e007dd9f6da1a0434bb521a20f32bcc60988356c14ad7dce3e43f6d4a249.ea2d7184a3223c68a5c56c3a82617f8e9cfa9556f93ca72bf21f440ef8d5b392
loading file https://huggingface.co/Siyong/wav2vec-base-All/resolve/main/tokenizer_config.json from cache at /root/.cache/huggingface/transformers/cf385d3e01a8772d2af3023cb47e41ee69151b16e5cdbd9b94df259bed924b81.59710b1a6a5501d31e746b6e464f5c44de3e55a58f80634196025936683a68a9
loading file https://huggingface.co/Siyong/wav2vec-base-All/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/Siyong/wav2vec-base-All/resolve/main/special_tokens_map.json from cache at /root/.cache/huggingface/transformers/fb7e34c0f6a13e05f1d648308e96df762ce932becdce3163ababf4c6def1adf0.a21d51735cf8667bcd610f057e88548d5d6a381401f6b4501a8bc6c1a9dc8498
Adding <s> to the vocabulary
Adding </s> to the vocabulary
Special t

# Prepare dataset

In [None]:
import IPython.display as ipd

sample = data['train'][8]
array = np.array(sample['audio']['array'])
print('sentence:', sample['sentence'])

ipd.Audio(array, autoplay=True, rate=16000)

sentence: little bit of lettuce as well 


In [None]:
def prepare_dataset(batch):
  audio = batch["audio"]

  # batched output is "un-batched"
  batch["input_values"] = processor(audio["array"], sampling_rate=audio["sampling_rate"]).input_values[0]
  batch["input_length"] = len(batch["input_values"])
  
  with processor.as_target_processor():
    batch["labels"] = processor(batch["sentence"]).input_ids
  return batch

data = data.map(prepare_dataset, remove_columns=data.column_names['train'], num_proc=4)
data

DatasetDict({
    train: Dataset({
        features: ['input_values', 'input_length', 'labels'],
        num_rows: 1197
    })
    test: Dataset({
        features: ['input_values', 'input_length', 'labels'],
        num_rows: 300
    })
})

# Trainer

In [None]:
import torch

from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union

@dataclass
class DataCollatorCTCWithPadding:
    """
    Data collator that will dynamically pad the inputs received.
    Args:
        processor (:class:`~transformers.Wav2Vec2Processor`)
            The processor used for proccessing the data.
        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
            among:
            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
              sequence if provided).
            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
              maximum acceptable input length for the model if that argument is not provided.
            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
              different lengths).
    """

    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lenghts and need
        # different padding methods
        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            return_tensors="pt",
        )
        with self.processor.as_target_processor():
            labels_batch = self.processor.pad(
                label_features,
                padding=self.padding,
                return_tensors="pt",
            )

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        batch["labels"] = labels

        return batch

In [None]:
data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)

In [None]:
from datasets import load_metric
wer_metric = load_metric("wer")
cer_metric = load_metric("cer")

def compute_metrics(pred):
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)

    pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id

    pred_str = processor.batch_decode(pred_ids)
    # we do not want to group tokens when computing the metrics
    label_str = processor.batch_decode(pred.label_ids, group_tokens=False)

    wer = wer_metric.compute(predictions=pred_str, references=label_str)
    cer = cer_metric.compute(predictions=pred_str, references=label_str)
    return {"wer": wer, "cer": cer}

In [None]:
from transformers import Wav2Vec2ForCTC

model = Wav2Vec2ForCTC.from_pretrained(
    PRETRAIN_MODEL, 
    attention_dropout=0.1,
    hidden_dropout=0.1,
    feat_proj_dropout=0.0,
    mask_time_prob=0.05,
    layerdrop=0.1,
    ctc_loss_reduction="mean", 
    pad_token_id=processor.tokenizer.pad_token_id,
    vocab_size=len(processor.tokenizer)
)

loading configuration file https://huggingface.co/facebook/wav2vec2-base/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/c7746642f045322fd01afa31271dd490e677ea11999e68660a92619ec7c892b4.ce1f96bfaf3d7475cb8187b9668c7f19437ade45fb9ceb78d2b06a2cec198015
  "Passing `gradient_checkpointing` to a config initialization is deprecated and will be removed in v5 "
Model config Wav2Vec2Config {
  "activation_dropout": 0.0,
  "adapter_kernel_size": 3,
  "adapter_stride": 2,
  "add_adapter": false,
  "apply_spec_augment": true,
  "architectures": [
    "Wav2Vec2ForPreTraining"
  ],
  "attention_dropout": 0.1,
  "bos_token_id": 1,
  "classifier_proj_size": 256,
  "codevector_dim": 256,
  "contrastive_logits_temperature": 0.1,
  "conv_bias": false,
  "conv_dim": [
    512,
    512,
    512,
    512,
    512,
    512,
    512
  ],
  "conv_kernel": [
    10,
    3,
    3,
    3,
    3,
    2,
    2
  ],
  "conv_stride": [
    5,
    2,
    2,
    2,
    2,
    2,
    2
  ],


In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
  output_dir=MODEL_REPO_NAME,
  group_by_length=True,
  per_device_train_batch_size=8,
  evaluation_strategy="steps",
  num_train_epochs=120,
  fp16=True,
  gradient_checkpointing=True,
  save_steps=500,
  eval_steps=500,
  logging_steps=5000,
  learning_rate=1e-4,
  weight_decay=0.005,
  warmup_steps=1000,
  save_total_limit=2,
  # push_to_hub=True,
  hub_token=TOKEN
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=data['train'],
    eval_dataset=data['test'],
    tokenizer=processor.feature_extractor
)

Using amp half precision backend


In [None]:
trainer.train()

The following columns in the training set  don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length. If input_length are not expected by `Wav2Vec2ForCTC.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 1197
  Num Epochs = 120
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 18000


Step,Training Loss,Validation Loss,Wer,Cer
500,No log,4.065407,1.0,0.982268
1000,No log,3.453181,1.0,0.982268
1500,No log,3.070707,0.999169,0.978055
2000,No log,2.733543,1.001663,0.902739
2500,No log,2.58961,1.068994,0.730162
3000,No log,2.331495,1.068994,0.667662
3500,No log,2.221694,1.014963,0.596559
4000,No log,2.380241,1.054863,0.594803
4500,No log,2.220776,0.997506,0.568118
5000,2.422400,2.268734,0.98005,0.553722


The following columns in the evaluation set  don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length. If input_length are not expected by `Wav2Vec2ForCTC.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 300
  Batch size = 8
Saving model checkpoint to wav2vec-base-All/checkpoint-500
Configuration saved in wav2vec-base-All/checkpoint-500/config.json
Model weights saved in wav2vec-base-All/checkpoint-500/pytorch_model.bin
Feature extractor saved in wav2vec-base-All/checkpoint-500/preprocessor_config.json
The following columns in the evaluation set  don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length. If input_length are not expected by `Wav2Vec2ForCTC.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 300
  Batch size = 8
Saving model checkpoint to wav2vec-base-All/checkpoint-1000
Configuration saved in wav2ve

TrainOutput(global_step=18000, training_loss=1.0596862148708768, metrics={'train_runtime': 4254.6224, 'train_samples_per_second': 33.761, 'train_steps_per_second': 4.231, 'total_flos': 1.5520820809485903e+18, 'train_loss': 1.0596862148708768, 'epoch': 120.0})

In [None]:
!rm -rf {MODEL_REPO_NAME}
trainer.push_to_hub()

Cloning https://huggingface.co/Siyong/wav2vec-base-All into local empty directory.


Download file pytorch_model.bin:   0%|          | 1.58k/360M [00:00<?, ?B/s]

Download file training_args.bin: 100%|##########| 2.98k/2.98k [00:00<?, ?B/s]

Clean file training_args.bin:  34%|###3      | 1.00k/2.98k [00:00<?, ?B/s]

Clean file pytorch_model.bin:   0%|          | 1.00k/360M [00:00<?, ?B/s]

Saving model checkpoint to wav2vec-base-All
Configuration saved in wav2vec-base-All/config.json
Model weights saved in wav2vec-base-All/pytorch_model.bin
Feature extractor saved in wav2vec-base-All/preprocessor_config.json


Upload file pytorch_model.bin:   0%|          | 3.34k/360M [00:00<?, ?B/s]

Upload file training_args.bin: 100%|##########| 2.98k/2.98k [00:00<?, ?B/s]

To https://huggingface.co/Siyong/wav2vec-base-All
   ea73acf..14acd39  main -> main

Dropping the following result as it does not have all the necessary fields:
{}
To https://huggingface.co/Siyong/wav2vec-base-All
   14acd39..096663f  main -> main



'https://huggingface.co/Siyong/wav2vec-base-All/commit/14acd39c1f2b38178fa3825363ec4011d80e0a5a'

# Evaluation

In [None]:
# model = Wav2Vec2ForCTC.from_pretrained(REPO_NAME).cuda()
# processor = Wav2Vec2Processor.from_pretrained(REPO_NAME)

In [None]:
def map_to_result(batch):
  with torch.no_grad():
    input_values = torch.tensor(batch["input_values"], device="cuda").unsqueeze(0)
    logits = model(input_values).logits

  pred_ids = torch.argmax(logits, dim=-1)
  batch["pred_str"] = processor.batch_decode(pred_ids)[0]
  batch["text"] = processor.decode(batch["labels"], group_tokens=False)
  
  return batch

In [None]:
result = data['test'].map(map_to_result, remove_columns=data.column_names['test'])

Loading cached processed dataset at /root/.cache/huggingface/datasets/parquet/Siyong--speech_millad-caff1d6bacb1a568/0.0.0/0b6d5799bb726b24ad7fc7be720c170d8e497f575d02d47537de9a5bac074901/cache-1c80317fa3b1799d.arrow


In [None]:
print("Test WER: {:.3f}".format(wer_metric.compute(predictions=result["pred_str"], references=result["text"])))
print("Test CER: {:.3f}".format(cer_metric.compute(predictions=result["pred_str"], references=result["text"])))

Test WER: 0.901
Test CER: 0.552


In [None]:
show_random_elements(result)

Unnamed: 0,pred_str,text
0,a manicwe per l e prs an tat l to ther chese on at tor co,i'm gonna cook it for a little bit first and then i will throw the cheese on after okay
1,p,just a little more
2,col,cool
3,yek,yep
4,anything els,anything else
5,o te you,good to hear
6,a ce,excellent
7,oat,okay
8,a wiet on thi srigh,ah we don't i'm sorry
9,i yust ft nas on eour tor stako bma,i just put an s on yours for steak
