<a href="https://colab.research.google.com/github/HamdanXI/nlp_adventure/blob/main/wav2vec2ctc-base-lj-speech-DifferentStructure.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install datasets>=1.18.3
!pip install transformers==4.11.3
!pip install librosa
!pip install jiwer
!pip install transformers[torch]
!apt install git-lfs

In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
# Data Preprocessing

'''
from datasets import load_dataset, load_metric
from datasets import Dataset, DatasetDict

timit = load_dataset("lj_speech")

timit = timit.remove_columns(['id', 'normalized_text'])

train_dataset = timit['train'].select(indices=list(range(4620)))
test_dataset = timit['train'].select(indices=list(range(4620, 4620 + 1680)))

new_dataset = DatasetDict({
    'train': train_dataset,
    'test': test_dataset
})

timit = new_dataset

import re
chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"]'

def remove_special_characters(batch):
    batch["text"] = re.sub(chars_to_ignore_regex, '', batch["text"]).lower()
    return batch

timit = timit.map(remove_special_characters)

timit.push_to_hub("lj_speech_DifferentStructure")

'''

In [None]:
from datasets import load_dataset, load_metric
import json
from transformers import Wav2Vec2CTCTokenizer

timit = load_dataset("HamdanXI/lj_speech_DifferentStructure")

# Removing extra vocabs
def remove_vocab(text):
    vocab_to_remove = ['1', 'é', '”', 'è', 'â', '6', 'à', '3', '&', ')', '£', '8', '7', '0', '“', 'ê', '’', '2', '5', 'ü', '9', '4', '(']
    for v in vocab_to_remove:
        text = text.replace(v, '')
    return text.strip()

timit["train"] = timit["train"].map(lambda x: {"text": remove_vocab(x["text"])})
timit["test"] = timit["test"].map(lambda x: {"text": remove_vocab(x["text"])})

In [None]:
def extract_all_chars(batch):
  all_text = " ".join(batch["text"])
  vocab = list(set(all_text))
  return {"vocab": [vocab], "all_text": [all_text]}

vocabs = timit.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=timit.column_names["train"])

vocab_list = list(set(vocabs["train"]["vocab"][0]) | set(vocabs["test"]["vocab"][0]))

vocab_dict = {v: k for k, v in enumerate(vocab_list)}
vocab_dict

Map:   0%|          | 0/4620 [00:00<?, ? examples/s]

Map:   0%|          | 0/1680 [00:00<?, ? examples/s]

{'o': 0,
 's': 1,
 'z': 2,
 'r': 3,
 'l': 4,
 'g': 5,
 'd': 6,
 'y': 7,
 'i': 8,
 'b': 9,
 'u': 10,
 'k': 11,
 't': 12,
 'n': 13,
 'c': 14,
 'a': 15,
 "'": 16,
 'j': 17,
 ' ': 18,
 'h': 19,
 'm': 20,
 'p': 21,
 'x': 22,
 'v': 23,
 'f': 24,
 'w': 25,
 'q': 26,
 'e': 27}

In [None]:
vocab_dict["|"] = vocab_dict[" "]
del vocab_dict[" "]

vocab_dict["[UNK]"] = len(vocab_dict)
vocab_dict["[PAD]"] = len(vocab_dict)

print(len(vocab_dict))

30


In [None]:
with open('vocab.json', 'w') as vocab_file:
    json.dump(vocab_dict, vocab_file)

tokenizer = Wav2Vec2CTCTokenizer("./vocab.json", unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|")

repo_name = "wav2vec2ctc-base-lj-speech-DifferentStructure"

tokenizer.push_to_hub(repo_name)

CommitInfo(commit_url='https://huggingface.co/HamdanXI/wav2vec2ctc-base-lj-speech-DifferentStructure/commit/58e9352b056b540cc4f083b5bdb88ce60087aa4f', commit_message='Upload tokenizer', commit_description='', oid='58e9352b056b540cc4f083b5bdb88ce60087aa4f', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
from transformers import Wav2Vec2FeatureExtractor

feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=22050, padding_value=0.0, do_normalize=True, return_attention_mask=False)

from transformers import Wav2Vec2Processor

processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)

In [None]:
def prepare_dataset(batch):
    audio = batch["audio"]
    batch["input_values"] = processor(audio["array"], sampling_rate=audio["sampling_rate"]).input_values[0]
    with processor.as_target_processor():
        batch["labels"] = processor(batch["text"]).input_ids
    return batch

In [None]:
timit = timit.map(prepare_dataset, remove_columns=timit.column_names["train"], num_proc=1)

Map:   0%|          | 0/4620 [00:00<?, ? examples/s]



Map:   0%|          | 0/1680 [00:00<?, ? examples/s]

In [None]:
import torch

from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union

@dataclass
class DataCollatorCTCWithPadding:
    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True
    max_length: Optional[int] = None
    max_length_labels: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    pad_to_multiple_of_labels: Optional[int] = None

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )
        with self.processor.as_target_processor():
            labels_batch = self.processor.pad(
                label_features,
                padding=self.padding,
                max_length=self.max_length_labels,
                pad_to_multiple_of=self.pad_to_multiple_of_labels,
                return_tensors="pt",
            )

        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
        batch["labels"] = labels
        return batch

In [None]:
data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)

In [None]:
wer_metric = load_metric("wer")

  wer_metric = load_metric("wer")


In [None]:
def compute_metrics(pred):
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)
    pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id
    pred_str = processor.batch_decode(pred_ids)
    label_str = processor.batch_decode(pred.label_ids, group_tokens=False)
    wer = wer_metric.compute(predictions=pred_str, references=label_str)
    return {"wer": wer}

In [None]:
from transformers import Wav2Vec2ForCTC

model = Wav2Vec2ForCTC.from_pretrained(
    "facebook/wav2vec2-base",
    ctc_loss_reduction="mean",
    pad_token_id=processor.tokenizer.pad_token_id,
)

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'lm_head.bias', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
model.freeze_feature_extractor()



In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
  output_dir=repo_name,
  group_by_length=True,
  per_device_train_batch_size=32,
  evaluation_strategy="steps",
  num_train_epochs=2,
  fp16=True,
  gradient_checkpointing=True,
  save_steps=500,
  eval_steps=500,
  logging_steps=500,
  learning_rate=1e-4,
  weight_decay=0.005,
  warmup_steps=1000,
  save_total_limit=2,
)

In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=timit["train"],
    eval_dataset=timit["test"],
    tokenizer=processor.feature_extractor,
)

In [None]:
trainer.train()



Step,Training Loss,Validation Loss


TrainOutput(global_step=290, training_loss=5.054187853582974, metrics={'train_runtime': 1337.7541, 'train_samples_per_second': 6.907, 'train_steps_per_second': 0.217, 'total_flos': 7.768117983992141e+17, 'train_loss': 5.054187853582974, 'epoch': 2.0})

In [None]:
trainer.push_to_hub()

pytorch_model.bin:   0%|          | 0.00/378M [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

training_args.bin:   0%|          | 0.00/4.54k [00:00<?, ?B/s]

'https://huggingface.co/HamdanXI/wav2vec2ctc-base-lj-speech-DifferentStructure/tree/main/'