In [1]:
import pandas as pd
import os
import re
import tqdm
import librosa
from datasets import Dataset, load_metric
import json
import IPython.display as ipd
import numpy as np
import random
import torch
from src.data_util import DataCollatorCTCWithPadding
from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2Processor, Wav2Vec2CTCTokenizer, Wav2Vec2ForCTC
from transformers import TrainingArguments, Trainer

root_dir = '/data/users/akrishnan/multilingual_asr/corpora/l2arctic'
speaker_info = pd.read_csv(f'{root_dir}/speaker_info.csv',sep='|')
speaker_info.columns = ['speaker','gender','accent','wav_files','annotations']
#SUBSAMPLING MODEL
speaker_info = speaker_info[speaker_info["speaker"].isin(['YBAA','ZHAA','LXC','BWC','HQTV','THV','HJK','YKWK'])]


speaker_files = {}
arctic_data = pd.DataFrame(['path','sentence','name','gender','accent','speaker'])
data_list=[]
for speaker in tqdm.tqdm(speaker_info.itertuples(index=False),total=len(speaker_info)):
    wav_transcript_dict = {}
    for wav in tqdm.tqdm_notebook(os.listdir(f'{root_dir}/{speaker.speaker}/wav/')):
        #Getting full paths for the wav transcript pairs
        wav_path = f"{root_dir}/{speaker.speaker}/wav/{wav}"
        transcript_path = f"{root_dir}/{speaker.speaker}/transcript/{wav.replace('.wav','.txt')}"
        
        #Reading transcripts and pairs. Note the forced resampling in librosa.load()
        array,sampling_rate=librosa.load(wav_path,sr=16000)
        transcript=open(transcript_path).read()
        
        data_dict={
            'audio': {'array' : array, 'sampling_rate':sampling_rate},
            'sentence' : transcript,
            'path' : wav,
            'gender' : speaker.gender,
            'accent' : speaker.accent,
            'speaker': speaker.speaker}
        
        data_list.append(data_dict)
    break
df=pd.DataFrame(data_list)
#Making Huggingface dataset
l2_data=Dataset.from_pandas(df)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for wav in tqdm.tqdm_notebook(os.listdir(f'{root_dir}/{speaker.speaker}/wav/')):


  0%|          | 0/1130 [00:00<?, ?it/s]

  0%|                                                                                                                                                                                 | 0/8 [03:06<?, ?it/s]


In [6]:
#Listening to a sample for testing
rand_int = 339
print(l2_data[rand_int]['sentence'])
ipd.Audio(data=l2_data[rand_int]["audio"]["array"], autoplay=True, rate=16000)

why doggone you all shake again


# Text Preprocessing

### Cleaning

In [2]:

chars_to_remove_regex = '[\,\?\.\!\-\;\:\"\“\%\‘\”\�\']'

def remove_special_characters(batch):
    batch["sentence"] = re.sub(chars_to_remove_regex, '', batch["sentence"]).lower()
    return batch
l2_data = l2_data.map(remove_special_characters)

0ex [00:00, ?ex/s]

### Vocab Extraction

In [3]:
def extract_all_chars(batch):
  all_text = " ".join(batch["sentence"])
  vocab = list(set(all_text))
  return {"vocab": [vocab], "all_text": [all_text]}
vocab = l2_data.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=l2_data.column_names)
vocab_dict = {v: k for k, v in enumerate(sorted(vocab['vocab'][0]))}

#Accounting for blanks and unknowns
vocab_dict["|"] = vocab_dict[" "]
del vocab_dict[" "]
vocab_dict["[UNK]"] = len(vocab_dict)
vocab_dict["[PAD]"] = len(vocab_dict)
len(vocab_dict)

#Saving into file
with open(f'{root_dir}/vocab.json', 'w') as vocab_file:
    json.dump(vocab_dict, vocab_file)

  0%|          | 0/1 [00:00<?, ?ba/s]

# Setting up audio processors and tokenizers

In [4]:
tokenizer = Wav2Vec2CTCTokenizer.from_pretrained("./", unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|")

feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=True)


processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


# Pusing the audio and transcript through wav2vec tokenizers and processors

In [9]:
def prepare_dataset(batch):
    audio = batch["audio"]

    # batched output is "un-batched"
    batch["input_values"] = processor(audio["array"], sampling_rate=audio["sampling_rate"]).input_values[0]
    batch["input_length"] = len(batch["input_values"])

    with processor.as_target_processor():
        batch["labels"] = processor(batch["sentence"]).input_ids
    return batch
l2_data = l2_data.map(prepare_dataset, remove_columns=['audio','sentence'])

0ex [00:00, ?ex/s]



# Defining Train Val Test Split, Setting up dataloaders and data collators

In [25]:
indices = np.arange(len(l2_data))
np.random.shuffle(indices)
train_indices,val_indices,test_indices = np.split(indices,[int(0.7*len(indices)),int(0.8*len(indices))])

BATCH_SIZE=8

data_collator = DataCollatorCTCWithPadding(processor = processor, padding=True)


# training_dataset = torch.utils.data.DataLoader(torch.utils.data.Subset(l2_data, train_indices), batch_size=BATCH_SIZE, shuffle=False,collate_fn=data_collator)

# validation_dataset= torch.utils.data.DataLoader(torch.utils.data.Subset(l2_data, val_indices), batch_size=BATCH_SIZE, shuffle=False,collate_fn=data_collator)

# testing_dataset=torch.utils.data.DataLoader(torch.utils.data.Subset(l2_data, test_indices), batch_size=BATCH_SIZE, shuffle=False,collate_fn=data_collator)

training_dataset = torch.utils.data.Subset(l2_data, train_indices)

validation_dataset = torch.utils.data.Subset(l2_data, val_indices)

testing_dataset = torch.utils.data.Subset(l2_data, test_indices)


# Setting up evaluation Metrics

In [27]:
wer_metric = load_metric("wer")

def compute_metrics(pred):
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)

    pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id

    pred_str = processor.batch_decode(pred_ids)
    # we do not want to group tokens when computing the metrics
    label_str = processor.batch_decode(pred.label_ids, group_tokens=False)

    wer = wer_metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}

# Loading Model, setting up Training Arguments

In [28]:
model = Wav2Vec2ForCTC.from_pretrained(
    "facebook/wav2vec2-xls-r-300m",
    attention_dropout=0.0,
    hidden_dropout=0.0,
    feat_proj_dropout=0.0,
    mask_time_prob=0.05,
    layerdrop=0.0,
    ctc_loss_reduction="mean",
    pad_token_id=processor.tokenizer.pad_token_id,
    vocab_size=len(processor.tokenizer),
)
model.freeze_feature_extractor()

comet_ml is installed but `COMET_API_KEY` is not set.
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-xls-r-300m and are newly initialized: ['lm_head.bias', 'lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from transformers import TrainingArguments, Trainer


training_args = TrainingArguments(
  output_dir=repo_name,
  group_by_length=True,
  per_device_train_batch_size=BATCH_SIZE,
  gradient_accumulation_steps=2,
  evaluation_strategy="steps",
  num_train_epochs=30,
  gradient_checkpointing=True,
  fp16=True,
  save_steps=400,
  eval_steps=400,
  logging_steps=400,
  learning_rate=3e-4,
  warmup_steps=500,
  save_total_limit=2,
  push_to_hub=False,
)


trainer = Trainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=training_dataset,
    eval_dataset=validation_dataset,
    tokenizer=processor.feature_extractor,
)