In [None]:
from transformers import WhisperProcessor, WhisperForConditionalGeneration
from datasets import load_dataset
cache = '/srv/scratch/z5313567/thesis/cache'
# load model and processor
processor = WhisperProcessor.from_pretrained("openai/whisper-small", cache_dir = cache)
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small", cache_dir = cache)
model.config.forced_decoder_ids = None

# load dummy dataset and read audio files
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", cache_dir = cache)
sample = ds[0]["audio"]
input_features = processor(sample["array"], sampling_rate=sample["sampling_rate"], return_tensors="pt").input_features 

# generate token ids
predicted_ids = model.generate(input_features)
# decode token ids to text
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=False)

transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)


In [2]:
# For accessing date and time
from datetime import date
from datetime import datetime
now = datetime.now()
# Print out dd/mm/YY H:M:S
# ------------------------------------------
dt_string = now.strftime("%d/%m/%Y %H:%M:%S")
print("Started:", dt_string)
# ------------------------------------------ 
print("\n------> IMPORTING PACKAGES.... ---------------------------------------\n")
print("-->Importing datasets...")
# Import datasets and evaluation metric
from datasets import load_dataset, load_metric, ClassLabel
# Convert pandas dataframe to DatasetDict
from datasets import Dataset
# Generate word alignment files for OOV checking
print("-->Importing jiwer...")
import jiwer
# Generate random numbers
print("-->Importing random...")
import random
# Manipulate dataframes and numbers
print("-->Importing pandas & numpy...")
import pandas as pd
import numpy as np
# Use regex
print("-->Importing re...")
import re
# Read, Write, Open json files
print("-->Importing json...")
import json
# Use models and tokenizers
print("-->Importing Wav2VecCTC...")
from transformers import Wav2Vec2CTCTokenizer
#from transformers import AutoTokenizer
from transformers import HubertForCTC
#from transformers import AutoModelForCTC
from transformers import Wav2Vec2FeatureExtractor
#from transformers import AutoFeatureExtractor
from transformers import Wav2Vec2Processor
#from transformers import AutoProcessor
# Loading audio files
print("-->Importing soundfile...")
import soundfile as sf
print("-->Importing librosa...")
import librosa
# For training
print("-->Importing torch, dataclasses & typing...")
import torch
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union
print("-->Importing from transformers for training...")
from transformers import TrainingArguments
from transformers import Trainer
print("-->Importing pyarrow for loading dataset...")
import pyarrow as pa
import pyarrow.csv as csv
print("-->SUCCESS! All packages imported.")

Started: 14/10/2023 16:54:43

------> IMPORTING PACKAGES.... ---------------------------------------

-->Importing datasets...
-->Importing jiwer...
-->Importing random...
-->Importing pandas & numpy...
-->Importing re...
-->Importing json...
-->Importing Wav2VecCTC...
-->Importing soundfile...
-->Importing librosa...
-->Importing torch, dataclasses & typing...
-->Importing from transformers for training...
-->Importing pyarrow for loading dataset...
-->SUCCESS! All packages imported.


In [1]:
# ------------------------------------------
#      Install packages if needed
# ------------------------------------------
#pip install datasets==1.8.0
#pip install transformers
#pip install soundfile
#pip install jiwer

# ------------------------------------------
#       Import required packages
# ------------------------------------------
# For printing filepath
import os
# ------------------------------------------
# ------------------------------------------
# For accessing date and time
from datetime import date
from datetime import datetime
now = datetime.now()
# Print out dd/mm/YY H:M:S
# ------------------------------------------
dt_string = now.strftime("%d/%m/%Y %H:%M:%S")
print("Started:", dt_string)
# ------------------------------------------ 
print("\n------> IMPORTING PACKAGES.... ---------------------------------------\n")
print("-->Importing datasets...")
# Import datasets and evaluation metric
from datasets import load_dataset, load_metric, ClassLabel
# Convert pandas dataframe to DatasetDict
from datasets import Dataset
# Generate random numbers
print("-->Importing random...")
import random
# Manipulate dataframes and numbers
print("-->Importing pandas & numpy...")
import pandas as pd
import numpy as np
# Use regex
print("-->Importing re...")
import re
# Read, Write, Open json files
print("-->Importing json...")
import json
# Use models and tokenizers
print("-->Importing Wav2VecCTC...")
from transformers import WhisperTokenizer
from transformers import WhisperForConditionalGeneration
from transformers import WhisperFeatureExtractor
from transformers import WhisperProcessor
# Loading audio files
print("-->Importing soundfile...")
import soundfile as sf
print("-->Importing librosa...")
import librosa
# For training
print("-->Importing torch, dataclasses & typing...")
import torch
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union
print("-->Importing from transformers for training...")
from transformers import Seq2SeqTrainingArguments
from transformers import Seq2SeqTrainer
print("-->Importing pyarrow for loading dataset...")
import pyarrow as pa
import pyarrow.csv as csv
print("-->SUCCESS! All packages imported.")

# ------------------------------------------
#      Setting experiment arguments
# ------------------------------------------
print("\n------> EXPERIMENT ARGUMENTS ----------------------------------------- \n")

base_fp = '/srv/scratch/z5313567/thesis/'
print('base_fp:', base_fp)

model = 'whisper'
print('model:', model)

dataset_name = 'AusKidTalk'
print('dataset_name:', dataset_name)

experiment_id = 'AusKidTalk_scripted_spontaneous_combined_finetune_20230925'
print('experiment_id:', experiment_id)

cache_name = 'AusKidTalk-finetune'
print('cache_name:', cache_name)


# Perform Training (True/False)
# If false, this will go straight to model evaluation 
training = True
print("training:", training)

# Resume training from/ use checkpoint (True/False)
# Set to True for:
# 1) resuming from a saved checkpoint if training stopped midway through
# 2) for using an existing finetuned model for evaluation 
# If 2), then must also set eval_pretrained = True
use_checkpoint = False
print("use_checkpoint:", use_checkpoint)

# Set checkpoint if resuming from/using checkpoint
#checkpoint = "/srv/scratch/z5160268/2020_TasteofResearch/kaldi/egs/renee_thesis/s5/myST-OGI_local/20210819-OGI-myST-120h"
checkpoint = "/srv/scratch/z5313567/thesis/wav2vec2/model/Renee_myST_OGI_TLT/20211016-base-myST-OGI-TLT17/checkpoint-20000"
if use_checkpoint:
    print("checkpoint:", checkpoint)

# Use a pretrained tokenizer (True/False)
#     True: Use existing tokenizer (if custom dataset has same vocab)
#     False: Use custom tokenizer (if custom dataset has different vocab)
use_pretrained_tokenizer = False
print("use_pretrained_tokenizer:", use_pretrained_tokenizer)

# Set tokenizer
pretrained_tokenizer = "facebook/wav2vec2-base-960h"
if use_pretrained_tokenizer:
    print("pretrained_tokenizer:", pretrained_tokenizer)

# Evaluate existing model instead of newly trained model (True/False)
#     True: use the model in the filepath set by 'eval_model' for eval
#     False: use the model trained from this script for eval
eval_pretrained = False
print("eval_pretrained:", eval_pretrained)

# Set existing model to evaluate, if evaluating on existing model
eval_model = checkpoint
if eval_pretrained:
    print("eval_model:", eval_model)

# Baseline model for evaluating baseline metric
# This model will be evaluated at the end for the baseline WER
baseline_model = "openai/whisper-small"
print("baseline_model:", baseline_model)

# Evalulate the baseline model or not (True/False)
#   True: evaluate baseline model on test set
#   False: do not evaluate baseline model on test set
eval_baseline = False
print("eval_baseline:", eval_baseline)


print("\n------> MODEL ARGUMENTS... -------------------------------------------\n")
# For setting model = Wav2Vec2ForCTC.from_pretrained()

set_hidden_dropout = 0.1                    # Default = 0.1
print("hidden_dropout:", set_hidden_dropout)
set_activation_dropout = 0.1                # Default = 0.1
print("activation_dropout:", set_activation_dropout)
set_attention_dropout = 0.1                 # Default = 0.1
print("attention_dropoutput:", set_attention_dropout)
set_feat_proj_dropout = 0.0                 # Default = 0.1
print("feat_proj_dropout:", set_feat_proj_dropout)
set_layerdrop = 0.01                         # Default = 0.1
print("layerdrop:", set_layerdrop)
set_mask_time_prob = 0.075                  # Default = 0.05
print("mask_time_prob:", set_mask_time_prob)
set_mask_time_length = 10                   # Default = 10
print("mask_time_length:", set_mask_time_length)
set_ctc_loss_reduction = "mean"             # Default = "sum"
print("ctc_loss_reduction:", set_ctc_loss_reduction)
set_ctc_zero_infinity = True               # Default = False
print("ctc_zero_infinity:", set_ctc_zero_infinity)
set_gradient_checkpointing = True           # Default = False
print("gradient_checkpointing:", set_gradient_checkpointing)

print("\n------> TRAINING ARGUMENTS... ----------------------------------------\n")
# For setting training_args = TrainingArguments()

set_evaluation_strategy = "steps"           # Default = "no"
print("evaluation strategy:", set_evaluation_strategy)
set_per_device_train_batch_size = 8         # Default = 8
print("per_device_train_batch_size:", set_per_device_train_batch_size)
set_gradient_accumulation_steps = 1         # Default = 1
print("gradient_accumulation_steps:", set_gradient_accumulation_steps)
set_learning_rate = 0.00005                 # Default = 0.00005
print("learning_rate:", set_learning_rate)
set_weight_decay = 0.01                     # Default = 0
print("weight_decay:", set_weight_decay)
set_adam_beta1 = 0.9                        # Default = 0.9
print("adam_beta1:", set_adam_beta1)
set_adam_beta2 = 0.98                       # Default = 0.999
print("adam_beta2:", set_adam_beta2)
set_adam_epsilon = 0.00000001               # Default = 0.00000001
print("adam_epsilon:", set_adam_epsilon)
set_num_train_epochs = 590                   # Default = 3.0
print("num_train_epochs:", set_num_train_epochs)
set_max_steps = 13000                          # Default = -1, overrides epochs
print("max_steps:", set_max_steps)
set_lr_scheduler_type = "linear"            # Default = "linear"
print("lr_scheduler_type:", set_lr_scheduler_type )
set_warmup_ratio = 0.1                      # Default = 0.0
print("warmup_ratio:", set_warmup_ratio)
set_logging_strategy = "steps"              # Default = "steps"
print("logging_strategy:", set_logging_strategy)
set_logging_steps = 1000                      # Default = 500
print("logging_steps:", set_logging_steps)
set_save_strategy = "steps"                 # Default = "steps"
print("save_strategy:", set_save_strategy)
set_save_steps = 1000                         # Default = 500
print("save_steps:", set_save_steps)
set_save_total_limit = 2                   # Optional                 
print("save_total_limit:", set_save_total_limit)
set_fp16 = True                             # Default = False
print("fp16:", set_fp16)
set_eval_steps = 1000                         # Optional
print("eval_steps:", set_eval_steps)
set_load_best_model_at_end = True           # Default = False
print("load_best_model_at_end:", set_load_best_model_at_end)
set_metric_for_best_model = "wer"           # Optional
print("metric_for_best_model:", set_metric_for_best_model)
set_greater_is_better = False               # Optional
print("greater_is_better:", set_greater_is_better)
set_group_by_length = True                  # Default = False
print("group_by_length:", set_group_by_length)

# ------------------------------------------
#        Generating file paths
# ------------------------------------------
print("\n------> GENERATING FILEPATHS... --------------------------------------\n")
# Path to dataframe csv for train dataset
# data_train_fp = base_fp + train_name + "_local/" + train_filename + ".csv"
data_train_fp = '/srv/scratch/z5313567/thesis/AusKidTalk_local/AusKidTalk_train_only_transcription_filepath.csv'
print("--> data_train_fp:", data_train_fp)

# Path to dataframe csv for test dataset
data_dev_fp = '/srv/scratch/z5313567/thesis/AusKidTalk_local/AusKidTalk_dev_only_transcription_filepath.csv'
print("--> data_dev_fp:", data_dev_fp)

# Path to dataframe csv for test dataset
#data_test_fp = base_fp + evaluation_name + "_local/" + evaluation_filename + ".csv"
data_test_fp = '/srv/scratch/z5313567/thesis/AusKidTalk_local/AusKidTalk_test_only_transcription_filepath.csv'
print("--> data_test_fp:", data_test_fp)

# Dataframe file 
# |-----------|---------------------|----------|---------|
# | file path | transcription_clean | duration | spkr_id |
# |-----------|---------------------|----------|---------|
# |   ...     |      ...            |  ..secs  | ......  |
# |-----------|---------------------|----------|---------|
# NOTE: The spkr_id column may need to be removed beforehand if
#       there appears to be a mixture between numerical and string ID's
#       due to this issue: https://github.com/apache/arrow/issues/4168
#       when calling load_dataset()

# Path to datasets cache
# data_cache_fp = base_cache_fp + datasetdict_id
data_cache_fp = '/srv/scratch/chacmod/.cache/huggingface/datasets/' + cache_name
print("--> data_cache_fp:", data_cache_fp)

# Path to save vocab.json
# vocab_fp = base_fp + train_name + "_local/vocab_" + experiment_id + ".json"
vocab_fp =  base_fp + model + '/vocab/' + dataset_name + '/' + experiment_id + '_vocab.json'
print("--> vocab_fp:", vocab_fp)

# Path to save model output
#model_fp = base_fp + train_name + "_local/" + experiment_id
model_fp = base_fp + model + '/model/' + dataset_name + '/' + experiment_id
print("--> model_fp:", model_fp)

# Path to save results output
# baseline_results_fp = base_fp + train_name + "_local/" + experiment_id + "_baseline_results.csv" 
baseline_results_fp = base_fp + model + '/baseline_result/' + dataset_name + '/'  + experiment_id + '_baseline_result.csv'
print("--> baseline_results_fp:", baseline_results_fp)

# finetuned_results_fp = base_fp + train_name + "_local/" + experiment_id + "_finetuned_results.csv"
finetuned_results_fp = base_fp + model + '/finetuned_result/' + dataset_name + '/'  + experiment_id + '_finetuned_result.csv'
print("--> finetuned_results_fp:", finetuned_results_fp)

# Pre-trained checkpoint model
# For 1) Fine-tuning or
#     2) resuming training from pre-trained model
# If 1) must set use_checkpoint = False
# If 2)must set use_checkpoint = True
# Default model to fine-tune is facebook's model
pretrained_mod = "openai/whisper-small"
if use_checkpoint:
    pretrained_mod = checkpoint
print("--> pretrained_mod:", pretrained_mod)
# Path to pre-trained tokenizer
# If use_pretrained_tokenizer = True
if use_pretrained_tokenizer:
    print("--> pretrained_tokenizer:", pretrained_tokenizer)

# ------------------------------------------
#         Preparing dataset
# ------------------------------------------
# Run the following scripts to prepare data
# 1) Prepare data from kaldi file: 
# /srv/scratch/z5160268/2020_TasteofResearch/kaldi/egs/renee_thesis/s5/wav2vec_exp/data_prep.py
# 3) [Optional] Limit the files to certain duration:
# /srv/scratch/z5160268/2020_TasteofResearch/kaldi/egs/renee_thesis/s5/wav2vec_projects/data_getShortWavs.py
# 2) Split data into train and test:
# /srv/scratch/z5160268/2020_TasteofResearch/kaldi/egs/renee_thesis/s5/wav2vec_projects/data_split.py

print("\n------> PREPARING DATASET... ------------------------------------\n")
# Read the existing csv saved dataframes and
# load as a DatasetDict 
data = load_dataset('csv', 
                    data_files={'train': data_train_fp,
                                'dev' : data_dev_fp,
                                'test': data_test_fp},
                    cache_dir=data_cache_fp)
# Remove the "duration" and "spkr_id" column
#data = data.remove_columns(["duration", "spkr_id"])
#data = data.remove_columns(["duration"])
print("--> dataset...")
print(data)
# Display some random samples of the dataset
print("--> Printing some random samples...")
def show_random_elements(dataset, num_examples=10):
    assert num_examples <= len(dataset), "Picking more elements than in dataset"
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)
    df = pd.DataFrame(dataset[picks])
    print(df)
show_random_elements(data["train"], num_examples=5)
print("SUCCESS: Prepared dataset.")
# ------------------------------------------
#       Processing transcription
# ------------------------------------------
# Create vocab.json
# Extracting all distinct letters of train and test set
# and building vocab from this set of letters
print("\n------> PROCESSING TRANSCRIPTION... ---------------------------------------\n")
# Mapping function that concatenates all transcriptions
# into one long transcription and then transforms the
# string into a set of chars. Set batched=True to the 
# map(...) function so that the mapping function has access
# to all transcriptions at once.

#chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"]'

def process_transcription(batch):
    #batch["transcription_clean"] = re.sub(chars_to_ignore_regex, '', batch["transcription_clean"]).upper()
    batch["transcription_clean"] = batch["transcription_clean"].upper()
    batch["transcription_clean"] = batch["transcription_clean"].replace("<UNK>", "<unk>")
    return batch

data = data.map(process_transcription)

def extract_all_chars(batch):
    all_text = " ".join(batch["transcription_clean"])
    vocab = list(set(all_text))
    return {"vocab": [vocab], "all_text": [all_text]}
    
if not use_pretrained_tokenizer:
    print("--> Creating map(...) function for vocab...")
    vocabs = data.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=data.column_names["train"])
    # Create union of all distinct letters in train and test set
    # and convert resulting list into enumerated dictionary
    # Vocab includes a-z, ' , space, UNK, PAD
    vocab_list = list(set(vocabs["train"]["vocab"][0]) | set(vocabs["test"]["vocab"][0]))
    vocab_dict = {v: k for k, v in enumerate(vocab_list)}
    print("--> Vocab len:", len(vocab_dict), "\n", vocab_dict)
    # Give space " " a visible character " | "
    # Include "unknown" [UNK] token for dealing with characters
    # not encountered in training.
    # Add padding token to corresponds to CTC's "blank token".
    vocab_dict["|"] = vocab_dict[" "]
    del vocab_dict[" "]
    vocab_dict["[UNK]"] = len(vocab_dict)
    vocab_dict["[PAD]"] = len(vocab_dict)
    print("--> Vocab len:", len(vocab_dict), "\n", vocab_dict)
    # Save vocab as a json file
    with open(vocab_fp, 'w') as vocab_file:
        json.dump(vocab_dict, vocab_file)
    print("SUCCESS: Created vocabulary file at", vocab_fp)
# Use json file to instantiate an object of the 
# Wav2VecCTCTokenziser class if not using pretrained tokenizer
if use_pretrained_tokenizer:
    #tokenizer = Wav2Vec2CTCTokenizer.from_pretrained(pretrained_tokenizer)
    tokenizer = WhisperTokenizer.from_pretrained(pretrained_tokenizer)
else:
    #tokenizer = Wav2Vec2CTCTokenizer(vocab_fp, unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|")
    tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-small", language="english", task="transcribe")
#tokenizer = save_pretrained(model_fp)
# ------------------------------------------
#    Create Wav2Vec2 Feature Extractor
# ------------------------------------------
print("\n------> CREATING WAV2VEC2 FEATURE EXTRACTOR... -----------------------\n")
# Instantiate a Wav2Vec2 feature extractor:
# - feature_size: set to 1 because model was trained on raw speech signal
# - sampling_rate: sampling rate the model is trained on
# - padding_value: for batched inference, shorter inputs are padded
# - do_normalize: whether input should be zero-mean-unit-variance
#   normalised or not. Usually, speech models perform better when true.
# - return_attention_mask: set to false for Wav2Vec2, but true for
#   fine-tuning large-lv60
#feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=False)
feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-small")
# Feature extractor and tokenizer wrapped into a single
# Wav2Vec2Processor class so we only need a model and processor object
# processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)
processor = WhisperProcessor.from_pretrained("openai/whisper-small", language="english", task="transcribe")
# Save to re-use the just created processor and the fine-tuned model
processor.save_pretrained(model_fp)
print("SUCCESS: Created feature extractor.")

# ------------------------------------------
#             Pre-process Data
# ------------------------------------------
print("\n------> PRE-PROCESSING DATA... ----------------------------------------- \n")
# Audio files are stored as .wav format
# We want to store both audio values and sampling rate
# in the dataset. 
# We write a map(...) function accordingly.

# def speech_file_to_array_fn(batch):
#    speech_array, sampling_rate = sf.read(batch["filepath"])
#    batch["speech"] = speech_array
#    batch["sampling_rate"] = sampling_rate
#    batch["target_text"] = batch["transcription_clean"]
#    return batch
def speech_file_to_array_fn(batch):
    #speech_array, sampling_rate = sf.read(batch["filepath"])
    speech_array, sampling_rate = librosa.load(batch['filepath'], sr=feature_extractor.sampling_rate)
    batch["speech"] = speech_array
    batch["sampling_rate"] = sampling_rate
    batch["target_text"] = batch["transcription_clean"]
    return batch

data = data.map(speech_file_to_array_fn, remove_columns=data.column_names["train"], num_proc=4)
# Check a few rows of data to verify data properly loaded
print("--> Verifying data with a random sample...")
rand_int = random.randint(0, len(data["train"])-1)
print("Target text:", data["train"][rand_int]["target_text"])
print("Input array shape:", np.asarray(data["train"][rand_int]["speech"]).shape)
print("Sampling rate:", data["train"][rand_int]["sampling_rate"])
# Process dataset to the format expected by model for training
# Using map(...)
# 1) Check all data samples have same sampling rate (16kHz)
# 2) Extract input_values from loaded audio file.
#    This only involves normalisation but could also correspond
#    to extracting log-mel features
# 3) Encode the transcriptions to label ids

# def prepare_dataset(batch):
    # # check that all files have the correct sampling rate
    # assert (
        # len(set(batch["sampling_rate"])) == 1
    # ), f"Make sure all inputs have the same sampling rate of {processor.feature_extractor.sampling_rate}."

    # batch["input_values"] = processor(batch["speech"], sampling_rate=batch["sampling_rate"][0]).input_values

    # with processor.as_target_processor():
        # batch["labels"] = processor(batch["target_text"]).input_ids
    # return batch
def prepare_dataset(batch):
    # check that all files have the correct sampling rate
    assert (
        len(set(batch["sampling_rate"])) == 1
    ), f"Make sure all inputs have the same sampling rate of {processor.feature_extractor.sampling_rate}."
    
    # compute log-Mel input features from input audio array
    batch["input_features"] = feature_extractor(batch["speech"], sampling_rate=batch["sampling_rate"][0]).input_features
    
    # encode target text to label ids
    batch["labels"] = tokenizer(batch["target_text"]).input_ids
    return batch
data_prepared = data.map(prepare_dataset, remove_columns=data.column_names["train"], batch_size=8, num_proc=4, batched=True)

print("SUCCESS: Data ready for training and evaluation.")

Started: 26/09/2023 15:53:52

------> IMPORTING PACKAGES.... ---------------------------------------

-->Importing datasets...


  from .autonotebook import tqdm as notebook_tqdm


-->Importing random...
-->Importing pandas & numpy...
-->Importing re...
-->Importing json...
-->Importing Wav2VecCTC...
-->Importing soundfile...
-->Importing librosa...
-->Importing torch, dataclasses & typing...
-->Importing from transformers for training...
-->Importing pyarrow for loading dataset...
-->SUCCESS! All packages imported.

------> EXPERIMENT ARGUMENTS ----------------------------------------- 

base_fp: /srv/scratch/z5313567/thesis/
model: whisper
dataset_name: AusKidTalk
experiment_id: AusKidTalk_scripted_spontaneous_combined_finetune_20230925
cache_name: AusKidTalk-finetune
training: True
use_checkpoint: False
use_pretrained_tokenizer: False
eval_pretrained: False
baseline_model: openai/whisper-small
eval_baseline: False

------> MODEL ARGUMENTS... -------------------------------------------

hidden_dropout: 0.1
activation_dropout: 0.1
attention_dropoutput: 0.1
feat_proj_dropout: 0.0
layerdrop: 0.01
mask_time_prob: 0.075
mask_time_length: 10
ctc_loss_reduction: mean


Found cached dataset csv (/srv/scratch/chacmod/.cache/huggingface/datasets/AusKidTalk-finetune/csv/default-14c1ba9ea46bfb6b/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)
100%|██████████| 3/3 [00:00<00:00, 521.70it/s]
Loading cached processed dataset at /srv/scratch/chacmod/.cache/huggingface/datasets/AusKidTalk-finetune/csv/default-14c1ba9ea46bfb6b/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-5c4139f027647cd7.arrow
Loading cached processed dataset at /srv/scratch/chacmod/.cache/huggingface/datasets/AusKidTalk-finetune/csv/default-14c1ba9ea46bfb6b/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-31c051c82af50429.arrow
Loading cached processed dataset at /srv/scratch/chacmod/.cache/huggingface/datasets/AusKidTalk-finetune/csv/default-14c1ba9ea46bfb6b/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-c6b5bfa1a550cc22.arrow


--> dataset...
DatasetDict({
    train: Dataset({
        features: ['filepath', 'transcription_clean'],
        num_rows: 8093
    })
    dev: Dataset({
        features: ['filepath', 'transcription_clean'],
        num_rows: 1744
    })
    test: Dataset({
        features: ['filepath', 'transcription_clean'],
        num_rows: 1774
    })
})
--> Printing some random samples...
                                            filepath transcription_clean
0  /srv/scratch/chacmod/auskidtalk_audio/218_task...             breathe
1  /srv/scratch/chacmod/auskidtalk_audio/255_task...          watermelon
2  /srv/scratch/chacmod/auskidtalk_audio/191_task...              banana
3  /srv/scratch/chacmod/auskidtalk_audio/203_task...               beard
4  /srv/scratch/chacmod/auskidtalk_audio/304_task...                cart
SUCCESS: Prepared dataset.

------> PROCESSING TRANSCRIPTION... ---------------------------------------

--> Creating map(...) function for vocab...


                                                    

--> Vocab len: 26 
 {' ': 0, 'H': 1, 'D': 2, 'M': 3, 'U': 4, 'E': 5, 'G': 6, 'L': 7, 'B': 8, 'C': 9, 'A': 10, 'P': 11, 'T': 12, 'F': 13, 'R': 14, 'J': 15, 'S': 16, 'N': 17, 'W': 18, 'O': 19, 'Y': 20, 'V': 21, 'X': 22, 'K': 23, 'I': 24, 'Z': 25}
--> Vocab len: 28 
 {'H': 1, 'D': 2, 'M': 3, 'U': 4, 'E': 5, 'G': 6, 'L': 7, 'B': 8, 'C': 9, 'A': 10, 'P': 11, 'T': 12, 'F': 13, 'R': 14, 'J': 15, 'S': 16, 'N': 17, 'W': 18, 'O': 19, 'Y': 20, 'V': 21, 'X': 22, 'K': 23, 'I': 24, 'Z': 25, '|': 0, '[UNK]': 26, '[PAD]': 27}
SUCCESS: Created vocabulary file at /srv/scratch/z5313567/thesis/whisper/vocab/AusKidTalk/AusKidTalk_scripted_spontaneous_combined_finetune_20230925_vocab.json

------> CREATING WAV2VEC2 FEATURE EXTRACTOR... -----------------------



Loading cached processed dataset at /srv/scratch/chacmod/.cache/huggingface/datasets/AusKidTalk-finetune/csv/default-14c1ba9ea46bfb6b/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-1264cc2d36ff1043_*_of_00004.arrow
Loading cached processed dataset at /srv/scratch/chacmod/.cache/huggingface/datasets/AusKidTalk-finetune/csv/default-14c1ba9ea46bfb6b/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-96623961a2e346a0_*_of_00004.arrow
Loading cached processed dataset at /srv/scratch/chacmod/.cache/huggingface/datasets/AusKidTalk-finetune/csv/default-14c1ba9ea46bfb6b/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-3c3add9f60df9f04_*_of_00004.arrow


SUCCESS: Created feature extractor.

------> PRE-PROCESSING DATA... ----------------------------------------- 

--> Verifying data with a random sample...
Target text: DUCK
Input array shape: (7200,)
Sampling rate: 16000


Loading cached processed dataset at /srv/scratch/chacmod/.cache/huggingface/datasets/AusKidTalk-finetune/csv/default-14c1ba9ea46bfb6b/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-fd604e6d65394860_*_of_00004.arrow
Loading cached processed dataset at /srv/scratch/chacmod/.cache/huggingface/datasets/AusKidTalk-finetune/csv/default-14c1ba9ea46bfb6b/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-4095edb5699b8b71_*_of_00004.arrow
Loading cached processed dataset at /srv/scratch/chacmod/.cache/huggingface/datasets/AusKidTalk-finetune/csv/default-14c1ba9ea46bfb6b/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-5b74132e5d537859_*_of_00004.arrow


SUCCESS: Data ready for training and evaluation.


In [4]:
len(data_prepared['train'])

8093

In [None]:
data_prepared['train'][0]

In [1]:
from transformers import WhisperForConditionalGeneration
pretrained_mod = "openai/whisper-tiny"
model_cache_fp = '/srv/scratch/z5313567/thesis/cache'
model = WhisperForConditionalGeneration.from_pretrained(pretrained_mod, cache_dir=model_cache_fp)

for param in model.base_model.parameters():
    param.requires_grad = False
print("SUCCESS: Pre-trained checkpoint loaded.")

model.base_model

  from .autonotebook import tqdm as notebook_tqdm


SUCCESS: Pre-trained checkpoint loaded.


WhisperModel(
  (encoder): WhisperEncoder(
    (conv1): Conv1d(80, 384, kernel_size=(3,), stride=(1,), padding=(1,))
    (conv2): Conv1d(384, 384, kernel_size=(3,), stride=(2,), padding=(1,))
    (embed_positions): Embedding(1500, 384)
    (layers): ModuleList(
      (0-3): 4 x WhisperEncoderLayer(
        (self_attn): WhisperAttention(
          (k_proj): Linear(in_features=384, out_features=384, bias=False)
          (v_proj): Linear(in_features=384, out_features=384, bias=True)
          (q_proj): Linear(in_features=384, out_features=384, bias=True)
          (out_proj): Linear(in_features=384, out_features=384, bias=True)
        )
        (self_attn_layer_norm): LayerNorm((384,), eps=1e-05, elementwise_affine=True)
        (activation_fn): GELUActivation()
        (fc1): Linear(in_features=384, out_features=1536, bias=True)
        (fc2): Linear(in_features=1536, out_features=384, bias=True)
        (final_layer_norm): LayerNorm((384,), eps=1e-05, elementwise_affine=True)
      )


In [2]:
encoder = model.base_model.encoder
len(encoder.layers)

4

In [8]:
from transformers import Wav2Vec2ForCTC
pretrained_mod = "facebook/wav2vec2-base-960h"
model_cache_fp = '/srv/scratch/z5313567/thesis/cache'
model = Wav2Vec2ForCTC.from_pretrained(pretrained_mod, cache_dir=model_cache_fp)

for param in model.base_model.parameters():
    param.requires_grad = False
print("SUCCESS: Pre-trained checkpoint loaded.")

model.base_model.parameters

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


SUCCESS: Pre-trained checkpoint loaded.


<bound method Module.parameters of Wav2Vec2Model(
  (feature_extractor): Wav2Vec2FeatureEncoder(
    (conv_layers): ModuleList(
      (0): Wav2Vec2GroupNormConvLayer(
        (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,), bias=False)
        (activation): GELUActivation()
        (layer_norm): GroupNorm(512, 512, eps=1e-05, affine=True)
      )
      (1-4): 4 x Wav2Vec2NoLayerNormConvLayer(
        (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
        (activation): GELUActivation()
      )
      (5-6): 2 x Wav2Vec2NoLayerNormConvLayer(
        (conv): Conv1d(512, 512, kernel_size=(2,), stride=(2,), bias=False)
        (activation): GELUActivation()
      )
    )
  )
  (feature_projection): Wav2Vec2FeatureProjection(
    (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
    (projection): Linear(in_features=512, out_features=768, bias=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): Wav2Vec2Encoder(
    (pos_conv_embed): 

In [1]:
from transformers import HubertForCTC
pretrained_mod = "facebook/hubert-large-ls960-ft"
model_cache_fp = '/srv/scratch/z5313567/thesis/cache'
model = HubertForCTC.from_pretrained(pretrained_mod, cache_dir=model_cache_fp)

#for param in model.base_model.parameters():
#    param.requires_grad = False
#print("SUCCESS: Pre-trained checkpoint loaded.")
num_freeze_lower_layer = 3
print(f"Freezing lower {num_freeze_lower_layer} transformer layers")
for n in range(num_freeze_lower_layer):
    for param in model.base_model.encoder.layers[n].parameters():
         param.requires_grad = False
print("SUCCESS: Pre-trained checkpoint loaded.")



  from .autonotebook import tqdm as notebook_tqdm


Freezing lower 3 transformer layers
SUCCESS: Pre-trained checkpoint loaded.


In [42]:
model_cache_fp = '/srv/scratch/z5313567/thesis/cache'
from transformers import WhisperProcessor
pretrained_tokenizer = "openai/whisper-medium"
#pretrained_tokenizer = '/srv/scratch/z5313567/thesis/whisper/model/CU/whisper_medium_finetune_CU_lowercase_20231017'

In [43]:
processor = WhisperProcessor.from_pretrained(pretrained_tokenizer, language="english", task="transcribe", cache_dir=model_cache_fp)

In [44]:
processor

WhisperProcessor:
- feature_extractor: WhisperFeatureExtractor {
  "chunk_length": 30,
  "feature_extractor_type": "WhisperFeatureExtractor",
  "feature_size": 80,
  "hop_length": 160,
  "n_fft": 400,
  "n_samples": 480000,
  "nb_max_frames": 3000,
  "padding_side": "right",
  "padding_value": 0.0,
  "processor_class": "WhisperProcessor",
  "return_attention_mask": false,
  "sampling_rate": 16000
}

- tokenizer: WhisperTokenizer(name_or_path='openai/whisper-medium', vocab_size=50258, model_max_length=1024, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'eos_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'unk_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'pad_token': '<|endoftext|>', 'additional_special_tokens': ['<|endoftext|>', '<|startoftrans

In [41]:
model_cache_fp = '/srv/scratch/z5313567/thesis/cache'
from transformers import WhisperProcessor
#pretrained_tokenizer = "openai/whisper-tiny"
pretrained_tokenizer = '/srv/scratch/z5313567/thesis/whisper/model/CU/whisper_medium_finetune_CU_lowercase_20231017'
processor = WhisperProcessor.from_pretrained(pretrained_tokenizer, language="english", task="transcribe", cache_dir=model_cache_fp)
processor

WhisperProcessor:
- feature_extractor: WhisperFeatureExtractor {
  "chunk_length": 30,
  "feature_extractor_type": "WhisperFeatureExtractor",
  "feature_size": 80,
  "hop_length": 160,
  "n_fft": 400,
  "n_samples": 480000,
  "nb_max_frames": 3000,
  "padding_side": "right",
  "padding_value": 0.0,
  "processor_class": "WhisperProcessor",
  "return_attention_mask": false,
  "sampling_rate": 16000
}

- tokenizer: WhisperTokenizer(name_or_path='/srv/scratch/z5313567/thesis/whisper/model/CU/whisper_medium_finetune_CU_lowercase_20231017', vocab_size=50258, model_max_length=1024, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'eos_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'unk_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'pad_token': '<|endoft

In [48]:
from transformers import WhisperTokenizer
pretrained_tokenizer = "openai/whisper-medium"
tokenizer = WhisperTokenizer.from_pretrained(pretrained_tokenizer, language="english", task="transcribe", cache_dir=model_cache_fp)
tokenizer

WhisperTokenizer(name_or_path='openai/whisper-medium', vocab_size=50258, model_max_length=1024, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'eos_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'unk_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'pad_token': '<|endoftext|>', 'additional_special_tokens': ['<|endoftext|>', '<|startoftranscript|>', '<|en|>', '<|zh|>', '<|de|>', '<|es|>', '<|ru|>', '<|ko|>', '<|fr|>', '<|ja|>', '<|pt|>', '<|tr|>', '<|pl|>', '<|ca|>', '<|nl|>', '<|ar|>', '<|sv|>', '<|it|>', '<|id|>', '<|hi|>', '<|fi|>', '<|vi|>', '<|he|>', '<|uk|>', '<|el|>', '<|ms|>', '<|cs|>', '<|ro|>', '<|da|>', '<|hu|>', '<|ta|>', '<|no|>', '<|th|>', '<|ur|>', '<|hr|>', '<|bg|>', '<|lt|>', '<|la|>', '<|mi|>', '<|ml|>', '<|cy|>', '<|sk|>', '<|te|

In [45]:
from transformers import WhisperTokenizer
#pretrained_tokenizer = "openai/whisper-medium"
pretrained_tokenizer = '/srv/scratch/z5313567/thesis/whisper/model/CU/whisper_medium_finetune_CU_lowercase_20231017'

In [46]:

tokenizer = WhisperTokenizer.from_pretrained(pretrained_tokenizer, language="english", task="transcribe", cache_dir=model_cache_fp)

In [47]:
tokenizer

WhisperTokenizer(name_or_path='/srv/scratch/z5313567/thesis/whisper/model/CU/whisper_medium_finetune_CU_lowercase_20231017', vocab_size=50258, model_max_length=1024, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'eos_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'unk_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'pad_token': '<|endoftext|>', 'additional_special_tokens': ['<|endoftext|>', '<|startoftranscript|>', '<|en|>', '<|zh|>', '<|de|>', '<|es|>', '<|ru|>', '<|ko|>', '<|fr|>', '<|ja|>', '<|pt|>', '<|tr|>', '<|pl|>', '<|ca|>', '<|nl|>', '<|ar|>', '<|sv|>', '<|it|>', '<|id|>', '<|hi|>', '<|fi|>', '<|vi|>', '<|he|>', '<|uk|>', '<|el|>', '<|ms|>', '<|cs|>', '<|ro|>', '<|da|>', '<|hu|>', '<|ta|>', '<|no|>', '<|th|>', '<|ur|>', '<|hr|>', '<|bg|