<a href="https://colab.research.google.com/github/HamdanXI/nlp_adventure/blob/main/803/trying_w2v2_finetuning_for_speech.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers datasets torch torchaudio
!pip install transformers[torch]



In [2]:
from datasets import load_dataset

dataset = load_dataset("HamdanXI/uclass_asr_v3")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [3]:
import torchaudio

def speech_file_to_array_fn(batch):
    try:
        # Check if the audio path is available and load the audio file
        audio_path = batch["audio"]["path"]
        if audio_path is not None:
            speech_array, sampling_rate = torchaudio.load(audio_path)
            batch["speech"] = speech_array.squeeze().numpy()
            batch["sampling_rate"] = sampling_rate
            batch["target_text"] = batch["transcript"]
        else:
            # Handle cases where the audio path might be None
            batch["speech"] = None
            batch["sampling_rate"] = None
            batch["target_text"] = None
    except Exception as e:
        # Log the error for debugging
        print(f"Error loading audio file: {e}")
        batch["speech"] = None
        batch["sampling_rate"] = None
        batch["target_text"] = None
    return batch

dataset = dataset.map(speech_file_to_array_fn)

In [4]:
from transformers import Wav2Vec2Processor

processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h")

def prepare_dataset(batch):
    if batch["speech"] is not None:
        input_values = processor(batch["speech"], sampling_rate=batch["sampling_rate"]).input_values
        with processor.as_target_processor():
            labels = processor(batch["target_text"]).input_ids

        batch["input_values"] = input_values
        batch["labels"] = labels
    else:
        # Skip or handle entries without audio data
        batch["input_values"] = None
        batch["labels"] = None
    return batch

# Apply map function but filter out None entries before training
dataset = dataset.map(prepare_dataset, remove_columns=dataset.column_names["train"])
filtered_dataset = dataset.filter(lambda x: x["input_values"] is not None and x["labels"] is not None)

In [5]:
from transformers import Wav2Vec2ForCTC, Trainer, TrainingArguments

model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h")
model.freeze_feature_extractor()  # Optional: improves training speed by freezing the convolutional layers

train_test_split = filtered_dataset['train'].train_test_split(test_size=0.1)


training_args = TrainingArguments(
    output_dir="./results",
    group_by_length=True,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=2,
    evaluation_strategy="steps",
    num_train_epochs=3,
    fp16=True,
    save_steps=500,
    eval_steps=500,
    logging_steps=10,
    learning_rate=1e-4,
    warmup_steps=500,
    save_total_limit=2,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_test_split["train"],
    eval_dataset=train_test_split["test"],
    tokenizer=processor.feature_extractor,
)

Some weights of the model checkpoint at facebook/wav2vec2-large-960h were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-960h and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.masked_spec_embed']
You s

In [7]:
class DataCollatorForWav2Vec2:
    def __init__(self, processor):
        self.processor = processor

    def __call__(self, features):
        # Ensure each input value is a tensor, then pad
        input_values = [torch.tensor(feature['input_values'], dtype=torch.float) for feature in features]
        labels = [feature['labels'] for feature in features]

        # Pad input values
        input_values_padded = pad_sequence(input_values, batch_first=True, padding_value=0.0)  # Update padding value if needed

        # Convert labels to tensors
        labels = torch.tensor(labels, dtype=torch.long)

        return {
            'input_values': input_values_padded,
            'labels': labels
        }

In [9]:
# Initialize the custom data collator with the processor
data_collator = DataCollatorForWav2Vec2(processor=processor)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_test_split["train"],
    eval_dataset=train_test_split["test"],
    data_collator=data_collator,
    tokenizer=processor.feature_extractor,
)

# Train the model
trainer.train()

IndexError: Invalid key: 0 is out of bounds for size 0