In [None]:
import torch
import torchaudio
from transformers import WhisperProcessor, WhisperForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer
from datasets import Dataset, load_metric
from dataclasses import dataclass
from typing import Any, Dict, List, Union

# Step 1: Prepare the dataset
data = [
    {'audio': 'path/to/audio1.wav', 'transcription': 'Transcription for audio one.'},
    {'audio': 'path/to/audio2.wav', 'transcription': 'Transcription for audio two.'},
    # Add more data...
]

dataset = Dataset.from_list(data)

processor = WhisperProcessor.from_pretrained("openai/whisper-large-v2")

def preprocess_function(examples):
    speech_array, sampling_rate = torchaudio.load(examples["audio"])
    speech_array = speech_array.squeeze()
    
    # Resample if necessary
    if sampling_rate != 16000:
        resampler = torchaudio.transforms.Resample(sampling_rate, 16000)
        speech_array = resampler(speech_array)
        sampling_rate = 16000

    inputs = processor(speech_array.numpy(), sampling_rate=sampling_rate)
    inputs["labels"] = processor.tokenizer(examples["transcription"]).input_ids
    return inputs

dataset = dataset.map(preprocess_function)

# Step 2: Prepare for training
@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: WhisperProcessor
    padding: Union[bool, str] = True
    max_length: int = None

    def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, Any]:
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        batch = self.processor.feature_extractor.pad(
            input_features,
            padding=self.padding,
            return_tensors="pt",
        )
        labels_batch = self.processor.tokenizer.pad(
            label_features,
            padding=self.padding,
            return_tensors="pt",
            max_length=self.max_length,
        )
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
        batch["labels"] = labels
        return batch

data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)

wer_metric = load_metric("wer")

def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids
    pred_str = processor.tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_ids[label_ids == -100] = processor.tokenizer.pad_token_id
    label_str = processor.tokenizer.batch_decode(label_ids, skip_special_tokens=True)
    wer = wer_metric.compute(predictions=pred_str, references=label_str)
    return {"wer": wer}

# Step 3: Load the model
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v2")
model.config.forced_decoder_ids = None  # Disable language tokens forcing if necessary

# Step 4: Set training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./whisper-finetuned",
    per_device_train_batch_size=4,  # Adjust based on GPU memory
    gradient_accumulation_steps=4,
    learning_rate=1e-5,
    num_train_epochs=3,
    logging_steps=10,
    save_steps=500,
    eval_steps=500,
    save_total_limit=2,
    fp16=True,
)

# Step 5: Create the trainer
trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=dataset,
    data_collator=data_collator,
    tokenizer=processor.feature_extractor,
    compute_metrics=compute_metrics,
)

# Step 6: Start training
trainer.train()
