### author
It is original whisper-finetune script, taken from [colab](https://colab.research.google.com/github/sanchit-gandhi/notebooks/blob/main/fine_tune_whisper.ipynb), and adapted for my special task.

### data loading

In [1]:
# load data from huggingface hub
from datasets import load_dataset, DatasetDict

common_voice = DatasetDict()

# this is actually data for noise augmentation, TBU only if enough time
common_voice["train"] = load_dataset("mozilla-foundation/common_voice_11_0", "cs", split="train+validation")
common_voice["test"] = load_dataset("mozilla-foundation/common_voice_11_0", "cs", split="test")


  from .autonotebook import tqdm as notebook_tqdm


KeyboardInterrupt: 

In [12]:
# remove unnecessary columns
try: 
    common_voice = common_voice.remove_columns(["accent", "age", "client_id", "down_votes", "gender", "locale", "path", "segment", "up_votes"])
except Exception as e:
    pass # print(e)

common_voice["train"] = common_voice["train"].select(range(10))
common_voice["test"] = common_voice["test"].select(range(10))
print(common_voice)

DatasetDict({
    train: Dataset({
        features: ['audio', 'sentence'],
        num_rows: 10
    })
    test: Dataset({
        features: ['audio', 'sentence'],
        num_rows: 10
    })
})


In [13]:
common_voice['train'].features

{'audio': Audio(sampling_rate=48000, mono=True, decode=True, id=None),
 'sentence': Value(dtype='string', id=None)}

In [14]:
print(common_voice["train"][0])

{'audio': {'path': '/home/johnny/.cache/huggingface/datasets/downloads/extracted/2e8616563213b840cdf3437844a3a166161e162770f3b9d40bb06cf2e316d895/cs_train_0/common_voice_cs_25695144.mp3', 'array': array([ 4.26325641e-14,  1.13686838e-13,  2.62900812e-13, ...,
       -1.01048208e-04, -1.48227118e-04, -8.67909548e-05]), 'sampling_rate': 48000}, 'sentence': 'S judem začínala v rodném Kjóto.'}


### prepare environment for data process

In [15]:
from transformers import WhisperFeatureExtractor, WhisperTokenizer
# whisper medium is the model used by @Veronika for her bachelor thesis
# this time it probably won't be used, as i shell use the czech model, but still I leave it here
# feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-medium" )
feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-small")

# again, this tokenizer is for the medium model, but this time it will be used
tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-small", language="Czech", task="transcribe")


In [16]:
# test whether it works
input_str = common_voice["train"][0]["sentence"]
labels = tokenizer(input_str).input_ids
decoded_with_special = tokenizer.decode(labels, skip_special_tokens=False)
decoded_str = tokenizer.decode(labels, skip_special_tokens=True)

print(f"Input:                 {input_str}")
print(f"Decoded w/ special:    {decoded_with_special}")
print(f"Decoded w/out special: {decoded_str}")
print(f"Are equal:             {input_str == decoded_str}")


Input:                 S judem začínala v rodném Kjóto.
Decoded w/ special:    <|startoftranscript|><|cs|><|transcribe|><|notimestamps|>S judem začínala v rodném Kjóto.<|endoftext|>
Decoded w/out special: S judem začínala v rodném Kjóto.
Are equal:             True


#### data resampling

In [17]:
from datasets import Audio
common_voice = common_voice.cast_column("audio", Audio(sampling_rate=16000))

In [18]:
def prepare_dataset(batch):
    # load and resample audio data from 48 to 16kHz
    audio = batch["audio"]

    # compute log-Mel input features from input audio array 
    batch["input_features"] = feature_extractor(audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0]

    # encode target text to label ids 
    batch["labels"] = tokenizer(batch["sentence"]).input_ids
    return batch


In [19]:
common_voice = common_voice.map(prepare_dataset, remove_columns=common_voice.column_names["train"], num_proc=4)

#### preparing the model

In [None]:
from transformers import WhisperForConditionalGeneration
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")

# setting the parameters of model for correct working
model.generation_config.language = "english"
model.generation_config.task = "transcribe"
model.generation_config.forced_decoder_ids = None


In [3]:
from datasets import DatasetDict,load_dataset
data_for_train = DatasetDict().load_from_disk("../data/apimod/data.data")

In [4]:
data_for_train

DatasetDict({
    train: Dataset({
        features: ['input_features', 'labels'],
        num_rows: 75
    })
    test: Dataset({
        features: ['input_features', 'labels'],
        num_rows: 19
    })
})

#### prepare data collator (for handle input data properly)

In [22]:
import torch

from dataclasses import dataclass
from typing import Any, Dict, List, Union

@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any
    decoder_start_token_id: int

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need different padding methods
        # first treat the audio inputs by simply returning torch tensors
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        # get the tokenized label sequences
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        # pad the labels to max length
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        # if bos token is appended in previous tokenization step,
        # cut bos token here as it's append later anyways
        if (labels[:, 0] == self.decoder_start_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels

        return batch


In [None]:
# include the processor that will serve instead of tokenizer and feature extractor
from transformers import WhisperProcessor
processor = WhisperProcessor.from_pretrained("openai/whisper-small", language="Czech", task="transcribe")
processor.get_prom

In [24]:
data_collator = DataCollatorSpeechSeq2SeqWithPadding(
    processor=processor,
    decoder_start_token_id=model.config.decoder_start_token_id,
)

#### add the evaluation metrics

In [25]:
import evaluate
metric = evaluate.load("wer")

In [26]:
def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    # replace -100 with the pad_token_id
    label_ids[label_ids == -100] = tokenizer.pad_token_id

    # we do not want to group tokens when computing the metrics
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    wer = 100 * metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}


#### define seq2seq training arguments and trainer and **start train**

In [None]:
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir="./whisper-test",  # change to a repo name of your choice
    per_device_train_batch_size=16,
    gradient_accumulation_steps=1,  # increase by 2x for every 2x decrease in batch size
    learning_rate=1e-5,
    lr_scheduler_type="constant",
    warmup_steps=50,
    max_steps=100,
    gradient_checkpointing=True,
    fp16=True,
    eval_strategy="steps", # changed from evaluation_strategy (because of warning)
    per_device_eval_batch_size=8,
    predict_with_generate=True,
    generation_max_length=225,
    save_steps=100,
    eval_steps=50,
    logging_steps=25,
    report_to=["tensorboard"],
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False,
    push_to_hub=False, # change to True to push the model to the Hub (need to be logged in)
)

In [36]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=common_voice["train"],
    eval_dataset=common_voice["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    processing_class=processor
)

max_steps is given, it will override any value given in num_train_epochs


In [34]:
trainer.evaluate()

Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


  0%|          | 0/2 [00:00<?, ?it/s]

Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


{'eval_loss': 0.7921435832977295,
 'eval_model_preparation_time': 0.0036,
 'eval_wer': 103.50877192982458,
 'eval_runtime': 2.5037,
 'eval_samples_per_second': 3.994,
 'eval_steps_per_second': 0.799}

In [37]:
trainer.train()

  0%|          | 0/100 [00:00<?, ?it/s]

{'loss': 0.0, 'grad_norm': 2.22494600166101e-05, 'learning_rate': 1e-05, 'epoch': 25.0}
{'loss': 0.0, 'grad_norm': 9.923002835421357e-06, 'learning_rate': 1e-05, 'epoch': 50.0}


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


  0%|          | 0/2 [00:00<?, ?it/s]

Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


{'eval_loss': 0.8920922875404358, 'eval_wer': 114.03508771929825, 'eval_runtime': 2.5294, 'eval_samples_per_second': 3.953, 'eval_steps_per_second': 0.791, 'epoch': 50.0}
{'loss': 0.0, 'grad_norm': 8.075524419837166e-06, 'learning_rate': 1e-05, 'epoch': 75.0}
{'loss': 0.0, 'grad_norm': 7.290151643246645e-06, 'learning_rate': 1e-05, 'epoch': 100.0}


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


  0%|          | 0/2 [00:00<?, ?it/s]

Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


{'eval_loss': 0.8978124856948853, 'eval_wer': 115.78947368421053, 'eval_runtime': 2.3909, 'eval_samples_per_second': 4.183, 'eval_steps_per_second': 0.837, 'epoch': 100.0}


There were missing keys in the checkpoint model loaded: ['proj_out.weight'].


{'train_runtime': 195.4163, 'train_samples_per_second': 8.188, 'train_steps_per_second': 0.512, 'train_loss': 4.22151905240753e-06, 'epoch': 100.0}


TrainOutput(global_step=100, training_loss=4.22151905240753e-06, metrics={'train_runtime': 195.4163, 'train_samples_per_second': 8.188, 'train_steps_per_second': 0.512, 'total_flos': 2.8858540032e+17, 'train_loss': 4.22151905240753e-06, 'epoch': 100.0})