# Fine-Tune Whisperfollowing 
The code is inspired and adapted from the following blogpost:
https://huggingface.co/blog/fine-tune-whisper

In [1]:
import os
import pickle

gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Tue Apr 30 12:54:19 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.67                 Driver Version: 550.67         CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 3060 ...    Off |   00000000:01:00.0 Off |                  N/A |
| N/A   53C    P8             10W /   80W |       8MiB /   6144MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [2]:
!pip install --upgrade pip
!pip install --upgrade datasets transformers accelerate soundfile librosa evaluate jiwer tensorboard gradio





## To continue create huggingface profile and create a access token
Pass it to the following code cell to login to huggingface.

In [5]:
import yaml
with open('../../data/SECRETS.yaml', 'r') as file:
    secrets = yaml.safe_load(file)

In [6]:
from huggingface_hub import login
login(secrets['huggingface_write_token'])

Token has not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /home/lvasina/.cache/huggingface/token
Login successful


## Load Dataset

In [4]:
from datasets import load_dataset, DatasetDict, concatenate_datasets

common_voice = DatasetDict()

common_voice["train"] = load_dataset("mozilla-foundation/common_voice_11_0", "cs", split="train+validation", use_auth_token=True)
common_voice["test"] = load_dataset("mozilla-foundation/common_voice_11_0", "cs", split="test", use_auth_token=True)

print(common_voice)

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


DatasetDict({
    train: Dataset({
        features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment'],
        num_rows: 22155
    })
    test: Dataset({
        features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment'],
        num_rows: 7714
    })
})


In [5]:
common_voice = common_voice.remove_columns(["accent", "age", "client_id", "down_votes", "gender", "locale", "path", "segment", "up_votes"])
print(common_voice)


DatasetDict({
    train: Dataset({
        features: ['audio', 'sentence'],
        num_rows: 22155
    })
    test: Dataset({
        features: ['audio', 'sentence'],
        num_rows: 7714
    })
})


## Prepare Feature Extractor, Tokenizer and Data

### Load WhisperFeatureExtractor

In [6]:
from transformers import WhisperFeatureExtractor

feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-base")

### Load WhisperTokenizer

In [7]:
from transformers import WhisperTokenizer

tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-base", language="czech", task="transcribe")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


### Combine To Create A WhisperProcessor

In [8]:
from transformers import WhisperProcessor

processor = WhisperProcessor.from_pretrained("openai/whisper-base", language="czech", task="transcribe")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


### Prepare Data

In [9]:
print(common_voice["train"][0])

{'audio': {'path': '/home/lvasina/.cache/huggingface/datasets/downloads/extracted/981d0d71b7ccdd8440b5a30d1846b0edbf80c7c3d4d6941b7d3193319a582fb6/cs_train_0/common_voice_cs_25695144.mp3', 'array': array([ 4.26325641e-14,  1.13686838e-13,  2.62900812e-13, ...,
       -1.01048208e-04, -1.48227118e-04, -8.67909548e-05]), 'sampling_rate': 48000}, 'sentence': 'S judem začínala v rodném Kjóto.'}


In [10]:
from datasets import Audio

common_voice = common_voice.cast_column("audio", Audio(sampling_rate=16000))

In [11]:
print(common_voice["train"][0])

{'audio': {'path': '/home/lvasina/.cache/huggingface/datasets/downloads/extracted/981d0d71b7ccdd8440b5a30d1846b0edbf80c7c3d4d6941b7d3193319a582fb6/cs_train_0/common_voice_cs_25695144.mp3', 'array': array([ 8.73114914e-11,  1.74622983e-10, -5.82076609e-11, ...,
       -2.33804210e-04, -5.76644670e-05, -5.10758255e-05]), 'sampling_rate': 16000}, 'sentence': 'S judem začínala v rodném Kjóto.'}


In [12]:
# from audiomentations import (
#     AddBackgroundNoise,
#     AddGaussianNoise,
#     Compose,Gain,OneOf,LowShelfFilter,PitchShift,PolarityInversion,TimeStretch,Mp3Compression,PeakingFilter,SevenBandParametricEQ
# )

# # define augmentation
# augmentation = Compose(
#     [
#         # TimeStretch(min_rate=0.9, max_rate=1.1, p=0.2, leave_length_unchanged=False),
#         # Gain(min_gain_in_db=-6, max_gain_in_db=6, p=0.1),
#         # PitchShift(min_semitones=-4, max_semitones=4, p=0.2),
#     #AddGaussianNoise(min_amplitude=0.005, max_amplitude=0.015, p=1.0),
#     LowShelfFilter(min_center_freq=20, max_center_freq=600, min_gain_db=-16.0, max_gain_db=16.0, min_q=0.5, max_q=1.0, p=1),
#     Mp3Compression(min_bitrate=8, max_bitrate=64, backend='pydub', p=1),
#     PeakingFilter(min_center_freq=51, max_center_freq=7400, min_gain_db=-22, max_gain_db=22, min_q=0.5, max_q=1.0, p=1),
#     SevenBandParametricEQ(min_gain_db=-10, max_gain_db=10, p=1),
#     ]
# )

# def augment_dataset(batch):
#     # load and (possibly) resample audio data to 16kHz
#     sample = batch['audio']

#     # apply augmentation
#     #print(f"SAMPLE ENTERING AUGMENTATION FUNCTION: {sample['array']}")
#     augmented_waveform = augmentation(sample['array'], sample_rate=sample['sampling_rate'])
#     batch['audio']['array'] = augmented_waveform
#     return batch

# common_voice = common_voice.map(augment_dataset, num_proc=1)

In [13]:
import sys
sys.path.append('../../AudioAugmentor')
from tqdm import tqdm_notebook as tqdm
import sox_parser
import importlib
import core
import transf_gen
import torch.multiprocessing as mp
import IPython
from scipy.signal import fftconvolve
from scipy.io import wavfile
import matplotlib.pyplot as plt
import numpy as np
import pyroomacoustics as pra
import soundfile as sf
import audiomentations as AA
import torch_audiomentations as TA
import ffmpeg
from rir_setup import ApplyRIR, get_all_materials_info
import rir_setup
import torch
import torchaudio
import torchaudio.transforms as T
import os
from IPython.display import Audio, display
import io
from pydub import AudioSegment
import sys
import random
importlib.reload(core)
importlib.reload(transf_gen)
importlib.reload(sox_parser)

torch.manual_seed(0)
torch.cuda.manual_seed(0)
random.seed(0)
np.random.seed(0)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
sampling_rate = 16000

example_sox = '--sox="norm gain 20 highpass 300 phaser 0.5 0.6 1 0.45 0.6 -s"'
#example_sox = '--sox="norm gain 20 highpass 300 phaser 0.5 0.6 1 0.45 0.6 -s" mp3 bitrate 8'
#example_sox = '--sox="norm gain 20 highpass 300 phaser 0.5 0.6 1 0.45 0.6 -s" pcm_mulaw'
#example_sox = '--sox="norm gain 20 highpass 300 phaser 0.5 0.6 1 0.45 0.6 -s" g726 audio_bitrate 40k'
#example_sox = '--sox="norm gain 20 highpass 300 phaser 0.5 0.6 1 0.45 0.6 -s" gsm'
example_sox = '--sox="norm gain 20 highpass 300 phaser 0.5 0.6 1 0.45 0.6 -s" amr audio_bitrate 4.75k'

sox_file_content = sox_parser.load_sox_file('../../data/sox_file_example.txt')
print('SOX FILE', sox_file_content, type(sox_file_content))

rir_kwargs = {
    'audio_sample_rate': sampling_rate,
    'corners_coord': [[0, 0], [0, 3], [5, 3], [5, 1], [3, 1], [3, 0]],
    'walls_mat': 'curtains_cotton_0.5',
    'room_height': 2.0,
    'max_order': 3,
    'floor_mat': 'carpet_cotton',
    'ceiling_mat': 'hard_surface',
    'ray_tracing': True,
    'air_absorption': True,
    'source_coord': [[1.0], [1.0], [0.5]],
    'microphones_coord': [[3.5], [2.0], [0.5]],
}

rir_kwargs = {
    'audio_sample_rate': sampling_rate,
    'x_range': (0, 10), 
    'y_range': (0, 10), 
    'num_vertices_range': (4, 4),
    'mic_height': 1.5,
    'source_height': 1.5,
    'walls_mat': 'curtains_cotton_0.5',
    'room_height': 2.0,
    'max_order': 3,
    'floor_mat': 'carpet_cotton',
    'ceiling_mat': 'hard_surface',
    'ray_tracing': True,
    'air_absorption': True,
}

transformations = [
    # core.torch_randomizer(T.Vol(2), 0.1),
    # core.torch_randomizer(T.Speed(orig_freq=sampling_rate, factor=1.3), 0.1),
    # ApplyRIR(**rir_kwargs),
    # T.Vol(1.5),
    # T.PitchShift(sample_rate=16000, n_steps=1.5),
    # T.Speed(orig_freq=sampling_rate, factor=0.7),
    # AA.TimeStretch(min_rate=0.9, max_rate=1.1, p=0.2,
    #                leave_length_unchanged=False),
    # AA.Gain(min_gain_in_db=-6, max_gain_in_db=6, p=0.1),
    # AA.PitchShift(min_semitones=-4, max_semitones=4, p=0.2),
    # AA.AddGaussianNoise(min_amplitude=0.005, max_amplitude=0.015, p=0.2),
    # TA.LowPassFilter(min_cutoff_freq=700, max_cutoff_freq=800,
    #                  sample_rate=sampling_rate, p=0.1),
    # TA.AddBackgroundNoise(background_paths='/home/lvasina/Desktop/IBP/BP/data/musan/noise/free-sound', min_snr_in_db=10, max_snr_in_db=20, p=0.3, sample_rate=sampling_rate),
    # AA.LowShelfFilter(min_center_freq=20, max_center_freq=600, min_gain_db=-16.0, max_gain_db=16.0, min_q=0.5, max_q=1.0, p=1),
    # AA.Mp3Compression(min_bitrate=8, max_bitrate=64, backend='pydub', p=1),
    # AA.PeakingFilter(min_center_freq=51, max_center_freq=7400, min_gain_db=-22, max_gain_db=22, min_q=0.5, max_q=1.0, p=1),
    # AA.SevenBandParametricEQ(min_gain_db=-10, max_gain_db=10, p=1),
]

transformations = transf_gen.transf_gen(verbose=True,
                                        # ApplyRIR=rir_kwargs,
                                        # Vol={'gain': [0.9, 1.5, 0.1],
                                        #      'p': 1.0},
                                        # Speed={'orig_freq': sampling_rate,
                                        #        'factor': [0.5, 1.5, 0.1],
                                        #        'p': 0.1},

                                        TimeStretch='min_rate=0.9, max_rate=1.1, p=0.2, leave_length_unchanged=False',
                                        Gain='min_gain_in_db=-6, max_gain_in_db=6, p=0.1',
                                        PitchShift={'sample_rate': sampling_rate,
                                                     'n_steps': [1, 1.5, 0.1],
                                                     'p': 0.2},
                                        AddGaussianNoise='min_amplitude=0.005, max_amplitude=0.015, p=0.2',          
                                        LowPassFilter={
                                            'min_cutoff_freq': 700,
                                            'max_cutoff_freq': 800,
                                            'sample_rate': sampling_rate,
                                            'p': 0.1},
                                        Mp3Compression={'min_bitrate': 8,
                                                        'max_bitrate': 8,
                                                        'backend': 'pydub',
                                                        'p': 0.2},
                                                        
                                        # pcm_alaw=True,
                                        # gsm=True,
                                        # g726={'audio_bitrate': '16k'},
                                        # amr={'audio_bitrate': '4.75k'},
                                        )
print(transformations)

with open('../../data/sox_file_example.txt', 'r') as file:
    sox_file_content = file.readlines()

# Augment using AudioAugmentor
def augment_dataset(batch):
    # load and (possibly) resample audio data to 16kHz
    sample = batch['audio']
    augment = core.AugmentWaveform(
        transformations=transformations, device='cpu', sox_effects=None, sample_rate=sample['sampling_rate'], verbose=False
        #transformations=transformations, device='cuda', sox_effects=None, sample_rate=sample['sampling_rate'], verbose=False
        #transformations=None, device='cpu', sox_effects=example_sox, sample_rate=sample['sampling_rate'], verbose=True
        #transformations=None, device='cpu', sox_effects=sox_file_content, sample_rate=sample['sampling_rate'], verbose=True
    )
    # apply augmentation
    # print(f"SAMPLE ENTERING AUGMENTATION FUNCTION: {sample['array']}")
    augmented_waveform = augment(sample['array'])
    batch['audio']['array'] = augmented_waveform
    return batch

# Augment using Audiomentaions
# def augment_dataset(batch):
#     # load and (possibly) resample audio data to 16kHz
#     sample = batch['audio']
#     augment = AA.Compose([
#         AA.AddGaussianNoise(min_amplitude=0.005, max_amplitude=0.015, p=1)
# ])
#     # apply augmentation
#     # print(f"SAMPLE ENTERING AUGMENTATION FUNCTION: {sample['array']}")
#     augmented_waveform = augment(sample['array'], sample['sampling_rate'])
#     batch['audio']['array'] = augmented_waveform
#     return batch


# from datasets.utils.logging import set_verbosity_debug
# set_verbosity_debug()

# common_voice['train'] = common_voice['train'].map(augment_dataset, num_proc=os.cpu_count())


SOX FILE ['--sox="norm gain 20 highpass 300 phaser 0.5 0.6 1 0.45 0.6 -s"\n', '--sox="norm gain 20 highpass 300 phaser 0.5 0.6 1 0.45 0.6 -s" mp3 bitrate 8\n', '--sox="norm gain 20 highpass 300 phaser 0.5 0.6 1 0.45 0.6 -s" pcm_mulaw\n', '--sox="norm gain 20 highpass 300 phaser 0.5 0.6 1 0.45 0.6 -s" g726 audio_bitrate 40k\n', '--sox="norm gain 20 highpass 300 phaser 0.5 0.6 1 0.45 0.6 -s" gsm\n', '--sox="norm gain 20 highpass 300 phaser 0.5 0.6 1 0.45 0.6 -s" amr audio_bitrate 4.75k'] <class 'list'>
ADDED: TimeStretch, 
		{'min_rate': 0.9, 'max_rate': 1.1, 'p': 0.2, 'leave_length_unchanged': False}

ADDED: Gain, 
		{'min_gain_in_db': -6, 'max_gain_in_db': 6, 'p': 0.1}

ADDED: PitchShift, 
		{'sample_rate': 16000, 'n_steps': [1, 1.5, 0.1]}

ADDED: AddGaussianNoise, 
		{'min_amplitude': 0.005, 'max_amplitude': 0.015, 'p': 0.2}

ADDED: LowPassFilter, 
		{'min_cutoff_freq': 700, 'max_cutoff_freq': 800, 'sample_rate': 16000, 'p': 0.1}

ADDED: Mp3Compression, 
		{'min_bitrate': 8, 'max_bitr

In [14]:
CURRENT_RUN_NAME = "whisper-base-cs-cv11-timestetch02-gain01-pitch02-gaussian02-lowpass01-timemask50-freqmask50"

In [15]:
if os.path.exists(f'{CURRENT_RUN_NAME}-TEST-BEFORE_FEATURE_EXTR.pkl'):
    # Open the file in binary mode 
    with open(f'{CURRENT_RUN_NAME}-TEST-BEFORE_FEATURE_EXTR.pkl', 'rb') as file: 
        
        # Call load method to deserialze 
        augmented_train_set = pickle.load(file) 
    
        print(f'LOADED: \n{augmented_train_set}')
else:
    #augmented_set = common_voice['test'].map(augment_dataset, num_proc=os.cpu_count(), writer_batch_size=500)
    #augmented_set = common_voice['train'].map(augment_dataset, num_proc=os.cpu_count(), writer_batch_size=500)
    augmented_set = common_voice.map(augment_dataset, num_proc=os.cpu_count(), writer_batch_size=500)

    # Open a file and use dump()
    with open(f'{CURRENT_RUN_NAME}-TEST-BEFORE_FEATURE_EXTR.pkl', 'wb') as file: 
        print('SAVED PICKLE FILE')
        # A new file will be created 
        pickle.dump(augmented_set, file)

Map (num_proc=8):   0%|          | 0/22155 [00:00<?, ? examples/s]

Map (num_proc=8):   0%|          | 0/7714 [00:00<?, ? examples/s]

SAVED PICKLE FILE


In [16]:
def prepare_dataset(batch):
    audio = batch["audio"]

    # compute log-Mel input features from input audio array
    batch["input_features"] = feature_extractor(audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0]

    # encode target text to label ids
    batch["labels"] = tokenizer(batch["sentence"]).input_ids

    return batch

In [17]:
print(common_voice)

DatasetDict({
    train: Dataset({
        features: ['audio', 'sentence'],
        num_rows: 22155
    })
    test: Dataset({
        features: ['audio', 'sentence'],
        num_rows: 7714
    })
})


In [18]:
# common_voice['train'] = concatenate_datasets([common_voice['train'], augmented_set])
# common_voice['test'] =  augmented_set
# common_voice['train'] =  augmented_set
common_voice =  augmented_set

In [19]:
print(common_voice)

DatasetDict({
    train: Dataset({
        features: ['audio', 'sentence'],
        num_rows: 22155
    })
    test: Dataset({
        features: ['audio', 'sentence'],
        num_rows: 7714
    })
})


In [20]:
if os.path.exists(f'{CURRENT_RUN_NAME}-READY.pkl'):
    # Open the file in binary mode 
    with open(f'{CURRENT_RUN_NAME}-READY.pkl', 'rb') as file: 
        
        # Call load method to deserialze 
        common_voice = pickle.load(file) 
    
        print(f'LOADED: \n{common_voice}') 
else:
    common_voice = common_voice.map(prepare_dataset, remove_columns=common_voice.column_names["train"], num_proc=os.cpu_count(), writer_batch_size=500)
    # Open a file and use dump() 
    with open(f'{CURRENT_RUN_NAME}-READY.pkl', 'wb') as file: 
        print('SAVED PICKLE FILE')
        # A new file will be created 
        pickle.dump(common_voice, file)

Map (num_proc=8):   0%|          | 0/22155 [00:00<?, ? examples/s]

Map (num_proc=8):   0%|          | 0/7714 [00:00<?, ? examples/s]

SAVED PICKLE FILE


## Training and Evaluation

### Define a Data Collator

In [21]:
import torch

from dataclasses import dataclass
from typing import Any, Dict, List, Union

@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need different padding methods
        # first treat the audio inputs by simply returning torch tensors
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        # get the tokenized label sequences
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        # pad the labels to max length
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        # if bos token is appended in previous tokenization step,
        # cut bos token here as it's append later anyways
        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels

        return batch
    
data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)

### Evaluation Metrics

In [23]:
import evaluate

metric = evaluate.load("wer")

def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    # replace -100 with the pad_token_id
    label_ids[label_ids == -100] = tokenizer.pad_token_id

    # we do not want to group tokens when computing the metrics
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    wer = 100 * metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}

## Load a Pre-Trained Checkpoint

In [24]:
from transformers import WhisperForConditionalGeneration

model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-base")

In [25]:
# THIS WAS ADDED BEACUSE OF THE ISSUE STATED HERE: https://github.com/huggingface/blog/issues/1794 
# AND HERE: https://github.com/huggingface/transformers/issues/28814
model.generation_config.language = "cs" 

In [26]:
model.config.forced_decoder_ids = None
model.config.suppress_tokens = []

In [27]:
# model.config.apply_spec_augment = True
# model.config.mask_time_prob = 0.20
# model.config.mask_feature_prob = 0.20

### Define the Training Configuration

In [28]:
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir=f'./{CURRENT_RUN_NAME}',  # change to a repo name
    per_device_train_batch_size=8,
    gradient_accumulation_steps=4,  # increase by 2x for every 2x decrease in batch size
    learning_rate=1e-5,
    warmup_steps=500,
    max_steps=4000,
    gradient_checkpointing=True,
    fp16=True,
    evaluation_strategy="steps",
    per_device_eval_batch_size=8,
    predict_with_generate=True,
    generation_max_length=225,
    save_steps=1000,
    eval_steps=1000,
    logging_steps=25,
    report_to=["tensorboard"],
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False,
    push_to_hub=True,
)

In [29]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=common_voice["train"],
    eval_dataset=common_voice["test"].shuffle(16),
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=processor.feature_extractor,
)

# We'll save the processor object once before starting training. Since the processor is not trainable, it won't change over the course of training:
processor.save_pretrained(training_args.output_dir)

max_steps is given, it will override any value given in num_train_epochs


[]

### Training

In [30]:
trainer.evaluate()

  0%|          | 0/965 [00:00<?, ?it/s]

{'eval_loss': 3.206214666366577,
 'eval_wer': 112.0436880686632,
 'eval_runtime': 1948.0194,
 'eval_samples_per_second': 3.96,
 'eval_steps_per_second': 0.495}

In [1]:
print("""
TINY-CS baseline CV11 performance:
'eval_wer': 100.73488783290972,
""")

print("""
SMALL-CS baseline performance:
'eval_wer': 46.296091649169334,
""")


TINY-CS baseline CV11 performance:
'eval_wer': 100.73488783290972,


SMALL-CS baseline performance:
'eval_wer': 46.296091649169334,



In [2]:
print("""
BASE-CS baseline performance:whisper-bs-cs-train-noaug-test-noaug
'eval_wer': 72.12583342542455,
""")

print("""
BASE-CS baseline performance:whisper-bs-cs-train-noaug-test-tstretch20-gain10-pitch20-gaussian20-lowpass10-mp3
'eval_wer': 109.89796294249825,
""")


BASE-CS baseline performance:whisper-bs-cs-train-noaug-test-noaug
'eval_wer': 72.12583342542455,


BASE-CS baseline performance:whisper-bs-cs-train-noaug-test-tstretch20-gain10-pitch20-gaussian20-lowpass10-mp3
'eval_wer': 109.89796294249825,



In [33]:
trainer.train()
#trainer.train(resume_from_checkpoint=True)

  0%|          | 0/4000 [00:00<?, ?it/s]

`use_cache = True` is incompatible with gradient checkpointing. Setting `use_cache = False`...


{'loss': 3.0544, 'grad_norm': 32.488033294677734, 'learning_rate': 4.800000000000001e-07, 'epoch': 0.04}
{'loss': 2.7699, 'grad_norm': 21.935815811157227, 'learning_rate': 9.800000000000001e-07, 'epoch': 0.07}
{'loss': 2.2183, 'grad_norm': 17.920761108398438, 'learning_rate': 1.48e-06, 'epoch': 0.11}
{'loss': 1.743, 'grad_norm': 11.836263656616211, 'learning_rate': 1.98e-06, 'epoch': 0.14}
{'loss': 1.5458, 'grad_norm': 11.166437149047852, 'learning_rate': 2.4800000000000004e-06, 'epoch': 0.18}
{'loss': 1.4539, 'grad_norm': 10.780823707580566, 'learning_rate': 2.9800000000000003e-06, 'epoch': 0.22}
{'loss': 1.308, 'grad_norm': 9.90709114074707, 'learning_rate': 3.48e-06, 'epoch': 0.25}
{'loss': 1.2953, 'grad_norm': 10.09154987335205, 'learning_rate': 3.980000000000001e-06, 'epoch': 0.29}
{'loss': 1.2742, 'grad_norm': 9.044551849365234, 'learning_rate': 4.48e-06, 'epoch': 0.32}
{'loss': 1.2325, 'grad_norm': 11.049210548400879, 'learning_rate': 4.980000000000001e-06, 'epoch': 0.36}
{'loss

  0%|          | 0/965 [00:00<?, ?it/s]

Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [], 'begin_suppress_tokens': [220, 50257]}


{'eval_loss': 0.8180581331253052, 'eval_wer': 63.13773160938594, 'eval_runtime': 1712.1745, 'eval_samples_per_second': 4.505, 'eval_steps_per_second': 0.564, 'epoch': 1.44}
{'loss': 0.6367, 'grad_norm': 7.616935729980469, 'learning_rate': 8.502857142857143e-06, 'epoch': 1.48}
{'loss': 0.674, 'grad_norm': 8.686921119689941, 'learning_rate': 8.431428571428572e-06, 'epoch': 1.52}
{'loss': 0.6177, 'grad_norm': 9.185637474060059, 'learning_rate': 8.36e-06, 'epoch': 1.55}
{'loss': 0.6167, 'grad_norm': 8.570244789123535, 'learning_rate': 8.288571428571429e-06, 'epoch': 1.59}
{'loss': 0.6576, 'grad_norm': 9.47664737701416, 'learning_rate': 8.217142857142858e-06, 'epoch': 1.62}
{'loss': 0.6607, 'grad_norm': 8.574908256530762, 'learning_rate': 8.145714285714287e-06, 'epoch': 1.66}
{'loss': 0.6505, 'grad_norm': 7.943095684051514, 'learning_rate': 8.074285714285714e-06, 'epoch': 1.7}
{'loss': 0.5836, 'grad_norm': 7.809476852416992, 'learning_rate': 8.002857142857143e-06, 'epoch': 1.73}
{'loss': 0.

  0%|          | 0/965 [00:00<?, ?it/s]

Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [], 'begin_suppress_tokens': [220, 50257]}


{'eval_loss': 0.7081364989280701, 'eval_wer': 56.870004052013115, 'eval_runtime': 1701.0699, 'eval_samples_per_second': 4.535, 'eval_steps_per_second': 0.567, 'epoch': 2.89}
{'loss': 0.4822, 'grad_norm': 7.049036026000977, 'learning_rate': 5.6485714285714285e-06, 'epoch': 2.92}
{'loss': 0.5002, 'grad_norm': 6.774135112762451, 'learning_rate': 5.5771428571428575e-06, 'epoch': 2.96}
{'loss': 0.4576, 'grad_norm': 7.788026332855225, 'learning_rate': 5.5057142857142865e-06, 'epoch': 3.0}
{'loss': 0.3568, 'grad_norm': 6.039063930511475, 'learning_rate': 5.4342857142857155e-06, 'epoch': 3.03}
{'loss': 0.3681, 'grad_norm': 6.392737865447998, 'learning_rate': 5.362857142857143e-06, 'epoch': 3.07}
{'loss': 0.3709, 'grad_norm': 6.505386829376221, 'learning_rate': 5.291428571428572e-06, 'epoch': 3.1}
{'loss': 0.3527, 'grad_norm': 6.562893867492676, 'learning_rate': 5.220000000000001e-06, 'epoch': 3.14}
{'loss': 0.3849, 'grad_norm': 6.991857528686523, 'learning_rate': 5.14857142857143e-06, 'epoch':

  0%|          | 0/965 [00:00<?, ?it/s]

Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [], 'begin_suppress_tokens': [220, 50257]}


{'eval_loss': 0.6906446814537048, 'eval_wer': 54.51246914944561, 'eval_runtime': 1637.9738, 'eval_samples_per_second': 4.709, 'eval_steps_per_second': 0.589, 'epoch': 4.33}
{'loss': 0.3041, 'grad_norm': 5.907439708709717, 'learning_rate': 2.7914285714285716e-06, 'epoch': 4.37}
{'loss': 0.3136, 'grad_norm': 6.546174049377441, 'learning_rate': 2.7200000000000002e-06, 'epoch': 4.4}
{'loss': 0.3038, 'grad_norm': 6.906599998474121, 'learning_rate': 2.648571428571429e-06, 'epoch': 4.44}
{'loss': 0.3167, 'grad_norm': 5.942595958709717, 'learning_rate': 2.5771428571428574e-06, 'epoch': 4.48}
{'loss': 0.3016, 'grad_norm': 6.228283405303955, 'learning_rate': 2.5057142857142856e-06, 'epoch': 4.51}
{'loss': 0.3095, 'grad_norm': 6.4183669090271, 'learning_rate': 2.4342857142857146e-06, 'epoch': 4.55}
{'loss': 0.3125, 'grad_norm': 6.689169406890869, 'learning_rate': 2.362857142857143e-06, 'epoch': 4.58}
{'loss': 0.3083, 'grad_norm': 5.8001017570495605, 'learning_rate': 2.2914285714285718e-06, 'epoch

  0%|          | 0/965 [00:00<?, ?it/s]

Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [], 'begin_suppress_tokens': [220, 50257]}


{'eval_loss': 0.687467098236084, 'eval_wer': 54.15331344163259, 'eval_runtime': 1656.7347, 'eval_samples_per_second': 4.656, 'eval_steps_per_second': 0.582, 'epoch': 5.78}


There were missing keys in the checkpoint model loaded: ['proj_out.weight'].


{'train_runtime': 22943.0769, 'train_samples_per_second': 5.579, 'train_steps_per_second': 0.174, 'train_loss': 0.5698619546890259, 'epoch': 5.78}


TrainOutput(global_step=4000, training_loss=0.5698619546890259, metrics={'train_runtime': 22943.0769, 'train_samples_per_second': 5.579, 'train_steps_per_second': 0.174, 'total_flos': 8.300458008576e+18, 'train_loss': 0.5698619546890259, 'epoch': 5.776173285198556})

In [34]:
trainer.evaluate(language="cs")

  0%|          | 0/965 [00:00<?, ?it/s]

{'eval_loss': 0.687467098236084,
 'eval_wer': 54.15331344163259,
 'eval_runtime': 1506.2654,
 'eval_samples_per_second': 5.121,
 'eval_steps_per_second': 0.641,
 'epoch': 5.776173285198556}

In [4]:
print("""
whisper-bs-cs-train-noaug-test-noaug 4000 steps performance:
'eval_loss': 0.3769497871398926, 'eval_wer': 35.114377279257376, 'eval_runtime': 1471.0067, 'eval_samples_per_second': 5.244, 'eval_steps_per_second': 0.656, 'epoch': 5.78
'train_runtime': 20830.4373, 'train_samples_per_second': 6.145, 'train_steps_per_second': 0.192, 'train_loss': 0.25434225314855574, 'epoch': 5.78
""")

print("""
whisper-base-cs-cv11-train-aug-test-aug 4000 steps performance:      
'eval_loss': 0.687467098236084, 'eval_wer': 54.15331344163259, 'eval_runtime': 1656.7347, 'eval_samples_per_second': 4.656, 'eval_steps_per_second': 0.582, 'epoch': 5.78
'train_runtime': 22943.0769, 'train_samples_per_second': 5.579, 'train_steps_per_second': 0.174, 'train_loss': 0.5698619546890259, 'epoch': 5.78
""")

print("""
whisper-base-cs-cv11-train-stretch20-gain10-pitch20-gaussian20-lowpass10-toest-noaug 4000 steps performance:
'eval_loss': 0.3659650683403015, 'eval_wer': 35.235937672671014, 'eval_runtime': 1270.0477, 'eval_samples_per_second': 6.074, 'eval_steps_per_second': 0.76, 'epoch': 5.78
'train_runtime': 17820.5123, 'train_samples_per_second': 7.183, 'train_steps_per_second': 0.224, 'train_loss': 0.4966413743495941, 'epoch': 5.78
"""
)

print("""
whisper-base-cs-cv11-train-noaug-test-timestretch20-gain10-pitch20-gaussian20-lowpass01-mp3compression01 4000 steps performance:
'eval_loss': 1.0829871892929077, 'eval_wer': 65.93546248204221, 'eval_runtime': 8263.055, 'eval_samples_per_second': 0.934, 'eval_steps_per_second': 0.117, 'epoch': 5.78
'train_runtime': 76764.5105, 'train_samples_per_second': 1.667, 'train_steps_per_second': 0.052, 'train_loss': 0.2543431313931942, 'epoch': 5.78
""")


whisper-bs-cs-train-noaug-test-noaug 4000 steps performance:
'eval_loss': 0.3769497871398926, 'eval_wer': 35.114377279257376, 'eval_runtime': 1471.0067, 'eval_samples_per_second': 5.244, 'eval_steps_per_second': 0.656, 'epoch': 5.78
'train_runtime': 20830.4373, 'train_samples_per_second': 6.145, 'train_steps_per_second': 0.192, 'train_loss': 0.25434225314855574, 'epoch': 5.78


whisper-base-cs-cv11-train-aug-test-aug 4000 steps performance:      
'eval_loss': 0.687467098236084, 'eval_wer': 54.15331344163259, 'eval_runtime': 1656.7347, 'eval_samples_per_second': 4.656, 'eval_steps_per_second': 0.582, 'epoch': 5.78
'train_runtime': 22943.0769, 'train_samples_per_second': 5.579, 'train_steps_per_second': 0.174, 'train_loss': 0.5698619546890259, 'epoch': 5.78


whisper-base-cs-cv11-train-stretch20-gain10-pitch20-gaussian20-lowpass10-toest-noaug 4000 steps performance:
'eval_loss': 0.3659650683403015, 'eval_wer': 35.235937672671014, 'eval_runtime': 1270.0477, 'eval_samples_per_second': 6.0

In [38]:
trainer.push_to_hub()

Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [], 'begin_suppress_tokens': [220, 50257]}


events.out.tfevents.1714514040.asus-lv.49669.1:   0%|          | 0.00/406 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/LadislavVasina1/whisper-bs-cs-train-aug-test-aug2/commit/03ff3564f5ad273982ea93a151d658878e0da4df', commit_message='End of training', commit_description='', oid='03ff3564f5ad273982ea93a151d658878e0da4df', pr_url=None, pr_revision=None, pr_num=None)

## Building a Demo

In [39]:
from transformers import pipeline
import gradio as gr


pipe = pipeline(model=f"LadislavVasina1/{CURRENT_RUN_NAME}")

def transcribe(audio):
    text = pipe(audio)["text"]
    return text

gradio_app = gr.Interface(
    fn=transcribe,
    inputs=gr.Audio(type="filepath"),
    outputs="text",
    title="Whisper Base CS",
    description=CURRENT_RUN_NAME,
)

gradio_app.launch()

config.json:   0%|          | 0.00/1.31k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/290M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/3.82k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/283k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/494k [00:00<?, ?B/s]

normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/34.6k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.19k [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


preprocessor_config.json:   0%|          | 0.00/339 [00:00<?, ?B/s]

Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.


