## initial checks

Check that we are using a CPU

In [1]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Sun Dec 10 01:28:23 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   32C    P0    22W / 300W |      0MiB / 16384MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

Install dependencies

In [None]:
!add-apt-repository -y ppa:jonathonf/ffmpeg-4
!apt update
!apt install -y ffmpeg

In [None]:
!pip install datasets>=2.6.1
!pip install git+https://github.com/huggingface/transformers
!pip install librosa
!pip install evaluate>=0.30
!pip install jiwer
!pip install gradio
!pip install hopsworks

Log in to huggingface hub with read premission

In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

## Get features to google drive

The first step is to get our data, prepare features, and store these.

In [None]:
from datasets import load_dataset, DatasetDict

common_voice = DatasetDict()

common_voice["train"] = load_dataset("mozilla-foundation/common_voice_11_0", "sv-SE", split="train+validation", use_auth_token=True)
common_voice["test"] = load_dataset("mozilla-foundation/common_voice_11_0", "sv-SE", split="test", use_auth_token=True)

print(common_voice)

In [None]:
common_voice = common_voice.remove_columns(["accent", "age", "client_id", "down_votes", "gender", "locale", "path", "segment", "up_votes"])

print(common_voice)

Prepare FeatureExtractor, Tokenizer and Processor

In [None]:
from transformers import WhisperFeatureExtractor
from transformers import WhisperTokenizer

feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-small")
tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-small", language="Swedish", task="transcribe")

In [None]:
from transformers import WhisperProcessor

processor = WhisperProcessor.from_pretrained("openai/whisper-small", language="Swedish", task="transcribe")

In [None]:
print(common_voice["train"][0])
from datasets import Audio
common_voice = common_voice.cast_column("audio", Audio(sampling_rate=16000))
print(common_voice["train"][0])

In [None]:
def prepare_dataset(batch):
    # load and resample audio data from 48 to 16kHz
    audio = batch["audio"]

    # compute log-Mel input features from input audio array
    batch["input_features"] = feature_extractor(audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0]

    # encode target text to label ids
    batch["labels"] = tokenizer(batch["sentence"]).input_ids
    return batch

In [None]:
common_voice = common_voice.map(prepare_dataset, remove_columns=common_voice.column_names["train"], num_proc=2)

In [None]:
import os
print(os.getcwd())
print(os.listdir())

Mount google drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
common_voice.save_to_disk('/content/drive/MyDrive/ID2223/common_voice')

In [None]:
# print(os.listdir("/content/drive/MyDrive/ID2223"))
# print(os.listdir("/content/drive/MyDrive/ID2223/common_voice"))
# print(os.listdir("/content/drive/MyDrive/ID2223"))

Check that we can load the data

In [None]:
cc = DatasetDict.load_from_disk("/content/drive/MyDrive/ID2223/common_voice")
cc

DatasetDict({
    train: Dataset({
        features: ['input_features', 'labels'],
        num_rows: 12360
    })
    test: Dataset({
        features: ['input_features', 'labels'],
        num_rows: 5069
    })
})

## Training & Eval


In [None]:
!pip install datasets>=2.6.1
!pip install git+https://github.com/huggingface/transformers
!pip install librosa
!pip install evaluate>=0.30
!pip install jiwer
!pip install gradio
!pip install hopsworks
!pip install evaluate
!pip install accelerate -U
# session sometimes needs to be restarted?

Collecting accelerate
  Downloading accelerate-0.25.0-py3-none-any.whl (265 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m265.7/265.7 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.25.0


In [None]:
import torch

from dataclasses import dataclass
from typing import Any, Dict, List, Union

@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need different padding methods
        # first treat the audio inputs by simply returning torch tensors
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        # get the tokenized label sequences
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        # pad the labels to max length
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        # if bos token is appended in previous tokenization step,
        # cut bos token here as it's append later anyways
        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels

        return batch

In [None]:
from transformers import WhisperFeatureExtractor
from transformers import WhisperTokenizer

feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-small")
tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-small", language="Swedish", task="transcribe")

In [None]:
from transformers import WhisperProcessor

processor = WhisperProcessor.from_pretrained("openai/whisper-small", language="Swedish", task="transcribe")

In [None]:
data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)

In [None]:
import evaluate

metric = evaluate.load("wer")

Downloading builder script:   0%|          | 0.00/4.49k [00:00<?, ?B/s]

In [None]:
def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    # replace -100 with the pad_token_id
    label_ids[label_ids == -100] = tokenizer.pad_token_id

    # we do not want to group tokens when computing the metrics
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    wer = 100 * metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}

In [None]:
from transformers import WhisperForConditionalGeneration

model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")

config.json:   0%|          | 0.00/1.97k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/967M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/3.84k [00:00<?, ?B/s]

In [None]:
model.config.forced_decoder_ids = None
model.config.suppress_tokens = []

Mount drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Specify trainer arguments

In [None]:
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    num_train_epochs=1,
    output_dir="/content/drive/MyDrive/ID2223/swedish_m_2",  # change to a repo name of your choice
    per_device_train_batch_size=8,
    gradient_accumulation_steps=1,  # increase by 2x for every 2x decrease in batch size
    learning_rate=1e-5,
    warmup_steps=250,
    max_steps=2000,
    gradient_checkpointing=True,
    fp16=True,
    evaluation_strategy="steps",
    per_device_eval_batch_size=8,
    predict_with_generate=True,
    generation_max_length=225,
    save_steps=250,
    eval_steps=250,
    logging_steps=25,
    report_to=["tensorboard"],
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False,
    push_to_hub=True,
)

In [None]:
"""
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    num_train_epochs=1,
    output_dir="/content/drive/MyDrive/ID2223/swedish_training",  # change to a repo name of your choice
    per_device_train_batch_size=16,
    gradient_accumulation_steps=1,  # increase by 2x for every 2x decrease in batch size
    learning_rate=1e-5,
    warmup_steps=500,
    max_steps=4000,
    gradient_checkpointing=True,
    fp16=True,
    evaluation_strategy="no",
    per_device_eval_batch_size=8,
    predict_with_generate=True,
    generation_max_length=225,
    save_steps=1,
    eval_steps=1,
    logging_steps=25,
    report_to=["tensorboard"],
    load_best_model_at_end=False,
    metric_for_best_model="wer",
    greater_is_better=False,
    push_to_hub=True,
)
"""

Load dataset

In [None]:
from datasets import load_from_disk
common_voice_reloaded = load_from_disk('/content/drive/MyDrive/ID2223/common_voice')
common_voice_reloaded

DatasetDict({
    train: Dataset({
        features: ['input_features', 'labels'],
        num_rows: 12360
    })
    test: Dataset({
        features: ['input_features', 'labels'],
        num_rows: 5069
    })
})

log in with write premission

In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

put it all in the trainer

In [None]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=common_voice_reloaded["train"],
    eval_dataset=common_voice_reloaded["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=processor.feature_extractor,
)

save the processor (as the processor is static/ wont change with training)

In [None]:
processor.save_pretrained(training_args.output_dir)

paste to browser console \n
another tip is https://nosleep.page/

---



In [None]:
"""
function ConnectButton(){
    console.log("Connect pushed");
    document.querySelector("#top-toolbar > colab-connect-button").shadowRoot.querySelector("#connect").click()
}
setInterval(ConnectButton, 60000);
"""

Only do below this first time use next section/notebook otherwise

In [None]:
trainer.train()

`use_cache = True` is incompatible with gradient checkpointing. Setting `use_cache = False`...


Step,Training Loss,Validation Loss,Wer
250,0.3503,0.375748,33.284935
500,0.3448,0.346996,45.435724




## Resuming training

In [3]:
!pip install datasets>=2.6.1
!pip install git+https://github.com/huggingface/transformers
!pip install librosa
!pip install evaluate>=0.30
!pip install jiwer
!pip install gradio
!pip install hopsworks
!pip install evaluate
!pip install accelerate -U
# session sometimes needs to be restarted? if you are starting from checkpoint

Collecting accelerate
  Downloading accelerate-0.25.0-py3-none-any.whl (265 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m265.7/265.7 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.25.0


Imports

In [2]:
import torch
from dataclasses import dataclass
from typing import Any, Dict, List, Union
from transformers import WhisperFeatureExtractor
from transformers import WhisperTokenizer
from transformers import WhisperProcessor
import evaluate
from transformers import WhisperForConditionalGeneration
from transformers import Seq2SeqTrainingArguments
from datasets import load_from_disk
from transformers import Seq2SeqTrainer

log in with read and write premission

In [3]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [4]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

mount drive

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


change "glob_checkpoint_url" to the most recent model chechpoint.
glob_resume_url wont change unless you want to change model/repo

In [6]:
glob_resume_url = "/content/drive/MyDrive/ID2223/swedish_m_2"
glob_checkpoint_url = "/content/drive/MyDrive/ID2223/swedish_m_2/checkpoint-1500"

Re-run some of the code (since the session timed out)

In [7]:
@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need different padding methods
        # first treat the audio inputs by simply returning torch tensors
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        # get the tokenized label sequences
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        # pad the labels to max length
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        # if bos token is appended in previous tokenization step,
        # cut bos token here as it's append later anyways
        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels

        return batch

Load the processor

In [8]:
processor_url = glob_resume_url

In [9]:
feature_extractor = WhisperFeatureExtractor.from_pretrained(processor_url)
tokenizer = WhisperTokenizer.from_pretrained(processor_url, language="Swedish", task="transcribe")
processor = WhisperProcessor.from_pretrained(processor_url, language="Swedish", task="transcribe")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


re-run some more

In [10]:
data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)

metric = evaluate.load("wer")

def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    # replace -100 with the pad_token_id
    label_ids[label_ids == -100] = tokenizer.pad_token_id

    # we do not want to group tokens when computing the metrics
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    wer = 100 * metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}

Downloading builder script:   0%|          | 0.00/4.49k [00:00<?, ?B/s]

update to the most recent url

In [11]:
checkpoint_model_url = glob_checkpoint_url

load the most recent model

In [12]:
model = WhisperForConditionalGeneration.from_pretrained(checkpoint_model_url)
model.config.forced_decoder_ids = None
model.config.suppress_tokens = []

Re-run some more code, load dataset

In [13]:
training_args = Seq2SeqTrainingArguments(
    num_train_epochs=1,
    output_dir="/content/drive/MyDrive/ID2223/swedish_m_2",  # change to a repo name of your choice
    per_device_train_batch_size=8,
    gradient_accumulation_steps=1,  # increase by 2x for every 2x decrease in batch size
    learning_rate=1e-5,
    warmup_steps=250,
    max_steps=2000,
    gradient_checkpointing=True,
    fp16=True,
    evaluation_strategy="steps",
    per_device_eval_batch_size=8,
    predict_with_generate=True,
    generation_max_length=225,
    save_steps=250,
    eval_steps=250,
    logging_steps=25,
    report_to=["tensorboard"],
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False,
    push_to_hub=True,
)

common_voice_reloaded = load_from_disk('/content/drive/MyDrive/ID2223/common_voice')

trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=common_voice_reloaded["train"],
    eval_dataset=common_voice_reloaded["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=processor.feature_extractor,
)

to see that we start from checkpoint

In [14]:
import transformers
transformers.logging.set_verbosity_info()

Resume training

In [15]:
checkpoint_trainer_url = glob_checkpoint_url

In [16]:
trainer.train(resume_from_checkpoint=checkpoint_trainer_url)


Loading model from /content/drive/MyDrive/ID2223/swedish_m_2/checkpoint-1500.
There were missing keys in the checkpoint model loaded: ['proj_out.weight'].
***** Running training *****
  Num examples = 12,360
  Num Epochs = 2
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 2,000
  Number of trainable parameters = 240,582,912
  Continuing training from checkpoint, will skip to saved global_step
  Continuing training from epoch 0
  Continuing training from global step 1500
  Will skip the first 0 epochs then the first 1500 batches in the first epoch.
`use_cache = True` is incompatible with gradient checkpointing. Setting `use_cache = False`...


Step,Training Loss,Validation Loss,Wer
1750,0.114,0.28217,67.080745
2000,0.1082,0.278629,74.343255


***** Running Evaluation *****
  Num examples = 5069
  Batch size = 8
Saving model checkpoint to /content/drive/MyDrive/ID2223/swedish_m_2/tmp-checkpoint-1750
Configuration saved in /content/drive/MyDrive/ID2223/swedish_m_2/tmp-checkpoint-1750/config.json
Configuration saved in /content/drive/MyDrive/ID2223/swedish_m_2/tmp-checkpoint-1750/generation_config.json
Model weights saved in /content/drive/MyDrive/ID2223/swedish_m_2/tmp-checkpoint-1750/pytorch_model.bin
Feature extractor saved in /content/drive/MyDrive/ID2223/swedish_m_2/tmp-checkpoint-1750/preprocessor_config.json
Feature extractor saved in /content/drive/MyDrive/ID2223/swedish_m_2/preprocessor_config.json
***** Running Evaluation *****
  Num examples = 5069
  Batch size = 8
Saving model checkpoint to /content/drive/MyDrive/ID2223/swedish_m_2/tmp-checkpoint-2000
Configuration saved in /content/drive/MyDrive/ID2223/swedish_m_2/tmp-checkpoint-2000/config.json
Configuration saved in /content/drive/MyDrive/ID2223/swedish_m_2/tmp-

TrainOutput(global_step=2000, training_loss=0.029645836114883424, metrics={'train_runtime': 8444.1157, 'train_samples_per_second': 1.895, 'train_steps_per_second': 0.237, 'total_flos': 4.61736640512e+18, 'train_loss': 0.029645836114883424, 'epoch': 1.29})

## Interface

COde for the Interface at huggingface spaces

In [None]:
from transformers import pipeline
import gradio as gr

model_id = "karl-sim/swedish_m_2"
pipe = pipeline(model=model_id)

def transcribe(audio):
    text = pipe(audio)["text"]
    return text

mic_transcribe = gr.Interface(
    fn=transcribe,
    inputs=gr.Audio(sources="microphone", type="filepath"),
    outputs="text",
    title="Whisper Small microphone",
    description="Realtime demo for Swedish speech recognition using a fine-tuned Whisper small model.",
)

vid_transcribe = gr.Interface(
    fn=transcribe,
    inputs=gr.Audio(sources="upload", type="filepath"),
    outputs="text",
    title="Whisper Small upload",
    description="Demo for Swedish speech recognition using a fine-tuned Whisper small model.",
)

with gr.Blocks() as demo:
    gr.TabbedInterface(
        [mic_transcribe, vid_transcribe],
        ["Transcribe Microphone", "Transcribe Audio File"],
    )

demo.launch(debug=True, share=True)