# Sanity Check
To prevent confusion further down in the notebook, a simple sanity check keeps track of what has been run and what has not.

First of all, make sure GPU is not loaded (we don't want to waste it on feature engineering)

In [1]:
sanity_prop = {}
def sanity_check(keys):
  for key in keys:
    if key not in sanity_prop:
      raise Exception(f"Requirement: '{key}' not satisfied!")

def sanity_check_not_added(keys):
  for key in keys:
    if key in sanity_prop:
      raise Exception(f"Requirement: '{key}' already satisfied!")

# should-not-use-gpu
# settings-set
# libraries-are-installed
# logged-in-to-huggingface
# whisper-loaded
# commonvoice-downloaded
# columns-filtered
# audio-resampled

gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0 or "command not found" in gpu_info:
  sanity_prop['should-not-use-gpu'] = True

# Settings

In [4]:
sanity_check(['should-not-use-gpu'])

# Settings
# - Model
# - - From Multilingual Checkpoint
# pretrained_model = "openai/whisper-small"
# pretrained_model_language = "Swedish"
# pretrained_model_language_training = "Swedish"
# pretrained_model_task = "transcribe"
# pretrained_model_sampling_rate = 16000  # Hz
# - - From English Checkpoint
pretrained_model = "openai/whisper-small.en"
pretrained_model_language = "Swedish"
pretrained_model_language_training = "Swedish"
pretrained_model_task = "transcribe"
pretrained_model_sampling_rate = 16000  # Hz

# - Dataset (Common Voice)
dataset_repo = "mozilla-foundation/common_voice_14_0"
dataset_version = "sv-SE"
dataset_column_filter = ["accent", "age", "client_id", "down_votes", "gender", "locale", "path", "segment", "up_votes", "variant"] # 'variant' is new in v13

# - Task
# - - From Multilingual Checkpoint
# task_repo_dataset = "GroupSix/common-voice-sv"
# - - From English Checkpoint
task_repo_dataset = "GroupSix/common-voice-en-sv"

sanity_prop['settings-set'] = True

# Install Libraries
Install necessary libraries

In [3]:
sanity_check(['should-not-use-gpu', 'settings-set'])
sanity_check_not_added(['libraries-are-installed'])  # Don't rerun

# Install updated libaries
!pip install datasets>=2.6.1
!pip install git+https://github.com/huggingface/transformers
!pip install librosa
!pip install evaluate>=0.30
!pip install jiwer
!pip install gradio
!pip install accelerate==0.23.0
!pip install datasets transformers[sentencepiece]

sanity_prop['libraries-are-installed'] = True

Collecting git+https://github.com/huggingface/transformers
  Cloning https://github.com/huggingface/transformers to /tmp/pip-req-build-z0og3khc
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers /tmp/pip-req-build-z0og3khc
  Resolved https://github.com/huggingface/transformers to commit e5079b0b2abcef11ecbdae60ba4a6636c57b725d
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: transformers
  Building wheel for transformers (pyproject.toml) ... [?25l[?25hdone
  Created wheel for transformers: filename=transformers-4.36.0.dev0-py3-none-any.whl size=8217653 sha256=f5af9cb563f583c6060888ba362151f5cccc33a0f38b433cbf508c5168ad0bbc
  Stored in directory: /tmp/pip-ephem-wheel-cache-nszvcyd3/wheels/c0/14/d6/6c9a5582d2ac191ec0a483be151a4495fe1eb2a6706ca49f1b
Successfully built transformers

# Hugging Face Login
Log in to Hugging Face to save dataset when it is processed.

In [5]:
sanity_check(['should-not-use-gpu', 'settings-set', 'libraries-are-installed'])

# Log in to HF
from huggingface_hub import notebook_login
notebook_login()

sanity_prop['logged-in-to-huggingface'] = True

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

# Feature Engineering
Choose the model, the dataset to train/test on and their settings

## Feature Extractor, Tokenizer

In [6]:
sanity_check(['should-not-use-gpu', 'settings-set', 'libraries-are-installed', 'logged-in-to-huggingface'])

# Load feature extractor from pre-trained checkpoint
from transformers import WhisperFeatureExtractor
feature_extractor = WhisperFeatureExtractor.from_pretrained(pretrained_model)

# Load WhisperTokenizer
from transformers import WhisperTokenizer
tokenizer = WhisperTokenizer.from_pretrained(pretrained_model, language=pretrained_model_language, task=pretrained_model_task)

sanity_prop['whisper-loaded'] = True

preprocessor_config.json:   0%|          | 0.00/185k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/805 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.41M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/34.6k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/1.83k [00:00<?, ?B/s]

## Training and Testing Dataset

In [7]:
sanity_check(['should-not-use-gpu', 'settings-set', 'libraries-are-installed', 'logged-in-to-huggingface', 'whisper-loaded'])

# Load our dataset, use latest version
from datasets import load_dataset, DatasetDict

# Dataset
# - Common Voice
common_voice = DatasetDict()
common_voice["train"] = load_dataset(dataset_repo, dataset_version, split="train+validation", use_auth_token=True)
common_voice["test"] = load_dataset(dataset_repo, dataset_version, split="test", use_auth_token=True)

sanity_prop['commonvoice-downloaded'] = True



Downloading builder script:   0%|          | 0.00/8.18k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/12.4k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.74k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/67.6k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/13.6k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/5 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/202M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/141M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/155M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/170M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/44.1M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/5 [00:00<?, ?it/s]

Downloading data files:   0%|          | 0/5 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/1.75M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.16M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.15M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.48M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/324k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/5 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]


Reading metadata...: 0it [00:00, ?it/s][A
Reading metadata...: 7503it [00:00, 52226.42it/s]


Generating validation split: 0 examples [00:00, ? examples/s]


Reading metadata...: 5085it [00:00, 72033.98it/s]


Generating test split: 0 examples [00:00, ? examples/s]


Reading metadata...: 5141it [00:00, 81473.72it/s]


Generating other split: 0 examples [00:00, ? examples/s]


Reading metadata...: 6226it [00:00, 70517.44it/s]


Generating invalidated split: 0 examples [00:00, ? examples/s]


Reading metadata...: 1388it [00:00, 62118.61it/s]


## Filter Columns

In [13]:
sanity_check(['should-not-use-gpu', 'settings-set', 'libraries-are-installed', 'logged-in-to-huggingface', 'whisper-loaded', 'commonvoice-downloaded'])
sanity_check_not_added(['columns-filtered'])

# Check columns
common_voice

Exception: ignored

In [9]:
sanity_check(['should-not-use-gpu', 'settings-set', 'libraries-are-installed', 'logged-in-to-huggingface', 'whisper-loaded', 'commonvoice-downloaded'])
sanity_check_not_added(['columns-filtered'])

# Remove features we don't need
common_voice = common_voice.remove_columns(dataset_column_filter)

sanity_prop['columns-filtered'] = True

In [10]:
sanity_check(['should-not-use-gpu', 'settings-set', 'libraries-are-installed', 'logged-in-to-huggingface', 'whisper-loaded', 'commonvoice-downloaded', 'columns-filtered'])

# Check columns
common_voice

DatasetDict({
    train: Dataset({
        features: ['audio', 'sentence'],
        num_rows: 12588
    })
    test: Dataset({
        features: ['audio', 'sentence'],
        num_rows: 5141
    })
})

## Resample Audio to Match Model

In [11]:
sanity_check(['should-not-use-gpu', 'settings-set', 'libraries-are-installed', 'logged-in-to-huggingface', 'whisper-loaded', 'commonvoice-downloaded',
              'columns-filtered'])
sanity_check_not_added(['audio-resampled'])

# Check
common_voice["train"][0]

{'audio': {'path': '/root/.cache/huggingface/datasets/downloads/extracted/26a69e6d80dfe65190cacdcf3173f3c82a2b22a0b63375e3b5142cc3a49a48de/sv-SE_train_0/common_voice_sv-SE_21922884.mp3',
  'array': array([ 0.00000000e+00, -4.05294071e-13, -5.49042907e-13, ...,
          1.22343961e-04,  8.68627030e-05,  5.25863652e-05]),
  'sampling_rate': 48000},
 'sentence': 'Gör hon?'}

In [12]:
sanity_check(['should-not-use-gpu', 'settings-set', 'libraries-are-installed', 'logged-in-to-huggingface', 'whisper-loaded', 'commonvoice-downloaded',
              'columns-filtered'])

# Downsample to pretrained_model_sampling_rate (16 kHz)
from datasets import Audio
common_voice = common_voice.cast_column("audio", Audio(sampling_rate=pretrained_model_sampling_rate))

# Check
common_voice["train"][0]

# Set language for coming data
if pretrained_model_language != pretrained_model_language_training:
  tokenizer.set_prefix_tokens(language=pretrained_model_language_training)

# Function to prepare the dataset
def prepare_dataset(batch):
    # load and resample audio data from 48 to 16kHz
    audio = batch["audio"]

    # compute log-Mel input features from input audio array
    batch["input_features"] = feature_extractor(audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0]

    # encode target text to label ids
    batch["labels"] = tokenizer(batch["sentence"]).input_ids
    return batch

# Prepare the full dataset
common_voice = common_voice.map(prepare_dataset, remove_columns=common_voice.column_names["train"], num_proc=2)

sanity_prop['audio-resampled'] = True

Map (num_proc=2):   0%|          | 0/12588 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/5141 [00:00<?, ? examples/s]

In [17]:
sanity_check(['should-not-use-gpu', 'settings-set', 'libraries-are-installed', 'logged-in-to-huggingface', 'whisper-loaded', 'commonvoice-downloaded',
              'columns-filtered', 'audio-resampled'])

# Push to hub
common_voice.push_to_hub(task_repo_dataset)

Uploading the dataset shards:   0%|          | 0/25 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/10 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]