In [1]:
# Preliminary
!pip install datasets>=2.6.1 jiwer librosa evaluate>=0.30 git+https://github.com/huggingface/transformers


  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers /tmp/pip-req-build-7bp6gwdf


### Load WhisperFeatureExtractor

In [2]:
from transformers import WhisperFeatureExtractor

feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-small")

preprocessor_config.json:   0%|          | 0.00/185k [00:00<?, ?B/s]

### Load WhisperTokenizer

In [3]:
from transformers import WhisperTokenizer

tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-small", language="German", task="transcribe")

tokenizer_config.json:   0%|          | 0.00/805 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/836k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.48M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/494k [00:00<?, ?B/s]

normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/34.6k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.08k [00:00<?, ?B/s]

### Combine To Create A WhisperProcessor

To simplify using the feature extractor and tokenizer, we can _wrap_
both into a single `WhisperProcessor` class. This processor object
inherits from the `WhisperFeatureExtractor` and `WhisperProcessor`,
and can be used on the audio inputs and model predictions as required.
In doing so, we only need to keep track of two objects during training:
the `processor` and the `model`:

In [4]:
from transformers import WhisperProcessor

processor = WhisperProcessor.from_pretrained("openai/whisper-small", language="Germany", task="transcribe")

## Connect to GDrive

In [5]:
# If run in colab
try:
    from google.colab import drive
    drive.mount('/content/drive', force_remount=True)
    path = "/content/drive/MyDrive/Lab2/ColabData/First"
    print("Running on colab")
except ImportError:
    # If the module is not present, we assume we're running locally
    print("running locally")
    path = "/mnt/g/My Drive/Lab2/ColabData/First"
except KeyError:
    # For whatever reason the colab docker container is able to import it but
    # not execute the mount and throws a KeyError
    print("running locally")
    path = "/mnt/g/My Drive/Lab2/ColabData/First"

print(path)

Mounted at /content/drive
Running on colab
/content/drive/MyDrive/Lab2/ColabData/First


# Load Data
This takes a few minutes.

TODO: Maybe one could argue that it might make sense to check how much data has already been written before reading all the "raw" data. But for now that is good enough

In [6]:
from datasets import load_from_disk
print("Reading data from", path)
common_voice = load_from_disk(path)
common_voice


Reading data from /content/drive/MyDrive/Lab2/ColabData/First


DatasetDict({
    train: Dataset({
        features: ['audio', 'sentence'],
        num_rows: 495090
    })
    test: Dataset({
        features: ['audio', 'sentence'],
        num_rows: 16082
    })
})

### Prepare Data

In [7]:
def prepare_dataset(batch):
    # load and resample audio data from 48 to 16kHz
    audio = batch["audio"]

    # compute log-Mel input features from input audio array
    batch["input_features"] = feature_extractor(audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0]

    # encode target text to label ids
    batch["labels"] = tokenizer(batch["sentence"]).input_ids
    return batch

In [8]:
from datasets import Audio

common_voice = common_voice.cast_column("audio", Audio(sampling_rate=16000))

print(common_voice["train"][0])
print("Len: " + str(len(common_voice["train"])))
print(common_voice["test"][0])
print("Len: " + str(len(common_voice["test"])))


{'audio': {'path': 'common_voice_de_17545785.mp3', 'array': array([-1.70530257e-13, -8.52651283e-13, -5.68434189e-13, ...,
       -1.78488335e-11,  3.52997631e-11,  2.02859951e-11]), 'sampling_rate': 16000}, 'sentence': 'Wo ist denn die Fernbedienung?'}
Len: 495090
{'audio': {'path': 'common_voice_de_17922420.mp3', 'array': array([-3.55271368e-14,  4.79616347e-14, -2.13162821e-14, ...,
        1.40587009e-09, -3.40389184e-09,  1.92177829e-10]), 'sampling_rate': 16000}, 'sentence': 'Zieht euch bitte draußen die Schuhe aus.'}
Len: 16082


# Splitting the data for Processing

Since the 490k rows are too much to be processed by one session of google collab. (Also I can't explain why, but the Disk size was running out, I assume that is due to how jupyter notebooks work, but I will have to look into that)

First let's prepare a few things:
1. Change the path variable to a different folder
2. Chunk the input into chunks of size 1000
3. Prepare a save_to_drive function based on the chunk of data and the chunk_index.
4. Check if a certain chunk has already been processed
5. Process if not

In [9]:
data_path = path.replace('/First', '/Second')
print(data_path)
import glob


def save_to_drive(chunk, chunk_index):
    # Save 'chunk' to Google Drive with a unique name based on 'chunk_index'
    import os

    file_path = os.path.join(data_path, f"processed_chunk_{chunk_index}.dataset")
    if not os.path.exists(data_path):
        os.makedirs(data_path)
    # Save as parquet because better :)
    #print(type(chunk))
    chunk.save_to_disk(file_path, max_shard_size="2GB")
    print(f"Saved chunk {chunk_index} to Google Drive at {file_path}")


def is_processed(chunk_index):
    from pathlib import Path
    # Check if a file for this chunk index exists in Google Drive
    filepath = Path(data_path) / f"processed_chunk_{chunk_index}.dataset"
    return filepath.exists()


/content/drive/MyDrive/Lab2/ColabData/Second


Now let's calculate the indices for each of the chunks

In [None]:
# We want to access the training part of the split
common_voice_train = common_voice['train']
chunk_size = 1000
num_chunks = (len(common_voice_train) + chunk_size - 1) // chunk_size  # Calculate the total number of chunks
chunks = []
for chunk_index in range(num_chunks):
    start = chunk_index * chunk_size
    end = min((chunk_index + 1) * chunk_size, len(common_voice_train))

    # Select the chunk
    chunk = common_voice_train.select(range(start, end))


    # Check if the chunk is empty
    if len(chunk) == 0:
        print(f"Chunk {chunk_index} is empty. Skipping...")
        continue
    chunks.append(chunk)


We split the initial dataset into chunks of size 1000.
That means on our dataset of 496k rows we should have a length of 496
Let's confirm that:

In [None]:
print(len(chunks))

496


And finally put it all together

In [None]:
print("beep")
for i, chunk in enumerate(chunks):
    end = min(i + chunk_size, len(common_voice_train))

    # Check if this chunk is already processed
    if is_processed(i):
        print(f"Chunk {i} is already processed. Skipping...")
        continue
    print("Iteration number: " + str(i) + " of 496")
    # Select and process the chunk
    processed_chunk = chunk.map(prepare_dataset, remove_columns=chunk.column_names, num_proc=2)

    # Save the processed chunk to Google Drive

    save_to_drive(processed_chunk, i)


This process is going to be taking a longer time than the "normal" approach.
However even the normal approach is going to run into timeouts on things like google colab.


In [18]:
common_voice_test = common_voice['test']
chunk_size = 1000
num_chunks = (len(common_voice_test) + chunk_size - 1) // chunk_size  # Calculate the total number of chunks
chunks = []
for chunk_index in range(num_chunks):
    start = chunk_index * chunk_size
    end = min((chunk_index + 1) * chunk_size, len(common_voice_test))

    # Select the chunk
    chunk = common_voice_test.select(range(start, end))


    # Check if the chunk is empty
    if len(chunk) == 0:
        print(f"Chunk {chunk_index} is empty. Skipping...")
        continue
    chunks.append(chunk)

print(len(chunks))

17


In [19]:
print("boop")
for i, chunk in enumerate(chunks):
    end = min(i + chunk_size, len(common_voice_test))
    # change i to include test
    i = "test" + str(i)

    # Check if this chunk is already processed
    if is_processed(i):
        print(f"Chunk {i} is already processed. Skipping...")
        continue
    print("Iteration number: " + str(i) + " of 17")
    # Select and process the chunk
    processed_chunk = chunk.map(prepare_dataset, remove_columns=chunk.column_names, num_proc=2)

    # Save the processed chunk to Google Drive

    save_to_drive(processed_chunk, i)

boop
Iteration number: test0 of 496


Map (num_proc=2):   0%|          | 0/1000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Saved chunk test0 to Google Drive at /content/drive/MyDrive/Lab2/ColabData/Second/processed_chunk_test0.dataset
Iteration number: test1 of 496


Map (num_proc=2):   0%|          | 0/1000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Saved chunk test1 to Google Drive at /content/drive/MyDrive/Lab2/ColabData/Second/processed_chunk_test1.dataset
Iteration number: test2 of 496


Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Saved chunk test2 to Google Drive at /content/drive/MyDrive/Lab2/ColabData/Second/processed_chunk_test2.dataset
Iteration number: test3 of 496


Map (num_proc=2):   0%|          | 0/1000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Saved chunk test3 to Google Drive at /content/drive/MyDrive/Lab2/ColabData/Second/processed_chunk_test3.dataset
Iteration number: test4 of 496


Map (num_proc=2):   0%|          | 0/1000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Saved chunk test4 to Google Drive at /content/drive/MyDrive/Lab2/ColabData/Second/processed_chunk_test4.dataset
Iteration number: test5 of 496


Map (num_proc=2):   0%|          | 0/1000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Saved chunk test5 to Google Drive at /content/drive/MyDrive/Lab2/ColabData/Second/processed_chunk_test5.dataset
Iteration number: test6 of 496


Map (num_proc=2):   0%|          | 0/1000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Saved chunk test6 to Google Drive at /content/drive/MyDrive/Lab2/ColabData/Second/processed_chunk_test6.dataset
Iteration number: test7 of 496


Map (num_proc=2):   0%|          | 0/1000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Saved chunk test7 to Google Drive at /content/drive/MyDrive/Lab2/ColabData/Second/processed_chunk_test7.dataset
Iteration number: test8 of 496


Map (num_proc=2):   0%|          | 0/1000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Saved chunk test8 to Google Drive at /content/drive/MyDrive/Lab2/ColabData/Second/processed_chunk_test8.dataset
Iteration number: test9 of 496


Map (num_proc=2):   0%|          | 0/1000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Saved chunk test9 to Google Drive at /content/drive/MyDrive/Lab2/ColabData/Second/processed_chunk_test9.dataset
Iteration number: test10 of 496


Map (num_proc=2):   0%|          | 0/1000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Saved chunk test10 to Google Drive at /content/drive/MyDrive/Lab2/ColabData/Second/processed_chunk_test10.dataset
Iteration number: test11 of 496


Map (num_proc=2):   0%|          | 0/1000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Saved chunk test11 to Google Drive at /content/drive/MyDrive/Lab2/ColabData/Second/processed_chunk_test11.dataset
Iteration number: test12 of 496


Map (num_proc=2):   0%|          | 0/1000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Saved chunk test12 to Google Drive at /content/drive/MyDrive/Lab2/ColabData/Second/processed_chunk_test12.dataset
Iteration number: test13 of 496


Map (num_proc=2):   0%|          | 0/1000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Saved chunk test13 to Google Drive at /content/drive/MyDrive/Lab2/ColabData/Second/processed_chunk_test13.dataset
Iteration number: test14 of 496


Map (num_proc=2):   0%|          | 0/1000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Saved chunk test14 to Google Drive at /content/drive/MyDrive/Lab2/ColabData/Second/processed_chunk_test14.dataset
Iteration number: test15 of 496


Map (num_proc=2):   0%|          | 0/1000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Saved chunk test15 to Google Drive at /content/drive/MyDrive/Lab2/ColabData/Second/processed_chunk_test15.dataset
Iteration number: test16 of 496


Map (num_proc=2):   0%|          | 0/82 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/82 [00:00<?, ? examples/s]

Saved chunk test16 to Google Drive at /content/drive/MyDrive/Lab2/ColabData/Second/processed_chunk_test16.dataset


# Google drive sometimes just disconnects :'\(

## Sometimes you need to force_remount=True in the mounting step for Google drive to force it to remount.*italicized text*

OSError: [Errno 107] Transport endpoint is not connected

During handling of the above exception, another exception occurred:

OSError                                   Traceback (most recent call last)
/usr/local/lib/python3.10/dist-packages/datasets/io/csv.py in write(self)
     91
     92         if isinstance(self.path_or_buf, (str, bytes, os.PathLike)):
---> 93             with open(self.path_or_buf, "wb+") as buffer:
     94                 written = self._write(file_obj=buffer, header=header, index=index, **self.to_csv_kwargs)
     95         else:

OSError: [Errno 107] Transport endpoint is not connected

