In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
import hopsworks

project = hopsworks.login()

In [None]:
# Constants
DATASET_PATH = "mozilla-foundation/common_voice_11_0"
DATASET_NAME = "sv-SE"

PRETRAINED_MODEL_PATH = "openai/whisper-small"

# Download data

In [None]:
from datasets import load_dataset, DatasetDict

common_voice = DatasetDict()

common_voice["train"] = load_dataset(DATASET_PATH, DATASET_NAME, split="train+validation", use_auth_token=True)
common_voice["test"] = load_dataset(DATASET_PATH, DATASET_NAME, split="test", use_auth_token=True)

print(common_voice)

In [None]:
# Drop unnecessary columns
common_voice = common_voice.remove_columns(
    ["accent", "age", "client_id", "down_votes", "gender", "locale", "path", "segment", "up_votes"])

# Feature extraction
## Initialize:
- Feature Extracator
- Tokenizer
- Processor

In [None]:
from transformers import WhisperFeatureExtractor, WhisperTokenizer, WhisperProcessor

feature_extractor = WhisperFeatureExtractor.from_pretrained(PRETRAINED_MODEL_PATH)

tokenizer = WhisperTokenizer.from_pretrained(PRETRAINED_MODEL_PATH, language="Swedish", task="transcribe")

processor = WhisperProcessor.from_pretrained(PRETRAINED_MODEL_PATH, language="Swedish", task="transcribe")

## Prepare data

In [None]:
# Inspect data
print(common_voice["train"][0])

In [None]:
# Downsample to 16000 kHz
from datasets import Audio

common_voice = common_voice.cast_column("audio", Audio(sampling_rate=16000))

## Process data

In [None]:
def prepare_dataset(batch):
    # load and resample audio data from 48 to 16kHz
    audio = batch["audio"]

    # compute log-Mel input features from input audio array
    batch["input_features"] = feature_extractor(audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0]

    # encode target text to label ids
    batch["labels"] = tokenizer(batch["sentence"]).input_ids
    return batch
common_voice = common_voice.map(prepare_dataset, remove_columns=common_voice.column_names["train"], num_proc=8)

In [None]:
common_voice.save_to_disk("common_voice")

In [None]:
cc = DatasetDict.load_from_disk("common_voice")

In [None]:
# Upload to hopsworks
dataset_api = project.get_dataset_api()

dataset_api.upload("./common_voice/dataset_dict.json", "Resources/common_voice_se", overwrite=True)

dataset_api.upload("./common_voice/train/state.json", "Resources/common_voice_se/train/", overwrite=True)
dataset_api.upload("./common_voice/test/state.json", "Resources/common_voice_se/test/", overwrite=True)

dataset_api.upload("./common_voice/train/dataset.arrow", "Resources/common_voice_se/train/", overwrite=True)
dataset_api.upload("./common_voice/test/dataset.arrow", "Resources/common_voice_se/test/", overwrite=True)

dataset_api.upload("./common_voice/train/dataset_info.json", "Resources/common_voice_se/train/", overwrite=True)
dataset_api.upload("./common_voice/test/dataset_info.json", "Resources/common_voice_se/test/", overwrite=True)