In [1]:
# Install ffmpeg and other dependencies
!add-apt-repository -y ppa:jonathonf/ffmpeg-4
!apt update
!apt install -y ffmpeg
!pip install datasets>=2.6.1
!pip install git+https://github.com/huggingface/transformers
!pip install librosa
!pip install evaluate>=0.30
!pip install jiwer
!pip install gradio
!pip install hopsworks

Get:1 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease [3,626 B]
Ign:2 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
Hit:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease
Hit:4 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  Release
Hit:5 http://archive.ubuntu.com/ubuntu bionic InRelease
Get:6 http://security.ubuntu.com/ubuntu bionic-security InRelease [88.7 kB]
Get:7 http://archive.ubuntu.com/ubuntu bionic-updates InRelease [88.7 kB]
Hit:8 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu bionic InRelease
Get:9 http://archive.ubuntu.com/ubuntu bionic-backports InRelease [83.3 kB]
Hit:10 http://ppa.launchpad.net/cran/libgit2/ubuntu bionic InRelease
Hit:11 http://ppa.launchpad.net/deadsnakes/ppa/ubuntu bionic InRelease
Hit:12 http://ppa.launchpad.net/graphics-drivers/ppa/ubuntu bionic InRelease
Hit:13 http://ppa.launchpad.net/jonathonf

In [2]:
# Setup HuggingFace and Hopsworks
from huggingface_hub import notebook_login
import hopsworks

notebook_login()
project = hopsworks.login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

Connected. Call `.close()` to terminate connection gracefully.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/5274


In [3]:
# Download and initialize dataset
from datasets import load_dataset, DatasetDict, Dataset
import pandas as pd

common_voice = DatasetDict()

common_voice["train"] = load_dataset("mozilla-foundation/common_voice_11_0", "sv-SE", split="train+validation", use_auth_token=True)
common_voice["test"] = load_dataset("mozilla-foundation/common_voice_11_0", "sv-SE", split="test", use_auth_token=True)

df_test = pd.DataFrame(common_voice['test'])
df_train = pd.DataFrame(common_voice['train'])

# Drop 10% of the training data to handle size issues
df_train = df_train.sample(frac=0.9, random_state=42)

common_voice = DatasetDict(
    {
        "train": Dataset.from_pandas(df_train),
        "test": Dataset.from_pandas(df_test),
    }
)


common_voice = common_voice.remove_columns(["accent", "age", "client_id", "down_votes", "gender", "locale", "path", "segment", "up_votes"])



In [None]:
# Transform dataset to match the format of the pretrained model
from transformers import WhisperFeatureExtractor
from transformers import WhisperTokenizer
from transformers import WhisperProcessor
from datasets import Audio

# Preparing function to transform dataset
def prepare_dataset(batch):
    # load and resample audio data from 48 to 16kHz
    audio = batch["audio"]

    # compute log-Mel input features from input audio array 
    batch["input_features"] = feature_extractor(audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0]

    # encode target text to label ids 
    batch["labels"] = tokenizer(batch["sentence"]).input_ids
    return batch

# Initialize transformsers
feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-small")
tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-small", language="Swedish", task="transcribe")
processor = WhisperProcessor.from_pretrained("openai/whisper-small", language="Swedish", task="transcribe")

# Transform dataset
common_voice = common_voice.cast_column("audio", Audio(sampling_rate=16000)) # 16000
common_voice = common_voice.map(prepare_dataset, remove_columns=common_voice.column_names["train"], num_proc=2)

  

In [None]:
# Save the dataset to disk
import os

"""
common_voice.save_to_disk("common_voice")
cc = DatasetDict.load_from_disk("common_voice")
print(os.getcwd())
print(os.listdir("./common_voice/"))
print(os.listdir("./common_voice/train"))
print(os.listdir("./common_voice/test"))

# This does not work
def get_dir_size(path='/common_voice/train'):
    total = 0
    with os.scandir(path) as it:
        for entry in it:
            if entry.is_file():
                total += entry.stat().st_size
            elif entry.is_dir():
                total += get_dir_size(entry.path)
    return total
    
#sz = get_dir_size(path="/root/.cache/common_voice/")
#print(sz)
"""

# Save your dataset to google drive
common_voice.save_to_disk(F"/content/gdrive/My Drive/common_voice/")
# cc2 = DatasetDict.load_from_disk("/content/gdrive/My Drive/common_voice")

# Upload dataset (.arrow) to Hopsworks
dataset_api = project.get_dataset_api()

# Upload Dataset Dict
path1 = dataset_api.upload(
    local_path = "./common_voice/dataset_dict.json", 
    upload_path = "/Projects/nathanotal/Voice/", overwrite=True)

# Upload train state
path2 = dataset_api.upload(
    local_path = "./common_voice/train/state.json", 
    upload_path = "/Projects/nathanotal/Voice/train/", overwrite=True)

# Upload train info
path3 = dataset_api.upload(
    local_path = "./common_voice/train/dataset_info.json", 
    upload_path = "/Projects/nathanotal/Voice/train/", overwrite=True)

# Upload test state
path4 = dataset_api.upload(
    local_path = "./common_voice/test/state.json", 
    upload_path = "/Projects/nathanotal/Voice/test/", overwrite=True)

# Upload test info
path5 = dataset_api.upload(
    local_path = "./common_voice/test/dataset_info.json", 
    upload_path = "/Projects/nathanotal/Voice/test/", overwrite=True)

# Upload test data
path6 = dataset_api.upload(
    local_path = "./common_voice/test/dataset.arrow", 
    upload_path = "/Projects/nathanotal/Voice/test/", overwrite=True)

# Upload train data
path7 = dataset_api.upload(
    local_path = "./common_voice/train/dataset.arrow", 
    upload_path = "/Projects/nathanotal/Voice/train/", overwrite=True)

# Print the paths to the uploaded files
print(path1, path2, path3, path4, path5, path6, path7)
