<a href="https://colab.research.google.com/github/Losiyu/test/blob/master/dataset_generate.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -q huggingface_hub torchaudio librosa 
!pip install -q datasets
!pip install -q transformers
!pip install -q jiwer

In [2]:
import numpy as np
import re
import datasets
from datasets import load_dataset

DATASET_REPO_NAME = "Siyong/speech_millad"

TOKEN = "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"

# Dataset

In [3]:
from datasets import Features, Value, Sequence

dataset = load_dataset("Siyong/speech_v1", data_files='speech.json', split='train', use_auth_token=TOKEN)
# remove unused column
dataset = dataset.remove_columns(['videoId', 'startTime', 'endTime'])
# cast audio array type to float32
dataset = dataset.with_format("numpy", columns=["audio"], output_all_columns=True)
dataset = dataset.cast(Features({
    "speaker": Value("int64"),
    "sentence": Value("string"),
    "audio": {
        "array": Sequence(Value('float32')),
        "sampling_rate": Value("int64")
    }
}))
dataset

Using custom data configuration Siyong--speech_v1-da43cd8f9170ea46
Reusing dataset json (/root/.cache/huggingface/datasets/json/Siyong--speech_v1-da43cd8f9170ea46/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b)
Loading cached processed dataset at /root/.cache/huggingface/datasets/json/Siyong--speech_v1-da43cd8f9170ea46/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b/cache-ee9e2efb57a07191.arrow


Dataset({
    features: ['speaker', 'sentence', 'audio'],
    num_rows: 1847
})

In [4]:
# filter
def applyFilters(batch):
  # 1. remove too short audio
  maxlen = 4.0 # max input length in sec
  minlen = 0.2 # min input length in sec
  c1 = minlen * 16_000 < len(batch['audio']['array']) and \
       maxlen * 16_000 > len(batch['audio']['array'])
  # 2. remove sentence with <unintelligible>
  c2 =  '<' not in batch['sentence']
  # 3. remove speech from customer
  c3 = batch['speaker'] == 0
  return c1 and c2 and c3
dataset = dataset.filter(applyFilters, num_proc=4)
if 'speaker' in dataset.features:
  dataset = dataset.remove_columns('speaker')
dataset



Dataset({
    features: ['sentence', 'audio'],
    num_rows: 583
})

In [5]:
# map
def applyMaps(batch):
  # 1. remove special charactor
  chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\“\%\‘\”\�]'
  batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower() + " "
  return batch
dataset = dataset.map(applyMaps, num_proc=4)
dataset

Dataset({
    features: ['sentence', 'audio'],
    num_rows: 583
})

In [6]:
import IPython.display as ipd

sample = dataset[5]
array = np.array(sample['audio']['array'])
print('sentence:', sample['sentence'])

ipd.Audio(array, autoplay=True, rate=16000)

sentence: thanks for coming 


In [7]:
# Split
dataset = dataset.train_test_split(0.2)
dataset

DatasetDict({
    train: Dataset({
        features: ['sentence', 'audio'],
        num_rows: 466
    })
    test: Dataset({
        features: ['sentence', 'audio'],
        num_rows: 117
    })
})

In [8]:
dataset.push_to_hub(DATASET_REPO_NAME, token=TOKEN, private=True)

Pushing split train to the Hub.
The repository already exists: the `private` keyword argument will be ignored.


Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Pushing split test to the Hub.
The repository already exists: the `private` keyword argument will be ignored.


Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]