In [1]:
# %matplotlib
# !git pull

In [2]:
# !wget http://www.openslr.org/resources/12/dev-clean.tar.gz
# !wget http://www.openslr.org/resources/12/test-clean.tar.gz
# !tar -zxf dev-clean.tar.gz
# !tar -zxf test-clean.tar.gz
# !rm dev-clean.tar.gz test-clean.tar.gz

In [3]:
# !wget https://raw.githubusercontent.com/pyannote/pyannote-audio/master/tutorials/data_preparation/download_ami.sh
# !mkdir ami
# !bash download_ami.sh ami

In [4]:
# !wget https://raw.githubusercontent.com/pyannote/pyannote-audio/master/tutorials/data_preparation/AMI/MixHeadset.development.rttm
# !wget https://raw.githubusercontent.com/pyannote/pyannote-audio/master/tutorials/data_preparation/AMI/MixHeadset.test.rttm
# !wget https://raw.githubusercontent.com/pyannote/pyannote-audio/master/tutorials/data_preparation/AMI/MixHeadset.train.rttm

In [5]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = ''

In [6]:
import tensorflow as tf
import malaya_speech.train as train
import numpy as np
import malaya_speech
from tqdm import tqdm
import random

np.seterr(divide='raise', invalid='raise')

{'divide': 'warn', 'over': 'warn', 'under': 'ignore', 'invalid': 'warn'}

In [7]:
import librosa

def random_stretch(samples, low = 0.5, high = 1.3):
    input_length = len(samples)
    stretching = samples.copy()
    random_stretch = np.random.uniform(low = low, high = high)
    stretching = librosa.effects.time_stretch(
        stretching.astype('float'), random_stretch
    )
    return stretching

def random_pitch(samples, low = 0.5, high = 1.0):
    y_pitch_speed = samples.copy()
    length_change = np.random.uniform(low = low, high = high)
    speed_fac = 1.0 / length_change
    tmp = np.interp(
        np.arange(0, len(y_pitch_speed), speed_fac),
        np.arange(0, len(y_pitch_speed)),
        y_pitch_speed,
    )
    minlen = min(y_pitch_speed.shape[0], tmp.shape[0])
    y_pitch_speed *= 0
    y_pitch_speed[:minlen] = tmp[:minlen]
    return y_pitch_speed

In [8]:
import pandas as pd
import random
import os
from glob import glob
from tqdm import tqdm

In [10]:
files = glob('/home/husein/speech-bahasa/LibriSpeech/*/*/*/*.flac')
files = random.sample(files, 5000)
len(files)

5000

In [11]:
y, sr = malaya_speech.load(files[0])

In [12]:
import IPython.display as ipd

In [13]:
ipd.Audio(y, rate = sr)

In [14]:
ipd.Audio(random_pitch(y), rate = sr)

In [15]:
noises = glob('/home/husein/noise/noise/*.wav')
noises = [f for f in noises if os.path.getsize(f) / 1e6 < 10]
noises = [malaya_speech.load(n)[0] for n in tqdm(noises)]
noises = [n for n in noises]

100%|██████████| 7035/7035 [00:25<00:00, 281.13it/s]


In [16]:
def generator(sr = 16000):
    for file in tqdm(files):
        y, sr = malaya_speech.load(file)
        y_int = malaya_speech.astype.float_to_int(y)
        if random.random() > 0.6:
            y = random_pitch(y)
        vad = malaya_speech.vad.webrtc(minimum_amplitude = int(np.quantile(np.abs(y_int), 0.3)))
        frames_int = malaya_speech.generator.frames(y_int, 30, sr, False)
        frames = malaya_speech.generator.frames(y, 30, sr, False)
        frames = [(frames[no], vad(frame)) for no, frame in enumerate(frames_int)]
        grouped = malaya_speech.group.group_frames(frames)

        x, y = [], []
        for g in grouped:
            if random.random() > 0.8:
                if g[1]:
                    factor = random.uniform(0.1, 0.4)
                else:
                    factor = random.uniform(0.4, 0.9)

                n = random.choice(noises)
                g[0].array = malaya_speech.augmentation.waveform.add_noise(g[0].array, n, 
                                                                            factor = factor)
            frames = malaya_speech.generator.frames(g[0].array, 50, sr, False)
            frames = [f.array for f in frames]
            x.extend(frames)
            y.extend([int(g[1])] * len(frames))
    
        for i in range(len(x)):
            yield {
                'waveforms': x[i].tolist(),
                'targets': [int(y[i])],
            }
        
generator = generator()

In [17]:
os.system('rm vad/data/*')
DATA_DIR = os.path.expanduser('vad/data')
tf.gfile.MakeDirs(DATA_DIR)

In [18]:
shards = [{'split': 'train', 'shards': 95}, {'split': 'dev', 'shards': 5}]
train.prepare_dataset(generator, DATA_DIR, shards, prefix = 'vad')





  0%|          | 0/5000 [00:00<?, ?it/s]


INFO:tensorflow:Generating case 0.


 14%|█▍        | 724/5000 [00:44<04:51, 14.68it/s]

INFO:tensorflow:Generating case 100000.


 29%|██▉       | 1450/5000 [01:29<04:26, 13.34it/s]

INFO:tensorflow:Generating case 200000.


 44%|████▍     | 2195/5000 [02:13<03:36, 12.95it/s]

INFO:tensorflow:Generating case 300000.


 59%|█████▉    | 2941/5000 [02:57<02:04, 16.47it/s]

INFO:tensorflow:Generating case 400000.


 73%|███████▎  | 3655/5000 [03:42<01:42, 13.15it/s]

INFO:tensorflow:Generating case 500000.


 88%|████████▊ | 4398/5000 [04:27<00:46, 13.06it/s]

INFO:tensorflow:Generating case 600000.


100%|██████████| 5000/5000 [05:04<00:00, 16.40it/s]


INFO:tensorflow:Generated 682651 Examples
INFO:tensorflow:Shuffling data...
Instructions for updating:
Use eager execution and: 
`tf.data.TFRecordDataset(path)`






INFO:tensorflow:Data shuffled.


In [19]:
ami = glob('ami/amicorpus/*/*/*.wav')
ami = {os.path.split(f)[1].replace('.wav', ''): f for f in ami}
rttm = glob('*.rttm')
len(ami), len(rttm)

(171, 3)

In [20]:
def generator():
    for file in rttm:
        annotations = malaya_speech.extra.rttm.load(file)
        for mix in random.sample(annotations.keys(), 7):
            if mix not in ami:
                continue
            print(mix)
            sample = annotations[mix]
            y, sr = malaya_speech.load(ami[mix])
            if random.random() > 0.6:
                y = random_pitch(y)
            frames = malaya_speech.generator.frames(y, 50, sr, False)
            labels = []
            for i in tqdm(range(len(frames))):
                if len(sample.crop(frames[i].timestamp, frames[i].timestamp + frames[i].duration)._labelNeedsUpdate):
                    label = 1
                else:
                    label = 0
                yield {
                    'waveforms': frames[i].array.tolist(),
                    'targets': [label],
                }
            
generator = generator()

In [21]:
os.system('rm vad-ami/data/*')
DATA_DIR = os.path.expanduser('vad-ami/data')
tf.gfile.MakeDirs(DATA_DIR)

In [22]:
shards = [{'split': 'train', 'shards': 95}, {'split': 'dev', 'shards': 5}]
train.prepare_dataset(generator, DATA_DIR, shards, prefix = 'vad')

EN2002a.Mix-Headset


  0%|          | 0/42854 [00:00<?, ?it/s]

INFO:tensorflow:Generating case 0.


100%|██████████| 42854/42854 [01:35<00:00, 450.76it/s]


EN2002c.Mix-Headset


 96%|█████████▌| 57115/59445 [01:47<00:04, 538.33it/s]

INFO:tensorflow:Generating case 100000.


100%|██████████| 59445/59445 [01:52<00:00, 530.13it/s]


ES2004c.Mix-Headset


100%|██████████| 46687/46687 [01:10<00:00, 664.78it/s]


TS3003d.Mix-Headset


 97%|█████████▋| 50996/52364 [01:46<00:02, 481.55it/s]

INFO:tensorflow:Generating case 200000.


100%|██████████| 52364/52364 [01:49<00:00, 479.16it/s]


IS1009c.Mix-Headset


100%|██████████| 36416/36416 [00:38<00:00, 939.39it/s]


EN2002b.Mix-Headset


100%|██████████| 35736/35736 [00:56<00:00, 632.58it/s]


TS3007b.Mix-Headset


 39%|███▉      | 26445/67097 [00:39<01:00, 669.60it/s]

INFO:tensorflow:Generating case 300000.


100%|██████████| 67097/67097 [01:40<00:00, 668.56it/s]


IS1008b.Mix-Headset


100%|██████████| 35370/35370 [00:39<00:00, 887.36it/s]


ES2011d.Mix-Headset


 61%|██████    | 23999/39646 [00:38<00:25, 616.50it/s]

INFO:tensorflow:Generating case 400000.


100%|██████████| 39646/39646 [01:03<00:00, 620.67it/s]


TS3004a.Mix-Headset


100%|██████████| 26906/26906 [00:33<00:00, 806.65it/s]


IB4002.Mix-Headset


100%|██████████| 59400/59400 [02:02<00:00, 483.07it/s]


IB4003.Mix-Headset


100%|██████████| 40465/40465 [00:51<00:00, 787.31it/s]


TS3006c.Mix-Headset


 39%|███▊      | 19958/51684 [00:48<01:17, 408.18it/s]

INFO:tensorflow:Generating case 600000.


100%|██████████| 51684/51684 [02:07<00:00, 404.76it/s]


ES2010c.Mix-Headset


100%|██████████| 36755/36755 [00:48<00:00, 759.81it/s]


TS3008a.Mix-Headset


 10%|▉         | 2612/27044 [00:03<00:30, 802.75it/s]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

100%|██████████| 49485/49485 [01:08<00:00, 718.49it/s]

INFO:tensorflow:Generated 930445 Examples
INFO:tensorflow:Shuffling data...





INFO:tensorflow:Data shuffled.


In [41]:
config = malaya_speech.config.ctc_featurizer_config
config['feature_type'] = 'mfcc'
config['num_feature_bins'] = 64
featurizer = malaya_speech.tf_featurization.STTFeaturizer(**config)
n_mels = featurizer.num_feature_bins

In [42]:
def preprocess_inputs(example):
    s = featurizer.vectorize(example['waveforms'])
    s = tf.reshape(s, (-1, n_mels))
    length = tf.cast(tf.shape(s)[0], tf.int32)
    length = tf.expand_dims(length, 0)
    example['inputs'] = s
    example['inputs_length'] = length

    return example


def parse(serialized_example):

    data_fields = {
        'waveforms': tf.VarLenFeature(tf.float32),
        'targets': tf.VarLenFeature(tf.int64),
    }
    features = tf.parse_single_example(
        serialized_example, features=data_fields
    )
    for k in features.keys():
        features[k] = features[k].values

    features = preprocess_inputs(features)

    keys = list(features.keys())
    for k in keys:
        if k not in ['inputs', 'inputs_length', 'targets']:
            features.pop(k, None)

    return features


def get_dataset(files, batch_size=32, shuffle_size=32, num_cpu_threads=6,
                thread_count=24, is_training=True):
    def get():
        if is_training:
            d = tf.data.Dataset.from_tensor_slices(tf.constant(files))
            d = d.repeat()
            d = d.shuffle(buffer_size=len(files))
            cycle_length = min(num_cpu_threads, len(files))
            d = d.interleave(
                tf.data.TFRecordDataset,
                cycle_length=cycle_length,
                block_length=batch_size)
            d = d.shuffle(buffer_size=100)
        else:
            d = tf.data.TFRecordDataset(files)
            d = d.repeat()
        d = d.map(parse, num_parallel_calls=thread_count)
        d = d.padded_batch(
            batch_size,
            padded_shapes={
                'inputs': tf.TensorShape([None, n_mels]),
                'inputs_length': tf.TensorShape([None]),
                'targets': tf.TensorShape([None]),
            },
            padding_values={
                'inputs': tf.constant(0, dtype=tf.float32),
                'inputs_length': tf.constant(0, dtype=tf.int32),
                'targets': tf.constant(0, dtype=tf.int64),
            },
        )
        return d

    return get

In [43]:
train_files = tf.io.gfile.glob(
    'vad/data/vad-train*'
) + tf.io.gfile.glob('vad-ami/data/vad-train*')
train_dataset = get_dataset(train_files, is_training=True)

In [44]:
dataset = train_dataset()
dataset = dataset.make_one_shot_iterator().get_next()
dataset

{'targets': <tf.Tensor 'IteratorGetNext_2:2' shape=(?, ?) dtype=int64>,
 'inputs': <tf.Tensor 'IteratorGetNext_2:0' shape=(?, ?, 64) dtype=float32>,
 'inputs_length': <tf.Tensor 'IteratorGetNext_2:1' shape=(?, ?) dtype=int32>}

In [33]:
sess = tf.Session()

In [45]:
sess.run(dataset)

{'targets': array([[1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [0],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [0],
        [1],
        [0],
        [1],
        [1],
        [1],
        [1],
        [1],
        [0],
        [1],
        [0],
        [1],
        [1],
        [1],
        [0]]),
 'inputs': array([[[ 1.4091831 ,  0.99353606, -1.4133252 , ..., -1.2676312 ,
          -0.5081047 , -1.2723954 ],
         [-0.8078032 ,  0.37482122,  0.7500655 , ...,  1.1768011 ,
           1.3970188 ,  1.1707608 ],
         [-0.60137945, -1.3683548 ,  0.6632598 , ...,  0.09083005,
          -0.888914  ,  0.10163454]],
 
        [[ 1.2775162 ,  1.3479495 , -1.2348609 , ...,  0.49294904,
          -0.27757835, -0.05915939],
         [-0.11343477, -0.30347028,  0.02048903, ...,  0.9014592 ,
          -1.0621324 ,  1.2532525 ],
         [-1.1640811 , -1.0444797 ,  1