In [1]:
import parselmouth
import librosa
import pyworld as pw
from sklearn.preprocessing import StandardScaler
import numpy as np
import os

os.environ['CUDA_VISIBLE_DEVICES'] = ''
os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'

In [2]:
import yaml

with open('config.yaml') as fopen:
    config = yaml.safe_load(fopen)
    
config

{'sampling_rate': 22050,
 'fft_size': 1024,
 'hop_size': 256,
 'win_length': None,
 'window': 'hann',
 'num_mels': 80,
 'fmin': 80,
 'fmax': 7600,
 'global_gain_scale': 1.0,
 'trim_silence': True,
 'trim_threshold_in_db': 60,
 'trim_frame_size': 2048,
 'trim_hop_size': 512}

In [3]:
import numpy as np

# https://github.com/TensorSpeech/TensorFlowTTS/blob/master/tensorflow_tts/utils/outliers.py
def is_outlier(x, p25, p75):
    """Check if value is an outlier."""
    lower = p25 - 1.5 * (p75 - p25)
    upper = p75 + 1.5 * (p75 - p25)
    return x <= lower or x >= upper


def remove_outlier(x, p_bottom: int = 25, p_top: int = 75):
    """Remove outlier from x."""
    p_bottom = np.percentile(x, p_bottom)
    p_top = np.percentile(x, p_top)

    indices_of_outliers = []
    for ind, value in enumerate(x):
        if is_outlier(value, p_bottom, p_top):
            indices_of_outliers.append(ind)

    x[indices_of_outliers] = 0.0
    x[indices_of_outliers] = np.max(x)
    return x

In [4]:
import re

_pad = 'pad'
_start = 'start'
_eos = 'eos'
_punctuation = "!'(),.:;? "
_special = '-'
_letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'

MALAYA_SPEECH_SYMBOLS = (
    [_pad, _start, _eos] + list(_special) + list(_punctuation) + list(_letters)
)

In [5]:
def tts_encode(string: str, add_eos: bool = True):
    r = [MALAYA_SPEECH_SYMBOLS.index(c) for c in string if c in MALAYA_SPEECH_SYMBOLS]
    if add_eos:
        r = r + [MALAYA_SPEECH_SYMBOLS.index('eos')]
    return r

In [6]:
from glob import glob

files = glob('../malaya/postprocessing*.json')
files

['../malaya/postprocessing-edge-tts-news-yasmin.json',
 '../malaya/postprocessing-edge-tts-parliament.json',
 '../malaya/postprocessing-edge-tts-wiki-osman.json',
 '../malaya/postprocessing-edge-tts-news.json',
 '../malaya/postprocessing-edge-tts-parliament-yasmin.json']

In [7]:
import json

with open('../malaya/postprocessing-edge-tts-news.json') as fopen:
    text = json.load(fopen)
    
with open('../malaya/postprocessing-edge-tts-parliament.json') as fopen:
    parliament = json.load(fopen)

In [8]:
# !wget https://gist.githubusercontent.com/huseinzol05/98974ae8c6c7a65d4bc0af9f5003786a/raw/5aa5257608b61e8fcc828e99fbd070d5ca7358e3/mp.py

In [9]:
text[0]

['osman-news-edge-tts-text/138.json',
 'Ketika perang Aceh meletus pada tahun seribu lapan ratus tujuh puluh tiga , Teuku Ibrahim Lamnga aktif berjuang di garisan depan .']

In [10]:
!rm -rf output-osman

In [11]:
directory = 'output-osman'
os.system(f'mkdir {directory}')
directories = ['audios', 'mels', 'text_ids', 'f0s', 'energies', 'pitches']
for d in directories:
    os.system(f'mkdir {directory}/{d}')

In [12]:
txts = []
for t in text:
    index = os.path.split(t[0])[1].replace('.json', '')
    wav = t[0].replace('-text', '-wav').replace('.json', '.wav')
    if os.path.exists(wav):
        txts.append((wav, t[1], index, directory))

In [13]:
txts[0]

('osman-news-edge-tts-wav/138.wav',
 'Ketika perang Aceh meletus pada tahun seribu lapan ratus tujuh puluh tiga , Teuku Ibrahim Lamnga aktif berjuang di garisan depan .',
 '138',
 'output-osman')

In [17]:
import malaya_speech
from malaya_speech import Pipeline
from tqdm import tqdm
vad = malaya_speech.vad.webrtc()

def process(txts, 
            start_silent_trail = int(0.15 * config['sampling_rate']),
            middle_silent_trail = int(0.12 * config['sampling_rate']),
            end_silent_trail = int(0.1 * config['sampling_rate']),
            process_middle_silent = True,
            maxlen = 25):
    
    txts = txts[0]
    audios, text_ids = [], []

    for f in txts:
        directory = f[3]
        index = f[2]
        text = f[1]
        f = f[0]
            
        text = [text, tts_encode(text)]
        try:
            audio, _ = malaya_speech.load(f, sr = config['sampling_rate'])
        except:
            continue
        audio = audio[start_silent_trail:]

        if config['trim_silence']:
            y_= malaya_speech.resample(audio, config['sampling_rate'], 16000)
            y_ = malaya_speech.astype.float_to_int(y_)
            frames = list(malaya_speech.generator.frames(audio, 30, config['sampling_rate']))
            frames_ = list(malaya_speech.generator.frames(y_, 30, 16000, append_ending_trail = False))
            frames_webrtc = [(frames[no], vad(frame)) for no, frame in enumerate(frames_)]
            grouped_deep = malaya_speech.group.group_frames(frames_webrtc)
            grouped_deep = malaya_speech.group.group_frames_threshold(grouped_deep, 0.15)
            r = []
            for no, g in enumerate(grouped_deep):
                if g[1]:
                    g = g[0].array
                else:
                    if no == 0:
                        g = g[0].array[-start_silent_trail:]
                    elif no == (len(grouped_deep) - 1):
                        g = g[0].array[:end_silent_trail]
                    else:
                        if process_middle_silent:
                            g = np.concatenate([g[0].array[:middle_silent_trail], g[0].array[-middle_silent_trail:]])
                        else:
                            g = g[0].array
                        
                r.append(g)
            audio = np.concatenate(r)
        
        if (len(audio) / config['sampling_rate']) > maxlen:
            print('skipped, audio too long')
            continue

        audio = np.pad(audio, (0, config["fft_size"]), mode="edge")
            
        np.save(f'{directory}/audios/{index}.npy', audio)
        np.save(f'{directory}/text_ids/{index}.npy', text)

        audios.append(audio)
        text_ids.append(text)
    
    return [[audios, text_ids]]

In [18]:
import matplotlib.pyplot as plt
import IPython.display as ipd

In [19]:
%%time

i = 1608
r = process((txts[i: i + 10], 0))[0]

CPU times: user 167 ms, sys: 10.7 ms, total: 178 ms
Wall time: 186 ms


  return array(a, dtype, copy=False, order=order, subok=True)


In [20]:
for n in range(len(r[1])):
    print(n, r[1][n][0])

0 Menurut Roman purba , Afrika tersebar di barat Egypt , manakala " Asia " yang dahulunya dirujuk sebagai Anatolia tersebar ke arah timur .
1 Bangunan yang digunakan ialah Blok ( Sekarang Kelas Penyayang ) .
2 Beberapa sesepuh di tempat itu berusaha menjelaskan setiap bangunan kuno yang ada di tempatnya .
3 Sekolah Kebangsaan Raja Bahar atau nama ringkasnya SK Raja Bahar , merupakan sebuah Sekolah kebangsaan yang terletak di Kota Jembal .
4 Usaha Pada seribu sembilan ratus sembilan puluh lapan , Jemiam melancarkan Jemima Kan Ltd sebuah syarikat membuat pakaian yang diusahakan oleh rakyat miskin Pakistan .
5 Mencari rekod fotografi , sebelas lagi nova yang ditemui .
6 Hasilnya dua lagi gol berjaya dijaringkan oleh pelawat , " katanya .
7 Maka prejudis juga satu hak bukan ?.
8 Kali ini , ia diterima oleh Lembaga Adat - istiadat .
9 Kerana kecantikannya , ia dikenal sebagai " Bunga Roos ( Mawar ) dari Cikembang " .


In [21]:
k = 9
ipd.Audio(r[0][k], rate = 22050)

In [24]:
import mp

for i in tqdm(range(0, len(txts), 1000)):
    index = min(i + 1000, len(txts))
    b = txts[i: index]
    mp.multiprocessing(b, process, cores = 5, returned = False)

  0%|                                                    | 0/49 [00:00<?, ?it/s]

skipped, audio too long


  2%|▉                                           | 1/49 [00:06<04:55,  6.16s/it]

skipped, audio too long


  4%|█▊                                          | 2/49 [00:12<04:50,  6.18s/it]

skipped, audio too long


 16%|███████▏                                    | 8/49 [00:49<04:14,  6.20s/it]

skipped, audio too long


 18%|████████                                    | 9/49 [00:55<04:12,  6.32s/it]

skipped, audio too long


 24%|██████████▌                                | 12/49 [01:14<03:48,  6.19s/it]

skipped, audio too long


 59%|█████████████████████████▍                 | 29/49 [03:24<02:46,  8.34s/it]

skipped, audio too long


100%|███████████████████████████████████████████| 49/49 [05:46<00:00,  7.07s/it]


In [25]:
!du -hs output-osman

48G	output-osman


In [26]:
directory = 'output-osman-parliament'
os.system(f'mkdir {directory}')
directories = ['audios', 'mels', 'text_ids', 'f0s', 'energies', 'pitches']
for d in directories:
    os.system(f'mkdir {directory}/{d}')

In [29]:
txts = []
for t in parliament:
    index = os.path.split(t[0])[1].replace('.json', '')
    wav = t[0].replace('-text', '-wav').replace('.json', '.wav')
    if os.path.exists(wav):
        txts.append((wav, t[1], index, directory))
len(txts)

57130

In [30]:
i = 80
r = process((txts[i: i + 10], 0))[0]

In [32]:
for n in range(len(r[1])):
    print(n, r[1][n][0])

0 Bagi Himpunan Rakyat Bersatu puia , pihak penganjur telah mengemukakan notis penganjuran kepada pihak polis dengan objektif penganjuran seperti berikut : satu .
1 Buat masa ini , pihak DBKL tidak mempunyai perancangan untuk mengambil alih tanah tersebut untuk dimajukan kerana tanah tersebut bukan milik .
2 Pihak Kerajaan sesungguhnya komited dalam memastikan mekanisme penyampaian bantuan PBR yang telus dan adil agar bantuan dapat dinikmati oleh golongan sasar .
3 Contohnya bagi pegawai perakaunan , permohonan tidak terbuka hanya kepada mereka yang mempunyai ijazah dalam bidang perakaunan .
4 lni akan menjejas peluang pekerjaan dan seterusnya ekonomi negara .
5 masih belum mencapai tempoh untuk diserahsimpan ke SSM .
6 Zon kawasan larangan merokok .
7 Walau bagaimanapun , jika pelaburan syarikat MKD melibatkan penubuhan anak syarikat , cadangan pelaburan ini perlu dikemukakan untuk pertimbangan MOF terlebih dahulu .
8 Bantuan Kumpulan Wang Amanah Pelajar .
9 lapan hingga sepuluh sepul

In [34]:
k = 1
ipd.Audio(r[0][k], rate = 22050)

In [35]:
import mp

In [37]:
for i in tqdm(range(0, len(txts), 1000)):
    index = min(i + 1000, len(txts))
    b = txts[i: index]
    mp.multiprocessing(b, process, cores = 6, returned = False)

  3%|█▌                                          | 2/58 [00:12<06:03,  6.49s/it]

skipped, audio too long


  7%|███                                         | 4/58 [00:26<06:00,  6.68s/it]

skipped, audio too long
skipped, audio too long
skipped, audio too long
skipped, audio too long


  9%|███▊                                        | 5/58 [00:35<06:46,  7.67s/it]

skipped, audio too long


 12%|█████▎                                      | 7/58 [00:50<06:17,  7.39s/it]

skipped, audio too long


 16%|██████▊                                     | 9/58 [01:07<06:31,  7.99s/it]

skipped, audio too long


 21%|████████▉                                  | 12/58 [01:31<06:04,  7.93s/it]

skipped, audio too long


 22%|█████████▋                                 | 13/58 [01:37<05:40,  7.57s/it]

skipped, audio too long


 31%|█████████████▎                             | 18/58 [02:18<05:29,  8.23s/it]

skipped, audio too long


 34%|██████████████▊                            | 20/58 [02:33<05:01,  7.93s/it]

skipped, audio too long


 38%|████████████████▎                          | 22/58 [02:55<05:37,  9.38s/it]

skipped, audio too long
skipped, audio too long


 41%|█████████████████▊                         | 24/58 [03:11<05:04,  8.97s/it]

skipped, audio too long


 50%|█████████████████████▌                     | 29/58 [03:57<04:12,  8.70s/it]

skipped, audio too long


 62%|██████████████████████████▋                | 36/58 [05:05<03:18,  9.03s/it]

skipped, audio too long


 67%|████████████████████████████▉              | 39/58 [05:34<02:53,  9.11s/it]

skipped, audio too long


 69%|█████████████████████████████▋             | 40/58 [05:44<02:48,  9.37s/it]

skipped, audio too long


 71%|██████████████████████████████▍            | 41/58 [05:56<02:53, 10.21s/it]

skipped, audio too long


 72%|███████████████████████████████▏           | 42/58 [06:04<02:33,  9.60s/it]

skipped, audio too long


 83%|███████████████████████████████████▌       | 48/58 [07:01<01:30,  9.03s/it]

skipped, audio too long


 86%|█████████████████████████████████████      | 50/58 [07:29<01:32, 11.60s/it]

skipped, audio too long


 90%|██████████████████████████████████████▌    | 52/58 [07:53<01:10, 11.67s/it]

skipped, audio too long


100%|███████████████████████████████████████████| 58/58 [09:08<00:00,  9.45s/it]


In [38]:
!du -hs output-osman-parliament

67G	output-osman-parliament


In [None]:
files = glob('/home/husein/speech-bahasa/output-osman/audios/*.npy')
files.extend(glob('/home/husein/speech-bahasa/output-osman-parliament/audios/*.npy'))
files.extend(glob('/home/husein/speech-bahasa/output-osman-synthetic/audios/*.npy'))

# files = glob('/home/husein/speech-bahasa/output-osman-synthetic/audios/*.npy')
files = sorted(files)
len(files)

In [None]:
directory = 'osman-audio'
directory = os.path.join(os.getcwd(), directory)
directory

In [None]:
!rm -r {directory}
!mkdir {directory}

In [None]:
import json
import malaya
import re
from unidecode import unidecode

normalizer = malaya.normalize.normalizer(date = False, time = False, money = True)

def put_spacing_num(string):
    string = re.sub('[A-Za-z]+', lambda ele: ' ' + ele[0] + ' ', string)
    return re.sub(r'[ ]+', ' ', string).strip()

def convert_to_ascii(string):
    return unidecode(string)

def collapse_whitespace(string):
    return re.sub(_whitespace_re, ' ', string)

def cleaning(string, normalize = True, add_eos = False):
    sequence = []
    string = convert_to_ascii(string)
    string = re.sub(r'[ ]+', ' ', string).strip()
    if string[-1] in ['-', ',']:
        string = string[:-1]
    if string[-1] != '.':
        string = string + '.'
    string = put_spacing_num(string)
    string = re.sub(r'[ ]+', ' ', string).strip()
    string = string
    return string

In [None]:
from tqdm import tqdm

osman = []
for f in tqdm(files):
    text_ids = np.load(f.replace('audios', 'text_ids'), allow_pickle=True)[
        0
    ]
    if 'output-osman-synthetic' in f:
        text_ids = cleaning(text_ids)
    text_ids = text_ids.replace('ju ta', 'juta')
    
    filename = f.replace('/', '-').replace('.npy', '.wav')
    audio = np.load(f)
    left = os.path.join(directory, filename)
    sf.write(left, audio, 22050)
    
    osman.append((left, text_ids))

In [None]:
import json

with open('osman-vits.json', 'w') as fopen:
    json.dump(osman, fopen)