In [1]:
import tensorflow as tf
import numpy as np
import librosa
import soundfile as sf
from pydub import AudioSegment
from glob import glob
import random



In [2]:
def int_to_float(array, type = np.float32):
    """
    Change np.array int16 into np.float32

    Parameters
    ----------
    array: np.array
    type: np.float32

    Returns
    -------
    result : np.array
    """

    if array.dtype == type:
        return array

    if array.dtype not in [np.float16, np.float32, np.float64]:
        array = array.astype(np.float32) / np.max(np.abs(array))

    return array

In [3]:
from scipy import interpolate

def change_samplerate(data, old_samplerate, new_samplerate):
    old_audio = data
    duration = data.shape[0] / old_samplerate
    time_old = np.linspace(0, duration, old_audio.shape[0])
    time_new = np.linspace(
        0, duration, int(old_audio.shape[0] * new_samplerate / old_samplerate)
    )

    interpolator = interpolate.interp1d(time_old, old_audio.T)
    data = interpolator(time_new).T
    return data


def read_flac(file, sample_rate = 16000):
    data, old_samplerate = sf.read(file)
    if len(data.shape) == 2:
        data = data[:, 0]
    if old_samplerate != sample_rate:
        data = change_samplerate(data, old_samplerate, sample_rate)
    return data, sample_rate


def read_wav(file, sample_rate = 16000):
    y, sr = librosa.load(file, sr = sample_rate)
    return y, sr

def read_mp3(file, sample_rate = 16000):
    audio = AudioSegment.from_mp3(file)
    a = np.array(audio.set_frame_rate(sample_rate).set_channels(1).get_array_of_samples())
    return int_to_float(a), sample_rate

def read_file(file):
    if '.flac' in file:
        y, sr = read_flac(file)
    if '.wav' in file:
        y, sr = read_wav(file)
    if '.mp3' in file:
        y, sr = read_mp3(file)
    return y, sr

In [4]:
def sampling(combined, frame_duration_ms = 700, sample_rate = 16000):
    n = int(sample_rate * (frame_duration_ms / 1000.0))
    offset = 0
    while offset + n <= len(combined):
        yield combined[offset : offset + n]
        offset += n
    if offset < len(combined):
        yield combined[offset:]

In [5]:
labels = [
    'english',
    'indonesian',
    'malay',
    'mandarin',
    'manglish',
    'others',
    'not a language',
]

In [6]:
len(glob('english/clean-wav/*.wav'))

936

In [7]:
english = random.sample(glob('LibriSpeech/*/*/*/*.flac'), 1000) + glob('english/clean-wav/*.wav')
english = [(m, 'english') for m in english]
len(english)

1936

In [8]:
len(glob('indon/clean-wav/*.wav'))

366

In [9]:
indon = glob('indon/clean-wav/*.wav') + random.sample(glob('speech/cv-corpus-5.1-2020-06-22/id/clips/*.mp3'),
                                                      1000)
indon = [(m, 'indonesian') for m in indon]
len(indon)

1366

In [10]:
len(glob('malay/clean-wav/*.wav'))

759

In [11]:
malay = glob('malay/clean-wav/*.wav')
malay = [(m, 'malay') for m in malay]
len(malay)

759

In [12]:
len(glob('mandarin/clean-wav/*.wav'))

304

In [13]:
mandarin = glob('mandarin/clean-wav/*.wav') + random.sample(glob('speech/cv-corpus-5.1-2020-06-22/zh-CN/clips/*.mp3'), 500) \
+ random.sample(glob('speech/cv-corpus-5.1-2020-06-22/zh-HK/clips/*.mp3'), 500) \
+ random.sample(glob('speech/cv-corpus-5.1-2020-06-22/zh-TW/clips/*.mp3'), 500)
mandarin = [(m, 'mandarin') for m in mandarin]
len(mandarin)

1804

In [14]:
manglish = glob('manglish/clean-wav/*.wav')
manglish = [(m, 'manglish') for m in manglish]
len(manglish)

1188

In [15]:
lang = {'en': 'English',
 'de': 'German',
 'fr': 'French',
 'cy': 'Welsh',
 'br': 'Breton',
 'cv': 'Chuvash',
 'tr': 'Turkish',
 'tt': 'Tatar',
 'ky': 'Kyrgyz',
 'ga-IE': 'Irish',
 'kab': 'Kabyle',
 'ca': 'Catalan',
 'zh-TW': 'Chinese (Taiwan)',
 'sl': 'Slovenian',
 'it': 'Italian',
 'nl': 'Dutch',
 'cnh': 'Hakha Chin',
 'eo': 'Esperanto',
 'et': 'Estonian',
 'fa': 'Persian',
 'eu': 'Basque',
 'es': 'Spanish',
 'zh-CN': 'Chinese (China)',
 'mn': 'Mongolian',
 'sah': 'Sakha',
 'dv': 'Dhivehi',
 'rw': 'Kinyarwanda',
 'sv-SE': 'Swedish',
 'ru': 'Russian',
 'id': 'Indonesian',
 'ar': 'Arabic',
 'ta': 'Tamil',
 'ia': 'Interlingua',
 'pt': 'Portuguese',
 'lv': 'Latvian',
 'ja': 'Japanese',
 'vot': 'Votic',
 'ab': 'Abkhaz',
 'zh-HK': 'Chinese (Hong Kong)',
 'rm-sursilv': 'Romansh Sursilvan',
 'hsb': 'Sorbian, Upper',
 'ro': 'Romanian',
 'fy-NL': 'Frisian',
 'cs': 'Czech',
 'el': 'Greek',
 'rm-vallader': 'Romansh Vallader',
 'pl': 'Polish',
 'as': 'Assamese',
 'uk': 'Ukrainian',
 'mt': 'Maltese',
 'ka': 'Georgian',
 'pa-IN': 'Punjabi',
 'or': 'Odia',
 'vi': 'Vietnamese'}
not_in = ['en', 'zh-TW', 'zh-CN', 'zh-HK', 'id']
lang = list(set(lang.keys()) - set(not_in))

In [16]:
from tqdm import tqdm

others = []
for l in tqdm(lang):
    g = glob(f'speech/cv-corpus-5.1-2020-06-22/{l}/clips/*.mp3')
    others.extend(random.sample(g, min(len(g), 1000)))
    
others = [(m, 'others') for m in others]

100%|██████████| 49/49 [01:21<00:00,  1.67s/it]


In [17]:
len(others)

45951

In [18]:
not_music = glob('not-music/clean-wav/*.wav') + glob('musan/music/**/*.wav', recursive = True) \
+ glob('musan/noise/**/*.wav', recursive = True)
not_music = [(m, 'not a language') for m in not_music]
not_music[:10]

[('not-music/clean-wav/Relaxing-Background-Music_-relaxdaily-B-Sides-N°1-qycqF1CWcXg-part-006.wav',
  'not a language'),
 ('not-music/clean-wav/Positive-Tropical-Vibes_-Happy-Music-Beats-to-Relax,-Work,-Study-o22uGQ-efQg-part-006.wav',
  'not a language'),
 ('not-music/clean-wav/The-Best-Music-Collection-For-Studying🎵-(Concentration!),-Relaxing-Music,-composed-by-Tido-Kang-★1-nn-0rd2fDsU-part-008.wav',
  'not a language'),
 ('not-music/clean-wav/Dark-Music_-Lucifers-Hymn-_-Choir-gfG9aJzFPd4.wav',
  'not a language'),
 ('not-music/clean-wav/Peaceful-Piano-&-Soft-Rain_-Relaxing-Sleep-Music,-A-Bitter-Rain-hj83cwfOF3Y-part-012.wav',
  'not a language'),
 ('not-music/clean-wav/Upbeat-Music_-Happy-Music-Beats-to-Relax,-Work,-Study-p1IChPfD2-s-part-010.wav',
  'not a language'),
 ('not-music/clean-wav/Upbeat-Music_-Happy-Music-Beats-to-Relax,-Work,-Study-p1IChPfD2-s-part-002.wav',
  'not a language'),
 ('not-music/clean-wav/Relaxing-Background-Music_-relaxdaily-B-Sides-N°1-qycqF1CWcXg-part-01

In [19]:
combined_all = english + indon + malay + mandarin + manglish + others + not_music
random.shuffle(combined_all)
len(combined_all)

55030

In [20]:
import os

for f in combined_all:
    s = os.path.getsize(f[0]) / 1e6
    if s > 50:
        print(f, s)

In [21]:
labels.index(combined_all[-1][1])

4

In [22]:
# y, sr = read_file(combined_all[0][0])

In [23]:
# y, sr, combined_all[0][1]

In [24]:
import os
import tensorflow as tf

os.system('rm language-detection/data/*')
DATA_DIR = os.path.expanduser('language-detection/data')
tf.gfile.MakeDirs(DATA_DIR)

In [25]:
import malaya_speech

vad = malaya_speech.vad.webrtc()

In [26]:
from tqdm import tqdm
from malaya_speech.train import prepare_data
from collections import defaultdict
import warnings
warnings.filterwarnings('error')

def loop(files, dupe_factor = 2):
    files, no = files
    fname = f'{DATA_DIR}/part-{no}.tfrecords'
    writer = tf.python_io.TFRecordWriter(fname)
    counts = defaultdict(int)
    for file in tqdm(files):
        try:
            wav = read_file(file[0])[0]
            for _ in range(dupe_factor):
                fs = sampling(wav, random.randint(500, 2000))
                for s in fs:
                    try:
                        if file[1] != 'not a language':
                            n = malaya_speech.utils.astype.float_to_int(s)
                            frames = malaya_speech.utils.generator.frames(n, 30, 16000, append_ending_trail=False)
                            frames = [f.array for f in frames if vad(f)]
                            n = malaya_speech.utils.astype.int_to_float(np.concatenate(frames))
                        else:
                            n = s
                        if len(n) > 50:
                            example = prepare_data.to_example({'inputs': n.tolist(), 
                                                               'targets': [labels.index(file[1])]})
                            writer.write(example.SerializeToString())
                            counts[file[1]] += 1
                    except Exception as e:
                        pass
        except Exception as e:
            pass

    writer.close()
    return [counts]

In [27]:
import mp
returned = mp.multiprocessing(combined_all, loop, cores = 10)

 26%|██▌       | 1425/5503 [17:23<29:25,  2.31it/s]t]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

 44%|████▍     | 2430/5503 [30:38<33:35,  1.52it/s]t]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

 62%|██████▏   | 3393/5503 [43:37<12:22,  2.84it/s]  IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp

In [28]:
combined_d = defaultdict(int)
for d in returned:
    for k, v in d.items():
        combined_d[k] += v
combined_d

defaultdict(int,
            {'others': 368462,
             'not a language': 722172,
             'indonesian': 270169,
             'malay': 380161,
             'manglish': 637166,
             'english': 963546,
             'mandarin': 171210})