# 1. Import Libraries

In [1]:
from IPython.display import Javascript
from google.colab import output
from base64 import b64decode
from io import BytesIO
!pip -q install pydub
from pydub import AudioSegment
import numpy as np

In [2]:
RECORD = """
const sleep  = time => new Promise(resolve => setTimeout(resolve, time))
const b2text = blob => new Promise(resolve => {
  const reader = new FileReader()
  reader.onloadend = e => resolve(e.srcElement.result)
  reader.readAsDataURL(blob)
})
var record = time => new Promise(async resolve => {
  stream = await navigator.mediaDevices.getUserMedia({ audio: true })
  recorder = new MediaRecorder(stream)
  chunks = []
  recorder.ondataavailable = e => chunks.push(e.data)
  recorder.start()
  await sleep(time)
  recorder.onstop = async ()=>{
    blob = new Blob(chunks)
    text = await b2text(blob)
    resolve(text)
  }
  recorder.stop()
})
"""

def record(name, count, sec, v=True):
  files = []
  for i in range(count):

    if v:
      print('Record start #'+str(i+1))
    display(Javascript(RECORD))
    s = output.eval_js('record(%d)' % (sec*1000))
    if v:
      print('Record end #'+str(i+1))

    b = b64decode(s.split(',')[1])
    audio = AudioSegment.from_file(BytesIO(b))
    audio.export(name+'#'+str(i+1)+'.mp3', format='mp3')
    files.append(name+'#'+str(i+1)+'.mp3')

  return files

In [3]:
import librosa
import pandas as pd
import os

import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
import scipy
import librosa.display
from IPython.display import Audio

# 2. Data Augmentation

Here we use:
  1. Noise (add random noise to the audio)
  2. Random Shifting (We shift the audio become early or late)
  3. Value Augmentation

In [4]:
def add_noise(y, sr=0):
    y_noise = y.copy()
    noise_amp = 0.005 * np.random.uniform() * np.amax(y_noise)
    y_noise = y_noise.astype('float64') + noise_amp * np.random.normal(size=y_noise.shape[0])

    return y_noise

def random_shift(y, sr=0):
    y_shift = y.copy()
    timeshift_fac = 0.2 * 2 * (np.random.uniform()-0.5)  # up to 20% of length
    start = int(y_shift.shape[0] * timeshift_fac)

    if (start > 0):
        y_shift = np.pad(y_shift,(start,0),mode='constant')[0:y_shift.shape[0]]
    else:
        y_shift = np.pad(y_shift,(0,-start),mode='constant')[0:y_shift.shape[0]]

    return y_shift

def value_augmentation(y, sr=0):
    y_aug = y.copy()
    dyn_change = np.random.uniform(low=1.5,high=3)
    y_aug = y_aug * dyn_change

    return y_aug

# 3. Make the Pickle Files

We will read every audio files available, then we take the Mel Spectrum, MFCCS, and ZCR.

We will do the same thing to same data with some modification from augmentation above.

In [5]:
def extract(name, files):
  data = []
  for file in files:
    for i in range(4):
      y, sr = librosa.load(file)
      if i == 1:
        y = add_noise(y);
      elif i == 2:
        y = random_shift(y)
      elif i == 3:
        y = value_augmentation(y)

      label = os.path.splitext(os.path.basename(file))[0].split('#')[0]
      mel_spec = librosa.feature.melspectrogram(y=y, sr=sr)
      mel_spec = librosa.util.fix_length(mel_spec, size=130, axis=1)
      mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13, hop_length=512)
      mfccs = librosa.util.fix_length(mfccs, size=130, axis=1)
      zcr = librosa.feature.zero_crossing_rate(y)
      zcr = librosa.util.fix_length(zcr, size=130, axis=1)

      data.append([label, mel_spec, mfccs, zcr])

  df = pd.DataFrame(data, columns=['label', 'mel_spec', 'mfccs', 'zcr'])
  df.to_pickle(name)

In [6]:
record('1', 3, 5)
record('2', 3, 3)
record('69', 3, 3)

Record start #1


<IPython.core.display.Javascript object>

KeyboardInterrupt: 

In [None]:
"The quick brown fox jumps over the lazy dog."
"Sally sells seashells by the seashore on sunny Sundays."
"In the moonlight, shadows dance across the tranquil water."

people = ['Josh', 'Aurick']
ids = ['1', '2']
all_files = []
for i in ids:
  print('Recording for '+i)
  files = record(i, 3, 2)
  all_files += files

#all_files = ['1#1.mp3', '1#2.mp3', '1#3.mp3', '2#1.mp3', '2#2.mp3', '2#3.mp3', '69#1.mp3', '69#2.mp3', '69#3.mp3']

extract('data.pkl', all_files)

Recording for 1


NameError: name 'record' is not defined

# 4. Start the training using Deep Learning

In [7]:
!pip uninstall tflearn -y
!pip install git+https://github.com/MihaMarkic/tflearn.git@fix/is_sequence_missing

[0mCollecting git+https://github.com/MihaMarkic/tflearn.git@fix/is_sequence_missing
  Cloning https://github.com/MihaMarkic/tflearn.git (to revision fix/is_sequence_missing) to /tmp/pip-req-build-uaolednw
  Running command git clone --filter=blob:none --quiet https://github.com/MihaMarkic/tflearn.git /tmp/pip-req-build-uaolednw
  Running command git checkout -b fix/is_sequence_missing --track origin/fix/is_sequence_missing
  Switched to a new branch 'fix/is_sequence_missing'
  Branch 'fix/is_sequence_missing' set up to track remote branch 'fix/is_sequence_missing' from 'origin'.
  Resolved https://github.com/MihaMarkic/tflearn.git to commit 6472b8588e758ff4a33a2764d4ee638bbd0e42f0
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: tflearn
  Building wheel for tflearn (setup.py) ... [?25l[?25hdone
  Created wheel for tflearn: filename=tflearn-0.5.0-py3-none-any.whl size=130659 sha256=f26d0c4ecb0818dd04502a72615261d16d09ef3cd9a665fbbbb749e45f0

In [8]:
import pandas as pd
import numpy as np
import tensorflow as tf
import tflearn
from tflearn.data_utils import to_categorical
import librosa

Instructions for updating:
non-resource variables are not supported in the long term


In [None]:
df = pd.read_pickle("data.pkl")
train_data = df

In [None]:
x_mel_spec = np.array(train_data['mel_spec'].tolist())
x_mfccs = np.array(train_data['mfccs'].tolist())
x_zcr = np.array(train_data['zcr'].tolist())
y = np.array(train_data['label']).astype(int)

x_train_mel_spec = []
for i in x_mel_spec:
  delta = librosa.feature.delta(i)
  delta2 = librosa.feature.delta(i, order=2)
  x_train_mel_spec.append(np.c_[i,delta,delta2])
x_train_mel_spec = np.array(x_train_mel_spec)

x_train_mfccs = []
for i in x_mfccs:
  delta = librosa.feature.delta(i)
  delta2 = librosa.feature.delta(i, order=2)
  x_train_mfccs.append(np.c_[i,delta,delta2])
x_train_mfccs = np.array(x_train_mfccs)

x_train_zcr = []
for i in x_zcr:
  delta = librosa.feature.delta(i)
  delta2 = librosa.feature.delta(i, order=2)
  x_train_zcr.append(np.c_[i,delta,delta2])
x_train_zcr = np.array(x_train_zcr)

y_train = to_categorical(y, nb_classes=max(y)+1)

print('x_train_mel_spec: ', x_train_mel_spec.shape)
print('x_train_mfccs: ', x_train_mfccs.shape)
print('x_train_zcr: ', x_train_zcr.shape)
print('y_train: ', y_train.shape, set(y))

x_train_mel_spec:  (36, 128, 390)
x_train_mfccs:  (36, 13, 390)
x_train_zcr:  (36, 1, 390)
y_train:  (36, 70) {1, 2, 69}


In [None]:
from tflearn.layers.core import input_data, fully_connected, dropout
from tflearn.layers.estimator import regression
from tflearn.layers.merge_ops import merge

layer_size = 128
dropout_rate = 0.7
learning_rate = 0.001

tf.compat.v1.reset_default_graph()
tflearn.init_graph(num_cores=8, gpu_memory_fraction=0.5)

f_layers = [
  input_data(shape=[None, x_train_mel_spec.shape[1], x_train_mel_spec.shape[2]]),
  input_data(shape=[None, x_train_mfccs.shape[1], x_train_mfccs.shape[2]]),
  input_data(shape=[None, x_train_zcr.shape[1], x_train_zcr.shape[2]])
]

f_merged = merge(f_layers, mode='concat', axis=1)

net = fully_connected(f_merged, layer_size, activation='relu')
net = fully_connected(net, layer_size, activation='relu')
net = fully_connected(net, layer_size, activation='relu')
net = dropout(net, dropout_rate)
net = fully_connected(net, max(y)+1, activation='softmax')
net = regression(net, optimizer='adam', loss='categorical_crossentropy', learning_rate=learning_rate)

In [None]:
model = tflearn.DNN(net)
model.fit([x_train_mel_spec, x_train_mfccs, x_train_zcr], y_train, n_epoch=10, show_metric=True, snapshot_step=1000, validation_set=0.1)

---------------------------------
Run id: POKQK6
Log directory: /tmp/tflearn_logs/
---------------------------------
Training samples: 32
Validation samples: 4
--
Training Step: 1  | time: 1.386s
| Adam | epoch: 001 | loss: 0.00000 - acc: 0.0000 | val_loss: 3.18098 - val_acc: 0.0000 -- iter: 32/32
--
Training Step: 2  | time: 1.101s
| Adam | epoch: 002 | loss: 0.00000 - acc: 0.0000 | val_loss: 1.62062 - val_acc: 0.0000 -- iter: 32/32
--
Training Step: 3  | time: 1.077s
| Adam | epoch: 003 | loss: 0.00000 - acc: 0.0000 | val_loss: 0.65924 - val_acc: 1.0000 -- iter: 32/32
--
Training Step: 4  | time: 1.062s
| Adam | epoch: 004 | loss: 0.00000 - acc: 0.0000 | val_loss: 0.68317 - val_acc: 0.7500 -- iter: 32/32
--
Training Step: 5  | time: 1.064s
| Adam | epoch: 005 | loss: 0.00000 - acc: 0.0000 | val_loss: 0.51632 - val_acc: 0.7500 -- iter: 32/32
--
Training Step: 6  | time: 1.085s
| Adam | epoch: 006 | loss: 0.00000 - acc: 0.0000 | val_loss: 0.14969 - val_acc: 1.0000 -- iter: 32/32
--
Tra

In [None]:
def format():
  files = record('0', 1, 1, False)
  extract('test.pkl', files)

  df = pd.read_pickle("test.pkl")
  test_data = df

  x_mel_spec = np.array(test_data['mel_spec'].tolist())
  x_mfccs = np.array(test_data['mfccs'].tolist())
  x_zcr = np.array(test_data['zcr'].tolist())

  x_test_mel_spec = []
  for i in x_mel_spec:
    delta = librosa.feature.delta(i)
    delta2 = librosa.feature.delta(i, order=2)
    x_test_mel_spec.append(np.c_[i,delta,delta2])
  x_test_mel_spec = np.array(x_test_mel_spec)

  x_test_mfccs = []
  for i in x_mfccs:
    delta = librosa.feature.delta(i)
    delta2 = librosa.feature.delta(i, order=2)
    x_test_mfccs.append(np.c_[i,delta,delta2])
  x_test_mfccs = np.array(x_test_mfccs)

  x_test_zcr = []
  for i in x_zcr:
    delta = librosa.feature.delta(i)
    delta2 = librosa.feature.delta(i, order=2)
    x_test_zcr.append(np.c_[i,delta,delta2])
  x_test_zcr = np.array(x_test_zcr)

  #print('x_test_mel_spec: ', x_test_mel_spec.shape)
  #print('x_test_mfccs: ', x_test_mfccs.shape)
  #print('x_test_zcr: ', x_test_zcr.shape)

  return [x_test_mel_spec, x_test_mfccs, x_test_zcr]

In [None]:
from time import sleep
import warnings
import IPython
from IPython.display import clear_output

warnings.filterwarnings("ignore", category=np.VisibleDeprecationWarning)

i = 1

people = ['Josh', 'Aurick']

while True:
  #sleep(3)
  test_data = format()
  pred = model.predict(test_data)
  y_pred = np.argmax(pred, axis=1)

  #print(y_pred[0])

  clear_output()
  print('#'+str(i), end=' - ')
  i += 1
  if y_pred[0] == 69:
    print('No one is talking')
  else:
    print(people[y_pred[0]-1] + ' is talking')

#7 - No one is talking


<IPython.core.display.Javascript object>

KeyboardInterrupt: ignored