# Import the required packages

In [1]:
import numpy as np
from pydub import AudioSegment
import random
import os

In [2]:
Ty = 330  # No. of time-steps to have in the label

# Helper functions

In [3]:
def match_target_amplitude(sound, target_dBFS):
    change_in_dBFS = target_dBFS - sound.dBFS
    return sound.apply_gain(change_in_dBFS)

In [4]:
def load_raw_audio():
    activates = []
    backgrounds = []
    negatives = []
    for filename in os.listdir("./raw_data/activates"):  # Positive examples
        if filename.endswith("wav"):
            try:
                activate = AudioSegment.from_wav("./raw_data/activates/"+filename)
            except:
                print("./raw_data/activates/"+filename)
            activates.append(activate)
    for filename in os.listdir("./raw_data/backgrounds"):  # Background sounds
        if filename.endswith("wav"):
            try:
                background = AudioSegment.from_wav("./raw_data/backgrounds/"+filename)
            except:
                print("./raw_data/backgrounds/"+filename)
            backgrounds.append(background)
    for filename in os.listdir("./raw_data/negatives"):  # Negative examples
        if filename.endswith("wav"):
            try:
                negative = AudioSegment.from_wav("./raw_data/negatives/"+filename)
            except:
                print("./raw_data/negatives/"+filename)
            negatives.append(negative)
    return activates, negatives, backgrounds

In [5]:
activates, negatives, backgrounds = load_raw_audio()

In [6]:
len(activates),len(negatives),len(backgrounds)

(557, 1882, 1800)

# Generate the dataset
A positive example is generated by taking a random time segment of 2 seconds from the background audio and superposing a car horn randomly over a small portion of it.

In [7]:
def get_random_time_segment(segment_ms):
    """
    Gets a random time segment of duration segment_ms in a 2,000 ms audio clip.
    
    Arguments:
    segment_ms -- the duration of the audio clip in ms ("ms" stands for "milliseconds")
    
    Returns:
    segment_time -- a tuple of (segment_start, segment_end) in ms
    """
    
    segment_start = np.random.randint(low=0, high=2000-segment_ms)   # Make sure segment doesn't run past the 2sec background 
    segment_end = segment_start + segment_ms - 1
    
    return (segment_start, segment_end)

In [8]:
def is_overlapping(segment_time, previous_segments):
    """
    Checks if the time of a segment overlaps with the times of existing segments.
    
    Arguments:
    segment_time -- a tuple of (segment_start, segment_end) for the new segment
    previous_segments -- a list of tuples of (segment_start, segment_end) for the existing segments
    
    Returns:
    True if the time segment overlaps with any of the existing segments, False otherwise
    """
    
    segment_start, segment_end = segment_time
    overlap = False
    for previous_start, previous_end in previous_segments:
        if (segment_start<=previous_end and segment_start>previous_start) or (segment_end>=previous_start and segment_end<previous_end) :
            overlap = True

    return overlap

In [9]:
def insert_audio_clip(background, audio_clip, previous_segments):
    """
    Insert a new audio segment over the background noise at a random time step, ensuring that the 
    audio segment does not overlap with existing segments.
    
    Arguments:
    background -- a 2 second background audio recording.  
    audio_clip -- the audio clip to be inserted/overlaid. 
    previous_segments -- times where audio segments have already been placed
    
    Returns:
    new_background -- the updated background audio
    """
    segment_ms = len(audio_clip)
    segment_time = get_random_time_segment(segment_ms)
    while is_overlapping(segment_time, previous_segments):
        segment_time = get_random_time_segment(segment_ms)
    previous_segments.append(segment_time)
    new_background = background.overlay(audio_clip, position = segment_time[0])
    return new_background, segment_time

In [10]:
def insert_ones(y, segment_end_ms):
    """
    Update the label vector y. The labels of the 10 output steps strictly after the end of the segment 
    should be set to 1. By strictly we mean that the label of segment_end_y should be 0 while, the
    50 following labels should be ones.
    
    
    Arguments:
    y -- numpy array of shape (1, Ty), the labels of the training example
    segment_end_ms -- the end time of the segment in ms
    
    Returns:
    y -- updated labels
    """
    segment_end_y = int(segment_end_ms * Ty / 2000.0)
    for i in range(segment_end_y + 1, segment_end_y + 11):
        if i < Ty:
            y[0, i] = 1
    
    return y

In [17]:
def create_training_example(background, activates, negatives, j):
    """
    Creates a training example with a given background, activates, and negatives.
    
    Arguments:
    background -- a 2 second background audio recording
    activates -- a list of audio segments of the word "activate"
    negatives -- a list of audio segments of random words that are not "activate"
    
    Returns:
    y -- the label at each time step of the spectrogram
    """
    background = background
    y = np.zeros((1, Ty))
    previous_segments = []
    number_of_activates = np.random.randint(1, 3)
    random_indices = np.random.randint(len(activates), size=number_of_activates)
    random_activates = [activates[i] for i in random_indices]
    for random_activate in random_activates:
        background, segment_time = insert_audio_clip(background, random_activate, previous_segments)
        segment_start, segment_end = segment_time
        y = insert_ones(y,segment_end)
    number_of_negatives = np.random.randint(0, 2)
    random_indices = np.random.randint(len(negatives), size=number_of_negatives)
    random_negatives = [negatives[i] for i in random_indices]

    for random_negative in random_negatives:
        background, _ = insert_audio_clip(background, random_negative, previous_segments)
    
    background = match_target_amplitude(background, -20.0)
    file_handle = background.export("./training_data/train" + str(j) + ".wav", format="wav")
    
    return y

In [24]:
j = 0
y_ = []
for background in backgrounds:
    j+=1
    y = create_training_example(background, activates, negatives, j)
    y_.append(y)
#np.save("./training_data/y.npy",y_)

In [25]:
y.shape,len(y_)

((1, 330), 1800)

In [26]:
for background in backgrounds:
    j+=1
    y = np.zeros((1,Ty))
    background.export("./training_data/train" + str(j) + ".wav", format="wav")
    y_.append(y)
    if(j>=2500):
        break
np.save("./training_data/y.npy",y_)

In [27]:
len(y_)  # No. of examples generated

2500

# Other helper functions

## Spliting Background Sound into parts
For splitting background sound into 2 second clips.

In [None]:
t1 = 0
x = 0
A = AudioSegment.from_mp3("./bG.mp3")
while(t1<3600):
    t2 = t1+2
    x+=1
    nA = A[t1*1000:t2*1000]
    nA.export('./raw_data/backgrounds/b'+str(x)+'.wav', format = "wav")
    t1 = t2

## Spliting dataset
For making 1 sec. clips from positive examples to superpose over the background.

In [None]:
x = 0
for activate in negatives:
    t2 = 0
    t1 = 0
    y = 0
    while(t2*1000<len(activate)):
        t2 = t1 + 1
        x+=1
        y+=1
        nA = activate[t1*1000:t2*1000]
        nA.export('./raw_data/split/negative'+str(x)+'.wav', format = "wav")
        t1 = t2
        if y>5: break
    if(400>len(activate)):
        activate.export('./raw_data/split/negative'+str(x)+'.wav', format = "wav")