`conda install -c anaconda yaml`
`conda install -c conda-forge pydub`
`conda install -c conda-forge dvc`

In [2]:
data_version = "d2"
# make it accessible in other notebooks
%store data_version
test_fraction = 0.2
duration = 1000 # in milliseconds
overlap = 100
annotation_file = "data/annotations.yaml"
data_dir = "data/raw"

Stored 'data_version' (str)


In [3]:
import yaml
from pydub import AudioSegment  # You can open and save WAV files with pure python
from random import random
import os
from dvc.api import DVCFileSystem



In [4]:
def build_directory_structure(annotation_dict):
    types = {}
    for annotation_list in annotation_dict.values():
        for entry in annotation_list:
            types[entry["type"]] = 1
    for type in types:
        test_dir = f"data/test/{type}"
        train_dir = f"data/train/{type}"
        if not os.path.exists(test_dir):
            os.makedirs(test_dir)
        if not os.path.exists(train_dir):
            os.makedirs(train_dir)


def split_to_file(segment, name, start_file_number):
    splitFileCount = 1
    start = 0
    fullDuration = segment.duration_seconds * 1000

    while (start + duration) <= fullDuration:
        end = start + duration
        audioSlice = segment[start:end]
        audioSlice = audioSlice.set_channels(1) #convert to mono
        audioSlice = audioSlice.set_frame_rate(16000) # 16 kHz
        trainOrTest = "train" if random() > test_fraction else "test"
        resultFile = f"./data/{trainOrTest}/{name}/{start_file_number + splitFileCount}.wav"
        audioSlice.export(out_f=resultFile, format="wav")
        splitFileCount += 1
        start = start + duration - overlap
    return splitFileCount

In [5]:
with open(annotation_file, 'r') as file:
    annotations = yaml.safe_load(file)
build_directory_structure(annotations)

dvc_fs = DVCFileSystem(data_dir, rev = data_version)
raw_files = dvc_fs.find("/", detail=False, dvc_only=True)

generated_file_count = 0

for raw_file in raw_files:
    audio_file = os.path.basename(raw_file)
    if not (audio_file in annotations): continue
    print(f"Processing {annotations[audio_file]}")
    audio_input = AudioSegment.from_wav(f"{data_dir}/{audio_file}")
    for annotation in annotations[audio_file]:
        start = annotation["start"] * 1000 + 1 # start position is not included, see yaml annotation
        end = annotation["end"] * 1000
        current_segment = audio_input[start:end]
        generated_file_count += split_to_file(current_segment, annotation["type"], generated_file_count)
    #print(f"\tFile ids to {generated_file_count}")

Processing [{'start': 0, 'end': 48, 'type': 'Background'}, {'start': 49, 'end': 64, 'type': 'Sniffing'}, {'start': 67, 'end': 93, 'type': 'Background'}, {'start': 97, 'end': 99, 'type': 'Sniffing'}, {'start': 101, 'end': 135, 'type': 'Background'}]
Processing [{'start': 1, 'end': 65, 'type': 'Quiet'}]
Processing [{'start': 1, 'end': 65, 'type': 'Quiet'}]
Processing [{'start': 1, 'end': 65, 'type': 'Quiet'}]
Processing [{'start': 1, 'end': 65, 'type': 'Quiet'}]
Processing [{'start': 0, 'end': 28, 'type': 'Background'}, {'start': 30, 'end': 40, 'type': 'Sniffing'}, {'start': 41, 'end': 70, 'type': 'Background'}, {'start': 71, 'end': 76, 'type': 'Sniffing'}, {'start': 76, 'end': 102, 'type': 'Background'}, {'start': 103, 'end': 136, 'type': 'Sniffing'}, {'start': 137, 'end': 180, 'type': 'Background'}, {'start': 181, 'end': 190, 'type': 'Sniffing'}, {'start': 191, 'end': 226, 'type': 'Background'}, {'start': 228, 'end': 247, 'type': 'Sniffing'}, {'start': 249, 'end': 290, 'type': 'Backgro