### <i>Pre-processing & splitting normal and/or augmented data</i>

In [None]:
import os
import librosa
import torchaudio
import torch
import math
import pandas as pd
import numpy as np
from IPython.display import Audio

In [None]:
#for generating either augmented or non-augmented dataset
include_augmented=True


train_dirs = [f"../data/Train/{path}" for path in os.listdir("../data/Train")]
if include_augmented: train_dirs+= [f"augmented/{path}" for path in os.listdir("augmented")]
train_dirs.sort()


labels_csv_out = ""
for dir in train_dirs:
    audio, sr = torchaudio.load(dir)
    file_name = dir.split("/")[-1].replace(".wav",".npy")
    label = file_name[0]
    labels_csv_out += ",".join([file_name,label])+"\n"
    length = len(audio[0])/sr
    target_length = 13 * sr
    #cut off silence and normalize
    audio = (librosa.effects.trim(audio, top_db = 40)[0]/torch.max(torch.abs(audio))).repeat(1,13//math.floor(length)+30)[0][:target_length]
    S = librosa.feature.melspectrogram(y=audio.numpy())
    np.save(f'preprocessed/all/specs/{file_name}', S)
with open("preprocessed/all/labels.csv", "w+") as f:
    f.write(labels_csv_out)

In [2]:
from sklearn.model_selection import train_test_split
import shutil

# Split data
train_files = [dir.replace(".wav","") for dir in os.listdir("../data/Train")]
train_files.sort()

stratify = [name.split("_")[0] for name in train_files]

train_ids, val_ids = train_test_split(train_files, test_size=0.2, random_state=42, stratify=stratify)

for file_name in os.listdir("preprocessed/all/specs"):
    id = file_name[:7]
    src = f"preprocessed/all/specs/{file_name}"
    dst = "preprocessed/train/specs" if (id in train_ids) else "preprocessed/val/specs"
    shutil.copy(src,dst)

# Generate labels
for set_type in ["train","val"]:
    labels = ""
    for file_name in os.listdir(f"preprocessed/{set_type}/specs"):
        labels+= f"{file_name},{file_name[0]}\n"
    with open(f"preprocessed/{set_type}/labels.csv", "w+") as f:
        f.write(labels)

## Process test data

In [None]:
# Pre-process test set
import convenience
import torch
df_test, sample_rates = convenience.load_test()
display(df_test.head())

target_duration = 13
#sampling rate
sr = 16000

target_length = target_duration * sr
prep_test_audio = [(librosa.effects.trim(audio, top_db = 40)[0]/torch.max(torch.abs(audio))).repeat(1,13//math.floor(length)+30)[0][:target_length] for audio in df_test.audio]


Unnamed: 0,id,file_name,audio,length
0,5775,5775.wav,"[tensor(0.0003), tensor(0.0002), tensor(0.0002...",3.498687
1,5652,5652.wav,"[tensor(-9.1553e-05), tensor(-9.1553e-05), ten...",5.125625
2,5740,5740.wav,"[tensor(0.), tensor(3.0518e-05), tensor(-3.051...",2.6375
3,6428,6428.wav,"[tensor(0.), tensor(6.1035e-05), tensor(0.0002...",7.253312
4,8021,8021.wav,"[tensor(0.0003), tensor(0.0002), tensor(0.0001...",4.010625


In [4]:
# Export test set
import librosa
import numpy as np
import os

if (not os.path.isdir("preprocessed")): os.mkdir("preprocessed")
if (not os.path.isdir("preprocessed/test")): os.mkdir("preprocessed/test")
if (not os.path.isdir("preprocessed/test/specs")): os.mkdir("preprocessed/test/specs")


labels_csv_out = "Id,label\n"

for i in range(len(prep_test_audio)):
    audio = prep_test_audio[i]
    S = librosa.feature.melspectrogram(y=audio.numpy())
    labels_csv_out += ",".join([f"{df_test.id[i]}","\n"])
    np.save(f'preprocessed/test/specs/{df_test.file_name[i][:-4]}.npy', S)

with open("preprocessed/test/labels.csv", "w+") as f:
    f.write(labels_csv_out)