In [6]:
import pandas as pd
import numpy as np
import matplotlib as plt
from pathlib import Path
import os
import math, random
import torch
import torchaudio
from torchaudio import transforms
from IPython.display import Audio

from sklearn.preprocessing import LabelEncoder

In [8]:
df = pd.read_csv('ESC-50\ESC-50-master\meta\esc50.csv')
df.head()

Unnamed: 0,filename,fold,target,category,esc10,src_file,take
0,1-100032-A-0.wav,1,0,dog,True,100032,A
1,1-100038-A-14.wav,1,14,chirping_birds,False,100038,A
2,1-100210-A-36.wav,1,36,vacuum_cleaner,False,100210,A
3,1-100210-B-36.wav,1,36,vacuum_cleaner,False,100210,B
4,1-101296-A-19.wav,1,19,thunderstorm,False,101296,A


In [10]:
filenames = []
fold = []
target = []
category = []
src_file = []
take = []
for i in df.columns:
    if i == 'filename':
        filenames.append(df[i])
    if i == 'fold':
        fold.append(df[i])
    if i == 'target':
        target.append(df[i])
    if i == 'category':
        category.append(df[i])
    if i == 'src_file':
        src_file.append(df[i])
    if i == 'take':
        take.append(df[i])

    

In [12]:
import numpy as np
import pydub
import librosa

class Clip:
    """A single 5-sec long recording."""
    
    RATE = 44100   # All recordings in ESC are 44.1 kHz
    FRAME = 512    # Frame size in samples
    
    class Audio:
        """The actual audio data of the clip.
        
            Uses a context manager to load/unload the raw audio data. This way clips
            can be processed sequentially with reasonable memory usage.
        """
        
        def __init__(self, path):
            self.path = path
        
        def __enter__(self):
            # Actual recordings are sometimes not frame accurate, so we trim/overlay to exactly 5 seconds
            self.data = pydub.AudioSegment.silent(duration=5000)
            self.data = self.data.overlay(pydub.AudioSegment.from_file(self.path)[0:5000])
            self.raw = (np.frombuffer(self.data._data, dtype="int16") + 0.5) / (0x7FFF + 0.5)   # convert to float

            return(self)
        
        
        def __exit__(self, exception_type, exception_value, traceback):
            if exception_type is not None:
                print (exception_type, exception_value, traceback)
            del self.data
            del self.raw
        
    def __init__(self, filename):
        self.filename = os.path.basename(filename)
        self.path = os.path.abspath(filename)        
        self.directory = os.path.dirname(self.path)
        self.category = self.directory.split('/')[-1]
        
        self.audio = Clip.Audio(self.path)
        
        with self.audio as audio:
            self._compute_mfcc(audio)    
            self._compute_zcr(audio)
            
    def _compute_mfcc(self, audio):
        # MFCC computation with default settings (2048 FFT window length, 512 hop length, 128 bands)
        self.melspectrogram = librosa.feature.melspectrogram(y = audio.raw, sr=Clip.RATE, hop_length=Clip.FRAME)
        self.logamplitude = librosa.amplitude_to_db(self.melspectrogram)
        self.mfcc = librosa.feature.mfcc(S=self.logamplitude, n_mfcc=13).transpose()
            
    def _compute_zcr(self, audio):
        # Zero-crossing rate
        self.zcr = []
        frames = int(np.ceil(len(audio.data) / 1000.0 * Clip.RATE / Clip.FRAME))
        
        for i in range(0, frames):
            frame = Clip._get_frame(audio, i)
            self.zcr.append(np.mean(0.5 * np.abs(np.diff(np.sign(frame)))))

        self.zcr = np.asarray(self.zcr)
            
    @classmethod
    def _get_frame(cls, audio, index):
        if index < 0:
            return None
        return audio.raw[(index * Clip.FRAME):(index+1) * Clip.FRAME]
    
    def __repr__(self):
        return '<{0}/{1}>'.format(self.category, self.filename)

In [15]:
import librosa
import numpy as np
from scipy import signal

class AudioUtil:
    @staticmethod
    def open(file_path):
        # Use librosa to load audio file
        audio, _ = librosa.load(file_path, sr=None)
        return audio

    @staticmethod
    def resample(audio, target_sr):
        # Use librosa to resample audio
        return librosa.resample(audio, orig_sr=len(audio), target_sr=target_sr)

    @staticmethod
    def rechannel(audio, target_channels):
        # If the number of channels is already correct, return the input
        if audio.shape[0] == target_channels:
            return audio
        # If not, duplicate the channels to match the target_channels
        return np.tile(audio, target_channels)[:target_channels]

    @staticmethod
    def pad_trunc(audio, target_length):
        # Use librosa to pad or truncate audio to the target length
        if len(audio) < target_length:
            return np.pad(audio, (0, target_length - len(audio)))
        else:
            return audio[:target_length]

    @staticmethod
    def time_shift(audio, shift_pct):
        # Use numpy to time-shift the audio
        shift_amount = int(len(audio) * shift_pct)
        return np.roll(audio, shift_amount)

    @staticmethod
    def spectrogram(audio, n_mels=64, n_fft=1024, hop_len=None):
        # Use librosa to compute the mel spectrogram
        mel_spec = librosa.feature.melspectrogram(audio, sr=len(audio), n_mels=n_mels, n_fft=n_fft, hop_length=hop_len)
        log_mel_spec = librosa.power_to_db(mel_spec, ref=np.max)
        return log_mel_spec

    @staticmethod
    def spectro_augment(spectrogram, max_mask_pct=0.1, n_freq_masks=2, n_time_masks=2):
        # Use scipy to apply frequency and time masking
        _, n_frames = spectrogram.shape
        freq_mask_size = int(n_freq_masks * max_mask_pct * n_frames)
        time_mask_size = int(n_time_masks * max_mask_pct * n_frames)

        freq_mask = signal.windows.hann(freq_mask_size)
        time_mask = signal.windows.hann(time_mask_size)

        augmented_spec = spectrogram.copy()

        for _ in range(n_freq_masks):
            f = np.random.randint(0, n_frames - freq_mask_size)
            augmented_spec[:, f:f + freq_mask_size] *= freq_mask

        for _ in range(n_time_masks):
            t = np.random.randint(0, n_frames - time_mask_size)
            augmented_spec[:, t:t + time_mask_size] *= time_mask

        return augmented_spec


In [14]:
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt

import seaborn as sb
sb.set(style="white", palette="muted")

import pandas as pd
import random
random.seed(20150420)

In [20]:
from torch.utils.data import DataLoader, Dataset, random_split
import torchaudio

# ----------------------------
# Sound Dataset
# ----------------------------
class SoundDS(Dataset):
  def __init__(self, df, data_path):
    self.df = df
    self.data_path = str(data_path)
    self.duration = 4000
    self.sr = 44000
    self.channel = 1
    self.shift_pct = 0.4

  # ----------------------------
  # Number of items in dataset
  # ----------------------------
  def __len__(self):
    return len(self.df)

  # ----------------------------
  # Get i'th item in dataset
  # ----------------------------
  def __getitem__(self, idx):
    # Absolute file path of the audio file - concatenate the audio directory with
    # Relative file path of the audio file with 'dataset/' prefix
    if idx < 0 or idx >= len(self.df):
        # Handle cases where the index is out of range
        raise IndexError(f"Index {idx} is out of range for the DataFrame")

    filename = self.df.loc[idx, 'filename']
    # Construct the full path to the audio file by joining with the 'dataset' directory
    full_path = os.path.join(self.data_path, filename)

    # Get the Class ID
    class_id = self.df.loc[idx, 'target']  # Assuming 'target' is the class ID

    return full_path, class_id

    # Load and process the audio file
    aud = AudioUtil.open(full_path)


    # Perform audio processing as before
    # Some sounds have a higher sample rate, or fewer channels compared to the
    # majority. So make all sounds have the same number of channels and same
    # sample rate. Unless the sample rate is the same, the pad_trunc will still
    # result in arrays of different lengths, even though the sound duration is
    # the same.
    reaud = AudioUtil.resample(aud, self.sr)
    rechan = AudioUtil.rechannel(reaud, self.channel)

    dur_aud = AudioUtil.pad_trunc(rechan, self.duration)
    shift_aud = AudioUtil.time_shift(dur_aud, self.shift_pct)
    sgram = AudioUtil.spectro_gram(shift_aud, n_mels=64, n_fft=1024, hop_len=None)
    aug_sgram = AudioUtil.spectro_augment(sgram, max_mask_pct=0.1, n_freq_masks=2, n_time_masks=2)

    return aug_sgram, class_id






In [17]:
os.getcwd()

'c:\\Users\\matti\\Documents\\GitHub\\HDA_project'

In [None]:
data_dir = 'ESC-50/audio'

myds = SoundDS(df, data_dir)

# Random split of 80:20 between training and validation
num_items = len(myds)
num_train = round(num_items * 0.8)
num_val = num_items - num_train
train_ds, val_ds = random_split(myds, [num_train, num_val])

# creating train test split manually for further usage
train_X = []
train_y = []
for tensor in range(len(train_ds)):
    train_X.append(train_ds[tensor][0])
    train_y.append(train_ds[tensor][1])

test_X = []
test_y = []
for tensor in range(len(val_ds)):
    test_X.append(val_ds[tensor][0])
    test_y.append(val_ds[tensor][1])
