In [8]:
import tensorflow as tf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

import librosa

In [9]:
metadata = "../UrbanSound8K/metadata/UrbanSound8K.csv"
df = pd.read_csv(metadata)

labels = df['class'].unique()    # obtaining the class labels
print(labels)

['dog_bark' 'children_playing' 'car_horn' 'air_conditioner' 'street_music'
 'gun_shot' 'siren' 'engine_idling' 'jackhammer' 'drilling']


## Reflective padding: our alternative to zero padding 

Reflective padding:

    Reflecting the signal at its boundaries instead of zero padding can help preserve the continuity of the signal, preventing alterations of the signal's characteristics, something that might occur with zero padding.
    It can also help reducing artifacts at the edges of the signal and providing a smooth transition from the original signal to the padded region.
    We believe this approach is effective with sound data, due to its symmetric nature.

In [15]:
def reflective_padding(signal, target_duration, target_rate):
    target_duration = target_duration*target_rate
    current_duration = len(signal)
    
    # Calculate the required padding on each side
    padding_needed = target_duration - current_duration
    left_padding = padding_needed // 2
    right_padding = padding_needed - left_padding
    
    # Reflective padding on both sides
    padded_signal = np.pad(signal, (left_padding, right_padding), 'reflect')
    
    return padded_signal

## Loading and resampling all the data

Parameters definition

In [18]:
signal_time = 4    # each signal will have 4 seconds of duration
target_rate = 1000    # resampling frequence

# MFCC parameters
n_mfcc=40
hop_length=round(target_rate*0.0125)
win_length=round(target_rate*0.023)
n_fft=2**14
mfcc_time_size = 4*target_rate//hop_length+1

dataset = []
dataset_mfcc = []

Obtaining all the resampled data

In [25]:
# for i in range(1,11):
i=5
_wav_dir_="../UrbanSound8K/audio/fold" + str(i) + '/'
files = librosa.util.find_files(_wav_dir_)
fold=np.zeros(shape=[len(files),4*target_rate])
fold_mfcc=np.zeros(shape=[len(files),n_mfcc,mfcc_time_size])
idx = 0
for f in files:
    signal, rate = librosa.load(f, sr=1000)
    # print(len(new_signal))
    new_signal = librosa.resample(signal, orig_sr=rate, target_sr=target_rate)
    # print(len(new_signal))
    print(f)
    new_signal=signal
    if len(new_signal)<4*target_rate:
        new_signal = reflective_padding(new_signal, 4, target_rate)
    fold[idx] = new_signal
    sig_mfcc =librosa.feature.mfcc(y=new_signal,sr=target_rate,n_fft=n_fft,hop_length=hop_length,win_length=win_length,n_mfcc=n_mfcc)
    fold_mfcc[idx] = sig_mfcc
    idx += 1
dataset.append(fold)
dataset_mfcc.append(fold_mfcc)
print(f"fold {i} done")

4000
318
c:\Users\gapmd\workspace\GitHub projects\AC II - project\UrbanSound8K\audio\fold5\100032-3-0-0.wav
4000
4000
c:\Users\gapmd\workspace\GitHub projects\AC II - project\UrbanSound8K\audio\fold5\100263-2-0-117.wav
4000
4000
c:\Users\gapmd\workspace\GitHub projects\AC II - project\UrbanSound8K\audio\fold5\100263-2-0-121.wav
4000
4000
c:\Users\gapmd\workspace\GitHub projects\AC II - project\UrbanSound8K\audio\fold5\100263-2-0-126.wav
4000
4000
c:\Users\gapmd\workspace\GitHub projects\AC II - project\UrbanSound8K\audio\fold5\100263-2-0-137.wav
4000
4000
c:\Users\gapmd\workspace\GitHub projects\AC II - project\UrbanSound8K\audio\fold5\100263-2-0-143.wav
4000
4000
c:\Users\gapmd\workspace\GitHub projects\AC II - project\UrbanSound8K\audio\fold5\100263-2-0-161.wav
4000
4000
c:\Users\gapmd\workspace\GitHub projects\AC II - project\UrbanSound8K\audio\fold5\100263-2-0-3.wav
4000
4000
c:\Users\gapmd\workspace\GitHub projects\AC II - project\UrbanSound8K\audio\fold5\100263-2-0-36.wav
4000
40

ValueError: could not broadcast input array from shape (4008,) into shape (4000,)