In [1]:
import librosa
import soundfile as sf
import numpy as np
import os
import shutil
import glob
from sklearn.model_selection import StratifiedKFold

In [2]:
# change the sample rate and the number of samples according to your needs
sr = 20000
samples = sr * 5  # 5 seconds audio clips

In [3]:
mainDir = os.path.dirname(os.path.dirname(os.getcwd()))
dataset_path = os.path.join(mainDir, 'datasets', 'urbansound8k')
audio_path = os.path.join(dataset_path, 'UrbanSound8K', 'audio')

master = os.path.join(dataset_path, 'master')

if not os.path.exists(master):
    os.mkdir(master)

In [4]:
for fold in range(1, 11):
    print('Processing fold {}'.format(fold))
    
    for src_file in sorted(glob.glob(os.path.join(audio_path, 'fold{}'.format(fold), '*.wav'))):
        dst_file = src_file.replace(os.path.join(audio_path, 'fold{}'.format(fold)), master)
    
        audio_data, s = librosa.load(src_file, sr=sr, mono=True)
        length = len(audio_data)
        n = samples//length + 1
        audio_data = np.tile(audio_data, n)
        audio_data = audio_data[:samples]
        
        sf.write(dst_file, audio_data, sr)
        

Processing fold 1
Processing fold 2
Processing fold 3
Processing fold 4
Processing fold 5
Processing fold 6
Processing fold 7
Processing fold 8
Processing fold 9
Processing fold 10


In [9]:
X = []
y = []

for processed_file in sorted(glob.glob(os.path.join(master, '*.wav'))):
    y.append(int(os.path.basename(processed_file).split('-')[1]))
    X.append(librosa.load(processed_file, sr=sr, mono=True)[0])

In [10]:
X = np.array(X)
y = np.array(y)

print('X shape: {}'.format(np.array(X).shape))
print('y shape: {}'.format(np.array(y).shape))

X shape: (8732, 100000)
y shape: (8732,)


In [11]:
# Create an instance of StratifiedKFold with 5 folds
stratified_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

urbanSound8k = {}

# Iterate over the folds
for fold, (train_index, test_index) in enumerate(stratified_kfold.split(X, y)):
    urbanSound8k['fold{}'.format(fold + 1)] = {}
    # Split the data into training and testing sets
    X_train, X_test = [X[i] for i in train_index], [X[i] for i in test_index]
    y_train, y_test = [y[i] for i in train_index], [y[i] for i in test_index]

    # Now, X_train, y_train contain the training data for the current fold
    # and X_test, y_test contain the testing data for the current fold

    print(f"Fold {fold + 1}:")
    print(f"  Training samples: {len(X_train)}")
    print(f"  Testing samples: {len(X_test)}")
    
    print('Testing bin count -  {}'.format(np.bincount(y_test)))
    
    urbanSound8k['fold{}'.format(fold + 1)]['sounds'] = X_test
    urbanSound8k['fold{}'.format(fold + 1)]['labels'] = y_test

save_path = os.path.join(dataset_path, 'wav{}.npz'.format(sr//1000))
np.savez(save_path, **urbanSound8k)   
print('Dataset created')

Fold 1:
  Training samples: 6985
  Testing samples: 1747
Testing bin count -  [200  86 200 200 200 200  75 200 186 200]
Fold 2:
  Training samples: 6985
  Testing samples: 1747
Testing bin count -  [200  86 200 200 200 200  75 200 186 200]
Fold 3:
  Training samples: 6986
  Testing samples: 1746
Testing bin count -  [200  86 200 200 200 200  75 200 185 200]
Fold 4:
  Training samples: 6986
  Testing samples: 1746
Testing bin count -  [200  86 200 200 200 200  74 200 186 200]
Fold 5:
  Training samples: 6986
  Testing samples: 1746
Testing bin count -  [200  85 200 200 200 200  75 200 186 200]
Dataset created


In [12]:
shutil.rmtree(master) # delete the master folder