In [91]:
import pandas as pd
import librosa 
import librosa.display
import matplotlib.pyplot as plt
import numpy as np
from FeatureExtractor import FeatureExtractor

In [92]:
%matplotlib inline

In [93]:
FRAME_SIZE = 512
HOP_LENGTH = 256
N_MFCC = 20

In [94]:
sound_db = pd.read_csv('./db/UrbanSound8K.csv')

In [95]:
sound_db.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8732 entries, 0 to 8731
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   slice_file_name  8732 non-null   object 
 1   fsID             8732 non-null   int64  
 2   start            8732 non-null   float64
 3   end              8732 non-null   float64
 4   salience         8732 non-null   int64  
 5   fold             8732 non-null   int64  
 6   classID          8732 non-null   int64  
 7   class            8732 non-null   object 
dtypes: float64(2), int64(4), object(2)
memory usage: 545.9+ KB


In [96]:
sound_db.describe()

Unnamed: 0,fsID,start,end,salience,fold,classID
count,8732.0,8732.0,8732.0,8732.0,8732.0,8732.0
mean,116033.493816,38.645409,42.253312,1.347,5.385937,4.592877
std,57991.017218,74.292126,74.369669,0.476043,2.84682,2.894544
min,344.0,0.0,0.105962,1.0,1.0,0.0
25%,69942.25,3.0,6.839398,1.0,3.0,2.0
50%,118279.0,10.376492,14.0,1.0,5.0,4.0
75%,166942.0,35.131372,38.866979,2.0,8.0,7.0
max,209992.0,600.125356,604.125356,2.0,10.0,9.0


In [97]:
sound_db.head()

Unnamed: 0,slice_file_name,fsID,start,end,salience,fold,classID,class
0,100032-3-0-0.wav,100032,0.0,0.317551,1,5,3,dog_bark
1,100263-2-0-117.wav,100263,58.5,62.5,1,5,2,children_playing
2,100263-2-0-121.wav,100263,60.5,64.5,1,5,2,children_playing
3,100263-2-0-126.wav,100263,63.0,67.0,1,5,2,children_playing
4,100263-2-0-137.wav,100263,68.5,72.5,1,5,2,children_playing


In [98]:
class FeatureExtractor:
    @staticmethod
    def amplitude_envelope(signal: np.ndarray, frame_size: int, hop_length: int) -> np.array:
        ae = []
        for i in range(0, len(signal), hop_length):
            ae.append(max(signal[i:i + frame_size]))
        return np.array(ae)

    @staticmethod
    def root_mean_square(signal: np.ndarray, frame_size: int, hop_length: int) -> np.array:
        return librosa.feature.rms(y=signal, frame_length=frame_size, hop_length=hop_length)

    @staticmethod
    def zero_crossing_rate(signal: np.ndarray, frame_size: int, hop_length: int) -> np.ndarray:
        return librosa.feature.zero_crossing_rate(y=signal, frame_length=frame_size, hop_length=hop_length)

    @staticmethod
    def short_time_fourier_transform(signal: np.ndarray, hop_length: int) -> np.ndarray:
        return librosa.stft(y=signal, hop_length=hop_length)
        
    @staticmethod
    def mfcc(signal: np.ndarray, n_mfcc: int, sr: int) -> np.ndarray:
        return librosa.feature.mfcc(y=signal, n_mfcc=n_mfcc, sr=sr)

    @staticmethod
    def spectral_centroids(signal: np.ndarray, sr: int, n_fft: int, hop_length: int) -> np.ndarray:
        return librosa.feature.spectral_centroid(y=signal, sr=sr, n_fft=n_fft, hop_length=hop_length)

    @staticmethod
    def bandwidth(signal: np.ndarray, sr: int, n_fft: int, hop_length: int) -> np.ndarray:
        return librosa.feature.spectral_bandwidth(y=signal, sr=sr, n_fft=n_fft, hop_length=hop_length)
    
    @staticmethod
    def normalize_feature(feature: np.ndarray) -> float:
        return np.sqrt(np.sum(feature ** 2))

    @staticmethod
    def normalize_matrix(matrix: np.ndarray) -> np.ndarray:
        return matrix.flatten()

In [99]:
class AudioDataLoader:
    def load_from_df(self, df: pd.DataFrame):
        entries = []
        for i in range(len(df)):
            data = dict()
            file_name, fold, class_ = df.iloc[i][["slice_file_name", "fold", "class"]]

            y, sr = librosa.load(f'./db/fold{fold}/{file_name}')
            data['class'] = class_
            data['fold'] = fold
            data['amplitude_envelope'] = FeatureExtractor.normalize_feature(FeatureExtractor.amplitude_envelope(y, frame_size=FRAME_SIZE, hop_length=HOP_LENGTH))
            data['root_mean_square'] = FeatureExtractor.normalize_feature(FeatureExtractor.root_mean_square(y, frame_size=FRAME_SIZE, hop_length=HOP_LENGTH))
            data['zero_crossing_rate'] = FeatureExtractor.normalize_feature(FeatureExtractor.zero_crossing_rate(y, frame_size=FRAME_SIZE, hop_length=HOP_LENGTH))
            data['mfcc'] = FeatureExtractor.normalize_feature(FeatureExtractor.normalize_matrix(FeatureExtractor.mfcc(y, n_mfcc=N_MFCC, sr=sr)))
            data['spectral_centroid'] = FeatureExtractor.normalize_feature(FeatureExtractor.normalize_matrix(FeatureExtractor.spectral_centroids(y, sr=sr, n_fft=FRAME_SIZE, hop_length=HOP_LENGTH)))
            data['bandwidth'] = FeatureExtractor.normalize_feature(FeatureExtractor.normalize_matrix(FeatureExtractor.bandwidth(y, sr=sr, n_fft=FRAME_SIZE, hop_length=HOP_LENGTH)))
                
            entries.append(data)
        
        return entries

In [100]:
data_loader = AudioDataLoader()
new_data_list = data_loader.load_from_df(sound_db)
new_data_list

  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)


[{'class': 'dog_bark',
  'fold': 5,
  'amplitude_envelope': 2.5011067,
  'root_mean_square': 0.85973305,
  'zero_crossing_rate': 0.701484631675251,
  'mfcc': 1114.7451,
  'spectral_centroid': 11002.153867171372,
  'bandwidth': 9055.166226660098},
 {'class': 'children_playing',
  'fold': 5,
  'amplitude_envelope': 0.20763576,
  'root_mean_square': 0.07694757,
  'zero_crossing_rate': 2.7154218977469546,
  'mfcc': 5759.2744,
  'spectral_centroid': 38984.71506911663,
  'bandwidth': 35987.44993860048},
 {'class': 'children_playing',
  'fold': 5,
  'amplitude_envelope': 0.14198396,
  'root_mean_square': 0.056997657,
  'zero_crossing_rate': 1.999477318175537,
  'mfcc': 6220.243,
  'spectral_centroid': 33621.214468117425,
  'bandwidth': 35413.73135881517},
 {'class': 'children_playing',
  'fold': 5,
  'amplitude_envelope': 0.2545674,
  'root_mean_square': 0.09267064,
  'zero_crossing_rate': 2.646289055292365,
  'mfcc': 5578.505,
  'spectral_centroid': 41672.17439089133,
  'bandwidth': 41306.24

In [101]:
features_data_frame = pd.DataFrame(new_data_list)
features_data_frame

Unnamed: 0,class,fold,amplitude_envelope,root_mean_square,zero_crossing_rate,mfcc,spectral_centroid,bandwidth
0,dog_bark,5,2.501107,0.859733,0.701485,1114.745117,11002.153867,9055.166227
1,children_playing,5,0.207636,0.076948,2.715422,5759.274414,38984.715069,35987.449939
2,children_playing,5,0.141984,0.056998,1.999477,6220.243164,33621.214468,35413.731359
3,children_playing,5,0.254567,0.092671,2.646289,5578.504883,41672.174391,41306.245923
4,children_playing,5,0.154707,0.060585,2.300883,6048.385742,36180.503459,35524.610921
...,...,...,...,...,...,...,...,...
8727,car_horn,7,0.242281,0.089146,2.199027,5477.838379,32959.303459,33843.998862
8728,car_horn,7,0.654602,0.240770,3.501109,3499.038086,38526.752163,24415.915710
8729,car_horn,7,0.835565,0.258022,2.360673,4000.881104,37881.330320,35187.318023
8730,car_horn,7,0.381664,0.137357,2.407475,3802.002930,29847.130883,27313.371371


In [102]:
features_data_frame.head()

Unnamed: 0,class,fold,amplitude_envelope,root_mean_square,zero_crossing_rate,mfcc,spectral_centroid,bandwidth
0,dog_bark,5,2.501107,0.859733,0.701485,1114.745117,11002.153867,9055.166227
1,children_playing,5,0.207636,0.076948,2.715422,5759.274414,38984.715069,35987.449939
2,children_playing,5,0.141984,0.056998,1.999477,6220.243164,33621.214468,35413.731359
3,children_playing,5,0.254567,0.092671,2.646289,5578.504883,41672.174391,41306.245923
4,children_playing,5,0.154707,0.060585,2.300883,6048.385742,36180.503459,35524.610921


In [103]:
features_data_frame.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8732 entries, 0 to 8731
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   class               8732 non-null   object 
 1   fold                8732 non-null   int64  
 2   amplitude_envelope  8732 non-null   float32
 3   root_mean_square    8732 non-null   float32
 4   zero_crossing_rate  8732 non-null   float64
 5   mfcc                8732 non-null   float32
 6   spectral_centroid   8732 non-null   float64
 7   bandwidth           8732 non-null   float64
dtypes: float32(3), float64(3), int64(1), object(1)
memory usage: 443.5+ KB


In [104]:
features_data_frame.describe()

Unnamed: 0,fold,amplitude_envelope,root_mean_square,zero_crossing_rate,mfcc,spectral_centroid,bandwidth
count,8732.0,8732.0,8732.0,8732.0,8732.0,8732.0,8732.0
mean,5.385937,3.245677,1.292797,2.214297,3361.473633,35841.925843,35523.13068
std,2.84682,2.799315,1.181825,1.717464,1596.536743,18639.323276,11848.752326
min,1.0,0.008834,0.003374,0.043934,220.031311,3341.335126,3049.713004
25%,3.0,1.179502,0.470023,1.100895,2161.331238,22945.143451,27589.114024
50%,5.0,2.509311,0.97746,1.755627,3131.631592,32157.266233,35257.050425
75%,8.0,4.473047,1.726893,2.63866,4408.380737,43896.092834,43844.191998
max,10.0,18.870834,10.808523,14.097483,10263.19043,123871.154911,79524.233972


In [105]:
features_data_frame.to_csv('features_data.csv', index=False)