In [1]:
# import numpy for math calculations
import numpy as np

# import pandas for data (csv) manipulation
import pandas as pd

# import gc to collect garbage
import gc


# import matplotlib for plotting
import matplotlib.pyplot as plt
import matplotlib
matplotlib.style.use('fivethirtyeight')
%matplotlib inline

# import seaborn for more plotting options(built on top of matplotlib)
import seaborn as sns

# import librosa for analysing audio signals : visualize audio, display the spectogram
import librosa
import soundfile as sf

# import librosa for analysing audio signals : visualize audio, display the spectogram
import librosa.display


# import wav for reading and writing wav files
import wave

# import IPython.dispaly for playing audio in Jupter notebook
import IPython.display as ipd

# import os for system operations
import os

# import random for get random values/choices
import random

# import sklearn for machine learning modelling and preprocessing
import sklearn

# import tensorflow for deep learning modelling
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPooling2D, BatchNormalization, Activation
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.models import load_model
from tensorflow.keras import backend as K

# importing from sklearn the evaluation metrics for classification
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import log_loss
from sklearn.metrics import confusion_matrix

# importing from sklearn model selection
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, StratifiedKFold, learning_curve


# import tqdm to show a smart progress meter
from tqdm.notebook import trange,tqdm

# import warnings to hide the unnessairy warniings
import warnings
warnings.filterwarnings('ignore')

In [2]:
import random
SEED = 42
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(SEED)

In [11]:
data = "audio"
dataset = pd.read_csv("ESC-50-master\meta\esc50.csv")
all_files = []
for path, subdirs, files in os.walk(data):
    for name in files:
        all_files.append(os.path.join(path, name))

# Model Modelling 👩🏿

## Data preparation

### Data for RawNet

In [None]:
import os
import numpy as np
import librosa
from scipy.io import wavfile
from pydub import AudioSegment

In [17]:
# Data Augmentation and Reshape
def handle_and_reshape(input_data, target_length=20480, window_size=1024, hop_size=512):
    # Split an audio signal into non-silent intervals
    segments = librosa.effects.split(input_data, top_db=200000000)

    reshaped_data = []
    for (start, end) in segments:
        segment = input_data[start:end]

        if len(segment) < target_length:
            # Pad the segment if it's shorter than the target length
            padding = target_length - len(segment)
            segment = np.pad(segment, (0, padding))
        elif len(segment) > target_length:
            # Truncate the segment if it's longer than the target length
            segment = segment[:target_length]

        # Calculate the number of hops based on the window and hop size
        num_hops = (len(segment) - window_size) // (hop_size // 2) + 1

        for i in range(num_hops):
            hop_start = int(i * hop_size // 2)
            hop_end = int(hop_start + window_size)

            # Ensure each segment has the same length
            reshaped_data.append(np.expand_dims(segment[hop_start:hop_end], axis=-1))


    return np.array(reshaped_data)

In [16]:
# Data Augmentation and Reshape
def augment_mel(input_data, target_length=20480, window_size=1024, hop_size=512):
    audio = input_data
    augmented_audio = time_stretch(audio, factor=0.81)
    augmented_audio = pitch_shift(audio, semitones=-2)
    augmented_audio = white_noise(audio, noise_level=0.005)
    augmented_audio = time_warp(audio, warping_factor=0.3)

    segments = librosa.effects.split(augmented_audio, top_db=200000000)
    augmented_data = []
    for (start, end) in segments:
        segment = augmented_audio[start:end]

        if len(segment) < target_length:
            # Pad the segment if it's shorter than the target length
            padding = target_length - len(segment)
            segment = np.pad(segment, (0, padding))
        elif len(segment) > target_length:
            # Truncate the segment if it's longer than the target length
            segment = segment[:target_length]

        # Calculate the number of hops based on the window and hop size
        num_hops = (len(segment) - window_size) // (hop_size // 2) + 1

        for i in range(num_hops):
            hop_start = int(i * hop_size // 2)
            hop_end = int(hop_start + window_size)

            # Ensure each segment has the same length
            #augmented_data.append(np.expand_dims(segment[hop_start:hop_end], axis=-1))


    return np.array(segment)

In [18]:
def time_stretch(audio, factor):
    # Convert NumPy array to AudioSegment
    audio_segment = AudioSegment(audio.tobytes(), frame_rate=22050, sample_width=audio.dtype.itemsize, channels=1)
    # Apply time stretching
    stretched_audio = audio_segment.speedup(playback_speed=factor)
    # Convert back to NumPy array
    stretched_array = np.array(stretched_audio.get_array_of_samples())
    return stretched_array

def pitch_shift(audio, semitones):
    # Convert NumPy array to AudioSegment
    audio_segment = AudioSegment(audio.tobytes(), frame_rate=22050, sample_width=audio.dtype.itemsize, channels=1)
    # Apply pitch shifting
    shifted_audio = audio_segment._spawn(audio_segment.raw_data, overrides={
        "frame_rate": int(audio_segment.frame_rate * (2 ** (semitones / 12.0)))
    })
    # Convert back to NumPy array
    shifted_array = np.array(shifted_audio.get_array_of_samples())
    return shifted_array

def white_noise(audio, noise_level=0.005):
    # Add white noise
    noise = np.random.normal(0, noise_level, len(audio))
    audio_with_noise = audio + noise
    # Clip values to stay within the valid range for int16 audio
    audio_with_noise = np.clip(audio_with_noise, -32768, 32767).astype(np.int16)
    return audio_with_noise

def time_warp(audio, warping_factor):
    num_frames = len(audio)
    frame_indices = np.arange(0, num_frames)
    warped_indices = np.int16(frame_indices + warping_factor * np.sin(np.linspace(0, 2*np.pi, num_frames)))

    # Ensure that the warped indices are within the valid range
    warped_indices = np.clip(warped_indices, 0, num_frames - 1)

    # Apply time warping
    warped_audio = audio[warped_indices]

    return warped_audio


In [19]:
def extract_log_mel_features(audio, sr):
    # Load audio file
    y = audio

    # Extract log-scaled mel spectrogram features
    mel_spectrogram = librosa.feature.melspectrogram(y=y, sr=22050, n_mels=60)
    log_mel_spectrogram = librosa.power_to_db(mel_spectrogram)

    # Divide the sound segments into frames with overlap
    frames = librosa.util.frame(log_mel_spectrogram, frame_length=23, hop_length=11)

    # Calculate the first temporal derivative
    delta_log_mel = librosa.feature.delta(log_mel_spectrogram)

    # Combine static and temporal derivative features
    feature_map = np.stack([log_mel_spectrogram, delta_log_mel], axis=-1)

    return feature_map

In [21]:
X = []
y = []
for i in tqdm(range(len(dataset))):
    audio_path = os.path.join('ESC-50-master/audio', dataset['filename'][i])
    audio,_ = librosa.load(audio_path, sr=22050, mono=True)
    #segments = augment_mel(audio)
    spettrogramma = extract_log_mel_features(audio, _)#logmel, delta = process_audio_file(segments)
    X.append(spettrogramma)
    y.append(dataset['target'][i])

X = np.array(X)
y = np.array(y)

  0%|          | 0/2000 [00:00<?, ?it/s]

In [22]:
# Save X and y to CSV
df = pd.DataFrame(data={'label': y, 'feature': list(X)})
df.to_csv('data_logmel.csv', index=False)
X_train_mel, X_test_mel, y_train_mel, y_test_mel = train_test_split(X, to_categorical(y, num_classes=50), test_size=0.2, random_state=42)
print(X_train_mel.shape, X_test_mel.shape, y_train_mel.shape, y_test_mel.shape)

(1600, 60, 216, 2) (400, 60, 216, 2) (1600, 50) (400, 50)


## Model Utilities

In [23]:
import math
from tensorflow.keras.callbacks import LearningRateScheduler
from tensorflow.keras.callbacks import ModelCheckpoint

stop = tf.keras.callbacks.EarlyStopping(monitor='loss',
                                            restore_best_weights = True,
                                            patience=10,
                                            verbose=1)

checkpoint_path = "training_1/cp.ckpt"
checkpoint_dir = os.path.dirname(checkpoint_path)

# Create a callback that saves the model's weights
cp_callback = ModelCheckpoint(filepath=checkpoint_path,
                                                 save_weights_only=True,
                                                 verbose=1)

# Define the dynamic learning rate schedule
def lr_schedule(epoch, lr):
    min_learning_rate = 0.0001
    max_learning_rate = 0.001
    decay_speed = 5.0
    return min_learning_rate + (max_learning_rate - min_learning_rate) * math.exp(-epoch / decay_speed)

initial_learning_rate = 0.003
optimizer = tf.keras.optimizers.Adam(learning_rate=initial_learning_rate)

## RawNet Model

In [25]:
# let's build rawnet CNN in tensorflow
# import libraries
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPooling2D, BatchNormalization, Activation
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.models import load_model
from tensorflow.keras import backend as K

# reset the memory of tensorflow
tf.keras.backend.clear_session()

df = pd.read_csv('ESC-50-master/meta/esc50.csv')
audio_directory = 'ESC-50-master/audio'
labels = df.set_index('filename')['target'].to_dict()
num_classes = len(set(labels.values()))
input_shape = (X_train_mel.shape[1], X_train_mel.shape[2], 1)

rawnet_model = Sequential([
    tf.keras.layers.Conv2D(24, (6, 6), padding='same',strides = (1,1), input_shape=input_shape, activation='relu'),
    tf.keras.layers.Conv2D(24, (6, 6), padding='same',strides = (1,1), input_shape=input_shape, activation='relu'),
    tf.keras.layers.Conv2D(48, (5, 5), padding='same',strides = (2,2), input_shape=input_shape, activation= 'relu'),
    tf.keras.layers.Conv2D(48, (5, 5), padding='same',strides = (2,2), input_shape=input_shape, activation='relu'),
    tf.keras.layers.Conv2D(64, (4, 4), padding='same',strides = (2,2), input_shape=input_shape, activation='relu'),

    # batch normalization layer to improve the learning process
    tf.keras.layers.BatchNormalization(),
    # fully connected layer with 200 units
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(200, activation='relu'),
    # applying dropout to prevent overfitting as described in the paper
    tf.keras.layers.Dropout(0.5),
    # output layer with 50 units
    tf.keras.layers.Dense(50, activation='softmax')
])

rawnet_model.summary()



Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d (Conv2D)             (None, 60, 216, 24)       888       
                                                                 
 conv2d_1 (Conv2D)           (None, 60, 216, 24)       20760     
                                                                 
 conv2d_2 (Conv2D)           (None, 30, 108, 48)       28848     
                                                                 
 conv2d_3 (Conv2D)           (None, 15, 54, 48)        57648     
                                                                 
 conv2d_4 (Conv2D)           (None, 8, 27, 64)         49216     
                                                                 
 batch_normalization (BatchN  (None, 8, 27, 64)        256       
 ormalization)                                                   
                                                        