In [None]:
!apt-get install libav-tools -y

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!apt-get install libav-tools -y

In [None]:
# Math libraries
import math
import statistics
import scipy.stats

## ML Libraries
from sklearn import feature_extraction, linear_model, model_selection, preprocessing, metrics
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn import metrics
from sklearn.metrics import f1_score, roc_curve, auc, accuracy_score, confusion_matrix, classification_report, mean_squared_error, roc_auc_score , recall_score , precision_score
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.decomposition import PCA

## Visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.colors import ListedColormap
from subprocess import check_output
from tqdm import tqdm

#Keras
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.metrics import AUC
from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten
from keras.callbacks import EarlyStopping
from keras import backend as K

#Audio libraries
import IPython.display as ipd
import librosa
from librosa import display
from scipy.io import wavfile as wav

#Torch libraries
import torch
import torchvision
import torchvision.transforms as tt
from torch import nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
from torch import Tensor
import torch.optim as optim
from torch.optim import lr_scheduler

#Others
import time
import logging
logging.disable(logging.WARNING)
import warnings
warnings.filterwarnings("ignore")
import gc
gc.enable()
np.random.seed(42)

In [None]:
#reading the csv file
data = pd.read_csv('/kaggle/input/environmental-sound-classification-50/esc50.csv')
data

In [None]:
data2=pd.DataFrame(data)
data2.head()

In [None]:
data2.tail()

In [None]:
data2.describe()

In [None]:
data2.info()

In [None]:
data2.shape

In [None]:
data2.columns

In [None]:
labels = list(data['category'].unique())
labels

In [None]:
#Visualizing unique values
sns.countplot(labels).set_title("Unique values labels")
plt.xticks(rotation=90)
sns.set(rc={'figure.figsize':(11,8)})

In [None]:
data2['category'].value_counts(dropna=False)

In [None]:
#Searching for null values
np.where(pd.isnull(data2).sum())

In [None]:
#Searching for duplicates
data2.duplicated().sum()

In [None]:
# Let's grab a single audio file from each class
files = dict()
for i in range(len(labels)):
    tmp = data[data['category'] == labels[i]][:1].reset_index()
    path = '../input/environmental-sound-classification-50/audio/audio/{}'.format(tmp['filename'][0])
    files[labels[i]] = path

In [None]:
fig = plt.figure(figsize=(15,15))
fig.subplots_adjust(hspace=0.4, wspace=0.4)
for i, label in enumerate(labels[:10]):
    fn = files[label]
    fig.add_subplot(5, 2, i+1)
    plt.title(label)
    data, sample_rate = librosa.load(fn)
    display.waveshow(data, sr = sample_rate, color="pink")

In [None]:
s1 = r"../input/environmental-sound-classification-50/audio/audio/1-100210-B-36.wav"
x,freq = librosa.load(s1)
sr=freq

In [None]:
sound1=ipd.Audio(s1)
sound1

In [None]:
plt.figure(figsize=(8,2))
plt.title("Vacuum Cleaner Sound")
librosa.display.waveshow(x,sr=freq, color='pink')

In [None]:
#Spectrogram
X=librosa.stft(x) #stft -> Short-time Fourier transform
desibel=librosa.amplitude_to_db(abs(X)) #Translation from amplitude to desibel(db) value
plt.figure(figsize=(10,4))
librosa.display.specshow(desibel, sr=sr,x_axis="time",y_axis="hz", cmap='PiYG')
plt.title("Vacuum Cleaner Spectogram")
plt.colorbar()

In [None]:
data_h, data_p = librosa.effects.hpss(x)
spec_h = librosa.feature.melspectrogram(data_h, sr=sr)
spec_p = librosa.feature.melspectrogram(data_p, sr=sr)
db_spec_h = librosa.power_to_db(spec_h,ref=np.max)
db_spec_p = librosa.power_to_db(spec_p,ref=np.max)

In [None]:
librosa.display.specshow(db_spec_h,y_axis='mel', x_axis='s', sr=sr, cmap='PiYG')
plt.title("Vacuum Cleaner Harmonic Mel Spectogram")
plt.colorbar()

In [None]:
librosa.display.specshow(db_spec_p,y_axis='mel', x_axis='s', sr=sr, cmap='PiYG')
plt.title("Vacuum Cleaner Percuisive Mel Spectogram")
plt.colorbar()

In [None]:
mfcc=librosa.feature.mfcc(x,sr=sr)
print("shape of mfcc:" ,mfcc.shape)

plt.figure(figsize=(15,6))
librosa.display.specshow(mfcc,x_axis="s", cmap='PiYG')
plt.title("Vacuum Cleaner Mel-Frequency Cepstral Coefficients")
plt.colorbar()

In [None]:
#Zero Crossing Rate
zero_crossing=librosa.zero_crossings(x)
print("Type of Zero Crossing Rate",type(zero_crossing))
print(zero_crossing, " --> See it contains booleans")
print("Total Number of Zero Crossing is: ",sum(zero_crossing))

In [None]:
plt.figure(figsize=(15,5))
plt.title("Vacuum Cleaner Zero Crossing Rate")
plt.plot(x[4000:5100],  color='pink')
plt.grid()

In [None]:
#Spectral Centroid
spec_cent=librosa.feature.spectral_centroid(x)
print(spec_cent.shape)

plt.figure(figsize=(15,5))
plt.title("Vacuum Cleaner Spectral Centroid")
plt.semilogy(spec_cent.T, "r", color='pink')
plt.ylabel("Hz")

In [None]:
#Spectral Roll off
spec_roll=librosa.feature.spectral_rolloff(x,sr=sr)
print(spec_roll.shape)

plt.figure(figsize=(15,5))
plt.title("Vacuum Cleaner Spectral Roll off")
plt.semilogy(spec_roll.T,"r", color='pink')
plt.ylabel("Hz")

In [None]:
spec_band=librosa.feature.spectral_bandwidth(x,sr=sr)
print("Spectral Bandwidth Shape: ",spec_band.shape)
print("Spectral Bandwidth: ", spec_band)

In [None]:
S = librosa.magphase(librosa.stft(x, window=np.ones, center=False))[0]
RMSEn= librosa.feature.rms(S=S)
print(RMSEn.shape)


fig, ax = plt.subplots(nrows=2, sharex=True)
times = librosa.times_like(RMSEn)
plt.title("Root Mean Squared Energy & log Pawer Spectogram")
ax[0].semilogy(times, RMSEn[0], label='RMS Energy', color='pink')
ax[0].set(xticks=[])
ax[0].legend()
ax[0].label_outer()
librosa.display.specshow(librosa.amplitude_to_db(S, ref=np.max), y_axis='log', x_axis='time', ax=ax[1], cmap='PiYG')
ax[1].set(title='log Power spectrogram')
plt.show()

In [None]:
s2 = r"/kaggle/input/environmental-sound-classification-50/audio/audio/1-11687-A-47.wav" 
x,freq = librosa.load(s2)
sr=freq

In [None]:
sound2=ipd.Audio(s2)
sound2

In [None]:
plt.figure(figsize=(8,2))
plt.title("Airplane Sound")
librosa.display.waveshow(x,sr=freq, color="pink")

In [None]:
#Spectrogram
X=librosa.stft(x) #stft -> Short-time Fourier transform
desibel=librosa.amplitude_to_db(abs(X)) #Translation from amplitude to desibel(db) value
plt.figure(figsize=(10,4))
librosa.display.specshow(desibel, sr=sr,x_axis="time",y_axis="hz", cmap='PiYG')
plt.title("Airplane Spectogram")
plt.colorbar()

In [None]:
data_h, data_p = librosa.effects.hpss(x)
spec_h = librosa.feature.melspectrogram(data_h, sr=sr)
spec_p = librosa.feature.melspectrogram(data_p, sr=sr)
db_spec_h = librosa.power_to_db(spec_h,ref=np.max)
db_spec_p = librosa.power_to_db(spec_p,ref=np.max)

In [None]:
librosa.display.specshow(db_spec_h,y_axis='mel', x_axis='s', sr=sr, cmap='PiYG')
plt.title("Airplane Harmonic Mel Spectogram")
plt.colorbar()

In [None]:
librosa.display.specshow(db_spec_p,y_axis='mel', x_axis='s', sr=sr, cmap='PiYG')
plt.title("Airplane Percuisive Mel Spectogram")
plt.colorbar()

In [None]:
mfcc=librosa.feature.mfcc(x,sr=sr)
print("shape of mfcc:" ,mfcc.shape)

plt.figure(figsize=(15,6))
librosa.display.specshow(mfcc,x_axis="s", cmap='PiYG')
plt.title("Airplane Mel-Frequency Cepstral Coefficients")
plt.colorbar()

In [None]:
zero_crossing=librosa.zero_crossings(x)
print("Type of Zero Crossing Rate",type(zero_crossing))
print(zero_crossing, " --> See it contains booleans")
print("Total Number of Zero Crossing is: ",sum(zero_crossing))

In [None]:
plt.figure(figsize=(15,5))
plt.title("Airplane Zero Crossing Rate")
plt.plot(x[4000:5100], color='pink')
plt.grid()

In [None]:
#Spectral Centroid
spec_cent=librosa.feature.spectral_centroid(x)
print(spec_cent.shape)
plt.figure(figsize=(15,5))
plt.title("Airplane Spectral Centroid")
plt.semilogy(spec_cent.T, "r", color='pink')
plt.ylabel("Hz")

In [None]:
#Spectral Roll off
spec_roll=librosa.feature.spectral_rolloff(x,sr=sr)
print(spec_roll.shape)

plt.figure(figsize=(15,5))
plt.title("Airplane Spectral Roll off")
plt.semilogy(spec_roll.T,"r", color='pink')
plt.ylabel("Hz")

In [None]:
spec_band=librosa.feature.spectral_bandwidth(x,sr=sr)
print("Spectral Bandwidth Shape: ",spec_band.shape)
print("Spectral Bandwidth: ", spec_band)

In [None]:
S = librosa.magphase(librosa.stft(x, window=np.ones, center=False))[0]
RMSEn= librosa.feature.rms(S=S)
print(RMSEn.shape)


fig, ax = plt.subplots(nrows=2, sharex=True)
times = librosa.times_like(RMSEn)
plt.title("Root Mean Squared Energy & log Pawer Spectogram")
ax[0].semilogy(times, RMSEn[0], label='RMS Energy', color='pink')
ax[0].set(xticks=[])
ax[0].legend()
ax[0].label_outer()
librosa.display.specshow(librosa.amplitude_to_db(S, ref=np.max), y_axis='log', x_axis='time', ax=ax[1], cmap='PiYG')
ax[1].set(title='log Power spectrogram')
plt.show()

In [None]:
s3 = r"/kaggle/input/environmental-sound-classification-50/audio/audio/1-101296-A-19.wav" 
x,freq = librosa.load(s3)
sr=freq

In [None]:
sound3=ipd.Audio(s3)
sound3

In [None]:
plt.figure(figsize=(8,2))
plt.title("Thunderstorm Sound")
librosa.display.waveshow(x,sr=freq, color="pink")

In [None]:
#Spectrogram
X=librosa.stft(x) #stft -> Short-time Fourier transform
desibel=librosa.amplitude_to_db(abs(X)) #Translation from amplitude to desibel(db) value
plt.figure(figsize=(10,4))
librosa.display.specshow(desibel, sr=sr,x_axis="time",y_axis="hz", cmap='PiYG')
plt.title("Thunderstorm Spectogram")
plt.colorbar()

In [None]:
data_h, data_p = librosa.effects.hpss(x)
spec_h = librosa.feature.melspectrogram(data_h, sr=sr)
spec_p = librosa.feature.melspectrogram(data_p, sr=sr)
db_spec_h = librosa.power_to_db(spec_h,ref=np.max)
db_spec_p = librosa.power_to_db(spec_p,ref=np.max)

In [None]:
librosa.display.specshow(db_spec_h,y_axis='mel', x_axis='s', sr=sr, cmap='PiYG')
plt.title("Thunderstorm Harmonic Mel Spectogram")
plt.colorbar()

In [None]:
librosa.display.specshow(db_spec_p,y_axis='mel', x_axis='s', sr=sr, cmap='PiYG')
plt.title("Thunderstorm Percuisive Mel Spectogram")
plt.colorbar()

In [None]:
mfcc=librosa.feature.mfcc(x,sr=sr)
print("shape of mfcc:" ,mfcc.shape)

plt.figure(figsize=(15,6))
librosa.display.specshow(mfcc,x_axis="s", cmap='PiYG')
plt.title("Thunderstorm Mel-Frequency Cepstral Coefficients")
plt.colorbar()

In [None]:
zero_crossing=librosa.zero_crossings(x)
print("Type of Zero Crossing Rate",type(zero_crossing))
print(zero_crossing, " --> See it contains booleans")
print("Total Number of Zero Crossing is: ",sum(zero_crossing))

In [None]:
plt.figure(figsize=(15,5))
plt.title("Zero Crossing Rate")
plt.plot(x[4000:5100], color="pink")
plt.grid()

In [None]:
#Spectral Centroid
spec_cent=librosa.feature.spectral_centroid(x)
print(spec_cent.shape)
plt.figure(figsize=(15,5))
plt.title("Thunderstorm Spectral Centroid")
plt.semilogy(spec_cent.T, "r", color='pink')
plt.ylabel("Hz")

In [None]:
#Spectral Roll off
spec_roll=librosa.feature.spectral_rolloff(x,sr=sr)
print(spec_roll.shape)

plt.figure(figsize=(15,5))
plt.title("Thunderstorm Spectral Roll off")
plt.semilogy(spec_roll.T,"r", color='pink')
plt.ylabel("Hz")

In [None]:
spec_band=librosa.feature.spectral_bandwidth(x,sr=sr)
print("Spectral Bandwidth Shape: ",spec_band.shape)
print("Spectral Bandwidth: ", spec_band)

In [None]:
S = librosa.magphase(librosa.stft(x, window=np.ones, center=False))[0]
RMSEn= librosa.feature.rms(S=S)
print(RMSEn.shape)


fig, ax = plt.subplots(nrows=2, sharex=True)
times = librosa.times_like(RMSEn)
plt.title("Root Mean Squared Energy & log Pawer Spectogram")
ax[0].semilogy(times, RMSEn[0], label='RMS Energy', color='pink')
ax[0].set(xticks=[])
ax[0].legend()
ax[0].label_outer()
librosa.display.specshow(librosa.amplitude_to_db(S, ref=np.max), y_axis='log', x_axis='time', ax=ax[1], cmap='PiYG')
ax[1].set(title='log Power spectrogram')
plt.show()

In [None]:
audio_fpath = "../input/environmental-sound-classification-50/audio/audio/44100/"

In [None]:
class AudioAugmentation:
    
    def read_audio_file(self, file_path):
        input_length = 220500
        data = librosa.load(file_path)[0]
        if len(data) > input_length:
            data = data[:input_length]
        else:
            data = np.pad(data, (0, max(0, input_length - len(data))), "constant")
        return data
    
    def add_noise(self, data):
        noise = np.random.randn(len(data))
        data_noise = data + 0.005 * noise
        return data_noise
    
    def shift(self, data):
        return np.roll(data, 22050)
    
    def stretch(self, data, rate=1):
        input_length = 220500
        data = librosa.effects.time_stretch(data, rate)
        if len(data) > input_length:
            data = data[:input_length]
        else:
            data = np.pad(data, (0, max(0, input_length - len(data))), "constant")
        return data
    def write_audio_file(self, file, data, sample_rate=44100):
        librosa.output.write_wav(file, data, sample_rate)

In [None]:
#Data Augmentation
aa = AudioAugmentation()
extracted_data = []
for index, row in tqdm(data.iterrows()):
    file_name = os.path.join(os.path.abspath(audio_fpath),str(row["filename"]))
    class_labels = row['category']
    y, sr = librosa.load(file_name, sr=44100)
    for i in range(8):
        if i == 1 or i >3:
            data = aa.add_noise(y)
        if i%3 == 2 or i == 7:
            data = aa.shift(y)
        if i%3 == 0 or i == 7:
            data = aa.stretch(y, rate=1.25)
        feature = librosa.feature.mfcc(y=data, sr=sr)
        scaled_feature = np.mean(feature.T,axis=0)
        extracted_data.append([scaled_feature, class_labels])
np.array(extracted_data).shape

In [None]:
feature_df = pd.DataFrame(extracted_data, columns=['feature','class'])
print(feature_df.head())

In [None]:
x = np.array(feature_df['feature'].tolist())
x.shape

In [None]:
target = np.array(feature_df['class'].tolist())

In [None]:
y_new = pd.get_dummies(target)
print(y_new.shape)
y_new.head()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(x, y_new, test_size=0.2,shuffle = True, random_state=42)
print(str(X_train.shape)+ ', ' + str(X_test.shape))
inp = (1, 20)
np.array(X_train).reshape(12800, 1, 20)

In [None]:
X_train, X_val, y_train, y_val = train_test_split(x, y_new, test_size=0.25, random_state=42)
print(str(X_train.shape)+ ', ' + str(X_val.shape))

In [None]:
inp = (1, 20)
np.array(X_train).reshape(12000, 1, 20)

In [None]:
# LSTM - complete with model definition, summary, fit and plots.
model_bidirectional = keras.models.Sequential()
model_bidirectional.add(keras.Input(shape = (1,20)))
model_bidirectional.add(keras.layers.Bidirectional(keras.layers.LSTM(1024, return_sequences = True)))
model_bidirectional.add(Dropout(0.2))
model_bidirectional.add(keras.layers.Bidirectional(keras.layers.LSTM(2048, return_sequences = True)))
model_bidirectional.add(Dropout(0.2))
model_bidirectional.add(keras.layers.Bidirectional(keras.layers.LSTM(3036, return_sequences = False)))
model_bidirectional.add(Dropout(0.2))  
model_bidirectional.add(Flatten())         
model_bidirectional.add(keras.layers.Dense(50, activation="softmax"))
optimizer = tf.keras.optimizers.Adam(lr=0.001)
model_bidirectional.compile(loss="categorical_crossentropy", optimizer=optimizer, metrics=["accuracy"])

In [None]:
model_bidirectional.summary()

In [None]:
es = EarlyStopping(
    monitor='val_accuracy', 
    restore_best_weights=True, 
    patience=30, min_delta = 0.001
)

In [None]:
len(y_train)

In [None]:
history=model_bidirectional.fit(np.array(X_train).reshape(12000, 1, 20),
        np.array(y_train),
        epochs=10,
        callbacks = [es],
        shuffle = True,
        batch_size = 100,
#         steps_per_epoch = 4,
        validation_data = (np.array(X_test).reshape(3200,1,20),
                            np.array(y_test))
                 )

In [None]:
# Plot loss
plt.figure()
plt.plot(history.history['loss'], label='Train loss')
plt.plot(history.history['val_loss'], label='Val loss')
plt.legend()
plt.xlabel('Epoch')
plt.ylabel('Loss')

# Plot accuracy
plt.figure()
plt.plot(history.history['accuracy'], label='Train accuracy')
plt.plot(history.history['val_accuracy'], label='Val accuracy')
plt.legend()
plt.xlabel('Epoch')
plt.ylabel('Accuracy')

In [None]:
y_pred = model_bidirectional.predict(np.array(x).reshape(16000, 1,20))

In [None]:
y_pred_classes = np.argmax(y_pred, axis=1)
y_pred_classes

In [None]:
y_true = np.argmax(np.array(y_new), axis=1)
y_true

In [None]:
print(np.mean(y_pred_classes == y_true))
m = AUC()
m.update_state(y_new, y_pred)
print(m.result())

In [None]:
cm = confusion_matrix(y_pred_classes, y_true,)
import itertools
plt.clf()
fig = plt.figure(figsize = (12,12)) 
ax = fig.add_subplot(111)
ax.set_aspect(1)
plt.imshow(cm, cmap = plt.cm.PiYG, interpolation='nearest')

plt.title("Confusion Matrix")
plt.colorbar()
tick_marks = np.arange(len(range(50)))
plt.xticks(tick_marks, range(50), rotation=45)
plt.yticks(tick_marks, range(50))
thresh = cm.max()/2
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
    plt.text(j, i , cm[i,j], horizontalalignment="center", color="white" if cm[i,j]>thresh else "black")

plt.tight_layout()
plt.ylabel("True Label")
plt.xlabel("Predicted Label")
plt.savefig('con_matrix_LSTM_ESC50.png')

In [None]:
test_results = []
answers = []
for index, row in tqdm(data2.iterrows()):
    x,sr = librosa.load(audio_fpath+row['filename'], sr=44100)
    mfcc = librosa.feature.mfcc(y=x, sr=sr)
    mfcc_mean = np.mean(mfcc.T, axis=0)
    sample = np.array(mfcc_mean).reshape(1, 1, 20)
    pred = model_bidirectional.predict(sample)
    pred_class = np.argmax(pred,axis=1 )
#     print(pred_class[0])
#     print(row['target'])
    test_results.append(pred_class[0])
    answers.append(row['category'])
print(np.mean(np.array(test_results) == np.array(answers)))