In [0]:
%%bash
pip install librosa

In [0]:
import os
from random import shuffle

import numpy as np
import pandas as pd
from keras.utils import np_utils
from sklearn.preprocessing import LabelEncoder

import librosa as rosa

In [0]:
from google.colab import drive
drive.mount('/content/drive')

In [0]:
directory = "drive/My Drive/UE_Proell"
res_type = "kaiser_fast"
n_mfcc = 40

preprocessed = f"{directory}/preprocessed"
os.makedirs(preprocessed, exist_ok=True)

In [0]:
train = pd.read_csv(f"{directory}/data/train_long.csv", sep=",")
test = pd.read_csv(f"{directory}/data/test.csv", sep=",")

train.sample(n = 5)

In [0]:
def feature_extract(row):
   # function to load files and extract features
   file_name = os.path.join(f"{directory}/data/urban_sound_files/{row['ID']}.wav")
   if not os.path.exists(file_name):
     print(f"File {file_name} does not exist. Abort...")
     return None, None

   # handle exception to check if there isn't a file which is corrupted
   try:
      # here kaiser_fast is a technique used for faster extraction
      X, sample_rate = rosa.load(file_name, res_type=res_type) 
      # we extract mfcc feature from data
      mfccs = np.mean(rosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=n_mfcc).T, axis=0) 
      #print(f"Features extracted for file {row['ID']}.wav")
   except Exception as e:
      print("Error encountered while parsing file: ", file)
      return None, None
 
   feature = mfccs
   label = row["Class"]
 
   return [feature, label]

In [0]:
# axis = 0 / per column
# axis = 1 / per row

train_temp = train.apply(feature_extract, axis=1)
print("Feature extraction for train data completed.")
test_temp = test.apply(feature_extract, axis=1)
print("Feature extraction for test data completed.")

In [0]:
train_temp = pd.DataFrame({"label": [row[1] for row in train_temp], "feature": [row[0] for row in train_temp]})
test_temp = pd.DataFrame({"label": [row[1] for row in test_temp], "feature": [row[0] for row in test_temp]})

In [0]:
def encode(data):
  X = np.array(data["feature"].tolist())
  y = np.array(data["label"].tolist())

  lb = LabelEncoder()
  y = np_utils.to_categorical(lb.fit_transform(y))

  return [X, y]

X_train, y_train = encode(train_temp)
X_test, y_test = encode(test_temp)

In [0]:
np.save(f"{preprocessed}/X_train", X_train)
np.save(f"{preprocessed}/X_test", X_test)
np.save(f"{preprocessed}/y_train", y_train)
np.save(f"{preprocessed}/y_test", y_test)