<a href="https://colab.research.google.com/github/mcui5/dl-final/blob/main/preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Importing needed libraries for preprocessing

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import librosa
import librosa.display
import os, sys 
import pandas as pd
import pickle

Sound feature extraction method: Mel-frequency cepstral coefficients

The time-series feature matrices of one sound extracted by mfcc resulted in a (173, 40) input data per sound.

The column direction contained the time-series data, whereas the row direction contained the feature data of a specific time. Only four-second long files were used, allowing the unification of the input data size.

In [None]:
from google.colab import drive #Can ignore if done locally
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
#!tar -xvf  '/content/gdrive/Shared drives/CS1470-Final/UrbanSound8K.tar.gz' -C '/content/gdrive/Shared drives/CS1470-Final'

In [None]:
'''
Original - removes every sound clip not exactly 4 seconds
'''
def get_features(filename):
  num_mfcc = 40
  try:
    audio, SR = librosa.load(filename)
    mfcc = librosa.feature.mfcc(audio, sr=SR, n_mfcc=num_mfcc)
  except Exception as e:
    print('Error encountered getting features, file: ', filename)
    return None

  return mfcc

audio_path = '/content/gdrive/Shared drives/CS1470-Final/UrbanSound8K/audio'
csv_metadata = pd.read_csv(audio_path + '/../metadata/UrbanSound8K.csv')

collected_mfccs = []

for i, row in csv_metadata.iterrows(): 
  curr_file = os.path.join(os.path.abspath(audio_path),'fold'+str(row["fold"])+'/',str(row["slice_file_name"]))
  curr_class = row['classID']
  if row['end'] - row['start'] == 4:
    curr_mfcc = np.transpose(get_features(curr_file))
    
    # inputs, label, folder 
    collected_mfccs.append([np.ndarray.tolist(curr_mfcc), curr_class, row["fold"]])

pickle.dump(collected_mfccs, open('mfccs.pkl', 'wb'))

In [None]:
from google.colab import files
files.download('mfccs.pkl')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
'''
Moved to LSTM_3 file
'''
def preprocess(filepath): 
  """
    1. Unpickle file
    2. Separate 
    3. One-hot encode labels 

    :inputs: 
    filepath: filepath to the pickle file in Drive 

    :returns: 
    (inputs, labels, folders)
  """
  
  with open(filepath, 'rb') as fo:
    pickle_output = pickle.load(fo, encoding='bytes')
  
  inputs = [row[0] for row in pickle_output]
  inputs = [inputs[i][:173] for i in range(len(inputs))]
  inputs = np.array(inputs)
  labels = np.array(pickle_output)[:, 1]
  folders = np.array(pickle_output)[:, 2]

  return (inputs, labels, folders)

In [None]:
pickled_path = '/content/gdrive/Shared drives/CS1470-Final/mfccs.pkl'
inputs, labels, folders = preprocess(pickled_path)
id0 = np.nonzero(labels == 0)
id1 = np.nonzero(labels == 1)
id2 = np.nonzero(labels == 2)
id3 = np.nonzero(labels == 3)
id4 = np.nonzero(labels == 4)
id5 = np.nonzero(labels == 5)
id6 = np.nonzero(labels == 6)
id7 = np.nonzero(labels == 7)
id8 = np.nonzero(labels == 8)
id9 = np.nonzero(labels == 9)

print('Class ID 0: ' + str(np.shape(id0)))
print('Class ID 1: ' + str(np.shape(id1)))
print('Class ID 2: ' + str(np.shape(id2)))
print('Class ID 3: ' + str(np.shape(id3)))
print('Class ID 4: ' + str(np.shape(id4)))
print('Class ID 5: ' + str(np.shape(id5)))
print('Class ID 6: ' + str(np.shape(id6)))
print('Class ID 7: ' + str(np.shape(id7)))
print('Class ID 8: ' + str(np.shape(id8)))
print('Class ID 9: ' + str(np.shape(id9)))

Class ID 0: (1, 970)
Class ID 1: (1, 180)
Class ID 2: (1, 938)
Class ID 3: (1, 566)
Class ID 4: (1, 677)
Class ID 5: (1, 817)
Class ID 6: (1, 16)
Class ID 7: (1, 734)
Class ID 8: (1, 726)
Class ID 9: (1, 949)


In [None]:
'''
Moved to LSTM_3 file
'''
def split(inputs, labels, folders, test_folder_idx):
  """
    Split data into training and testing data 

    :inputs: 
    the outputs from preprocess 
    test_folder_idx: index of the folder that will be used for testing

    :return: 
    one quadruple, (train_inputs, train_labels, test_inputs, test_labels)
  """
  test_indices = np.nonzero(folders == test_folder_idx)
  train_indices = np.nonzero(folders != test_folder_idx)

  return (inputs[train_indices], labels[train_indices], inputs[test_indices], labels[test_indices])

In [None]:
'''
Move into model files 
'''
def shuffle(inputs, labels, test_fraction):
  '''
  shuffle collection of all data, and split into testing and training, 15%:85%

  :inputs: 
    the outputs from preprocess (inputs and labels)
    test_fraction: percentage of inputs that will be used for testing
  
  :return: 
    one quadruple, (train_inputs, train_labels, test_inputs, test_labels)
  '''
  indices = np.arange(labels.shape[0])
  np.random.shuffle(indices)
  inputs = np.take(inputs, indices, axis=0)
  labels = np.take(labels, indices, axis=0)

  num_test = int(test_fraction * labels.shape[0])
  test_inputs = inputs[:num_test]
  test_labels = labels[:num_test]
  train_inputs = inputs[num_test:]
  train_labels = labels[num_test:]

  return (train_inputs, train_labels, test_inputs, test_labels)