# Data Pre-Processing - Full Volcabulary Testing data<br>

Author: Jamie McQuire <br>

* This Python notebook provides the code for the pre-processing of the competitions testing set.
* The code will translate all the audio files into log spectrograms.
* The output of this Python notebook will be the data and labels for the testing sets

In [2]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
from random import gauss
from scipy.io import wavfile
import csv
from scipy import signal
import librosa
import scipy

* Change the path to your own directory to create the data.

In [3]:
#change the working directory to the Data directory
os.chdir("C:\\Users\\b9027741\\OneDrive - Newcastle University\\Masters\\Computer Science\\Machine_Learning_Project\\Data")

* These are the labels for the competition data.

In [4]:
#labels needed for classification
labels = ["yes", "no", "up", "down", "left", "right", "on", "off", "stop", "go", "silence"]

* This function will return the log spectrogram of an audio file.

In [5]:
def log_specgram(audio, sample_rate, window_size=20,
                 step_size=10, eps=1e-10):
    nperseg = int(round(window_size * sample_rate / 1e3))
    noverlap = int(round(step_size * sample_rate / 1e3))
    freqs, times, spec = signal.spectrogram(audio,
                                    fs=sample_rate,
                                    window='hann',
                                    nperseg=nperseg,
                                    noverlap=noverlap,
                                    detrend=False)
    return freqs, times, np.log(spec.T.astype(np.float32) + eps)

* This function will return the log spectrogram of a .wav file.

In [12]:
def wav2spec(wav_name,directory):
    
    sample_rate, samples = wavfile.read(directory + wav_name)
    
    if (len(samples) < 16000):
        samples = np.pad(samples, (0, (16000 - len(samples))), "linear_ramp")
    
    _, _, spectrogram = log_specgram(samples,sample_rate)
    
    return spectrogram.T

* This function will return the data files values and labels.
* This function is going to be used for the testing files.

In [13]:
def create_data_sets(file_list,directory):
    X = np.zeros([len(file_list),161,99])
    Y = np.zeros([len(file_list)])
    for index, file in enumerate(file_list):
        if index%2000 == 0:
            print(index,file)
        try:
            X[index] = wav2spec(file,directory)
        except ValueError:
            print(index,file,ValueError)
        Y[index] = all_labels.index(file.rsplit("/")[0])
        
    return X, Y
    

* This function creates part 1 of the testing data.
* This is done in parts due to memory constraints.

In [14]:
def create_testing_data_part1():
    X1 = np.zeros([50000,161,99])
    for ind,files in enumerate(os.listdir("test/audio")):
        if ind%2000 == 0:
            print(ind)
        if ind < 50000:
            X1[ind] = wav2spec(wav_name=files,directory="test/audio/")
    return X1
        
X1 = create_testing_data_part1()      

0
2000
4000
6000
8000
10000
12000
14000
16000
18000
20000
22000
24000
26000
28000
30000
32000
34000
36000
38000
40000
42000
44000
46000
48000
50000
52000
54000
56000
58000
60000
62000
64000
66000
68000
70000
72000
74000
76000
78000
80000
82000
84000
86000
88000
90000
92000
94000
96000
98000
100000
102000
104000
106000
108000
110000
112000
114000
116000
118000
120000
122000
124000
126000
128000
130000
132000
134000
136000
138000
140000
142000
144000
146000
148000
150000
152000
154000
156000
158000


* Save the part of the testing data.
* Change path for where you want to store it.

In [7]:
np.save("train/data/X_test_p1", np.expand_dims(X1,-1)+1.3)

* Set X1 to 0 to free up memory.
* Repeat the previous steps for the remaining data.

In [10]:
X1 = 0

In [12]:
def create_testing_data_part2():
    X1 = np.zeros([50000,161,99])
    for ind,files in enumerate(os.listdir("test/audio")):
        if ind%2000 == 0:
            print(ind)
        if (ind >= 50000 and ind < 100000):
            X1[ind - 50000] = wav2spec(wav_name=files,directory="test/audio/")
    return X1
        
X1 = create_testing_data_part2() 

0
2000
4000
6000
8000
10000
12000
14000
16000
18000
20000
22000
24000
26000
28000
30000
32000
34000
36000
38000
40000
42000
44000
46000
48000
50000
52000
54000
56000
58000
60000
62000
64000
66000
68000
70000
72000
74000
76000
78000
80000
82000
84000
86000
88000
90000
92000
94000
96000
98000
100000
102000
104000
106000
108000
110000
112000
114000
116000
118000
120000
122000
124000
126000
128000
130000
132000
134000
136000
138000
140000
142000
144000
146000
148000
150000
152000
154000
156000
158000


In [13]:
np.save("train/data/X_test_p2", np.expand_dims(X1,-1)+1.3)

In [14]:
X1 = 0

In [15]:
def create_testing_data_part3():
    X1 = np.zeros([58538,161,99])
    for ind,files in enumerate(os.listdir("test/audio")):
        if ind%2000 == 0:
            print(ind)
            
        if (ind >= 100000):
            X1[ind - 100000] = wav2spec(wav_name=files,directory="test/audio/")
        except ValueError:
            print(row,ValueError)
            
    return X1
        
X1 = create_testing_data_part3() 

0
2000
4000
6000
8000
10000
12000
14000
16000
18000
20000
22000
24000
26000
28000
30000
32000
34000
36000
38000
40000
42000
44000
46000
48000
50000
52000
54000
56000
58000
60000
62000
64000
66000
68000
70000
72000
74000
76000
78000
80000
82000
84000
86000
88000
90000
92000
94000
96000
98000
100000
102000
104000
106000
108000
110000
112000
114000
116000
118000
120000
122000
124000
126000
128000
130000
132000
134000
136000
138000
140000
142000
144000
146000
148000
150000
152000
154000
156000
158000


In [16]:
np.save("train/data/X_test_p3", np.expand_dims(X1,-1)+1.3)

* This function will create a pandas dataframe that contains the filenames of the testing data.
* This will be used later for submission to Kaggle.

In [14]:
def output_filename_list():
    
    Y = []
    
    for ind, files in enumerate(os.listdir("test/audio")):
        
        Y.append(files)
        
    Y = pd.DataFrame(Y)
    
    Y.columns = ["fname"]
    
    return Y

Y = output_filename_list()

* Check that the format of the dataframe is correct.

In [15]:
print(Y.shape)
print(Y.head())

(158538, 1)
                fname
0  clip_000044442.wav
1  clip_0000adecb.wav
2  clip_0000d4322.wav
3  clip_0000fb6fe.wav
4  clip_0001d1559.wav


* Export to .csv to import to the colab environment later for Kaggle submission.

In [16]:
Y.to_csv("test/filenames.csv")