# Data Preparation
- Audio files are read from each of the train/validation/test folders respectively, STFT is applied on them and their magnitude values are saved in a list.
- Each signal after applying STFT and taken the absolute value will produce a nparray of shape \[dimensions, timesteps\] and all such arrays are stored in a list.
- The file names are sorted before modifying the signals so that the source/noise/mixed signals are aligned with list indices.
- After obtained all the training data, all the data is saved as pickle files, so that it would be easy to read the processed data from next time.

In [None]:
import os
from os import listdir
from os.path import isfile, join
import numpy as np
import pickle
import librosa
import matplotlib.pyplot as plt
%matplotlib notebook

## Training Data

In [None]:
train_path = '/opt/e533/timit-homework/tr/'
s_files_train = [] # list of filenames of source signals
n_files_train = [] # list of filenames of noise signals
x_files_train = [] # list of filenames of mixed signals
s_spect_train = [] # list of abs values of stft applied source signals
n_spect_train = [] # list of abs values of stft applied noise signals
x_spect_train = [] # list of abs values of stft applied mixed signals
s_train_vec = []   # list of source signal vectors before applying STFT
x_phase_train = [] # list of phase information of mixed signal
count = 0

# Reading filenames
for f in listdir(train_path):
    if isfile(join(train_path, f)):
        if 'trx' in f:
            x_files_train.append(join(train_path, f))
        elif 'trs' in f:
            s_files_train.append(join(train_path, f))
        elif 'trn' in f:
            n_files_train.append(join(train_path, f))

# Sorting filenames to match indices
s_files_train.sort()
n_files_train.sort()
x_files_train.sort()

# Applying STFT
for f in s_files_train:
    sn, sr=librosa.load(f)
    s_train_vec.append(sn)
    X=librosa.stft(sn, n_fft=1024, hop_length=512)
    abs_X = np.abs(X)
    s_spect_train.append(abs_X)
    count += 1
    if(count%30==0): print(len(s_spect_train), " -- ", len(n_spect_train), " -- ", len(x_spect_train))
for f in n_files_train:
    sn, sr=librosa.load(f)
    X=librosa.stft(sn, n_fft=1024, hop_length=512)
    abs_X = np.abs(X)
    n_spect_train.append(abs_X)
    count += 1
    if(count%30==0): print(len(s_spect_train), " -- ", len(n_spect_train), " -- ", len(x_spect_train))
for f in x_files_train:
    sn, sr=librosa.load(f)
    X=librosa.stft(sn, n_fft=1024, hop_length=512)
    abs_X = np.abs(X)
    x_spect_train.append(abs_X)
    phase = X/abs_X
    x_phase_train.append(phase)
    count += 1
    if(count%30==0): print(len(s_spect_train), " -- ", len(n_spect_train), " -- ", len(x_spect_train))

## Saving Training Data

In [None]:
# Saving all the processed training data into pickle files
source_file = open('train_source', 'wb')
pickle.dump(s_spect_train, source_file)
source_file.close()
source_file = open('train_noise', 'wb')
pickle.dump(n_spect_train, source_file)
source_file.close()
source_file = open('train_mix', 'wb')
pickle.dump(x_spect_train, source_file)
source_file.close()
source_file = open('train_mix_phase', 'wb')
pickle.dump(x_phase_train, source_file)
source_file.close()
source_file = open('istft_source', 'wb')
pickle.dump(s_train_vec, source_file)
source_file.close()

## Validation Data

In [None]:
valid_path = '/opt/e533/timit-homework/v/'
s_files_valid = []
n_files_valid = []
x_files_valid = []
s_spect_valid = []
n_spect_valid = []
x_spect_valid = []
s_valid_vec = []
x_phase_valid = []

for f in listdir(valid_path):
    if isfile(join(valid_path, f)):
        if 'vx' in f:
            x_files_valid.append(join(valid_path, f))
        elif 'vs' in f:
            s_files_valid.append(join(valid_path, f))
        elif 'vn' in f:
            n_files_valid.append(join(valid_path, f))
s_files_valid.sort()
n_files_valid.sort()
x_files_valid.sort()

for f in s_files_valid:
    sn, sr=librosa.load(f, sr=None)
    s_valid_vec.append(sn)
    X=librosa.stft(sn, n_fft=1024, hop_length=512)
    abs_X = np.abs(X)
    s_spect_valid.append(abs_X)
for f in n_files_valid:
    sn, sr=librosa.load(f, sr=None)
    X=librosa.stft(sn, n_fft=1024, hop_length=512)
    abs_X = np.abs(X)
    n_spect_valid.append(abs_X)
for f in x_files_valid:
    sn, sr=librosa.load(f, sr=None)
    X=librosa.stft(sn, n_fft=1024, hop_length=512)
    abs_X = np.abs(X)
    x_spect_valid.append(abs_X)
    phase = X/abs_X
    x_phase_valid.append(phase)

## Saving Validation Data

In [None]:
source_file = open('valid_source', 'wb')
pickle.dump(s_spect_valid, source_file)
source_file.close()
source_file = open('valid_noise', 'wb')
pickle.dump(n_spect_valid, source_file)
source_file.close()
source_file = open('valid_mix', 'wb')
pickle.dump(x_spect_valid, source_file)
source_file.close()
source_file = open('valid_mix_phase', 'wb')
pickle.dump(x_phase_valid, source_file)
source_file.close()
source_file = open('istft_valid', 'wb')
pickle.dump(s_valid_vec, source_file)
source_file.close()

## Test Data

In [None]:
test_path = '/opt/e533/timit-homework/te/'
x_files_test = []
x_spect_test = []
x_phase_test = []
for f in listdir(test_path):
    if isfile(join(test_path, f)):
        x_files_test.append(join(test_path, f))
x_files_test.sort()

for f in x_files_test:
    sn, sr=librosa.load(f)
    X=librosa.stft(sn, n_fft=1024, hop_length=512)
    abs_X = np.abs(X)
    phase = X/abs_X
    x_phase_test.append(phase)
    x_spect_test.append(abs_X)

## Saving Test Data

In [None]:
source_file = open('test_mix', 'wb')
pickle.dump(x_spect_test, source_file)
source_file.close()
source_file = open('test_mix_phase', 'wb')
pickle.dump(x_phase_test, source_file)
source_file.close()