# Spoken Digit Classification

## Preparing Data
Before beginning, we need to process the audio files by extracting the wavefile and pairing them with their respective labels. We'd expect the length of the list of all the wave files and labels to be 90,000 each.

In [3]:
import numpy as np
import pandas as pd
from scipy.io import wavfile
PATH = './train/train_new/train_'
TEST_PATH = './test/test_new/test_'

def load_speeches(path):
    all_waves = []
    for i in range(90000):
        file = path + str(i) + '.wav'
        _, samples = wavfile.read(file)
        all_waves.append(samples)
    data = pd.read_csv('train.csv')
    labels = [data.iloc[:, 1][i] for i in range(90000)]
    return all_waves,labels


all_waves,labels = load_speeches(PATH)
print(len(all_waves))
print(len(labels))

90000
90000


Next, we then encode the labels (since there are 6), and pair them with the spectrogram transformation of the audio signals

In [4]:
from scipy import signal
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
def get_spectrograms(waves):
    sample_rate = 8000
    spectros = []
    freqs = []
    tims = []
    for wav in waves:
        frequencies, times, spectrogram = signal.spectrogram(wav, sample_rate)
        freqs.append(frequencies)
        tims.append(times)
        spectros.append(spectrogram)
    return freqs,tims,spectros

labelencoder = LabelEncoder().fit(labels)
encoded_labels = tf.keras.utils.to_categorical(labelencoder.transform(labels), 6)
freqs,tims,spectros = get_spectrograms(all_waves)
spectros = np.array(spectros)
spectros.shape

(90000, 129, 26)

## Logistic Regression

In [19]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
log_reg_waves = np.array(all_waves)
log_reg_spectros = spectros.reshape(90000, -1)
print(log_reg_waves.shape)
print(log_reg_waves.shape)
print(encoded_labels.shape)

encoded_labels = labelencoder.transform(labels)
def logistic_regression_accuracy(data, encoded_labels):
    X, X_test, Y, Y_test = train_test_split(data, encoded_labels, test_size=0.15, random_state=42)
    reg = LogisticRegression().fit(X, Y)
    predictions = reg.predict(X_test)
    accuracy = reg.score(X_test, Y_test)
    return accuracy


(90000, 6000)
(90000, 6000)
(90000,)


In [18]:
wav_accuracy = logistic_regression_accuracy(log_reg_waves, encoded_labels)
spectro_accuracy = logistic_regression_accuracy(log_reg_spectros, encoded_labels)
print(f'Accuracy for logistic regression with wave: {wav_accuracy}')
print(f'Accuracy for logistic regression with spectrogram: {spectro_accuracy}')

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy for logistic regression with wave: 1.0
Accuracy for logistic regression with spectrogram: 0.9945925925925926


## Naive Bayes

In [20]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
NB_waves = log_reg_waves
NB_spectros = log_reg_spectros

encoded_labels = labelencoder.transform(labels)
def NB_accuracy(data, encoded_labels):
    X, X_test, Y, Y_test = train_test_split(data, encoded_labels, test_size=0.15, random_state=42)
    clf = GaussianNB().fit(X, Y)
    predictions = clf.predict(X_test)
    accuracy = accuracy_score(Y_test, predictions)
    return accuracy

wav_accuracy = NB_accuracy(NB_waves, encoded_labels)
spectro_accuracy = NB_accuracy(NB_spectros, encoded_labels)
print(f'Accuracy for Naive Bayes with wave: {wav_accuracy}')
print(f'Accuracy for Naive Bayes with spectrogram: {spectro_accuracy}')

Accuracy for Naive Bayes with wave: 0.20977777777777779
Accuracy for Naive Bayes with spectrogram: 0.88


## Modifications
* Append 43 examples and labels to our current training data
* These examples were stored in an array named intersection 
* We should only append labels that were are certain of being '43'

In [22]:
PATH = './train/train_new/train_'
TEST_PATH = './test/test_new/test_'

def load_speeches(path):
    all_waves = []
    for i in range(18000):
        file = path + str(i) + '.wav'
        _, samples = wavfile.read(file)
        all_waves.append(samples)
    data = pd.read_csv('train.csv')
    labels = [data.iloc[:, 1][i] for i in range(18000)]
    return all_waves,labels
def append_43(all_waves, labels, intersection):
    for i in intersection:
        file = TEST_PATH + str(i) + '.wav'
        _, samples = wavfile.read(file)
        all_waves.append(samples)
        labels.append(43)
    return all_waves, labels
all_waves,labels = load_speeches(PATH)
print(f'All waves before appending 43 labels: {len(all_waves)}')
intersection = np.loadtxt('./intersection.txt').astype(int)
all_waves, labels = append_43(all_waves, labels, intersection)
print(f'All waves after appending 43 labels: {len(all_waves)}')
labelencoder = LabelEncoder().fit(labels)
encoded_labels = tf.keras.utils.to_categorical(labelencoder.transform(labels), 6)

All waves before appending 43 labels: 18000
All waves after appending 43 labels: 20229


In [24]:
freqs,tims,spectros = get_spectrograms(all_waves)
spectros = np.array(spectros)
modified_waves = np.array(all_waves)
modified_spectros = spectros.reshape(20229, -1)
encoded_labels = labelencoder.transform(labels)
log_wav_accuracy = logistic_regression_accuracy(modified_waves, encoded_labels)
log_spectro_accuracy = logistic_regression_accuracy(modified_spectros, encoded_labels)
print(f'Accuracy for logistic regression with modified (43) wave: {log_wav_accuracy}')
print(f'Accuracy for logistic regression with modified (43) spectrogram: {log_spectro_accuracy}')
NB_wav_accuracy = NB_accuracy(modified_waves, encoded_labels)
NB_spectro_accuracy = NB_accuracy(modified_spectros, encoded_labels)
print(f'Accuracy for Naive Bayes with modified (43) wave: {NB_wav_accuracy}')
print(f'Accuracy for Naive Bayes with modified (43) spectrogram: {NB_spectro_accuracy}')

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy for logistic regression with modified (43) wave: 1.0
Accuracy for logistic regression with modified (43) spectrogram: 0.9696869851729819
Accuracy for Naive Bayes with modified (43) wave: 0.185502471169687
Accuracy for Naive Bayes with modified (43) spectrogram: 0.8520593080724876


## Trying a new model - Convolutional Neural Network
* 11 Layers
* Max Pooling and Batch Normalization
* Fully Connected Layers
* Dropout Layers and L2-regularization


In [35]:
import tensorflow as tf

spectros = np.array(spectros) #spectros[0].shape --> (129, 26)
spectros = spectros.reshape(len(all_waves), 129, 26, 1)
encoded_labels = tf.keras.utils.to_categorical(labelencoder.transform(labels), 6)
X, X_test, Y, Y_test = train_test_split(spectros, encoded_labels, test_size=0.15, random_state=98)

model = tf.keras.models.Sequential([
tf.keras.layers.Conv2D(filters = 32, kernel_size = 5, strides = 1, activation = 'relu', input_shape = (129,26,1), kernel_regularizer=tf.keras.regularizers.l2(0.0005)),
tf.keras.layers.Conv2D(filters = 32, kernel_size = 5, strides = 1, use_bias=False),
tf.keras.layers.BatchNormalization(),
tf.keras.layers.Activation('relu'),
tf.keras.layers.MaxPooling2D(pool_size = 2, strides = 2),
tf.keras.layers.Dropout(0.25),
tf.keras.layers.Conv2D(filters = 64, kernel_size = 3, strides = 1, activation = 'relu', kernel_regularizer=tf.keras.regularizers.l2(0.0005)),
tf.keras.layers.Conv2D(filters = 64, kernel_size = 3, strides = 1, use_bias=False),
tf.keras.layers.BatchNormalization(),
tf.keras.layers.Activation('relu'),
tf.keras.layers.MaxPooling2D(pool_size = 2, strides = 2),
tf.keras.layers.Dropout(0.25),
tf.keras.layers.Flatten(),
tf.keras.layers.Dense(units = 256, use_bias=False),
tf.keras.layers.BatchNormalization(),
tf.keras.layers.Activation('relu'),
tf.keras.layers.Dense(units = 128, use_bias=False),
tf.keras.layers.BatchNormalization(),
tf.keras.layers.Activation('relu'),
tf.keras.layers.Dense(units = 84, use_bias=False),
tf.keras.layers.BatchNormalization(),
tf.keras.layers.Activation('relu'),
tf.keras.layers.Dropout(0.25),
tf.keras.layers.Dense(units = 6, activation = 'softmax')
])
model.summary()

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_16 (Conv2D)           (None, 125, 22, 32)       832       
_________________________________________________________________
conv2d_17 (Conv2D)           (None, 121, 18, 32)       25600     
_________________________________________________________________
batch_normalization_20 (Batc (None, 121, 18, 32)       128       
_________________________________________________________________
activation_20 (Activation)   (None, 121, 18, 32)       0         
_________________________________________________________________
max_pooling2d_8 (MaxPooling2 (None, 60, 9, 32)         0         
_________________________________________________________________
dropout_12 (Dropout)         (None, 60, 9, 32)         0         
_________________________________________________________________
conv2d_18 (Conv2D)           (None, 58, 7, 64)        

In [36]:
model.compile(optimizer=tf.keras.optimizers.RMSprop(lr=0.001, rho=0.9, epsilon=1e-08, decay=0.0), loss='categorical_crossentropy', metrics=['accuracy'])
model.fit(X,Y,batch_size=128,epochs=8,validation_data=(X_test,Y_test))

Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


<tensorflow.python.keras.callbacks.History at 0x7fb646ca7c40>

## Making Predictions (to be submitted on kaggle)

In [37]:
def load_speeches_test(path):
    all_waves = []
    for i in range(24750):
        file = path + str(i) + '.wav'
        _, samples = wavfile.read(file)
        all_waves.append(samples)
    return all_waves


test_waves = load_speeches_test(TEST_PATH)
_, _, test_spectros = get_spectrograms(test_waves)
test_spectros = np.array(test_spectros)
test_spectros = test_spectros.reshape(24750, 129, 26, 1)
predictions = model.predict(test_spectros)
# decoded_predictions = np.argwhere(predictions ==1 )#[:, 1]
inverse_predictions = np.array([ np.array([i, np.argmax(prediction)]) for i, prediction in enumerate(predictions)])
inverse_predictions[:, 1] = labelencoder.inverse_transform(inverse_predictions[:, 1]) #len(np.argwhere(inverse_predictions[:, 1] == 43))
# for replace in replacements: inverse_predictions[replace][1] = 43
df = pd.DataFrame(inverse_predictions, columns=['ID', 'Label'])
df.to_csv('submission.csv', index=False)

## Ensemble methods
* Find the intersection of '43' labels over 51 predictions and retrain convolutional neural net with these labels 

In [38]:
all_predictions = np.loadtxt("./predictions/prediction0.txt").astype(int).reshape(24750, 1) #shape:(24750, 50)
for i in range(1,51):
     next_prediction = np.loadtxt(f"./predictions/prediction{i}.txt").astype(int).reshape(24750,1)
     all_predictions = np.append(all_predictions, next_prediction, axis=1)


intersection = np.argwhere(all_predictions[:, 0] ==43).flatten()
for i in range(1,51):
    pred = np.argwhere(all_predictions[:, i] ==43).flatten()
    intersection = np.intersect1d(intersection, pred)
    
print(intersection.shape)
intersection

(2229,)


array([    2,     3,     7, ..., 24711, 24731, 24742])

As expected, approximately 2229/24750 ~ 10% of data have '43' labels.  We use this cummulative intersection to continuously retrain our model.