<a href="https://colab.research.google.com/github/Kanaagalakshmi/Mavericks/blob/master/spokenrecognition.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import os
import time
import random
!pip install numpy==1.14.3
!pip install librosa==0.6.1
!pip install tensorflow==1.8.0
!pip install TFLearn
import tflearn

import numpy as np
import librosa.display
import tensorflow as tf
import matplotlib.pyplot as plt
tf.reset_default_graph()
# EXTRACT MFCC FEATURES
#
def extract_mfcc(file_path, utterance_length):
    # Get raw .wav data and sampling rate from librosa's load function
    raw_w, sampling_rate = librosa.load(file_path+"/", mono=True)

    # Obtain MFCC Features from raw data
    mfcc_features = librosa.feature.mfcc(raw_w, sampling_rate)
    if mfcc_features.shape[1] > utterance_length:
        mfcc_features = mfcc_features[:, 0:utterance_length]
    else:
        mfcc_features = np.pad(mfcc_features, ((0, 0), (0, utterance_length - mfcc_features.shape[1])),
                               mode='constant', constant_values=0)
    
    return mfcc_features

#
# GET TRAINING BATCH, returns data in batches 
#
def get_mfcc_batch(file_path, batch_size, utterance_length):
    print("hello")
    files = os.listdir(file_path)
    ft_batch = []
    label_batch = []

    while True:
        # Shuffle Files
        random.shuffle(files)
        for fname in files:
            # print("Total %d files in directory" % len(files))

            # Make sure file is a .wav file
            if not fname.endswith(".wav"):
                continue
            
            # Get MFCC Features for the file
            mfcc_features = extract_mfcc(file_path +"/"+fname, utterance_length)
            
            # One-hot encode label for 10 digits 0-9
            label = np.eye(10)[int(fname[0])]
            
            # Append to label batch
            label_batch.append(label)
            
            # Append mfcc features to ft_batch
            ft_batch.append(mfcc_features)

            # Check to see if default batch size is < than ft_batch
            if len(ft_batch) >= batch_size:
                # send over batch
                yield ft_batch, label_batch
                # reset batches
                ft_batch = []
                labels_batch = []

#
# DISPLAY FEATURE SHAPE
#
# wav_file_path: Input a file path to a .wav file
#
def display_power_spectrum(wav_file_path, utterance_length):
    mfcc = extract_mfcc(wav_file_path, utterance_length)
    
    # Plot
    plt.figure(figsize=(10, 6))
    plt.subplot(2, 1, 1)
    librosa.display.specshow(mfcc, x_axis='time')
    plt.show()

    # Feature information
    print("Feature Shape: ", mfcc.shape)
    print("Features: " , mfcc[:,0])

#
# MAIN
#
def main():
    # Initial Parameters
    lr = 0.001
    iterations_train = 50
    bsize = 64
    audio_features = 20  
    utterance_length = 35  # Modify to see what different results you can get
    ndigits = 10

    # Get training data
    train_batch = get_mfcc_batch('/content/drive/My Drive/new spoken digit/Spoken-Digit-Recognition/new_spoken_digit/dataset', 150, utterance_length)
    
    # # Build Model
    sp_network = tflearn.input_data([None, audio_features, utterance_length])
    sp_network = tflearn.lstm(sp_network, 128*4, dropout=0.5)
    sp_network = tflearn.fully_connected(sp_network, ndigits, activation='softmax')
    sp_network = tflearn.regression(sp_network, optimizer='adam', learning_rate=lr, loss='categorical_crossentropy')
    sp_model = tflearn.DNN(sp_network, tensorboard_verbose=0)

    # Train Model
    while iterations_train > 0:
        X_tr, y_tr = next(train_batch)
        X_test, y_test = next(train_batch)
        sp_model.fit(X_tr, y_tr, n_epoch=10, validation_set=(X_test, y_test), show_metric=True, batch_size=bsize)
        iterations_train -=1
    sp_model.save("/content/drive/My Drive/new spoken digit/Spoken-Digit-Recognition/new_spoken_digit/lstm files/speech_recognition.lstm.index")

    # Test Model
    sp_model.load('/content/drive/My Drive/new spoken digit/Spoken-Digit-Recognition/new_spoken_digit/lstm files/speech_recognition.lstm.index')
    mfcc_features = extract_mfcc('/content/drive/My Drive/test/0_jackson_49.wav', utterance_length)
    mfcc_features = mfcc_features.reshape((1,mfcc_features.shape[0],mfcc_features.shape[1]))
    prediction_digit = sp_model.predict(mfcc_features)
    print(prediction_digit)
    print("Digit predicted: ", np.argmax(prediction_digit))

    # Done
    return 0


if __name__ == '__main__':
    main()

Training Step: 1499  | total loss: [1m[32m2.29775[0m[0m | time: 0.622s
| Adam | epoch: 500 | loss: 2.29775 - acc: 0.1333 -- iter: 128/150
Training Step: 1500  | total loss: [1m[32m2.28910[0m[0m | time: 2.012s
| Adam | epoch: 500 | loss: 2.28910 - acc: 0.1434 | val_loss: 2.31813 - val_acc: 0.1067 -- iter: 150/150
--
INFO:tensorflow:/content/drive/My Drive/new spoken digit/Spoken-Digit-Recognition/new_spoken_digit/lstm files/speech_recognition.lstm.index is not in all_model_checkpoint_paths. Manually adding it.
INFO:tensorflow:Restoring parameters from /content/drive/My Drive/new spoken digit/Spoken-Digit-Recognition/new_spoken_digit/lstm files/speech_recognition.lstm.index
[[0.10514367 0.08418491 0.09734729 0.08213335 0.08085749 0.12703276
  0.11499744 0.1152242  0.1216312  0.07144772]]
Digit predicted:  5
