In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import IPython.display as ipd
import librosa, librosa.display
import numpy as np
from scipy.io.wavfile import write
import os
from pathlib import Path
import csv
import pandas as pd
import sklearn.metrics
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, classification_report
import soundfile as sf
import warnings
warnings.filterwarnings('ignore')

In [2]:
def trim_audio(x: np.ndarray) -> np.ndarray:
    
    middle = np.where(x == max(x[int(len(x)*0.1):int(len(x)*0.95)]))[0][0]
    minI, maxI = middle, middle
    while minI > 0 and max(abs(x[minI:minI+500])) > 0.1:
        minI -= 50
    while maxI < len(x) and max(abs(x[maxI-500:maxI])) > 0.1:
        maxI += 50
    return x[minI-500:maxI+1000] if minI > 500 else x[minI:maxI+1000]

In [34]:
def process_audio(fn: str) -> tuple:
    path = Path(fn)
    if fn == None:
        print("Not valid File")
    
    # checks to see if processed file already exists, and doesn't reprocess if it does
    '''if Path("/".join(path.parts[:-1])+"/.Processed/p"+path.name).is_file():
        file = "/".join(path.parts[:-1])+"/.Processed/p"+path.name
        time = librosa.get_duration(filename=file)
        return librosa.load(file, sr = int(44100/time))'''
        
    #loads and trims audio
    amplitudes, sr = librosa.load(fn)
    amplitudes = trim_audio(amplitudes)

    #creates a .Processed folder
    strdir = "/".join(path.parts[:-1])+"/.Processed/"
    try:
        os.mkdir(strdir)
    except:
        pass
        
    #saves the processed audio file into a new audio file in the .Processed folder
    newFilename = strdir + 'p' + path.name
    sf.write(newFilename, amplitudes, sr)
    
    #loads audio file amplitude values that standardize the length of the amplitude array
    time = librosa.get_duration(filename=newFilename)
    amplitudes, sr = librosa.load(newFilename, sr = int(44100/time))
    return amplitudes, sr

In [30]:
class NeuralNetwork:
    
    def __init__(self, inputnodes, hiddennodes, outputnodes, learningrate):
        self.inodes = inputnodes
        self.hnodes = hiddennodes
        self.onodes = outputnodes
        
        self.lr = learningrate
        
        self.wih = np.random.normal(0.0, pow(self.hnodes, -0.5), (self.hnodes, self.inodes))
        self.who = np.random.normal(0.0, pow(self.onodes, -0.5), (self.onodes, self.hnodes))
        
        self.activiation_function = lambda x: 1/(1+np.exp(-x))
    
    def train(self, inputs_list, targets_list):
        targets = np.array(targets_list, ndmin = 2).T
        inputs = np.array(inputs_list, ndmin = 2).T
        
        hidden_inputs = np.dot(self.wih, inputs)
        hidden_outputs = self.activiation_function(hidden_inputs)
        final_inputs = np.dot(self.who, hidden_outputs)
        final_outputs = self.activiation_function(final_inputs)
        
        output_errors = targets - final_outputs
        hidden_errors = np.dot(self.who.T, output_errors)
        self.who += self.lr * np.dot((output_errors * final_outputs * (1-final_outputs)), hidden_outputs.T)
        self.wih += self.lr * np.dot((hidden_errors * hidden_outputs * (1-hidden_outputs)), inputs.T)
    
    def query(self, inputs_list):
        inputs = np.array(inputs_list, ndmin = 2).T
        
        hidden_inputs = np.dot(self.wih, inputs)
        hidden_outputs = self.activiation_function(hidden_inputs)
        final_inputs = np.dot(self.who, hidden_outputs)
        final_outputs = self.activiation_function(final_inputs)
        
        return final_outputs

In [6]:
def read_wordlist() -> dict:
    #reads the wordlist csv into a dictionary for easy retrieval
    #Ex. CW100 corresponds to the word "Yes"
    with open('UASpeech/speaker_wordlist.csv', mode='r') as infile:
        reader = csv.reader(infile)
        word_dict = {rows[1]:rows[0] for rows in reader}
    del word_dict['FILE NAME']
    return word_dict

In [7]:
def get_word_from_filename(fn: str, word_dict: dict) -> str:
    #retrievs the word based on the code used in the file
    path = Path(fn)
    fn = path.name.split('_')
    return word_dict[fn[2]]

In [43]:
input_nodes = 44100
hidden_nodes = 1000
output_nodes = 100
learning_rate = 0.2

nn = NeuralNetwork(input_nodes, hidden_nodes, output_nodes, learning_rate)

training_list = []
testing_list = []
for i in range(1, 101):
    for j in range(1, 4):
        filename = f"UASpeech/audio/M16/M16_B{j}_CW{i}_M5.wav"
        if j == 2:
            testing_list.append(np.insert(process_audio(filename)[0], 0, i))
        else:
            training_list.append(np.insert(process_audio(filename)[0], 0, i))
training_list = np.array(training_list)
testing_list = np.array(testing_list)


In [25]:
training_list.shape

array([ 2.000000e+00,  2.691697e-04,  3.421405e-04, ..., -5.401783e-03,
       -4.076488e-03,  0.000000e+00], dtype=float32)

In [44]:
#train
for i in range(20):
    for word in training_list:
        #print(int(word[0]), word.shape)
        inputs = word[1:]
        targets = np.zeros(output_nodes) + 0.01
        targets[int(word[0])-1] = 0.99
        nn.train(inputs, targets)

#test
scorecard = []
for test_word in testing_list:
    expected = int(test_word[0])-1
    received = int(np.argmax(nn.query(test_word[1:])))
    scorecard.append(expected == received)
    print(expected, received)

#print(scorecard)
print(f"score = {sum(scorecard)/len(scorecard)}")

KeyboardInterrupt: 