In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pyttsx3
import os
import scipy.io.wavfile
import scipy.signal
import scipy.spatial.distance
import librosa
import librosa.display
import copy

In [2]:
WORDS = [
    "time", "prepare", "solution", "make", "mistake", "no", "the", "probable", "long", "lecture",
    "method", "disaster", "fail", "work", "advice", "idea", "succeed", "easy", "is", "for", "give",
    "to" # Added word "to" which is not in assingment list but appears in the audio
]

WORDS_DIR = "words/"

# the work easy the time long to prepare the solution
SOLUTION = ["the", "work", "easy", "the", "time", "long", "to", "prepare", "the", "solution"]

In [3]:
### Generate words WAV files

if not os.path.exists(WORDS_DIR):
    os.mkdir(WORDS_DIR)
    
engine = pyttsx3.init()
for word in WORDS:
    # skip existing
    if os.path.exists(os.path.join(WORDS_DIR, "%s.wav" % word)):
        continue
    engine.save_to_file(word, os.path.join(WORDS_DIR, "%s.wav" % word))
engine.runAndWait()

In [4]:
### Helper functions

def trim_signal(s1, trim_threshold=10):
    s1[(-trim_threshold < s1) & (s1 < trim_threshold)] = 0
    ts1 = np.trim_zeros(s1)
    return ts1
    
def trim_pad_pair(s1, s2, trim_threshold=10):
    ts1 = trim_signal(s1)
    ts2 = trim_signal(s2)
    msize = max(ts1.size, ts2.size)
    psize = (msize // 1000 + 1)*1000
    ps1 = np.pad(ts1, (0, psize - ts1.size))
    ps2 = np.pad(ts2, (0, psize - ts2.size))
    
    return ps1, ps2

def load_file(filename):
    fs, s = scipy.io.wavfile.read(filename)
    s = trim_signal(s)
    return fs, s.astype(np.float64)

def load_word(word):
    filename = os.path.join(WORDS_DIR, "%s.wav" % word)
    return load_file(filename)

def write_wav(data, filename, rate=22050): 
    scipy.io.wavfile.write(filename, rate, data)
    
def split_data(data, window_size=1000, threshold=1600):
    data = copy.deepcopy(data)
    
    base = 0
    is_voice = False
    n_empty = 0
    
    msize = data.size
    psize = (msize // window_size + 1)*window_size
    data = np.pad(data, (0, psize - data.size))

    parts = []

    for i, window in enumerate(np.split(data, data.size / window_size)):
        #_sum = np.sum(np.abs(window))
        _max = np.max(np.abs(window))
        if n_empty > 1 and is_voice:
            part = data[base:(i+1)*window_size]
            part = trim_signal(part)
            parts.append(part)

            base = (i+1)*window_size
            is_voice = False
            n_empty = 0
        elif _max >= threshold and not is_voice:
            is_voice = True
        elif _max < threshold and is_voice:
            n_empty += 1
            
    if not parts:
        return [data]
    return parts

In [5]:
### LOAD data

data = np.loadtxt("Signal1.txt")
#data = data[:data.size - data.size%1000]

In [6]:
### Split data
parts = split_data(data)

In [7]:
correct = 0
for answer, Y in zip(SOLUTION, parts):
    similarities = []
    for i, word in enumerate(WORDS):
        sr, X = load_word(word)
        pX, pY = trim_pad_pair(X, Y)
        #X *= np.hamming(X.size)

        ###f, t, Sxx1 = scipy.signal.spectrogram(pX, sr)
        ###f, t, Sxx2 = scipy.signal.spectrogram(pY, sr)
        ###dist_spec = scipy.spatial.distance.cosine(Sxx1.flatten(), Sxx2.flatten())

        #dist, cost, acc_cost, path = dtw(pX, pY, dist=lambda x, y: np.linalg.norm(x - y, ord=1))
        #similarity = dist
        
        # Mel spectrum
        sX = librosa.feature.melspectrogram(y=pX, sr=sr, n_fft=512)
        # Mel cepstrum coefs
        mfccsX = librosa.feature.mfcc(S=librosa.power_to_db(sX))
        sY = librosa.feature.melspectrogram(y=pY, sr=sr, n_fft=512)
        mfccsY = librosa.feature.mfcc(S=librosa.power_to_db(sY))
        dist_mfccs = scipy.spatial.distance.cosine(mfccsX.flatten(), mfccsY.flatten())
        
        ###fftX = np.fft.fft(pX)
        ###fftY = np.fft.fft(pY)
        ###dist_fft = scipy.spatial.distance.cosine(fftX,  fftY)
        
        #cqtX = librosa.cqt(pX)
        #cqtY = librosa.cqt(pY)
        #dist_cqt = scipy.spatial.distance.cosine(cqtX.flatten(), cqtY.flatten())
        
        #stftX = np.abs(librosa.stft(pX, n_fft=512))
        #stftX = librosa.amplitude_to_db(stftX, ref=np.max)
        #stftY = np.abs(librosa.stft(pY, n_fft=512))
        #stftY = librosa.amplitude_to_db(stftY, ref=np.max)
        #dist_stft = scipy.spatial.distance.cosine(stftX.flatten(), stftY.flatten())

        #similarity = dist_spec + dist_mfccs + dist_fft + dist_cqt + dist_stft
        similarity = dist_mfccs

        similarities.append( (word, similarity) )
    
    guess = sorted(similarities, key=lambda x: x[1])[0:3]
    # Convert to likelihood
    guess = [(x[0], np.round(1-x[1], 3)) for x in guess]
    #guess = [x[0] for x in guess]
    #print(answer, [x[0] for x in guess])
    print(answer, *guess)
    if answer == guess[0][0]:
        correct += 1
        
correct_percentage = correct / len(parts) * 100
print("Correct: %s/%s (%s%%)" % (correct, len(parts), correct_percentage))

the ('the', 0.936) ('give', 0.934) ('is', 0.93)
work ('work', 0.954) ('make', 0.938) ('no', 0.937)
easy ('easy', 0.947) ('long', 0.947) ('fail', 0.944)
the ('the', 0.935) ('give', 0.932) ('is', 0.93)
time ('time', 0.954) ('fail', 0.952) ('long', 0.947)
long ('no', 0.958) ('long', 0.952) ('work', 0.951)
to ('the', 0.949) ('to', 0.935) ('give', 0.931)
prepare ('prepare', 0.957) ('idea', 0.945) ('probable', 0.919)
the ('the', 0.935) ('give', 0.932) ('is', 0.931)
solution ('solution', 0.949) ('disaster', 0.939) ('probable', 0.922)
Correct: 8/10 (80.0%)
