# Relevant Packages

In [1]:
#Import packages
import IPython.display as ipd
import numpy as np
import pyaudio, wave
import math
import matplotlib.pyplot as plt
from numpy.matlib import repmat
from scipy.fftpack import dct

# MFCC PART

Here, I write three functions to get MFCC feature (39 dim) one is from project 2 another two functions are made of librosa and python_speech_features. 

In [2]:
def getMFCC2(wavename):#without normalization
    import numpy as np
    import scipy.io.wavfile as wav
    from python_speech_features import mfcc
    fs, audio = wav.read(wavename)
    feature_mfcc = mfcc(audio, samplerate=fs)
    mfcc=[]
    mfcc.append(np.hstack([feature_mfcc[0],feature_mfcc[0],feature_mfcc[0]]))
    for i in range(1,len(feature_mfcc)-1):
        delta=np.zeros(13)
        for j in range(13):
            delta[j]=feature_mfcc[i+1][j]-feature_mfcc[i-1][j]
        mfcc.append(np.hstack([feature_mfcc[i],delta]))
    mfcc.append(np.hstack([feature_mfcc[-1],feature_mfcc[-1],feature_mfcc[-1]]))

    for i in range(1,len(mfcc)-1):
        acc=np.zeros(13)
        for j in range(13):
            acc[j]=mfcc[i+1][13+j]-mfcc[i-1][13+j]
        mfcc[i]=np.hstack([mfcc[i],acc])
    mfcc=np.array(mfcc)
    return mfcc

In [3]:
def getMFCC3(wavename):#with normalization
    import librosa
    audio, sr = librosa.load(wavename,sr=None)
    mfccs = librosa.feature.mfcc(y=audio, sr=sr,n_mfcc=39)
    return mfccs.T

# Create Templates


There are total 100 wave files, 10 samples for digit 1-10. All these files are saved in the following format by Huangrui Chu: digit_0.wav, digit_1.wav.  digit could be 0,1,2,...9.

In [4]:
def create_templates(foldername,start_instance,end_instance):
    #we could also use this function for project 1,2,3 for testing our algorithm
    #INPUT:
        #foldername: the name of folder where Huangrui save the audio files
        #end-start:the number of instance we are expected to use as the templates.
    #OUTPUT:
        #templates: we can easily fetch the template for the corresponding digit using:
                #len(templates)%10+digit
    templates=[]
    max_template_length=0
    for i_th_instance in range(start_instance,end_instance):
        for digit in range(0,10):
            #print(str(digit )+'_'+str(i_th_instance)+'.wav')
            mfcc=getMFCC2(foldername+"/"+str(digit )+'_'+str(i_th_instance)+'.wav')
            if max_template_length<len(mfcc):
                max_template_length=len(mfcc)
            templates.append(mfcc)
    return templates,max_template_length

# DTW(Dynamic Time Warping)

In [7]:
def dtw(template,data):
    #Author: Huangrui Chu
    #Input:
        #template 39 dim array
        #data 39 dim array
    import numpy as np
    # insert fin at the beginning of the template and data
    zeros=np.zeros([39])
    template=np.vstack([zeros,template])
    data=np.vstack([zeros,data])
    
    t=len(template) # means i th template frame aligns with j-th input frame
    d=len(data)#means input frame j
    #create empty best path cost matrix "P" 
    P=np.zeros([t])
    
    
    for j in range(d): #input frame j
        P1=np.zeros([t])
        for i in range(t): # i th template frame aligns with j-th input frame
            Cij=np.sqrt(np.sum(np.square(template[i]-data[j])))
            #print(Cij)
            if i-2>=0:
                P1[i]=min(P[i],P[i-1],P[i-2])+Cij
            elif i-1>=0:
                P1[i]=min(P[i],P[i-1])+Cij
            else:
                P1[i]=P[i]+Cij
        P=P1.copy() #Use deep copy to updata the best path cost
        #print(P)
    #Use DTW cost / frame of input speech, instead of total DTW cost, before determining threshold
    # 5 PPT  p.g 32
    normalized_cost=P[-1]/d
    return normalized_cost

# dtw-based recognition

In [8]:
def DTW_Based_recognition(templates,check_mfcc):
    smallest_distance=[100000,10000]
    #smallest_distance[0] save the current smallest distance,smallest_distance[1] save the corresponding template id
    for i in range(len(templates)):
        distance=dtw(templates[i],check_mfcc)
        if distance<smallest_distance[0]:
            smallest_distance[0]=distance
#             print(distance)
            smallest_distance[1]=i
#             print("id is {}".format(i))
#             print("maybe {}".format(i%10))
    final_answer=smallest_distance[1]%10
#     print("I guess the answer is {}".format(final_answer))
    return final_answer

In [9]:
foldername="../data"
test_data,max_test_length=create_templates(foldername,5,10)
templates,max_template_length=create_templates(foldername,0,1)

In [10]:
check_mfcc=test_data[6]

In [11]:
DTW_Based_recognition(templates,check_mfcc)

74.43894128880532
id is 0
maybe 0
73.76736993186549
id is 2
maybe 2
72.97301794037506
id is 3
maybe 3
71.55232133780056
id is 5
maybe 5
65.97772347895047
id is 6
maybe 6
I guess the answer is 6


6

# Time Synchronous Search DTW

In [87]:
def TS_DTW_Based_recognition(templates,data,threshold=0.1):
    #Author: Yueqi Wu
    #Input:
        #templates a list containing several 39 dim arrays
        #data 39 dim array
        #threshold: the threshold to stop the search for certain template
                    #the threshold should be soft 想想教授上课怎么讲的，我忘了
    import numpy as np
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
  

In [89]:
TS_DTW_Based_recognition(templates,max_template_length,check_mfcc,threshold=0.1)

ValueError: all the input array dimensions for the concatenation axis must match exactly, but along dimension 0, the array at index 0 has size 270 and the array at index 1 has size 250

# check the accuracy

In [96]:
def check_the_accuracy(foldername,start_instance,end_instance,test_data):
    print("Huangrui is going to check the accuracy based dtw with {} templates".format(end_instance-start_instance))
    the_input_digit=[]
    the_dtw_result=[]
    templates=create_templates(foldername,start_instance,end_instance)
    for i in range(len(test_data)):
        current_digit=i%10
        current_guess=DTW_Based_recognition(templates,test_data[i])
        the_input_digit.append(current_digit)
        the_dtw_result.append( current_guess)
    return the_input_digit,the_dtw_result

In [97]:
foldername="../data"
test_data=create_templates(foldername,5,10)