# Import GMM and other library

In [1]:
from GMMHMM import *
import numpy as np
import math
import copy
import matplotlib.pyplot as plt

In [2]:
import pickle
def save_pickle(model,filepath,save_name):
    # Dump the trained decision tree classifier with Pickle
    pkl_filename = filepath+save_name+'.pkl'

    # Open the file to save as pkl file
    model_pkl = open(pkl_filename, 'wb')
    pickle.dump(model,model_pkl)

    # Close the pickle instances
    model_pkl.close()
def load_pickle(filepath,save_name):
    classification_pkl_filename = filepath+"/"+save_name+'.pkl'
    classification_model_pkl = open(classification_pkl_filename, 'rb')
    classification_model = pickle.load(classification_model_pkl)
    #print ("Loaded HMM model: ", classification_model)
    return classification_model

In [3]:
def load_all_digit_GMMHMM(filepath,filenames):
    GMMHMMs={}
    for digit in filenames:
        print("Huangrui is loading the digit {} GMMHMM".format(digit))
        current_digit_GMMHMM=load_pickle(filepath,str(digit))
        GMMHMMs[str(digit)]=current_digit_GMMHMM
    return GMMHMMs

In [4]:
filepath="new_project6_models/"
filenames=[0,1,2,3,4,5,6,7,8,9,"silence"]
#filenames=[0,1,2,3,4,5,6,7,8,9]
GMMHMMS=load_all_digit_GMMHMM(filepath,filenames)

Huangrui is loading the digit 0 GMMHMM
Huangrui is loading the digit 1 GMMHMM
Huangrui is loading the digit 2 GMMHMM
Huangrui is loading the digit 3 GMMHMM
Huangrui is loading the digit 4 GMMHMM
Huangrui is loading the digit 5 GMMHMM
Huangrui is loading the digit 6 GMMHMM
Huangrui is loading the digit 7 GMMHMM
Huangrui is loading the digit 8 GMMHMM
Huangrui is loading the digit 9 GMMHMM
Huangrui is loading the digit silence GMMHMM


In [5]:
GMMHMMS["0"].hmm.transition_cost

array([[0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.07374347, 2.64380793, 0.        , 0.        ],
       [0.        , 0.        , 0.01878577, 3.98403354, 0.        ],
       [0.        , 0.        , 0.        , 0.01959566, 3.94222885],
       [0.        , 0.        , 0.        , 0.        , 3.94222885]])

In [6]:
from collections import Counter
class LexNode:
    def __init__(self, val,word):
        #这里的VAL 是GMM 哦
        self.val = val
        self.word= word
        self.children = []
        # set the property so that we can differentiate the start node, normal(between) node and end of word node
        # 0: normal node
        # 1: start node
        # 2: end-of-word node
        self.property = 0

In [7]:
class BuildLextree:
    def __init__(self, dic):
        self.dic2words(dic)
        zeros=np.zeros([39])
        ones=np.zeros([39])+1
        mix_of_all_states=[]
        #create a fine GMM 
        fine_GMM=mixInfo()
        fine_GMM.Gaussian_mean.append(zeros)
        #fine_GMM.Gaussian_mean.append(zeros)
        fine_GMM.Gaussian_var.append(ones)
        #fine_GMM.Gaussian_var.append(ones)
        fine_GMM.Gaussian_weight=[1]
        #translate from list to np array
        fine_GMM.Gaussian_mean=np.array(fine_GMM.Gaussian_mean)
        fine_GMM.Gaussian_var=np.array(fine_GMM.Gaussian_var)
        fine_GMM.Num_of_Gaussian = 1
        self.tree=LexNode(fine_GMM,"*")
        # dummy symbol for the root of the tree
        self.tree.property = 1
        n_words = len(self.words)
        word_lens = [w.hmm.N for w in self.words]
        print("There are {} words in this dictionary".format(len(self.words)))
        
    def dic2words(self,dic):
        self.words=[]
        self.keys=list(dic.keys())
        self.transition_cost={}
        for key in self.keys:
            self.words.append(dic[key])
            self.transition_cost[key]=dic[key].hmm.transition_cost
        
    def append_lex_node(self,parent, child):
        #This function just append the child node to the paretn node
        #It would check whether the parent is a LexNode!
        assert type(parent) is LexNode and type(child) is LexNode
        parent.children.append(child)
    
    def build_lextree(self):
        #this is the function to build the lextree from the self.words and root node "*"
        for i in range(len(self.words)):
            word=self.words[i]
            key=self.keys[i]
            previous_node=LexNode(word.hmm.mix[0],key)
            self.tree.children.append(previous_node)
            for j in range(1,word.hmm.N):
                current_node=LexNode(word.hmm.mix[j],key)
                previous_node.children.append(current_node)
                previous_node=current_node
            previous_node.children.append(self.tree)
            previous_node.property=2
        
   

In [8]:
buildlextree=BuildLextree(GMMHMMS)
buildlextree.build_lextree()
lextree=buildlextree.tree
transition_cost=buildlextree.transition_cost

There are 11 words in this dictionary


In [9]:
class ContinousSpeechRecognition():
    def __init__(self):
        self.lextree=None
        self.dist_fun=None
        
    def fit(self,lextree,transition_cost):
        self.lextree=lextree
        assert type(self.lextree) is LexNode
        self.nodes = []
        self.get_nodes(self.lextree)
        self.initial_nodes_idx=[]
        for i in self.initial_nodes:
            self.initial_nodes_idx.append(self.nodes.index(i))
        # get self.transitions
        self.transition_cost=transition_cost
        self.get_parent = {}
        self.get_children={}
        to_children={}
        n_nodes = len(self.nodes)
        self.word_ends = []
        # record the end idx of each words, therefore, at the end of the vertibe , we can get the costs of each word
        for i in range(n_nodes):
            n = self.nodes[i]
            if n.property == 2:
                self.word_ends.append(i)
            self.get_children[i]=[]
            # add transition if there is any. to get the parent node of current node
            if len(n.children) > 0:
                for child in n.children:
                    self.get_children[i].append(self.nodes.index(child))
                    self.get_parent[self.nodes.index(child)] = i
                
    def get_nodes(self, lexnode):
        self.nodes=[]
        self.states=[]
        self.initial_nodes=[]
        words=lexnode.children
        self.states.append(0)
        self.nodes.append(lexnode)
        for word in words:
            state=0
            current_GMM=word
            self.initial_nodes.append(current_GMM)
            while current_GMM.property!=2:
                state+=1
                self.states.append(state)
                self.nodes.append(current_GMM)
                current_GMM=current_GMM.children[0]
            state+=1
            self.states.append(state)
            self.nodes.append(current_GMM)
    
    def idx2words(self,result):
        sentence=""
        for idx in result:
            sentence+=self.nodes[idx].word
        return sentence
    
    
    def traceback4or7(self,z_level,c):
        if len(z_level)>=7:
            min7=min(z_level[6][self.word_ends,c])
            min4=min(z_level[3][self.word_ends,c])
            
            if min7<min4:
                start=6
            else:
                start=3
        else:
            start=3
            
        final_result=""
        for i in range(start,-1,-1):
            current_digit,c=self._traceback(z_level[i],c)
            final_result=current_digit+final_result
        return final_result
    
    def traceback(self,z_matrix,c):

        final_result=""
        while c>0:
            current_digit,c=self._traceback(z_matrix,c)
            #print(current_digit)
            final_result=current_digit+final_result

        return final_result
    
    def _traceback(self,z_matrix,c):
        min_idx=np.argmin(z_matrix[self.word_ends,c])
        #print(min_idx)
        r=self.word_ends[min_idx]
        while r>0 and c>0:
            to_check=[z_matrix[r,c-1], 
                  z_matrix[self.get_parent[r],c-1],]
            track=np.argmin(to_check)
            if track==0:
                c-=1
            elif track==1:
                c-=1
                r=self.get_parent[r]
            else:
                r=self.get_parent[r]

        #print("current word start from {} th input".format(result))
        return self.nodes[self.word_ends[min_idx]].word,c
    
    
    def digit_vertibe47(self,data,loop_cost=300):
        #set different types of cost
        
        loop_cost = loop_cost
        
        
        zero39=np.zeros([data.shape[1]])
        data=np.vstack([zero39,data])
        # initialize cost matrix
        n_cols = len(data)
        n_rows = len(self.nodes)
        costs = np.full([n_rows,n_cols], np.inf)
        mute=np.zeros(n_rows)
        # * to all other nodes
        initial_cost=copy.deepcopy(costs)
        initial_cost[0,0]=0

        #token=[x=c,y=idx,z=?]
        y_level=[mute]
        z_level=[initial_cost]
        for c in range(1,n_cols):
            next_z_level=[]
            next_y_level=[]
            for current_possible_choice in range(len(z_level)):
                z_matrix=z_level[current_possible_choice]
                current_nodes=y_level[current_possible_choice]
                next_to_check_nodes=copy.deepcopy(mute)
                #update the y level costs
                for r in range(1,n_rows):
                    distance=mixture_log_gaussian(self.nodes[r].val,data[c])
                    
                    if current_nodes[r]:
                        to_check=[( z_matrix[self.get_parent[r]][c-1]+
                            self.transition_cost[self.nodes[r].word][self.states[self.get_parent[r]]][self.states[r]]
                            )]
                    elif current_nodes[self.get_parent[r]]:
                        to_check=[z_matrix[r][c-1]+self.transition_cost[self.nodes[r].word][self.states[r]][self.states[r]]]
                    elif current_nodes[self.get_parent[r]] and current_nodes[r]:
                        to_check=[np.inf]
                        next_to_check_nodes[r]=1
                    else:
                        to_check=[z_matrix[r][c-1]+self.transition_cost[self.nodes[r].word][self.states[r]][self.states[r]], # self transition
                                 (z_matrix[self.get_parent[r]][c-1]+
                                self.transition_cost[self.nodes[r].word][self.states[self.get_parent[r]]][self.states[r]])]

                    z_matrix[r][c]= min(to_check)+distance
                    if distance>500:#建议设置成500， 不然出错的几率会变大
                        next_to_check_nodes[r]=1
                next_y_level.append(next_to_check_nodes)
                #现在查看是否有新的词可以产生
                min_idx=np.argmin(z_matrix[:,c])
                min_cost=min(z_matrix[:,c])
                #print("min cost is {}, idx is {}".format(min_cost,min_idx))
                if min_idx in self.word_ends:
                    if len(z_level)-1>current_possible_choice:
                        #说明已经存在了这个新词
                        next_z=z_level[current_possible_choice+1]
                        next_z[0,c]=min_cost+loop_cost
                        
                    elif len(z_level)<7:
                        #可以开新词
                        new_z_matrix=copy.deepcopy(costs)
                        new_z_matrix[0,c]=min_cost+loop_cost
                        z_level.append(new_z_matrix)
                        next_y_level.append(copy.deepcopy(mute))
                                       
            y_level=next_y_level
        
        final_result=self.traceback4or7(z_level,c)
        print("final_result is {}".format(final_result))
        return final_result
    
    
    def digit_vertibe(self,data,threshold=400,loop_cost=300):
        #set different types of cost        
        loop_cost = loop_cost
        
        zero39=np.zeros([data.shape[1]])
        data=np.vstack([zero39,data])
        # initialize cost matrix
        n_cols = len(data)
        n_rows = len(self.nodes)
        trellis = np.full([n_rows,n_cols], np.inf)
        trellis[0][0]=0
        def pruning(column,threshold):
            best=min(column)
            for i in range(len(column)):
                if column[i]>best+threshold:
                    column[i]= np.inf
        
        for c in range(1,n_cols):
            # pruning
            if c>=3:
                column=trellis[:,c-1]
                pruning(column,threshold)
            for r in range(1,n_rows):
                distance=mixture_log_gaussian(self.nodes[r].val,data[c])

                #to check information
                to_check=[trellis[r][c-1]+self.transition_cost[self.nodes[r].word][self.states[r]][self.states[r]], # self transition
                         (trellis[self.get_parent[r]][c-1]+
                        self.transition_cost[self.nodes[r].word][self.states[self.get_parent[r]]][self.states[r]])]
                
                if not min (to_check)==np.inf:
                    trellis[r][c]= min(to_check)+distance
                
            #现在查看是否有新的词可以产生
            min_idx=np.argmin(trellis[:,c])
            min_cost=min(trellis[:,c])
            #print("min cost is {}, idx is {}".format(min_cost,min_idx))
            if min_idx in self.word_ends and min_cost!=np.inf:
                #print("a new word start")
                trellis[0,c]=min_cost+loop_cost
                
        final_result=self.traceback(trellis,c)
        print("final_result is {}".format(final_result))
        return final_result

In [10]:
csr=ContinousSpeechRecognition()
csr.fit(lextree,transition_cost)

# Test the record project6 data

In [12]:
file_folder="project6data/"
wavefile="0987654321_8.wav"

digit=wavefile[:-4]
data=getMFCC2(file_folder+wavefile)
digit_result=csr.digit_vertibe(data)
print("Huangrui recognize {} as {}".format(digit,digit_result))

final_result is silence0987354321silence
Huangrui recognize 0987654321_8 as silence0987354321silence


# start test problem 2

In [37]:
import os

In [39]:
file_folder="test_data/problem2/"
wavefiles=os.listdir(file_folder)
for wavefile in wavefiles:
    digit=wavefile[:-4]
    data=getMFCC2(file_folder+wavefile)
    digit_result=csr.digit_vertibe(data,loop_cost=300)
    print("Huangrui recognize {} as {}".format(digit,digit_result))

final_result is silence123456
Huangrui recognize 123456 as silence123456
final_result is 296789436
Huangrui recognize 25678543 as 296789436
final_result is silence37274920
Huangrui recognize 37274921 as silence37274920
final_result is silence5555
Huangrui recognize 55555 as silence5555
final_result is 96890372344
Huangrui recognize 6890372344 as 96890372344
final_result is silence729843479246
Huangrui recognize 72184347924 as silence729843479246
final_result is silence73433321903776
Huangrui recognize 7343332190377 as silence73433321903776
final_result is silence82121763426
Huangrui recognize 8212176342 as silence82121763426
final_result is 9826414052002
Huangrui recognize 826414052002 as 9826414052002
final_result is silence911386
Huangrui recognize 911385 as silence911386


## Huangrui Calculate the final accuracy to be: 83/89=0.93