In [376]:
from GMMHMM import *

In [377]:
import pickle

def save_pickle(model, filepath, save_name):
    pkl_filename = filepath + save_name + '.pkl'  

    with open(pkl_filename, 'wb') as model_pkl:
        pickle.dump(model, model_pkl)


def load_pickle(filepath, save_name):
    classification_pkl_filename = filepath + "/" + save_name + '.pkl'

    with open(classification_pkl_filename, 'rb') as classification_model_pkl:
        classification_model = pickle.load(classification_model_pkl)

    return classification_model

In [378]:
def load_all_digit_GMMHMM(filepath, filenames):
    """
    Returns:
        A dictionary containing loaded GMMHMM models, with digit labels as keys.
    """
    GMMHMMs = {} 
    for digit in filenames:
        print("Loading the digit {} GMMHMM".format(digit))
        current_digit_GMMHMM = load_pickle(filepath, str(digit))  # Load the GMMHMM model for the current digit
        GMMHMMs[str(digit)] = current_digit_GMMHMM  # Add the loaded model to the dictionary with the digit label as key
    return GMMHMMs


In [379]:
filepath = "models/"
filenames = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
GMMHMMS = load_all_digit_GMMHMM(filepath, filenames)

Loading the digit 0 GMMHMM
Loading the digit 1 GMMHMM
Loading the digit 2 GMMHMM
Loading the digit 3 GMMHMM
Loading the digit 4 GMMHMM
Loading the digit 5 GMMHMM
Loading the digit 6 GMMHMM
Loading the digit 7 GMMHMM
Loading the digit 8 GMMHMM
Loading the digit 9 GMMHMM


In [380]:
class LexNode:
    def __init__(self, val,word):
        self.val = val
        self.word= word
        self.children = []
        self.property = 0

In [381]:
class BuildLextree:
    def __init__(self, dic):
        """
        Initialize the BuildLextree class with a dictionary of words.

        Args:
            dic: A dictionary where keys are identifiers and values are Word objects.

        Returns:
            None
        """
        self.dic2words(dic)
        zeros = np.zeros([39])
        ones = np.ones([39])

        # Create a fine GMM with a single Gaussian component
        fine_GMM = mixInfo()
        fine_GMM.Gaussian_mean.append(zeros)
        fine_GMM.Gaussian_var.append(ones)
        fine_GMM.Gaussian_weight = [1]
        fine_GMM.Gaussian_mean = np.array(fine_GMM.Gaussian_mean)
        fine_GMM.Gaussian_var = np.array(fine_GMM.Gaussian_var)
        fine_GMM.Num_of_Gaussian = 1

        self.tree = LexNode(fine_GMM, "*")
        self.tree.property = 1

    def dic2words(self, dic):
        """
        Convert the input dictionary to a list of words and store transition costs.

        Args:
            dic: A dictionary where keys are identifiers and values are Word objects.

        Returns:
            None
        """
        self.words = []  
        self.keys = list(dic.keys())  
        self.transition_cost = {}  
        for key in self.keys:
            self.words.append(dic[key])
            self.transition_cost[key] = dic[key].hmm.transition_cost  # Store transition costs

    def append_lex_node(self, parent, child):
        """
        Append a child node to a parent node.

        Args:
            parent: The parent LexNode.
            child: The child LexNode to append.

        Returns:
            None
        """
        assert type(parent) is LexNode and type(child) is LexNode
        parent.children.append(child)  # Append the child node to the parent node

    def build_lextree(self):
        """
        Build the lextree from the list of words.

        Args:
            None

        Returns:
            None
        """
        for i in range(len(self.words)):
            word = self.words[i]  
            key = self.keys[i]  
            previous_node = LexNode(word.hmm.mix[0], key)  # Create a LexNode for the first mixture
            self.tree.children.append(previous_node)  # Append the node to the root
            for j in range(1, word.hmm.N):
                current_node = LexNode(word.hmm.mix[j], key)  # Create a LexNode for the current mixture
                previous_node.children.append(current_node)  
                previous_node = current_node  
            previous_node.children.append(self.tree)  # Append the root node to complete the loop
            previous_node.property = 2  


In [382]:
buildlextree = BuildLextree(GMMHMMS)
buildlextree.build_lextree()
lextree = buildlextree.tree
transition_cost = buildlextree.transition_cost

In [383]:
import copy

class ContinousSpeechRecognition():
    def __init__(self):
        self.lextree = None
        self.dist_fun = None
        
    def fit(self, lextree, transition_cost):
        self.lextree = lextree
        assert type(self.lextree) is LexNode
        self.nodes = []
        self.get_nodes(self.lextree)
        initial_nodes_idx = []  # Define initial_nodes_idx as a local variable
        for i in self.initial_nodes:
            initial_nodes_idx.append(self.nodes.index(i))
        self.transition_cost = transition_cost
        self.get_parent = {}
        self.get_children = {}
        n_nodes = len(self.nodes)
        self.word_ends = []
        for i in range(n_nodes):
            n = self.nodes[i]
            if n.property == 2:
                self.word_ends.append(i)
            self.get_children[i] = []
            if len(n.children) > 0:
                for child in n.children:
                    self.get_children[i].append(self.nodes.index(child))
                    self.get_parent[self.nodes.index(child)] = i


    def get_nodes(self, lexnode):
        """
        Extracts nodes, states, and initial nodes from the lexnode and stores them.

        Args:
            lexnode: The LexNode to process.

        Returns:
            None
        """
        # Initialize lists to store nodes, states, and initial nodes
        self.nodes = []
        self.states = []
        self.initial_nodes = []
        
        # Process children of the lexnode
        words = lexnode.children
        self.states.append(0)  # Initial state
        self.nodes.append(lexnode)  # Add the lexnode to the nodes list
        for word in words:
            state = 0
            current_GMM = word
            self.initial_nodes.append(current_GMM)  # Add current GMM to the initial nodes
            # Traverse through the children until reaching the end of a word
            while current_GMM.property != 2:
                state += 1
                self.states.append(state)
                self.nodes.append(current_GMM)
                current_GMM = current_GMM.children[0]  # Move to the next child
            state += 1
            self.states.append(state)
            self.nodes.append(current_GMM)  # Add the last GMM node


    def idx2words(self, result):
        """
        Converts indices to words based on stored nodes.

        Args:
            result: List of indices to convert.

        Returns:
            A string representing the words corresponding to the indices.
        """
        sentence = ""
        # Concatenate words corresponding to the indices
        for idx in result:
            sentence += self.nodes[idx].word
        return sentence


    def traceback4or7(self, z_level, c):
        """
        Performs traceback to determine the final result based on provided z-level and c values.

        Args:
            z_level: List of z-level values.
            c: The c value.

        Returns:
            The final result determined through traceback.
        """
        # Determine where to start the traceback (either at level 7 or level 4)
        if len(z_level) >= 7:
            min7 = min(z_level[6][self.word_ends, c])
            min4 = min(z_level[3][self.word_ends, c])
            if min7 < min4:
                start = 6
            else:
                start = 3
        else:
            start = 3

        final_result = ""
        # Perform traceback from the determined start level
        for i in range(start, -1, -1):
            current_digit, c = self._traceback(z_level[i], c)
            final_result = current_digit + final_result
        return final_result

    def traceback(self, z_matrix, c):
        """
        Performs traceback to determine the final result based on the provided z-matrix and c value.

        Args:
            z_matrix: The z-matrix containing costs.
            c: The c value.

        Returns:
            The final result determined through traceback.
        """
        final_result = ""
        # Perform traceback until c becomes 0
        while c > 0:
            current_digit, c = self._traceback(z_matrix, c)
            final_result = current_digit + final_result
        return final_result


    def _traceback(self, z_matrix, c):
        """
        Performs a single step of traceback using the provided z-matrix and c value.

        Args:
            z_matrix: The z-matrix containing costs.
            c: The c value.

        Returns:
            The current digit and updated c value after traceback.
        """
        # Find the index of the minimum cost
        min_idx = np.argmin(z_matrix[self.word_ends, c])
        r = self.word_ends[min_idx]
        # Traverse backward until reaching the start of the word or end of the input
        while r > 0 and c > 0:
            to_check = [z_matrix[r, c - 1], z_matrix[self.get_parent[r], c - 1]]
            track = np.argmin(to_check)
            if track == 0:
                c -= 1
            elif track == 1:
                c -= 1
                r = self.get_parent[r]
            else:
                r = self.get_parent[r]
        # Return the current word and updated c value
        return self.nodes[self.word_ends[min_idx]].word, c

    def digit_viterbi_47(self, data, loop_cost=300):
        """
        Perform digit recognition using the Viterbi algorithm with word-ending constraint.

        Args:
            data: The input data.
            loop_cost: The cost associated with looping.

        Returns:
            The recognized digit.
        """
        loop_cost = loop_cost
        zero39 = np.zeros([data.shape[1]])
        data = np.vstack([zero39, data])  # Add a row of zeros at the beginning
        n_cols = len(data)
        n_rows = len(self.nodes)
        costs = np.full([n_rows, n_cols], np.inf)  # Initialize cost matrix with infinity values
        mute = np.zeros(n_rows)  # Initialize an array of zeros

        # Initialize the initial cost matrix
        initial_cost = np.full_like(costs, np.inf)
        initial_cost[0, 0] = 0  # Set the cost of the initial node to 0

        y_level = [mute]
        z_level = [initial_cost]

        for c in range(1, n_cols):
            next_y_level = []
            for current_possible_choice in range(len(z_level)):
                z_matrix = z_level[current_possible_choice]
                current_nodes = y_level[current_possible_choice]
                next_to_check_nodes = np.zeros_like(mute)

                # Update the y level costs
                for r in range(1, n_rows):
                    distance = mixture_log_gaussian(self.nodes[r].val, data[c])

                    if current_nodes[r]:
                        to_check = [z_matrix[self.get_parent[r]][c - 1] +
                                    self.transition_cost[self.nodes[r].word][self.states[self.get_parent[r]]][
                                        self.states[r]]]
                    elif current_nodes[self.get_parent[r]]:
                        to_check = [z_matrix[r][c - 1] +
                                    self.transition_cost[self.nodes[r].word][self.states[r]][self.states[r]]]
                    elif current_nodes[self.get_parent[r]] and current_nodes[r]:
                        to_check = [np.inf]
                        next_to_check_nodes[r] = 1
                    else:
                        to_check = [z_matrix[r][c - 1] +
                                    self.transition_cost[self.nodes[r].word][self.states[r]][self.states[r]],
                                    z_matrix[self.get_parent[r]][c - 1] +
                                    self.transition_cost[self.nodes[r].word][self.states[self.get_parent[r]]][
                                        self.states[r]]]

                    z_matrix[r][c] = min(to_check) + distance
                    if distance > 500:  # Adjust this threshold as needed
                        next_to_check_nodes[r] = 1

                next_y_level.append(next_to_check_nodes)

                # Check if a new word can be formed
                min_idx = np.argmin(z_matrix[:, c])
                min_cost = min(z_matrix[:, c])
                if min_idx in self.word_ends:
                    if len(z_level) - 1 > current_possible_choice:
                        next_z = z_level[current_possible_choice + 1]
                        next_z[0, c] = min_cost + loop_cost
                    elif len(z_level) < 7:
                        new_z_matrix = np.full_like(costs, np.inf)
                        new_z_matrix[0, c] = min_cost + loop_cost
                        z_level.append(new_z_matrix)
                        next_y_level.append(np.zeros_like(mute))

            y_level = next_y_level

        final_result = self.traceback4or7(z_level, c)
        print("final_result is {}".format(final_result))
        return final_result


    def digit_viterbi(self, data, threshold=400, loop_cost=300):
        # Prepend a row of zeros to the data to handle initial state.
        zero_row = np.zeros([data.shape[1]])
        data = np.vstack([zero_row, data])
        
        # Initialize the cost matrix (trellis).
        n_cols = len(data)
        n_rows = len(self.nodes)
        trellis = np.full([n_rows, n_cols], np.inf)  # Use np.inf for initially unreachable states.
        trellis[0][0] = 0  # The start state has zero cost.

        # Define a pruning function to eliminate paths exceeding a certain threshold.
        def pruning(column, threshold):
            best = min(column)
            for i in range(len(column)):
                if column[i] > best + threshold:
                    column[i] = np.inf

        # Iterate through each column (time step) in the data.
        for c in range(1, n_cols):
            # Prune paths to reduce computation for columns from the third onwards.
            if c >= 3:
                column = trellis[:, c - 1]
                pruning(column, threshold)
            
            # Update the cost for each state (row) based on the previous column.
            for r in range(1, n_rows):
                # Calculate the distance using a Gaussian mixture model.
                distance = mixture_log_gaussian(self.nodes[r].val, data[c])

                # Evaluate the cost of staying in the same state vs. transitioning from the parent state.
                to_check = [
                    trellis[r][c - 1] + self.transition_cost[self.nodes[r].word][self.states[r]][self.states[r]],  # Self-transition
                    trellis[self.get_parent[r]][c - 1] + self.transition_cost[self.nodes[r].word][self.states[self.get_parent[r]]][self.states[r]]  # Transition from parent
                ]
                
                # Update the trellis if any of the paths are viable.
                if not min(to_check) == np.inf:
                    trellis[r][c] = min(to_check) + distance

            # Check if a new word can start by examining the minimum cost in the current column.
            min_idx = np.argmin(trellis[:, c])
            min_cost = min(trellis[:, c])
            if min_idx in self.word_ends and min_cost != np.inf:
                trellis[0, c] = min_cost + loop_cost  # Allow starting a new word with an additional loop cost.

        # Traceback from the final column to find the optimal path.
        final_result = self.traceback(trellis, c)
        print("final_result is {}".format(final_result))
        return final_result


In [384]:
csr = ContinousSpeechRecognition()
csr.fit(lextree,transition_cost)

# Problem 1

In [385]:
import os

file_folder = "test_data/problem1/"
wavefiles = os.listdir(file_folder)

for wavefile in wavefiles:
    digit = wavefile[:-4]
    data = getMFCC2(file_folder + wavefile)
    digit_result = csr.digit_viterbi_47(data)
    print("Recognize {} as {}".format(digit,digit_result))

final_result is 2347575
Recognize 2347895 as 2347575
final_result is 2212
Recognize 2212 as 2212
final_result is 6575
Recognize 5678 as 6575
final_result is 6398
Recognize 6398 as 6398
final_result is 0751
Recognize 3785 as 0751
final_result is 6181
Recognize 6789 as 6181
final_result is 1324711
Recognize 3247895 as 1324711
final_result is 3657855
Recognize 3657895 as 3657855
final_result is 1234567
Recognize 1234567 as 1234567
final_result is 1647511
Recognize 8647895 as 1647511
final_result is 9395
Recognize 1398 as 9395
final_result is 1399
Recognize 1399 as 1399
final_result is 1521755
Recognize 3217895 as 1521755
final_result is 4511
Recognize 4391 as 4511
final_result is 6657511
Recognize 8657895 as 6657511
final_result is 1399
Recognize 7398 as 1399
final_result is 2345678
Recognize 2345678 as 2345678
final_result is 9395
Recognize 5398 as 9395
final_result is 9399
Recognize 9398 as 9399


# Problem 2

In [386]:
file_folder = "test_data/problem2/"
wavefiles = os.listdir(file_folder)

for wavefile in wavefiles:
    digit = wavefile[:-4]
    data = getMFCC2(file_folder + wavefile)
    digit_result = csr.digit_viterbi(data)
    print("Recognize {} as {}".format(digit, digit_result))

final_result is 85555
Recognize 55555 as 85555
final_result is 7343332190371
Recognize 7343332190377 as 7343332190371
final_result is 101385
Recognize 911385 as 101385
final_result is 6800372344
Recognize 6890372344 as 6800372344
final_result is 721814547124
Recognize 72184347924 as 721814547124
final_result is 29675543
Recognize 25678543 as 29675543
final_result is 923456
Recognize 123456 as 923456
final_result is 28212776342
Recognize 8212176342 as 28212776342
final_result is 37274121
Recognize 37274921 as 37274121
final_result is 0825414052002
Recognize 826414052002 as 0825414052002
