In [1]:
import numpy as np
import math

class HMMState:
    """
    Represents a state in a Hidden Markov Model with Gaussian emissions.
    
    Attributes:
        mean (np.ndarray): Mean vectors of Gaussian emissions.
        covariance (np.ndarray): Covariance matrices (diagonal) of Gaussian emissions.
        label (int, optional): Associated digit with the state. None if it's an initial state.
        parent (HMMState, optional): Parent state. None if it's an initial state.
    """
    def __init__(self, mean: np.ndarray, covariance: np.ndarray, label: int = None, parent: "HMMState" = None):
        """
        Initializes the HMMState instance.
        
        Parameters:
            mean (np.ndarray): Mean vectors of Gaussian emissions.
            covariance (np.ndarray): Covariance matrices (diagonal) of Gaussian emissions.
            label (int, optional): Associated digit with the state.
            parent (HMMState, optional): Parent state.
        """
        self.mean = mean
        self.covariance = covariance
        self.label = label
        self.parent = parent
    
    def __hash__(self):
        """
        Returns a hash value for the state based on its label.
        """
        return self.label
    
    def log_multivariate_gaussian_pdf_diag_cov(self, x: np.ndarray, epsilon: float = 1e-9) -> float:
        """
        Calculates the log PDF of a multivariate Gaussian with diagonal covariance.
        
        Parameters:
            x (np.ndarray): The input vector.
            epsilon (float): Small value added to the diagonal of the covariance matrix for numerical stability.
            
        Returns:
            float: The log probability density function value.
        """
        cov_safe = self.covariance + epsilon * np.eye(x.shape[0])
        log_det_cov = np.log(np.linalg.det(cov_safe))
        inv_cov = np.linalg.inv(cov_safe)
        const_term = -0.5 * x.shape[0] * np.log(2 * np.pi)
        diff = x - self.mean
        quadratic_term = -0.5 * np.dot(diff.T, np.dot(inv_cov, diff))
    
        return const_term - 0.5 * log_det_cov + quadratic_term
        
    def get_log_emission_prob(self, observation: np.ndarray) -> float:
        """
        Calculates the log emission probability of an observation considering multiple Gaussian components.
        
        Parameters:
            observation (np.ndarray): The observed data vector.
            
        Returns:
            float: The log emission probability.
        """
        if not self.mean.size or not self.covariance.size:
            return -np.inf # Return negative infinity if this state does not emit any observations.

        log_probs = [
            self.log_multivariate_gaussian_pdf_diag_cov(observation)
            for mean, cov in zip(np.atleast_2d(self.mean), np.atleast_3d(self.covariance))
        ]
    
        max_log_prob = max(log_probs)
        log_sum = np.log(np.sum(np.exp(log_probs - max_log_prob))) + max_log_prob
        
        return log_sum - np.log(len(self.mean))


In [2]:
mean = [1, 2, 3]
covar = [[1, 0, 0], [0, 2, 0], [0, 0, 3]]

state = HMMState(mean, covar)

observation = np.array([1, 20, 3])
log_pdf = state.log_multivariate_gaussian_pdf_diag_cov(observation)

print(log_pdf)

-84.65269529464472


In [3]:
import os
from typing import List, Dict

def hmm_load_features(data_dir: str) -> list:
    """
    Loads feature data from .npy files within a given directory, each file representing a sample.
    
    Parameters:
        data_dir (str): The path to the directory containing the .npy files.
        
    Returns:
        list: A list of dictionaries, where each dictionary contains 'label' as an integer and 'features' as a numpy array.
    """
    samples = []
    try:
        for file_name in os.listdir(data_dir):
            if file_name.endswith('.npy'):
                parts = file_name.split('-')
                if len(parts) == 2:
                    label = int(parts[0])
                    features_path = os.path.join(data_dir, file_name)
                    features = np.load(features_path)
                    sample = {'label': label, 'features': features}
                    samples.append(sample)
    except Exception as e:
        print(f"Error loading features: {e}")
    return samples

def filter_samples_by_label(samples: List[Dict], label: int) -> List[np.ndarray]:
    """
    Filters the given list of sample dictionaries to include only those with a specific label.
    
    Parameters:
        samples (List[Dict]): A list of dictionaries, where each dictionary contains 'label' as an integer and 'features' as a numpy array.
        label (int): The label by which to filter the samples.
        
    Returns:
        List[np.ndarray]: A list of numpy arrays containing the features of samples that match the specified label.
    """
    return [sample["features"] for sample in samples if sample['label'] == label]



In [4]:
from typing import List, Dict
import numpy as np

class HMM:
    """
    A Hidden Markov Model (HMM) with Gaussian emissions for sequence modeling.

    This class represents a Hidden Markov Model designed for analyzing sequences of observations.
    It uses Gaussian distributions to model the emissions from each state, allowing for continuous observation spaces.

    Attributes:
        states (List[HMMState]): The states of the HMM.
        transitions (np.ndarray): Transition probability matrix between states.
        observations (List[np.ndarray]): List of observed sequences.
        state_index (Dict[HMMState, int]): Mapping of states to their indices in the transition matrix.
        initial_probabilities (List[float]): Probability of starting in each state.
    """
    def __init__(self, label, training_folder_path='../training'):
        """
        Initializes the HMM with a specific label and loads training data from a specified folder path.
        Args:
            label (str): The label associated with this HMM.
            training_folder_path (str, optional): The path to the folder containing the training data. Defaults to '../training'.
        """
        self.states: List[HMMState] = []
        data = hmm_load_features(training_folder_path)
        sequences = [sample['features'] for sample in data]
    
        templates_for_label = filter_samples_by_label(data, label)
        self.templates = templates_for_label
        self.num_states = self.get_num_state()
        self.observations: List[np.ndarray] = []
        self.transitions: np.ndarray = np.zeros((len(self.states), len(self.states)))
        self.state_index: Dict[HMMState, int] = {}
        self.initial_probabilities: List[float] = []  # Probability of starting in each state
        
    def calculate_mean_and_covariance(self, vectors):
        """
        Calculates the mean vector and covariance matrix for a given list of vectors.

        Args:
            vectors (List[np.ndarray]): A list of observation vectors.

        Returns:
            tuple: A tuple containing the mean vector and covariance matrix.
        """
        vectors_np = np.array(vectors)
        mean_vector = np.mean(vectors_np, axis=0)
        covariance_matrix = np.cov(vectors_np.T)
        return mean_vector, covariance_matrix
    
    def normalize_sequence(self, seq):
        """
        Normalizes a sequence by maintaining increasing trends and filtering out short-term decreases.

        Args:
            seq (List[float]): The sequence to normalize.

        Returns:
            List[float]: The normalized sequence.
        """
        if not seq:
            return seq  # Return empty list if input is empty

        normalized_seq = [seq[0]]  # Start with the first element

        for i in range(1, len(seq)):
            current = seq[i]
            previous = normalized_seq[-1]
            if current >= previous:
                normalized_seq.append(current)
            else:
                if i + 1 < len(seq) and seq[i + 1] >= current:
                    normalized_seq.append(current)
        return normalized_seq
    
    def print_status(self):
        """
        Prints a comprehensive status report of the HMM, including the number of states, observation details,
        the transition matrix, state index mapping, and initial probabilities.
        """
        print("HMM Status Report")
        print("=================")
        print(f"Number of States: {len(self.states)}")
        
        # Printing state index mapping
        print(f"State Index Map: {len(self.state_index)} entries")
        for state, index in self.state_index.items():
            print(f"  State {state} -> Index {index}")
        
        # Optionally, print details about each state
        for i, state in enumerate(self.states):
            print(f"  State {i}: Mean {state.mean}, Covariance shape {state.covariance.shape}")
        
        print(f"Number of Observations: {len(self.observations)}")
        for i, obs in enumerate(self.observations):
            print(f"  Observation {i}: Shape {obs.shape}")
        
        print(f"Transition Matrix: {len(self.transitions)}x{len(self.transitions[0])}" if self.transitions else "Not defined")
        for i, row in enumerate(self.transitions):
            print(f"  Transition from State {i}: {row}")
        
        print(f"Initial Probabilities: {self.initial_probabilities}")

    def add_state(self, state: HMMState):
        """
        Adds a new state to the HMM model, updating the transition matrix and state index accordingly.

        Args:
            state (HMMState): The state to be added to the HMM.
        """
        self.states.append(state)
        self.state_index[state] = len(self.states) - 1 # Assign index to the new state

        new_transitions = np.zeros((len(self.states), len(self.states)))
        new_transitions[:-1, :-1] = self.transitions
        self.transitions = new_transitions

        self.initial_probabilities = [1.0 / len(self.states) for _ in self.states]

    def initialize_HMM_states(self, label, training_folder_path = '../training'):
        """
        Initializes the HMM states by loading training data, segmenting it, and clustering
        to calculate mean vectors and covariance matrices for each cluster.

        This method performs an initial segmentation of the training data for a specific label into segments,
        clusters these segments, and then calculates the mean vector and covariance matrix for each cluster.
        These mean vectors and covariances are used to initialize the states of the HMM.

        Args:
            label (str): The label of the digit to initialize states for.
            training_folder_path (str, optional): The path to the folder containing the training data. Defaults to '../training'.

        Returns:
            tuple: A tuple containing two lists; the first list contains mean vectors, and the second list contains covariance matrices for each state initialized.
        """
        data = hmm_load_features(training_folder_path)
        templates_for_label = filter_samples_by_label(data, label)

        segments = self.initial_segmentation(templates_for_label,5)

        clusters = self.get_clusters(segments)

        mean=[]
        covariances = []
        for cluster in clusters:
            mean, covariance = self.calculate_mean_and_covariance(cluster)
            mean.append(mean)
            covariances.append(covariance)

        return mean, covariances
    
    def initialize(self, label, training_folder_path = '../training', num_states = 5):
        """
        Initializes the HMM by setting up its states with mean vectors and covariance matrices,
        initial probabilities, and transition probabilities.

        This method loads training data for a specific label, performs initial segmentation and clustering,
        and initializes HMM states based on the clusters. It also sets up the initial probabilities and
        transition probabilities between states.

        Args:
            label (str): The label of the digit to initialize states for.
            training_folder_path (str, optional): The path to the folder containing the training data. Defaults to '../training'.
            num_states (int, optional): The number of states for the HMM. Defaults to 5.
        """
        data = hmm_load_features(training_folder_path)
        templates_for_label = filter_samples_by_label(data, label)
        segmented_templates = self.initial_segmentation(templates_for_label, num_states)

        clustered_data = self.get_clusters(segmented_templates)

        means, variances = self.initialize_HMM_states(label, training_folder_path, num_states)

        for i in range(num_states):
            new_state = HMMState(means[i], variances[i], label = i)
            self.states.append(new_state)
            self.state_index[new_state] = i
        
        self.initial_probabilities = [1.0 if i == 0 else 0.0 for i in range(num_states)]

        self.transitions = np.zeros((num_states, num_states))
        for i in range(num_states):
            if i < num_states - 1:
                self.transitions[i][i + 1] = len(templates_for_label)/len(clustered_data[i])
                self.transitions[i][i] = 1- self.transitions[i][i + 1] # Probability of staying in the same state
            else:
                self.transitions[i][i] = 1.0  # Last state only points to itself


    def set_observations(self, observations: List[np.ndarray]):
        """Sets the sequence of observations for the HMM."""
        self.observations = observations

    def most_probable_sequence(self, obs_seq):
        """
        Computes the most probable sequence of states for a given sequence of observations using the Viterbi algorithm.

        Args:
            obs_seq (List[np.ndarray]): A list of observation vectors.

        Returns:
            tuple: A tuple containing the maximum log probability of the most probable sequence and the sequence of states itself.
        """
        V = [{}]  # Stores the max log probability of the most probable path to each state at each timestep
        path = {}  # Stores the most probable path to each state

        for state in self.states:
            initial_prob = self.initial_probabilities[self.state_index[state]]
            log_initial_prob = math.log(initial_prob) if initial_prob > 0 else -math.inf
            V[0][self.state_index[state]] = log_initial_prob + state.log_multivariate_gaussian_pdf_diag_cov(obs_seq[0])
            path[self.state_index[state]] = [state]

        # Dynamic programming forward pass for t > 0
        for t in range(1, len(obs_seq)):
            V.append({})
            newpath = {}
            for cur_state in self.states:
                max_log_prob, best_prev_state= -math.inf, None

                for prev_state in self.states:
                    transition_prob = self.transitions[self.state_index[prev_state]][self.state_index[cur_state]]
                    log_transition_prob = math.log(transition_prob) if transition_prob > 0 else - math.inf
                    log_prob = V[t-1][self.state_index[prev_state]] + log_transition_prob + cur_state.log_multivariate_gaussian_pdf_diag_cov(obs_seq[t])

                    if log_prob > max_log_prob:
                        max_log_prob, best_prev_state = log_prob, prev_state

                V[t][self.state_index[cur_state]] = max_log_prob
                if best_prev_state is not None:  # Check to ensure there is a valid previous state
                    newpath[self.state_index[cur_state]] = path[self.state_index[best_prev_state]] + [cur_state]

            path = newpath

        # Find the final state with the highest probability
        max_final_log_prob = max(V[-1].values())
        final_state = next(state for state, prob in V[-1].items() if prob == max_final_log_prob)

        return (max_final_log_prob, path[final_state])

    def initial_segmentation(self, templates, num_segments):
        """
        Segments each template into a specified number of segments, distributing the observations evenly.

        This method divides each observation sequence (template) in the provided list into a fixed number of segments.
        It ensures that the observations are as evenly distributed as possible across these segments.

        Args:
            templates (List[List[np.ndarray]]): A list of templates, where each template is a list of observation vectors.
            num_segments (int): The number of segments to divide each template into.

        Returns:
            List[List[List[np.ndarray]]]: A list of segmented templates, where each template is now a list of segments,
                                        and each segment is a list of observation vectors.
        """
        segmented_templates = []

        for template in templates:
            # Determine the size of each segment
            num_observations = len(template)
            segment_size = num_observations // num_segments
            extra = num_observations % num_segments

            segments = []
            start_idx = 0

            for _ in range(num_segments):
                # Adjust segment size to distribute remaining observations
                end_idx = start_idx + segment_size + (1 if extra > 0 else 0)
                # Decrease extra count until it's distributed
                extra -= 1 if extra > 0 else 0

                # Extract the segment and add to the list
                segment = template[start_idx:end_idx]
                segments.append(segment)

                start_idx = end_idx

            segmented_templates.append(segments)

        return segmented_templates
    
    def get_clusters(self, segmented_templates, num_segments = 5):
        """
        Clusters segments across all templates into a dictionary, where each key represents a segment index
        and its value is an array of all segments from all templates corresponding to that index.

        Args:
            segmented_templates (List[List[List[np.ndarray]]]): A list of segmented templates.
            num_segments (int, optional): The expected number of segments in each template. Defaults to 5.

        Returns:
            dict: A dictionary where keys are segment indices and values are arrays of segments.
        """
        clusters = {}
        
        for i, template in enumerate(segmented_templates):
            for j, segment in enumerate(template):
                if j < num_segments:
                    if j not in clusters:
                        clusters[j] = np.array(segment, dtype=object)
                    else:
                        clusters[j] = np.concatenate((clusters[j], np.array(segment, dtype=object)))
        
        return clusters
    
    def segment_based_on_indices(self, template, indices):
        """
        Segments a single template based on a list of indices, effectively slicing the template into segments.

        Args:
            template (List[np.ndarray]): The template to be segmented.
            indices (List[int]): A list of indices at which the template should be segmented.

        Returns:
            List[List[np.ndarray]]: A list of segmented templates, where each segment is a list of observations.
        """
        segmented_template = []

        if indices:
            segmented_template.append(template[:indices[0]])

            for i in range(len(indices) - 1):
                segment = template[indices[i]:indices[i + 1]]
                segmented_template.append(segment)

            segmented_template.append(template[indices[-1]:])

        return segmented_template
    
    def get_num_state(self):
        """
        Returns the number of states currently defined in the HMM model.

        Returns:
            int: The number of states.
        """
        return len(self.states)

    def train_single_iteration(self):
        """
        Performs a single iteration of training on the HMM model using the templates.

        This method iterates through each template, computes the most probable state sequence, normalizes the sequence,
        and then calculates new mean and covariance values for each state based on the segmentation of the templates.
        It also updates the state transition probabilities based on the newly clustered data.

        Returns:
            float: The average score (probability) of the most probable sequences for all templates.
        """
        score_total=[]
        split_indices=[]
        segmented_templates=[]
        
        for template in self.templates:
            probability, state_sequence = self.most_probable_sequence(template)
            score_total.append(probability)
            normalized_sequence = self.normalize_sequence([self.state_index[state] for state in state_sequence])

            indices = [i for i, _ in enumerate(normalized_sequence[:-1]) if normalized_sequence[i] != normalized_sequence[i+1]]
            split_indices.append(indices)
            segmented_template = self.segment_based_on_indices(template, indices)
            segmented_templates.append(segmented_template)
            
        score = np.mean(score_total)

        clustered_data = self.get_clusters(segmented_templates)
        means, covariances = [], []
    
        for cluster in clustered_data.values():
            mean, covariance = self.calculate_mean_and_covariance(cluster)
            mean.append(mean)
            covariances.append(covariance)

        for i, state in enumerate(self.states):
            state.mean = means[i]
            state.covariance = covariances[i]
        
        for i in range(len(self.states)):
            if i < len(self.states) - 1:
                self.transitions[i][i + 1] = len(self.templates) / len(clustered_data[i])
                self.transitions[i][i] = 1 - self.transitions[i][i + 1] # Probability of staying in the same state
            else:
                self.transitions[i][i] = 1.0 

        return score
    
    def train(self, iterations=10):
        """
        Trains the HMM model over a specified number of iterations.

        For each iteration, it performs a single iteration of training and prints the training score.

        Args:
            iterations (int, optional): The number of training iterations to perform. Defaults to 10.
        """
        for iteration in range(iterations):
            score = self.train_single_iteration()
            print(f'HMM training for iteration {iteration}, training score: {score}')

    def evaluate(self, sequences, labels):
        """
        Evaluate the HMM on a test set.
        Args:
            sequences (List[List[np.ndarray]]): A list of observation sequences.
            labels (List[List[int]]): The true state sequences for each observation sequence.
        Returns:
            float, float: The sentence accuracy and the word accuracy.
        """
        correct_sentences = 0
        correct_words = 0
        total_words = 0

        for obs_seq, true_states in zip(sequences, labels):
            predicted_states = self.decode(obs_seq)

            if predicted_states == true_states:
                correct_sentences += 1

            correct_words += sum(p == t for p, t in zip(predicted_states, true_states))
            total_words += len(true_states)

        sentence_accuracy = correct_sentences / len(sequences)
        word_accuracy = correct_words / total_words

        return sentence_accuracy, word_accuracy



In [5]:
import pickle

def save_hmm(hmm, filename):
    """
    Save a trained Hidden Markov Model (HMM) to a file using pickle.

    Parameters:
    - hmm: The HMM object to save.
    - filename: The name of the file where the HMM should be saved.
    """
    with open(filename, 'wb') as file:
        pickle.dump(hmm, file)
    print(f"HMM model has been saved to '{filename}'")
    
def load_hmm(filename):
    """
    Load a trained Hidden Markov Model (HMM) from a file using pickle.

    Parameters:
    - filename: The name of the file from which to load the HMM.

    Returns:
    - The loaded HMM object.
    """
    with open(filename, 'rb') as file:
        hmm = pickle.load(file)
    print(f"HMM model has been loaded from '{filename}'")
    return hmm

In [6]:
"""
def train_all_HMM():
    all_label=range(0,10)
    for i in all_label:
        filename=f'Digit {i} HMM'
        print(f"Training {filename}")
        hmm=HMM(label=i)
        hmm.initialize_HMM_states(label=i)
        hmm.initialize(label=i)
        hmm.train(iterations=5)
        save_hmm(hmm, filename)
        print(f'{filename} training finished! Moving to the next.')
        
train_all_HMM()
"""


'\ndef train_all_HMM():\n    all_label=range(0,10)\n    for i in all_label:\n        filename=f\'Digit {i} HMM\'\n        print(f"Training {filename}")\n        hmm=HMM(label=i)\n        hmm.initialize_HMM_states(label=i)\n        hmm.initialize(label=i)\n        hmm.train(iterations=5)\n        save_hmm(hmm, filename)\n        print(f\'{filename} training finished! Moving to the next.\')\n        \ntrain_all_HMM()\n'

In [7]:
import math

def load_all_hmm():
    """
    Loads HMM models for all ten digits.

    This function iterates through the numbers 0 to 9, loads the HMM model for each digit,
    and appends it to a list of models.

    Returns:
        List: A list containing HMM models for digits 0 through 9.
    """
    hmms = []
    for digit in range(10):
        hmm = load_hmm(f'Digit {digit} HMM')
        hmms.append(hmm)
    return hmms

def recognize(hmms, data, digit):
    """
    Recognizes the digit from given data using a list of HMM models.

    This function calculates the probability of the given data for each digit model and
    identifies the digit with the highest probability as the recognized digit.

    Args:
        hmms (List): A list containing HMM models for digits 0 through 9.
        data: The data to be recognized/classified.
        digit (int): The true digit value for validation.

    Prints:
        The recognized digit and validation result.
    """
    p_max = -math.inf
    recognized_digit = -1
    for i, hmm in enumerate(hmms):
        probability, _ = hmm.most_prabable_sequence(data)
        if probability > p_max:
            p_max = probability
            recognized_digit = i
    
    print(f"The voice is recognized as {recognized_digit}, the true value is {digit}.")
    if recognized_digit == digit:
        print("Congrats, you recognized the digit correctly.")
    else:
        print("Oops, it seems that you are wrong.")



In [8]:
hmm1 = load_all_hmm()

HMM model has been loaded from 'Digit 0 HMM'
HMM model has been loaded from 'Digit 1 HMM'
HMM model has been loaded from 'Digit 2 HMM'
HMM model has been loaded from 'Digit 3 HMM'
HMM model has been loaded from 'Digit 4 HMM'
HMM model has been loaded from 'Digit 5 HMM'
HMM model has been loaded from 'Digit 6 HMM'
HMM model has been loaded from 'Digit 7 HMM'
HMM model has been loaded from 'Digit 8 HMM'
HMM model has been loaded from 'Digit 9 HMM'


In [9]:

data=hmm_load_features('../training')
digit=8
templates_for_digit=filter_samples_by_label(data, digit) 
print(len(hmm1))  
recognize(hmm1,templates_for_digit[1],digit)

10
The voice is recognized as 8, the true value is 8
Congrats, you recognized digit right


In [10]:
class Language_HMM:
    def __init__(self,hmms):
        self.states: List[HMMState] = []
        self.hmms=hmms
        self.observations: List[np.ndarray] = []
        self.transitions: List[List[float]] = []
        self.state_index: Dict[HMMState, int] = {}
        self.initial_probabilities: List[float] = [] 
        self.set_state_index()
    def get_all_states(self,hmms):
        null_state=HMMState(isNull=True)
        self.states.append(null_state)
        for hmm in self.hmms[2:]:
            for state in hmm.states:
                self.states.append(state)
        null_state=HMMState(isNull=True)
        self.states.append(null_state)
        for i in range(1,7):
            for hmm in self.hmms:
                for state in hmm.states:
                    self.states.append(state)
            null_state=HMMState(isNull=True)
            self.states.append(null_state)
    def set_state_index(self):
        for i in range(len(self.states)):
            self.state_index[self.states[i]]=i
    def initialize_transition(self):
        self.transitions= np.zeros(len(self.states), (len(self.states)))
        self.set_transitions()
    def set_transitions(self):
        null_states_indices=[0,41,41+51,41+51*2,41+51*3,41+51*4,41+51*5,41+51*6]
        idx=0
        for i in range(0,8):
            hmm=self.hmms[2+i]
            for j in range(5):
                if j!=4:
                    self.transitions[5*i+j][5*i+j+1]=hmm.transitions[j][j+1]
                    self.transitions[5*i+j][5*i+j]=hmm.transitions[j][j]
                else:
                    self.transitions[5*i+j][5*i+j]=hmm.transitions[j][j]
        for k in range(1,7):
            c=42+51*(k-1)
            for i in range(0,10):
                hmm=self.hmms[i]
                for j in range(5):
                    if j!=4:
                        self.transitions[5*i+j+c][5*i+j+c+1]=hmm.transitions[j][j+1]
                        self.transitions[5*i+j+c][5*i+j+c]=hmm.transitions[j][j]
                    else:
                        self.transitions[5*i+j+c][5*i+j+c]=hmm.transitions[j][j]

 
        for i in range(len(self.states)):
            if i==0:
                for m in range(8):
                    self.transitions[i][i+5*m+1]=1/9
            elif i==1:
                for m in range(8):
                    self.transitions[i-5*m-1][i]=1/8
            elif i==null_states_indices[-1]:
                for m in range(10):
                    self.transitions[i-5*m-1][i]=1/10
            elif i==null_states_indices[3]:
                for m in range(10):
                    self.transitions[i-5*m-1][i]=1/11
                    self.transitions[i][i+5*m+1]=1/10
            else:
                for m in range(10):
                    self.transitions[i-5*m-1][i]=1/10
                    self.transitions[i][i+5*m+1]=1/10

    def most_probable_sequence(self, obs_seq):
        V = [{}]
        path = {}

        for state in self.states:
            initial_prob = self.initial_probabilities[self.state_index[state]]
            V[0][self.state_index[state]] = (math.log(initial_prob) if initial_prob > 0 else -math.inf) + state.log_multivariate_gaussian_pdf_diag_cov(obs_seq[0])
            path[self.state_index[state]] = [state]

        # Run Viterbi for t > 0
        for t in range(1, len(obs_seq)):
            V.append({})
            newpath = {}
            for cur_state in self.states:
                max_log_prob = -math.inf  # Initialize with negative infinity for comparison
                best_prev_state = None  # Initialize with None to find the best previous state
                for prev_state in self.states:
                    transition_prob = self.transitions[self.state_index[prev_state]][self.state_index[cur_state]]
                    log_transition_prob = math.log(transition_prob) if transition_prob > 0 else -math.inf
                    log_prob = V[t-1][self.state_index[prev_state]] + log_transition_prob + cur_state.log_multivariate_gaussian_pdf_diag_cov(obs_seq[t])
                    if log_prob > max_log_prob:
                        max_log_prob = log_prob
                        best_prev_state = prev_state
                V[t][self.state_index[cur_state]] = max_log_prob
                if best_prev_state is not None:  # Check to ensure there is a valid previous state
                    newpath[self.state_index[cur_state]] = path[self.state_index[best_prev_state]] + [cur_state]
            path = newpath

        # Find the final state with the highest probability
        max_final_log_prob = max(V[-1].values())
        final_state = [state for state, prob in V[-1].items() if prob == max_final_log_prob][0]

        return (max_final_log_prob, path[final_state])

In [11]:
import librosa
import numpy as np
import os

def compute_mfcc_features(file_path, n_mfcc=39):
    y, sr = librosa.load(file_path)
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
    delta_mfcc = librosa.feature.delta(mfcc)
    delta2_mfcc = librosa.feature.delta(mfcc, order=2)
    features = np.vstack([mfcc, delta_mfcc, delta2_mfcc])
    return features

def process_folder(folder_path):
    features_dict = {}
    for file_name in os.listdir(folder_path):
        if file_name.endswith('.wav'):  # Ensure processing only wav files
            file_path = os.path.join(folder_path, file_name)
            features = compute_mfcc_features(file_path)
            features_dict[file_name] = features
    return features_dict

# Assuming your test folder is in the current directory
folder_path = '../Project5/problem1'
features_dict = process_folder(folder_path)