In [None]:
import numpy as np
import pandas as pd

# Part 1: Define HMM Parameters
states = ['E', '5', 'I']
nucleotides = ['A', 'C', 'G', 'T']

# Transition matrix: rows = from state, columns = to state
# E -> E is high, E -> 5 is possible= Transition probabilities
# 5 -> I is high, I -> I is high
transition_matrix = pd.DataFrame(
    [
        [0.90, 0.05, 0.05],  # E -> E, E -> 5, E -> I
        [0.00, 0.00, 1.00],  # 5 -> E, 5 -> 5, 5 -> I
        [0.00, 0.00, 1.00],  # I -> E, I -> 5, I -> I
    ],
    index=states,
    columns=states
)

# Emission matrix: rows = states, columns = nucleotides
# Exon: roughly equal, Intron: A/T rich, 5: prefers G (splice site)
emission_matrix = pd.DataFrame(
    [
        [0.25, 0.25, 0.25, 0.25],  # E: uniform
        [0.20, 0.20, 0.40, 0.20],  # 5: G-rich
        [0.30, 0.20, 0.20, 0.30],  # I: A/T-rich
    ],
    index=states,
    columns=nucleotides
)

# Part 2: Function to Compute Log Probability of a Given Path
def get_log_prob_of_a_given_path(path, sequence):
    if len(path) != len(sequence):
        raise ValueError("Path and sequence must have the same length")
    
    log_prob = 0.0
    for t in range(len(path)):
        current_state = path[t]
        emission = sequence[t].upper()
        
        if t == 0:
            # Initial probability (assume uniform for simplicity)
            init_prob = 1.0 / len(states)
            log_prob += np.log(init_prob)
        else:
            # Transition probability
            prev_state = path[t-1]
            trans_prob = transition_matrix.loc[prev_state, current_state]
            if trans_prob == 0:
                return -np.inf  # Invalid transition
            log_prob += np.log(trans_prob)
        
        # Emission probability
        if emission not in nucleotides:
            return -np.inf  # Invalid nucleotide
        emit_prob = emission_matrix.loc[current_state, emission]
        if emit_prob == 0:
            return -np.inf  # Invalid emission
        log_prob += np.log(emit_prob)
    
    return round(log_prob, 2)

# Part 3: Viterbi Algorithm
def viterbi_algorithm(sequence):
    sequence = sequence.upper()
    T = len(sequence)
    N = len(states)
    
    # Initialize Viterbi and backpointer tables
    V = np.zeros((N, T))
    backpointer = np.zeros((N, T), dtype=int)
    
    # Initialization
    for s in range(N):
        state = states[s]
        init_prob = 1.0 / N  # Uniform initial probabilities
        emit_prob = emission_matrix.loc[state, sequence[0]] if sequence[0] in nucleotides else 1e-10
        V[s, 0] = np.log(init_prob * emit_prob)
    
    # Recursion
    for t in range(1, T):
        for s in range(N):
            state = states[s]
            max_prob = -np.inf
            max_state = 0
            
            for prev_s in range(N):
                prev_state = states[prev_s]
                trans_prob = transition_matrix.loc[prev_state, state]
                prob = V[prev_s, t-1] + np.log(trans_prob) if trans_prob > 0 else -np.inf
                if prob > max_prob:
                    max_prob = prob
                    max_state = prev_s
            
            emit_prob = emission_matrix.loc[state, sequence[t]] if sequence[t] in nucleotides else 1e-10
            V[s, t] = max_prob + np.log(emit_prob)
            backpointer[s, t] = max_state
    
    # Termination
    max_final_prob = V[:, -1].max()
    last_state_idx = V[:, -1].argmax()
    
    # Backtracking
    path = [states[last_state_idx]]
    for t in range(T-1, 0, -1):
        last_state_idx = backpointer[last_state_idx, t]
        path.append(states[last_state_idx])
    
    return ''.join(path[::-1]), max_final_prob

# Main execution
def main():
    # Test the log probability function
    test_path = "EEEEEEEEEEEEEEEEEE5IIIIIII"
    test_sequence = "CTTCATGTGAAAGCAGACGTAAGTCA"
    log_prob = get_log_prob_of_a_given_path(test_path, test_sequence)
    print(f"Log probability of path {test_path}: {log_prob}")
    
    # Run Viterbi algorithm
    path, max_prob = viterbi_algorithm(test_sequence)
    print(f"Most likely path: {path}")
    print(f"Log probability of most likely path: {round(max_prob, 2)}")

if __name__ == '__main__':
    main()