In [16]:
import numpy as np
import math

# Define HMM parameters
states = ['E', '5', 'I']  # Exon, 5' splice site, Intron
nucleotides = ['A', 'C', 'G', 'T']
# Transition probabilities (log scale)
transition = {
    'E': {'E': math.log(0.9), '5': math.log(0.1), 'I': float('-inf')},
    '5': {'E': float('-inf'), '5': float('-inf'), 'I': math.log(1.0)},
    'I': {'E': float('-inf'), '5': float('-inf'), 'I': math.log(0.9)}
}
# Emission probabilities (log scale)
emission = {
    'E': {'A': math.log(0.25), 'C': math.log(0.25), 'G': math.log(0.25), 'T': math.log(0.25)},
    '5': {'A': math.log(0.05), 'C': float('-inf'), 'G': math.log(0.95), 'T': float('-inf')},
    'I': {'A': math.log(0.4), 'C': math.log(0.1), 'G': math.log(0.1), 'T': math.log(0.4)}
}
# Initial probabilities (log scale)
initial = {'E': math.log(1.0), '5': float('-inf'), 'I': float('-inf')}

def get_log_prob_of_a_given_path(path, sequence):
    """
    Calculate log probability of a given state path for an observed sequence.
    
    Args:
        path (str): State path (e.g., "EEEEEEEEEEEEEEEEEE5IIIIIII")
        sequence (str): Observed DNA sequence (e.g., "CTTCATGTGAAAGCAGACGTAAGTCA")
    
    Returns:
        float: Log probability of the path
    """
    if len(path) != len(sequence):
        raise ValueError("Path and sequence lengths must match")
    
    log_prob = 0.0
    for position in range(len(sequence)):
        if position == 0:
            if path[position] != 'E':
                return float('-inf')
            log_prob = emission[path[position]][sequence[position]]
        else:
            log_prob += transition[path[position-1]][path[position]] + emission[path[position]][sequence[position]]
    
    # Add terminal transition probability
    log_prob += math.log(0.1)
    
    return log_prob

def viterbi(sequence):
    """
    Implement Viterbi algorithm to find most likely state sequence.
    
    Args:
        sequence (str): Observed DNA sequence
    
    Returns:
        tuple: (most likely path, log probability)
    """
    N = len(sequence)
    S = len(states)
    
    # Initialize Viterbi and backpointer matrices
    viterbi = [{} for _ in range(N)]
    backpointer = [{} for _ in range(N)]
    
    # Initialization
    for state in states:
        viterbi[0][state] = initial[state] + emission[state][sequence[0]]
        backpointer[0][state] = None
    
    # Recursion
    for t in range(1, N):
        for state in states:
            max_prob = float('-inf')
            best_prev = None
            for prev_state in states:
                prob = viterbi[t-1][prev_state] + transition[prev_state][state] + emission[state][sequence[t]]
                if prob > max_prob:
                    max_prob = prob
                    best_prev = prev_state
            viterbi[t][state] = max_prob
            backpointer[t][state] = best_prev
    
    # Termination
    max_prob = float('-inf')
    best_final_state = None
    for state in states:
        prob = viterbi[N-1][state] + math.log(0.1)  # Add terminal transition
        if prob > max_prob:
            max_prob = prob
            best_final_state = state
    
    # Backtracking
    path = [best_final_state]
    for t in range(N-1, 0, -1):
        best_final_state = backpointer[t][best_final_state]
        path.insert(0, best_final_state)
    
    return ''.join(path), max_prob

# Test the functions
if __name__ == "__main__":
    test_path = "EEEEEEEEEEEEEEEEEE5IIIIIII"
    test_sequence = "CTTCATGTGAAAGCAGACGTAAGTCA"
    viterbi_sequence = "AGGCTCTCCATAAGG"
    
    # Calculate log probability for given path
    log_prob = get_log_prob_of_a_given_path(test_path, test_sequence)
    print(f"Log probability of sequence given state path: {log_prob:.2f}")
    
    # Find most likely path using Viterbi
    most_likely_path, path_prob = viterbi(viterbi_sequence)
    print(f"Most likely path: {most_likely_path}")
    print(f"Log probability of most likely path: {path_prob:.2f}")

Log probability of sequence given state path: -41.22
Most likely path: EEEEEEEEEEEEEEE
Log probability of most likely path: -24.57
