In [8]:
import numpy as np
import math

# Nucleotides and states
nucs = ['A', 'C', 'G', 'T']
states = ['E', '5', 'I']

# Initial probabilities
init_prob = {'E': 1.0, '5': 0.0, 'I': 0.0}

# Transition matrix
trans_prob = {
    'Start': {'E': 1.0, '5': 0.0, 'I': 0.0, 'End': 0.0},
    'E': {'E': 0.9, '5': 0.1, 'I': 0.0, 'End': 0.0},
    '5': {'E': 0.0, '5': 0.0, 'I': 1.0, 'End': 0.0},
    'I': {'E': 0.0, '5': 0.0, 'I': 0.9, 'End': 0.1}
}

# Emission probabilities
emit_prob = {
    'E': {'A': 0.25, 'C': 0.25, 'G': 0.25, 'T': 0.25},
    '5': {'A': 0.05, 'C': 0.00, 'G': 0.95, 'T': 0.00},
    'I': {'A': 0.4, 'C': 0.1, 'G': 0.1, 'T': 0.4},
}

# Logarithm function with protection for zero
def log(x):
    return -math.inf if x == 0 else math.log(x)

# Log probability calculation
def log_prob(path, seq):
    log_prob_value = 0.0
    if len(path) != len(seq):
        raise ValueError("The length of observed and state sequences must be the same")
    
    prev_state = 'Start'
    
    for i in range(len(seq)):
        curr_state = path[i]
        observed_nuc = seq[i]
        log_prob_value += log(trans_prob[prev_state][curr_state]) + log(emit_prob[curr_state][observed_nuc])
        prev_state = curr_state
        
    if prev_state == 'I':
        log_prob_value += log(trans_prob[prev_state]['End'])

    return log_prob_value
STATE_PATH = "EEEEEEEEEEEEEEEEEE5IIIIIII"
OBSERVED_SEQ = "CTTCATGTGAAAGCAGACGTAAGTCA"
ans = log_prob(STATE_PATH, OBSERVED_SEQ)

print(ans)
def viterbi(observed_seq):
    num_states = len(states)
    num_obs = len(observed_seq)

    V = np.full((num_states, num_obs), -np.inf)
    backtrack = np.zeros((num_states, num_obs), dtype=int)

    state_index = {s: i for i, s in enumerate(states)}

    for i, s in enumerate(states):
        V[i, 0] = log(init_prob[s]) + log(emit_prob[s][observed_seq[0]])

    for t in range(1, num_obs):
        for curr_i, curr_state in enumerate(states):
            max_prob = -np.inf
            best_prev = 0
            for prev_i, prev_state in enumerate(states):
                prob = V[prev_i, t - 1] + log(trans_prob[prev_state][curr_state])
                if prob > max_prob:
                    max_prob = prob
                    best_prev = prev_i
            V[curr_i, t] = max_prob + log(emit_prob[curr_state][observed_seq[t]])
            backtrack[curr_i, t] = best_prev

    path = []
    last_state = np.argmax(V[:, -1])
    path.append(states[last_state])

    for t in range(num_obs - 1, 0, -1):
        last_state = backtrack[last_state, t]
        path.insert(0, states[last_state])

    return path, np.max(V[:, -1])
obs_seq = "CTTCATGTGAAAGCAGACGTAAGTCA"
path, prob = viterbi(obs_seq)

print("Most likely state path:", path)
print("Log probability:", prob)


-41.21967768602254
Most likely state path: ['E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E']
Log probability: -38.677666280562796
