## Task ba10c

In [33]:
import numpy as np
import copy

In [138]:
def read_data(fname='rosalind_ba10c.txt'):
    with open(fname, 'r') as f:
        x = f.readline().strip()
        _ = f.readline()
        alphabet = f.readline().strip().split()
        _ = f.readline()
        states = f.readline().strip().split()
        _ = f.readline()
        _ = f.readline()
        transition = [0]*len(states)
        for i in range(len(states)):
            row = f.readline().strip().split()
            state = row[0]
            prob_list = [float(x) for x in row[1:]]
            transition[i] = np.array(prob_list)
        _ = f.readline()
        _ = f.readline()
        emission = [0]*len(states)
        for i in range(len(states)):
            row = f.readline().strip().split()
            state = row[0]
            prob_list = [float(x) for x in row[1:]]
            emission[i] = np.array(prob_list)
    return x, alphabet, states, np.matrix(transition), np.matrix(emission)

In [139]:
class hmm:
    def __init__(self, states, alphabet, transition, emission):
        self.states = states
        self.alphabet = alphabet
        self.start = np.array([1/len(states) for _ in range(len(states))])
        self.transition = transition
        self.emission = emission

    def get_viterbi_path(self, x):
        len_hmm = len(x)
        num_rows = len(self.states)
          
        # mapping from observed values to its index
        alpha_map = {}
        for i, obs in enumerate(self.alphabet):
            alpha_map[obs] = i
        # mapping from hidden states to its index
        states_map = {}
        for i, obs in enumerate(self.states):
            states_map[i] = obs
            
        # dyn_var[i] is probability of most probable path up to state[i]
        # transition from root to 1st layer(1st observation) 
        # is determined by start matrix (which is uniform in our case)
        dyn_var = np.multiply(self.emission[:, alpha_map[x[0]]].T, self.start)

        # scale it
        dyn_var = dyn_var / np.sum(dyn_var)
        
        # init 2 matrices for path matrices that contain hidden path with max prob
        old_path, cur_path = np.zeros((len_hmm, num_rows)), np.zeros((len_hmm, num_rows))
        old_path[0, :] = list(range(num_rows))
        # transition for all the other observations in x
        for i in range(1, len_hmm):
            obs_idx = alpha_map[x[i]]
            cur_transition = np.multiply(dyn_var, self.transition.T)
            dyn_tmp = np.multiply(cur_transition, self.emission[:, obs_idx])
            
            #find max for each row
            dyn_var = dyn_tmp.max(axis=1).T
            dyn_var = dyn_var / np.sum(dyn_var)
            
            #get most probable state
            max_states = np.ravel(dyn_tmp.argmax(axis=1).T)
            
            # update path
            for j in range(num_rows):
                cur_path[:i, j] = old_path[:i, max_states[j]]

            cur_path[i, :] = list(range(num_rows))
            old_path = copy.deepcopy(cur_path)
        # perform last stage - obtain max probability and use it to get final path
#         print(cur_path)
#         print(dyn_var)
        max_states = np.ravel(dyn_var).argmax()
        final_path = cur_path[:, max_states]
#         print(final_path)
#         print(states_map)
        return [states_map[int(i)] for i in final_path]

In [140]:
def main(fname='rosalind_ba10c.txt'):
    x, alphabet, states, transition, emission = read_data(fname)
#     print(x, alphabet, states, transition, emission)
    model = hmm(states, alphabet, transition, emission)
    path = model.get_viterbi_path(x)
    print(''.join(path))
    return ''.join(path)

In [141]:
main(fname='sample_data/ba10c/sample.txt')

AAABBAAAAA


'AAABBAAAAA'

In [142]:
main(fname='sample_data/ba10c/sample2.txt')

AB


'AB'

In [143]:
res = main(fname='sample_data/ba10c/input.txt')

AAAAAAAAAAAAAABBBBBBBBBBBAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABBBBBBBBBBBAAAAAAAAAAAAAAAAAAAAABBBBBBBBBBAAA


In [144]:
exp = 'AAAAAAAAAAAAAABBBBBBBBBBBAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABBBBBBBBBBBAAAAAAAAAAAAAAAAAAAAABBBBBBBBBBAAA'
print(res)
print(exp)

AAAAAAAAAAAAAABBBBBBBBBBBAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABBBBBBBBBBBAAAAAAAAAAAAAAAAAAAAABBBBBBBBBBAAA
AAAAAAAAAAAAAABBBBBBBBBBBAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABBBBBBBBBBBAAAAAAAAAAAAAAAAAAAAABBBBBBBBBBAAA


In [147]:
# driver code
# main()