# Decoding of HMMs

We have two different dictionaries with the probabilities for scores.

We want to dynamically build up a most-likely path.

Question: How do we realize when it is time for transition? That is kind of figured out by taking into account the stretch of following nucleotides. Should we iterate backwards? Based on following nucleotides, were we encountering transition or not?

In [64]:
import math
from math import log, log2
import numpy as np

In [44]:
full_seq = ''
with open('data/gpc.fasta', 'r') as in_fh:
    for line in in_fh:
        line = line.rstrip()
        if line.startswith('>'):
            continue
        full_seq += line

print('{} nts'.format(len(full_seq)))

32743 nts


In [4]:
reg_trans = {'A':{'A': 0.299, 'C': 0.205, 'G': 0.285, 'T': 0.210, '+': 0.001},
             'C':{'A': 0.321, 'C': 0.298, 'G': 0.078, 'T': 0.302, '+': 0.001},
             'G':{'A': 0.248, 'C': 0.246, 'G': 0.297, 'T': 0.208, '+': 0.001},
             'T':{'A': 0.177, 'C': 0.239, 'G': 0.292, 'T': 0.291, '+': 0.001}}


cpg_trans = {'A':{'A': 0.179, 'C': 0.273, 'G': 0.425, 'T': 0.120, '-': 0.003},
             'C':{'A': 0.171, 'C': 0.367, 'G': 0.273, 'T': 0.187, '-': 0.003},
             'G':{'A': 0.160, 'C': 0.338, 'G': 0.374, 'T': 0.125, '-': 0.003},
             'T':{'A': 0.079, 'C': 0.354, 'G': 0.383, 'T': 0.181, '-': 0.003}}



In [3]:
test_seq = 'AGC'

In [67]:
def setup_matrices(seq):
    
    prob_m = np.zeros((2, len(seq)+1))
    trace_m = np.zeros((2, len(seq)+1), (str,1))
    
#     for i in range(0, nbr_states):
    prob_m[0,0] = log2(0.5)
    prob_m[1,0] = log2(0.5)
    
    return prob_m, trace_m

In [74]:
def calculate_trace(seq, test_print=False):
    
    prob_m, trace_m = setup_matrices(seq)
    
    if test_print:
        print(seq)
        print(prob_m)
        print(trace_m)
        
    for col in range(1, len(seq)+1):

        prev_letter = seq[col-2]
        letter = seq[col-1]

        # Reg
#         print(prob_m[0][col-1])
        reg_same_prob = prob_m[0][col-1] + log2(reg_trans[letter][prev_letter])
        reg_diff_prob = prob_m[1][col-1] + log2(reg_trans[letter]['+'])

        if reg_same_prob > reg_diff_prob:
            prob_m[0][col] = reg_same_prob
            trace_m[0][col] = ' '
        else:
            prob_m[0][col] = reg_diff_prob
            trace_m[0][col] = '+'

        # CpG
        cpg_same_prob = prob_m[1][col-1] + log2(cpg_trans[letter][prev_letter])
        cpg_diff_prob = prob_m[0][col-1] + log2(cpg_trans[letter]['-'])

        if cpg_same_prob > cpg_diff_prob:
            prob_m[1][col] = cpg_same_prob
            trace_m[1][col] = ' '
        else:
            prob_m[1][col] = cpg_diff_prob
            trace_m[1][col] = '-'
#         print(prob_m)

    return trace_m

trace_m = calculate_trace(full_seq)
# print(prob_m)
print('Trace 0: {}'.format(''.join(trace_m[0][0:1000])))
print('Trace 1: {}'.format(''.join(trace_m[1][0:1000])))


Trace 0:                                   ++++++++++++ +++   + +++    +++++++    + ++++  ++++ ++   +           ++++++++   +++++   ++++ +++    + +++  + +++++++   + +                                ++   +++       +++++ + +++    + +++++      + +++ ++++  +  +++ ++++  +++ +       ++++ + ++++++        + ++    +         ++++ ++++++++++ ++++++                    +      +++   ++++++       ++++ +++++++ ++   + +  +                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                

In [100]:
def get_cpg_string(trace_m, seq):
    curr_cpg = 0
    cpg_string = ''
    for pos in range(len(trace_m[0])-2, -1, -1):
        
#         print(pos)
        if trace_m[curr_cpg][pos] != ' ':
            curr_cpg = (curr_cpg+1) % 2
        
        if curr_cpg == 1:
            cpg_string += 'C'
        else:
            cpg_string += ' '

    return cpg_string

cpg_string = get_cpg_string(trace_m, full_seq)
# print(cpg_string[0:10000])

In [102]:
cpg_start = -1
print(len(cpg_string))
for i in range(len(cpg_string)):
    if cpg_start == -1:
        if cpg_string[i] == 'C':
            cpg_start = i
    else:
        if cpg_string[i] == ' ':
            print('Island in range: {} to {}'.format(cpg_start, i))
            cpg_start = -1

32743
Island in range: 9936 to 10068
Island in range: 14788 to 14887
Island in range: 20600 to 20739
Island in range: 32343 to 32742
