In [16]:
def read_fasta_file(filename):
    """
    Reads the given FASTA file f and returns a dictionary of sequences.

    Lines starting with ';' in the FASTA file are ignored.
    """
    sequences_lines = {}
    current_sequence_lines = None
    with open(filename) as fp:
        for line in fp:
            line = line.strip()
            if line.startswith(';') or not line:
                continue
            if line.startswith('>'):
                sequence_name = line.lstrip('>')
                current_sequence_lines = []
                sequences_lines[sequence_name] = current_sequence_lines
            else:
                if current_sequence_lines is not None:
                    current_sequence_lines.append(line)
    sequences = {}
    for name, lines in sequences_lines.items():
        sequences[name] = ''.join(lines)
    return sequences

def create_fasta_dict(n):
    d = {'genome' : [], 'annotation' : [], 'genome_name' : []}
    for i in n:
        d['genome_name'].append('genome' + str(i) + '.fa')
        d['genome'].append(read_fasta_file('genome' + str(i) + '.fa')['genome' + str(i)])
        if i < 6:
            d['annotation'].append(read_fasta_file('annotation'+ str(i) + '.fa')['annotation' + str(i)])
        else:
            d['annotation'].append(None)
    return d

In [18]:
train_set = create_fasta_dict([1,2,3,4,5])
test_set = create_fasta_dict([6,7,8,9,10])


In [157]:
import numpy as np
def viterbi(transition, emission, pi, hidden, sequence, observables):
    #init viterbi table
    viterbi_table = np.zeros((len(hidden), len(sequence)))
    viterbi_table = viterbi_table.astype(np.float)
    #init result list Z
    Z = [0] * len(sequence)
    
    for i in range(len(hidden)):
        viterbi_table[i,0] = np.log(pi[i]) + np.log(emission[i, observables[sequence[0]]])
    for n in range(1,len(sequence)):
        for k in range(len(hidden)):
            viterbi_table[k,n] = float("-inf")
            if emission[k, observables[sequence[n]]] != float(0):
                for j in range(len(hidden)):
                    if transition[j,k] != float(0):
                        viterbi_table[k,n] = max(viterbi_table[k,n], np.log(emission[k, observables[sequence[n]]]) + viterbi_table[j, n - 1] + np.log(transition[j,k]))
    Z[-1] = np.argmax(viterbi_table[:,-1])
    for n in reversed(range(len(sequence)-1)):
        state = float("-inf")
        for k in range(len(hidden)):
            z = np.log(emission[Z[n+1], observables[sequence[n+1]]]) + viterbi_table[k,n] + np.log(transition[k, Z[n+1]])
            if z > state:
                state = z
                index = k
        Z[n] = index
    return Z

def train_by_counting_2_state(dictionary):
	observables = ['A', 'G', 'C', 'T']
	emissions  = dict(zip(observables,range(len(observables))))
	seqnames = dictionary['genome_name']
	seqs = dictionary['genome']
	hiddenseqs = dictionary['annotation']
	hiddenstates = {'N':0, 'C':1, 'R':1}

	emissionMatrix = np.zeros((2, len(observables)))
	for i in range(len(seqs)):
		for j, char in enumerate(seqs[i]):
			emissionMatrix[hiddenstates[hiddenseqs[i][j]], emissions[char]] += 1
	for i in range(2):
		rowsum = emissionMatrix[i,:].sum()
		for j in range(len(observables)):
			emissionMatrix[i,j] = emissionMatrix[i,j] / rowsum

	transitionMatrix = np.zeros((2,2))
	pi = [0.0] * 2

	for i in range(len(hiddenseqs)):
		for j, state in enumerate(hiddenseqs[i]):
			if j < 1:
				pi[hiddenstates[state]] += 1
			else:
				transitionMatrix[hiddenstates[hiddenseqs[i][j-1]], hiddenstates[state]] += 1
	pisum = sum(pi)
	for i in range(2):
		rowsum = transitionMatrix[i,:].sum()

		pi[i] = pi[i] / pisum

		for j in range(2):
			transitionMatrix[i,j] = transitionMatrix[i,j] / rowsum
	return pi, transitionMatrix, emissionMatrix, hiddenstates




#training by counting
SYMBOL_DICT={'A':0,'C':1,'G':2,'T':3}
NUMBER_OF_SYMBOLS=4
import numpy as np
numberOfStates=43
A=np.zeros([numberOfStates,numberOfStates])
startVector=np.zeros([numberOfStates])
startVector[0]=1.0
emissionTabel=np.zeros([numberOfStates,NUMBER_OF_SYMBOLS])

def updateEmissionTabel(observations,states):
    for i in range(len(states)):
        emissionTabel[states[i],SYMBOL_DICT[observations[i]]]+=1
    return(emissionTabel)
def counting(genome,annotation, A = None, emissionTabel = None):
    if A == None:
        A=np.zeros([numberOfStates,numberOfStates])
        emissionTabel = np.zeros([numberOfStates,NUMBER_OF_SYMBOLS])
    a = genome
    b = annotation
    j=0
    jo=-1

    state=0
    while j <len(a):
        if j==jo:
            print("her", state,i,j,b[j:j+4], a[j:j+4])
            break
        if state==0:
            if b[j]=='C':
                if a[j:j+3]=="TTG": #state=1
                    A[0,1]=A[0,1]+1
                    state=3
                    j+=3
                elif a[j:j+3]=="GTG": #state=4
                    A[0,4]+=1
                    state=6
                    j+=3
                elif a[j:j+3]=="ATG": #state=7
                    A[0,7]+=1
                    state=9
                    j+=3
                else:
                    A[0,0]+=1
                    emissionTabel[0,SYMBOL_DICT[a[j]]]+=1
                    j+=1
            elif b[j]=='R':
                if a[j:j+3]=="CTA": #state=22
                    A[0,22]+=1
                    state=24
                    j+=3
                elif a[j:j+3]=="TTA": #state=25
                    A[0,25]+=1
                    state=27
                    j+=3
                elif a[j:j+3]=="TCA": #state=28
                    A[0,28]+=1
                    state=30
                    j+=3
                else:
                    A[0,0]+=1
                    emissionTabel[0,SYMBOL_DICT[a[j]]]+=1
                    j+=1
                    
            else:
                A[0,0]+=1
                emissionTabel[0,SYMBOL_DICT[a[j]]]+=1
                j+=1
        elif (state>=10) & (state<=12):
            if b[j:j+4]=="CCCN":
                if state!=12:
                    print("UPS 4",i,j,b[j:j+4],a[j:j+4])
                else:
                    if a[j:j+3]=="TAG": #state=13
                        A[12,13]+=1
                        state=15
                        j+=3
                    elif a[j:j+3]=="TGA": #state=16
                        A[12,16]+=1
                        state=18
                        j+=3
                    elif a[j:j+3]=="TAA": #state=19
                        A[12,19]+=1
                        state=21
                        j+=3
                    else:
                        print("UPS 3",i,j,b[j:j+4],a[j:j+4])
                        state=0
            elif b[j:j+4]=="CCCC":
                emissionTabel = updateEmissionTabel(a[j:j+3],[10,11,12])
                A[12,10]+=1
                state=12
                j+=3
            else:
                print("UPS 1",i,j,b[j:j+4],a[j:j+4])
                state = 0
                for char in b[j:j+4]:
                    if char == 'C':
                        j += 1

        elif (state>=31) & (state<=33):
            if b[j:j+4]=="RRRN":
                if state!=33:
                    print("UPS 5",i,j,b[j:j+4],a[j:j+4])
                else:
                    if a[j:j+3]=="CAC": #state=34
                        A[33,34]+=1
                        state=36
                        j+=3
                    elif a[j:j+3]=="CAT": #state=37
                        A[33,37]+=1
                        state=39
                        j+=3
                    elif a[j:j+3]=="CAA": #state=40
                        A[33,40]+=1
                        state=42
                        j+=3
                    else:
                        print("UPS 6",i,j,b[j:j+4],a[j:j+4])
                        state = 0
                        j += 3
            elif b[j:j+4]=="RRRR":
                emissionTabel = updateEmissionTabel(a[j:j+3],[31,32,33])
                state=33
                j+=3
            else:
                print("UPS 2",i,j,b[j:j+4],a[j:j+4])
                state = 0
                j += 3
        elif (state==36) | (state==39)|(state==42)| (state==15) | (state==18)|(state==21):
            if b[j]=='N':
                A[state,0]+=1
                emissionTabel[0,SYMBOL_DICT[a[j]]]+=1
                state=0
                j+=1
            else:
                print("UPS 7",i,j,b[j],a[j]) 
        elif (state==3) | (state==6)|(state==9):
            if b[j:j+4]=="CCCC":
                emissionTabel = updateEmissionTabel(a[j:j+3],[10,11,12])
                A[state,10] += 1
                state=12
                j+=3
            else:
                print("UPS 9",i,j,b[j:j+4],a[j:j+4])
        elif (state==24) | (state==27)|(state==30):
            if b[j:j+4]=="RRRR":
                emissionTabel = updateEmissionTabel(a[j:j+3],[31,32,33])
                A[state, 31] += 1
                state=33
                j+=3
            else:
                print("UPS 10",i,j,b[j:j+4],a[j:j+4])
            
        else:
            print("UPS 8",i,j,b[j],a[j])
    
    return(A, emissionTabel)




In [158]:

A = None
emission = None

for i in range(0,5):
    genome = train_set['genome'][i]
    annotation = train_set['annotation'][i]
    if i == 0:
        A= None
        emission = None
    A, emission = counting(train_set['genome'][i], train_set['annotation'][i], A, emission)




UPS 6 1 311945 RRRN AATT
UPS 1 1 435295 CCNN AAGA
UPS 2 1 502874 RRRC CATA
UPS 1 1 503664 CCNN AAAG
UPS 1 1 504496 CCNN AATG
UPS 1 1 538010 CCCR TGAT
UPS 6 1 1169715 RRRN CAGT
UPS 6 1 1617553 RRRN GATA
UPS 6 1 1772084 RRRN AATA
UPS 6 1 2018096 RRRN AATA
UPS 6 1 2040180 RRRN TATA
UPS 1 2 360271 CCNN AAAT
UPS 1 2 1882075 CCNN AATG
UPS 6 2 2242651 RRRN TATC
UPS 6 2 2329880 RRRN AATC
UPS 1 4 1545117 CCCR TAGT
UPS 1 4 2072387 CCCR TAAT


In [179]:

emission[1,3] = 1
emission[2,3] = 1
emission[3,2] = 1
emission[4,2] = 1
emission[5,3] = 1
emission[6,2] = 1
emission[7,0] = 1
emission[8,3] = 1
emission[9,2] = 1
emission[13,3] = 1
emission[14,0] = 1
emission[15,2] = 1
emission[16,3] = 1
emission[17,2] = 1
emission[18,0] = 1
emission[19,3] = 1
emission[20,0] = 1
emission[21,0] = 1
emission[22,1] = 1
emission[23,3] = 1
emission[24,0] = 1
emission[25,3] = 1
emission[26,3] = 1
emission[27,0] = 1
emission[28,3] = 1
emission[29,1] = 1
emission[30,0] = 1
emission[34,1] = 1
emission[35,0] = 1
emission[36,1] = 1
emission[37,1] = 1
emission[38,0] = 1
emission[39,3] = 1
emission[40,1] = 1
emission[41,0] = 1
emission[42,0] = 1

A[1,2] = 1
A[2,3] = 1
A[3,10] = 1
A[4,5] = 1
A[5,6] = 1
A[6,10] = 1
A[7,8] = 1
A[8,9] = 1
A[9,10] = 1
A[10,11] = 1
A[11,12] = 1
A[19,20] = 1
A[13,14] = 1
A[14,15] = 1
A[15,0] = 1
A[16,17] = 1
A[17,18] = 1
A[18,0] = 1
A[20,21] = 1
A[21,0] = 1
A[22,23] = 1
A[23,24] = 1
A[24,31] = 1
A[25,26] = 1
A[26,27] = 1
A[27,31] = 1
A[28, 20] = 1
A[29,30] = 1
A[30,31] = 1
A[31,32] = 1
A[32,33] = 1
A[34,35] = 1
A[35,36] = 1
A[36,0] = 1
A[37,38] = 1
A[38,39] = 1
A[39,0] = 1
A[40,41] = 1
A[41,42] = 1
A[42,0] = 1

pi = [1]
pi = pi + [0.0 for x in range(42)]

Atotal = A.sum(axis = 1, keepdims = True)
new_A = A / Atotal

emissiontotal = emission.sum(axis = 1, keepdims = True)
new_emission = emission / emissiontotal

[[ 0.33434726  0.16479575  0.16612391  0.33473308]
 [ 0.          0.          0.          1.        ]
 [ 0.          0.          0.          1.        ]
 [ 0.          0.          1.          0.        ]
 [ 0.          0.          1.          0.        ]
 [ 0.          0.          0.          1.        ]
 [ 0.          0.          1.          0.        ]
 [ 1.          0.          0.          0.        ]
 [ 0.          0.          0.          1.        ]
 [ 0.          0.          1.          0.        ]
 [ 0.32053233  0.16016488  0.32451124  0.19479155]
 [ 0.35200492  0.20135675  0.13669658  0.30994175]
 [ 0.33870462  0.13090755  0.12819783  0.40219   ]
 [ 0.          0.          0.          1.        ]
 [ 1.          0.          0.          0.        ]
 [ 0.          0.          1.          0.        ]
 [ 0.          0.          0.          1.        ]
 [ 0.          0.          1.          0.        ]
 [ 1.          0.          0.          0.        ]
 [ 0.          0.          0.  

In [40]:
trained_model = train_by_counting_2_state(dictionary = train_set)
pi = trained_model[0]
transition = trained_model[1]
emission = trained_model[2]
hiddenstates = {'N' : 0, 'C' : 1}
observables = {'A':0, 'G':1, 'C':2, 'T':3}

In [36]:
def tenfold_cross_validation(datafolder, states = 3, decoding = "viterbi"):
	observables = ['A', 'C', 'E', 'D', 'G', 'F', 'I', 'H', 'K', 'M', 'L', 'N', 'Q', 'P', 'S', 'R', 'T', 'W', 'V', 'Y']
	emissions  = dict(zip(observables,range(len(observables))))
	results = []
	names = []
	sequences = []
	files = []
	transitionMatrix = []
	pi = []
	filelist = os.listdir(datafolder)
	for i in range(len(filelist)):
		trainingdata = filelist[:]
		validationdata = trainingdata.pop(i)

		trainingset = [[], [], []]
		for filename in trainingdata:
			data = read_sequence(datafolder + "/" + filename)
			trainingset[0] = trainingset[0] + data[0]
			trainingset[1] = trainingset[1] + data[1]
			trainingset[2] = trainingset[2] + data[2]
		if states == 3:
			model = train_by_counting_3_state(trainingset)
		if states == 4:
			model = train_by_counting_4_state(trainingset)
		if states == 24:
			model = train_by_counting_24_states(trainingset)
		hidden = model[3]
		transitionMatrix.append(model[1])
		pi.append(model[0])
		newhidden = {v: k for k, v in hidden.iteritems()}
		validationset = read_sequence(datafolder + "/" + validationdata)
		predictions = []
		names.append(validationset[0])
		for valseq in validationset[1]:
			if decoding == 'viterbi':
				predictions.append(viterbi.viterbi(transition = model[1], emission = model[2], pi = model[0], hidden = hidden, sequence = valseq, observables = emissions))
			if decoding == 'posterior':
				predictions.append(posterior.posterior(transition = model[1], emission = model[2], pi = model[0], hidden = hidden, sequence = valseq, observables = emissions))
		for r in range(len(predictions)):
			predictions[r] = [newhidden[x] for x in predictions[r]]
			if states == 4:
				for x in range(len(predictions[r])):
					if predictions[r][x] == 'N':
						predictions[r][x] = 'M'
			if states == 24:
				for x in range(len(predictions[r])):
					if predictions[r][x] != 'i':
						if predictions[r][x] != 'o':
							predictions[r][x] = 'M'
			predictions[r] = "".join(predictions[r])
		results.append(predictions)
		sequences.append(validationset[1])
		files.append(validationdata)
	return files, names, results, sequences, transitionMatrix, pi


[[ 0.33434315  0.16612728  0.16479453  0.33473505]
 [ 0.32083579  0.17959771  0.17934359  0.32022291]]


In [None]:
hiddenstates = dict(zip(range(0,42), range(0,42)))
observables = {'A':0, 'C':1, 'G':2, 'T':3}
test_model = viterbi(new_A, new_emission, pi, hiddenstates, test_set['genome'][0], observables = observables)
for i in range(len(test_model)):
    if test_model[i] == 0:
        test_model[i] = 'N'
    if (test_model[i] >= 1) and (test_model[i] <= 21):
        test_model[i] = 'C'
    if (test_model[i] >= 22) and (test_model[i] <= 42):
        test_model[i] = 'R'



