### Task ba5j

In [11]:
import numpy as np


def read_scores(fname='blosum.txt'):
    with open(fname, 'r') as f:
        bases = f.readline().rstrip().split()
        scores = []
        for _ in range(len(bases)):
            line = f.readline().rstrip().split()
            scores.append([int(line[i]) for i in range(1, len(line))])
    return scores, bases


def read_data(fname='rosalind_ba5j.txt'):
    with open(fname, 'r') as f:
        v = f.readline().rstrip()
        w = f.readline().rstrip()
    return v, w


def get_steps_matrices_affine_gap(v, w, scoring_mtx, bases, SO=10, SE=1):
    lv, lw = len(v), len(w)
    # insertion, insertion, match/mismatch, deletion matrices 
    I = np.zeros((lv + 1, lw + 1))
    M = np.zeros((lv + 1, lw + 1))
    D = np.zeros((lv + 1, lw + 1))
    
    # matrices with move tracks for insertion, insertion, match/mismatch, deletion matrices 
    I_steps = [[0]*(lw+1) for _ in range(lv+1)]
    M_steps = [[0]*(lw+1) for _ in range(lv+1)]
    D_steps = [[0]*(lw+1) for _ in range(lv+1)]

    for i in range(1, lv+1):
        for j in range(1, lw+1):
            # get location of current bases in scoring mtx
            X = bases.index(v[i-1])
            Y = bases.index(w[j-1])
            
            # insertion matrix rule
            I[i][j] = max(I[i-1][j] - SE, M[i-1][j] - SE - SO)
            
            # save which matrix and which position we came from
            if I[i][j] == I[i-1][j] - SE:
                I_steps[i][j] = ('I', i-1, j)
            else:
                I_steps[i][j] = ('M', i-1, j)

            # deletion matrix rule
            D[i][j] = max(D[i][j-1] - SE, M[i][j-1] - SE - SO)
            
            # save which matrix and which position we came from
            if D[i][j] == D[i][j-1] - SE:
                D_steps[i][j] = ('D', i, j-1)
            else:
                D_steps[i][j] = ('M', i, j-1)

            # match/mismatch matrix rule
            M[i][j] = max(I[i][j], D[i][j], M[i-1][j-1] + scoring_mtx[X][Y])

            # save which matrix and which position we came from
            if M[i][j] == I[i][j]:
                M_steps[i][j] = ('I', i, j)
            elif M[i][j] == D[i][j]:
                M_steps[i][j] = ('D', i, j)
            else:
                M_steps[i][j] = ('M', i-1, j-1)
                
    final_score = int(M[-1][-1])
    return final_score, I_steps, M_steps, D_steps


def get_alignment_using_steps(v, w, I_steps, M_steps, D_steps):
    v_final, w_final = '', ''
    cur_loc = M_steps[len(v)][len(w)]
    cur_mtx = 'M'
    while cur_loc != 0:
        i, j = cur_loc[1], cur_loc[2]
        if cur_mtx == 'M':
            # if came from match/mismatch then we just output two bases
            if cur_loc[0] == 'M':
                v_final += v[i]
                w_final += w[j]
                # move backwards into match/mismatch mtx
                cur_loc = M_steps[i][j]
            
            # if came from insertion matrix, then we insert '-' symbol in string w
            elif cur_loc[0] == 'I':
                v_final += v[i-1]
                w_final += "-"
                # move backwards into insertion mtx
                cur_loc = I_steps[i][j]
                cur_mtx = 'I'
            else:
                # if came from deletion, then we insert '-' in string v
                v_final += "-"
                w_final += w[j-1]
                # move backwards into deletion mtx
                cur_loc = D_steps[i][j]
                cur_mtx = 'D'

        elif cur_mtx == 'I':
            # if in insertion, then could come from insertion or match/mismatch
            if cur_loc[0] == 'I':
                v_final += v[i-1]
                w_final += "-"
                cur_loc = I_steps[i][j]
            else:
                cur_loc = M_steps[i][j]
                cur_mtx = 'M'

        elif cur_mtx == 'D':
            if cur_loc[0] == 'D':
                v_final += "-"
                w_final += w[j-1]
                cur_loc = D_steps[i][j]
            else:
                cur_loc = M_steps[i][j]
                cur_mtx = 'M'
        else:
            break
    return v_final[::-1], w_final[::-1]

    
def main(dataset_fname='rosalind_ba5j.txt', score_fname='blosum.txt'):
    scores, bases = read_scores(fname=score_fname)
    v, w = read_data(fname=dataset_fname)
    score, I_steps, M_steps, D_steps = get_steps_matrices_affine_gap(v, w, scores, bases)
    v_ans, w_ans = get_alignment_using_steps(v, w, I_steps, M_steps, D_steps)
    print(score)
    print(v_ans)
    print(w_ans)


In [12]:
main(dataset_fname='sample_data/ba5j/sample.txt', score_fname='sample_data/ba5j/blosum.txt')

8
PRT---EINS
PRTWPSEIN-


In [14]:
# driver code
# main(score_fname='sample_data/ba5j/blosum.txt')