In [34]:
import re

def validate_input(reference_sequence, new_sequence, allowed_characters):
    """ This function validates that the input sequences contain only allowed characters."""
    for character in reference_sequence:
        if character.upper() not in allowed_characters:
            raise Exception('incorrect character in reference sequence')

    for character in new_sequence:
        if character.upper() not in allowed_characters:
            raise Exception('incorrect character in new sequence')

            
def init_alignment_matrix(reference_sequence, new_sequence, gap_penalty):
    alignment_matrix = [[[0 for x in range(len(new_sequence))]for y in range(len(reference_sequence))]for z in range(2)]

    for x in range(len(new_sequence)):
                        alignment_matrix[0][0][x]=new_sequence[x]
                        alignment_matrix[1][0][x]=new_sequence[x]
    for y in range(len(reference_sequence)):
                        alignment_matrix[0][y][0]=reference_sequence[y]
                        alignment_matrix[1][y][0]=reference_sequence[y]

    for x in range(len(new_sequence)-2):
                        alignment_matrix[0][1][x+2]=(x+1)*-gap_penalty
    for y in range(len(reference_sequence)-2):
                        alignment_matrix[0][y+2][1]=(y+1)*-gap_penalty 
    return alignment_matrix
    

def align_sequences(reference_sequence, new_sequence, allowed_characters, gap_penalty, similarity_score_fn):
    """ This function calculates the best alignments of two sequences. """

    # Normalise sequences
    reference_sequence = reference_sequence.upper()
    new_sequence = new_sequence.upper()
    
    validate_input(reference_sequence, new_sequence, allowed_characters)

    # Add zero prefix
    reference_sequence = '00' + reference_sequence
    new_sequence = '00' + new_sequence

    alignment_matrix = init_alignment_matrix(reference_sequence, new_sequence, gap_penalty)
    
    j = 0
    i = 0
    h = 0
    for x in range(len(new_sequence)-2):
        for y in range(len(reference_sequence)-2):
            sxy = similarity_score(reference_sequence[y+2], new_sequence[x+2])
            alignment_matrix[1][y+2][x+2]= sxy

            diag  =  int(sxy) + alignment_matrix[0][y+2-1][x+2-1]
            right =  alignment_matrix[0][y+2][x+2-1] - gap_penalty
            down  =  alignment_matrix[0][y+2-1][x+2]- gap_penalty
            if  diag > right and diag > down :
                h = diag
            elif right > down:
                h = right
            else:
                h = down
            alignment_matrix[0][y+2][x+2]= h

    x = len(new_sequence)
    y = len(reference_sequence)
    previous_choice = 'none'
    aligned_reference_sequence = ''
    aligned_new_sequence = ''
    for cycle in range(x+y):
        choices = ''
        
        if x == 2 and y ==2:
            break
        elif x == 2:
            aligned_reference_sequence =  reference_sequence[y-1] + aligned_reference_sequence
            aligned_new_sequence =  '-' + aligned_new_sequence
            x = x
            y = y-1

        elif y == 2:
            aligned_reference_sequence =  '-' + aligned_reference_sequence
            aligned_new_sequence =  new_sequence[x-1] + aligned_new_sequence
            x = x-1
            y = y

        else:

            if alignment_matrix[0][y-1][x-1] == alignment_matrix[0][y-1-1][x-1] - gap_penalty:
                choices = choices + 'u'
            if alignment_matrix[0][y-1][x-1] == alignment_matrix[0][y-1-1][x-1-1] + alignment_matrix[1][y-1][x-1]:
                choices = choices + 'd'        
            if alignment_matrix[0][y-1][x-1] == alignment_matrix[0][y-1][x-1-1] - gap_penalty:
                choices = choices + 'l'   
            print(f'{cycle}: {choices}')
            if choices.find(previous_choice)>-1:
                choice = previous_choice

            else:
                choice = choices[0]

            if choice == 'u':
                aligned_reference_sequence =  reference_sequence[y-1] + aligned_reference_sequence
                aligned_new_sequence =  '-' + aligned_new_sequence
                x = x
                y = y-1
            elif choice == 'd':
                aligned_reference_sequence =  reference_sequence[y-1] + aligned_reference_sequence
                aligned_new_sequence =  new_sequence[x-1] + aligned_new_sequence
                x = x-1
                y = y-1
            elif choice == 'l':
                aligned_reference_sequence =  '-' + aligned_reference_sequence
                aligned_new_sequence =  new_sequence[x-1] + aligned_new_sequence
                x = x-1
                y = y
            else:
                print('error at choice')
            previous_choice = choice

    line_length = 40
    ref_pos = 0
    new_pos = 0
    for line in range(int(len(aligned_reference_sequence)/line_length)):    
        ref_chunk = aligned_reference_sequence[0+20*line:line_length+20*line]
        new_chunk = aligned_new_sequence[0+20*line:line_length+20*line]

        print('ref\t' + str(ref_pos) + '\t' + ref_chunk + '\t' + str(ref_pos + len(ref_chunk.replace('-','')) -1))
        print('new\t' + str(new_pos) + '\t' + new_chunk + '\t' + str(new_pos + len(new_chunk.replace('-','')) -1) + '\n')
        ref_pos = ref_pos + len(ref_chunk.replace('-',''))
        new_pos = new_pos + len(new_chunk.replace('-',''))
        

def load_blosum_matrix(filename):
    """ This function loads the blosum matrix from the given file. """
    blosum_matrix = []
    with open(filename,'r') as file:
        line_number=0
        for line in file:
            blosum_matrix.append([])
            entrys=line.strip('\n').split('\t')
            entry_number=0
            for entry in entrys:
                blosum_matrix[line_number].append(entry)
            line_number = line_number+1
    return blosum_matrix

In [35]:
allowed_characters = {'A','C','D','E','F','G','H','I','K','L','M','N','P','Q','R','S','T','V','W','Y'}
reference_sequence = 'SKSYYDILGVPKSASERQIKKAFHKLAMKYHPDKNKSPDAEAKFREIAEAYETLSDANRRKEYDTLGHSAFTSGKGQRGSGSSFEQSFNFNFDDLFKDFGFFGQNQNTGSKKRFENHFQTRQDGGSSRQRHHFQEFSFGGGLFDDMFEDMEKMFSFSGFDSTNQHTVQTENRFHGSSKHCRTVTQRRGNMVTTYTDCSGQ'
new_sequence = 'KQDYYEILGVSKTAEEREIRKAYKRLAMKYHPDRNQGDKEAEAKFKEIKEAYEVLTDSQKRAAYDQYGHA'

blosum_matrix = load_blosum_matrix('blosum.txt')

#print(blosum_matrix)

def similarity_score(symbol_a, symbol_b):
    j = blosum_matrix[0].index(symbol_a)
    i = blosum_matrix[0].index(symbol_b)
    return int(blosum_matrix[j][i])

align_sequences(reference_sequence=reference_sequence,
                new_sequence=new_sequence,
                allowed_characters=allowed_characters,
                gap_penalty=2,
                similarity_score_fn=similarity_score)
#reference_sequence = 'GCATGCU'
#new_sequence = 'GATTACA'  

0: u
1: u
2: d
3: u
4: u
5: u
6: u
7: u
8: u
9: u
10: u
11: u
12: u
13: u
14: u
15: u
16: u
17: u
18: u
19: u
20: u
21: d
22: u
23: u
24: u
25: d
26: u
27: d
28: u
29: u
30: u
31: u
32: ud
33: u
34: u
35: u
36: d
37: u
38: u
39: u
40: d
41: d
42: u
43: d
44: u
45: d
46: u
47: ud
48: d
49: u
50: u
51: u
52: u
53: u
54: u
55: u
56: u
57: u
58: u
59: u
60: u
61: u
62: u
63: u
64: u
65: u
66: u
67: u
68: u
69: u
70: d
71: d
72: u
73: ud
74: d
75: u
76: u
77: d
78: u
79: u
80: d
81: ud
82: d
83: u
84: u
85: d
86: d
87: u
88: u
89: u
90: d
91: u
92: u
93: u
94: ud
95: u
96: ud
97: u
98: u
99: u
100: u
101: u
102: d
103: d
104: u
105: d
106: u
107: u
108: u
109: u
110: u
111: u
112: u
113: u
114: u
115: d
116: u
117: u
118: u
119: u
120: u
121: u
122: u
123: u
124: u
125: d
126: u
127: u
128: u
129: d
130: u
131: u
132: u
133: u
134: u
135: u
136: u
137: u
138: u
139: d
140: u
141: u
142: u
143: ud
144: u
145: u
146: u
147: u
148: u
149: u
150: d
151: d
152: d
153: u
154: d
155: u
156: u
157: