## Import modules

In [None]:
import pandas as pd
import numpy as np
import os
from joblib import Parallel, delayed

pd.options.display.max_rows=999
pd.options.display.max_columns=999

## Load functions 

In [619]:
def align(s1,s2,debug=False,verbose=False):
    assert len(s1) > 0 and len(s2) > 0
    
    m = len(s1)
    n = len(s2)
    
    # for debugging
    if debug:
        global dp
    
    # initialize table for memoization
    dp = np.zeros((m+1,n+1),dtype=np.int16)
    
    # memoization
    for i in range(1,m+1):
        for j in range(1,n+1):
            if s1[i-1:i] == s2[j-1:j]:
                dp[i][j] = max(dp[i-1][j-1] + 1, max(dp[i-1][j]-2, dp[i][j-1]-2))
            else:
                dp[i][j] = max(dp[i-1][j-1] - 2, max(dp[i-1][j]-2, dp[i][j-1]-2))
        
    # keep track of bit mask
    mask1 = np.zeros((m,),dtype=np.byte)
    mask2 = np.zeros((n,),dtype=np.byte)
    
    # search for local alignment
    max_local_val = np.NINF
    max_local_row = np.nan
    max_local_col = np.nan
    
    for irow in range(1,m+1):
        for icol in range(1,n+1):
            if dp[irow][icol] > max_local_val:
                max_local_val = dp[irow][icol]
                max_local_row = irow
                max_local_col = icol
    assert max_local_val
    assert max_local_row 
    assert max_local_col 
    
    # edge case 0 - alignment length is 0
    if max_local_val < 0:
        if verbose:
            print('No alignment found. Returning empty masks and some negative score.')
        return mask1, mask2, max_local_val
    
    if verbose:
        if max_local_col == n:
            print('Using semi-global alignment. s1 is reference, s2 is query')
        elif max_local_row == m:
            print('Using semi-global alignment. s2 is reference, s1 is query')
        else:
            print(f'Using local alignment. s1 ends at {max_local_row} and s2 ends at {max_local_col}')
    
    # initialize i and j for backtracing the dp table
    i = max_local_row
    j = max_local_col
    score = 0
    
    # initialize mask with the starting position
    if verbose:
        print(f'Assigning 1 at index {i-1} of AF_mask and index {j-1} of jpred_mask.')
    mask1[i-1] = 1
    mask2[j-1] = 1

    # edge case 1 - alignment length is 1
    if max_local_row == 1 or max_local_col == 1:
        if verbose:
            print('Alignment of length 1.')
        return mask1, mask2, max_local_val
    
    # backtrace the dp table
    while i != 0:
        while j != 0:
            curr = dp[i][j]
            diag = dp[i-1][j-1] 
            left = dp[i][j-1] 
            abov = dp[i-1][j]
            score += curr
            if diag >= left and diag >= abov:
                mask1[i-1] = 1
                mask2[j-1] = 1
                if max(diag, max(left, abov)) < 0:
                    if verbose:
                        print(f'Truncating alignment to start at s1[{i}:] and s2[{j}:]. No alignment beyond this position N-terminally.')
                    break
                i = i - 1
                j = j - 1
            elif left >= diag and left >= abov:
                j = j - 1
            elif abov >= left and abov >= diag:
                i = i - 1
            else:
                print(f'unhandled error for {i},{j}')
                break
        break
     
    return mask1,mask2,score


def align_all(input_df):
    
    df = pd.DataFrame()

    def for_joblib(series):
        seqID = series.seqID
        AF_sequence = series.AF_sequence
        jpred_sequence = series.jpred_sequence
        AF_mask,jpred_mask,score = align(AF_sequence,jpred_sequence,debug=True,verbose=False)
        return df.append([{'seqID' : seqID, 
                           'score' : score, 
                           'AF_sequence' : AF_sequence,
                           'AF_mask' : AF_mask, 
                           'jpred_sequence' : jpred_sequence,
                           'jpred_mask' : jpred_mask}])

    df_list = Parallel(n_jobs=-1,verbose=1)(
        delayed(for_joblib)(row) for _,row in input_df.iterrows()
    )
    df = pd.concat(df_list, ignore_index=True)
    
    return df

def split(string):
    return ['-']+[c for c in string]

def compress(seq,mask):
    return ''.join([s for s, m in zip(seq,mask) if m])

def string(byte_array):
    return ''.join(byte_array.astype(str))

def pad_jpred(debug=False):
    global AF_mask, jpred_mask, AF_sequence, jpred_sequence
    i = 0
    j = 0
    lower = []
    while i < len(AF_mask): # and j < len(jpred_align):
        
        j = min(j, len(jpred_mask)-1)
        
        if AF_mask[i] and jpred_mask[j]:
            lower.append(jpred_sequence[j])
            i += 1
            j += 1
        elif AF_mask[i] and not jpred_mask[j]:
            if debug:
                # for case 24716
                j += 1
            else:
                lower.append('-')
                j += 1
        elif not AF_mask[i]:
            lower.append('-')
            i += 1
        else:
            print('unhandled error')
            break
    return lower

## Load data

In [696]:
#os.chdir('/cluster/gjb_lab/2472402/.')
os.chdir('/YOUR/DIRECTORY/HERE')
df = pd.read_csv('summary_table.csv')

aligned_df = align_all(df[72:150]) # seqID 24744

it = aligned_df.iterrows()

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 64 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  78 | elapsed:    0.7s remaining:    1.2s
[Parallel(n_jobs=-1)]: Done  78 out of  78 | elapsed:    3.9s finished


## Run to generate new sequence

In [703]:
i, (seqID, score, AF_sequence, AF_mask, jpred_sequence, jpred_mask) = it.__next__()
print(seqID)
print(score)
#print(AF_sequence)
#print(string(AF_align))
#print(jpred_sequence)
#print(string(jpred_align))
#print(compress(AF_sequence,AF_align))
#print(compress(jpred_sequence,jpred_align))
#print(' '*100)
print(AF_sequence)
print(''.join(pad_jpred(debug=True)))
print(jpred_sequence)

24909
48849
MNIDMAALHAIEVDRGISVNELLETIKSALLTAYRHTQGHQTDARIEIDRKTGVVRVIARETDEAGNLISEWDDTPEGFGRIAATTARQVMLQRFRDAENERTYGEFSTREGEIVAGVIQRDSRANARGLVVVRIGTETKASEGVIPAAEQVPGESYEHGNRLRCYVVGVTRGAREPLITLSRTHPNLVRKLFSLEVPEIADGSVEIVAVAREAGHRSKIAVRSNVAGLNAKGACIGPMGQRVRNVMSELSGEKIDIIDYDDDPARFVANALSPAKVVSVSVIDQTARAARVVVPDFQLSLAIGKEGQNARLAARLTGWRIDIRGDAPPPPPGQPEPGVSRGMAHDR
VMNIDMAALHAIEVDRGISVNELLETIKSALLTAYRHTQGHQTDARIEIDRKTGVVRVIARETDEAGNLISEWDDTPEGFGRIAATTARQVMLQRFRDA--------ESTREGEIVAGVIQRDSRANARGLVVVRIGTETKASEGVIPAAEQVPGESYEHGNRLRCYVVGVTRGAREPLITLSRTHPNLVRKLFSLEVPEIADGSVEIVAVAREAGHRSKIAVRSNVAGLNAKGACIGPMGQRVRNVMSELSGEKIDIIDYDDDPARFVANALSPAKVVSVSVIDQTARAARVVVPDFQLSLAIGKEGQNARLAARLTGWRIDIRGDAP-----------------P
VSRRHMNIDMAALHAIEVDRGISVNELLETIKSALLTAYRHTQGHQTDARIEIDRKTGVVRVIARETDEAGNLISEWDDTPEGFGRIAATTARQVMLQRFRDAESTREGEIVAGVIQRDSRANARGLVVVRIGTETKASEGVIPAAEQVPGESYEHGNRLRCYVVGVTRGAREPLITLSRTHPNLVRKLFSLEVPEIADGSVEIVAVAREAGHRSKIAVRSNVAGLNAKGACIGPMGQRVRNVMSELSGEKIDIIDYDDDPARFVANALSPAKVVSVSVIDQTARAARVVVP

## Debugging

In [705]:
dp = pd.DataFrame()
AF_mask, jpred_mask, score = align(AF_sequence,jpred_sequence,debug=True,verbose=True)
print(seqID)
print('Alignment score: ',score)
print(' ')
print(string(AF_mask))
print(string(jpred_mask))
print(AF_sequence)
print(''.join(pad_jpred(debug=True)))
print(jpred_sequence)
dp = pd.DataFrame(dp,index=split(AF_sequence),columns=split(jpred_sequence))

Using semi-global alignment. s1 is reference, s2 is query
Assigning 1 at index 328 of AF_mask and index 325 of jpred_mask.
24909
Alignment score:  48849
 
11111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111100000000111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111000000000000000001
10000111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111
MNIDMAALHAIEVDRGISVNELLETIKSALLTAYRHTQGHQTDARIEIDRKTGVVRVIARETDEAGNLISEWDDTPEGFGRIAATTARQVMLQRFRDAENERTYGEFSTREGEIVAGVIQRDSRANARGLVVVRIGTETKASEGVIPAAEQVPGESYEHGNRLRCYVVGV

TODO: need to model in some sort of gap extension penalty, during the backtracing stage

So far, best combination I have found is +1,-2,-2 and -2,-2,-2