In [315]:
import pandas as pd
import numpy as np
import os
from joblib import Parallel, delayed

pd.options.display.max_rows=999
pd.options.display.max_columns=999

def align(s1,s2,debug=False):
    assert len(s1) > 0 and len(s2) > 0
    
    m = len(s1)
    n = len(s2)
    
    # for debugging
    global dp
    dp = np.zeros((m+1,n+1),dtype=np.int32)
    
    for i in range(1,m+1):
        for j in range(1,n+1):
            if s1[i-1:i] == s2[j-1:j]:
                dp[i][j] = max(dp[i-1][j-1] + 1, max(dp[i-1][j]-2, dp[i][j-1]-2))
            else:
                dp[i][j] = max(dp[i-1][j-1] - 1, max(dp[i-1][j]-2, dp[i][j-1]-2))
        
    # we want whole of s2 to be aligned
    # initialize j as last col
    # initialize i as max row
    max_row_val = np.NINF
    max_i = np.nan
    for irow in range(1,len(s1)+1):
        if dp[irow][len(s2)-1] > max_row_val:
            max_row_val = dp[irow][len(s2)-1]
            max_i = irow
    
    max_col_val = np.NINF
    max_j = np.nan
    for icol in range(1,len(s2)+1):
        if dp[len(s1)-1][icol] > max_col_val:
            max_col_val = dp[len(s1)-1][icol]
            max_j = icol
    
    if debug:
        print(f'max_row_val: {max_row_val}, max_i: {max_i}')
        print(f'max_col_val: {max_col_val}, max_j: {max_j}')
    
    # keep track of bit mask
    mask1 = np.zeros((m,),dtype=np.byte)
    mask2 = np.zeros((n,),dtype=np.byte)

    # decide whether to start from last row or last col
    # initialize i and j accordingly
    # at the same time add starting cell to mask
    if max_row_val >= max_col_val:
        i = max_i
        j = len(s2)
        mask1[i-1] = 1
        mask2[j-1] = 1
        if debug:
            print('starting from last column')
            print(f'Assigned 1 at mask1: {i-1}, mask2: {j-1}')
    else:
        i = len(s1)
        j = max_j
        mask1[i-1] = 1
        mask2[j-1] = 1
        if debug:
            print('starting from last row')
            print(f'Assigned 1 at mask1: {i-1}, mask2: {j-1}')

    if debug:
        print(f'Starting scoring of dp with i: {i}, j: {j}')
        
    # score for best path
    score = 0
    
    while i != 0:
        while j != 0:
            curr = dp[i][j]
            diag = dp[i-1][j-1] 
            left = dp[i][j-1] 
            abov = dp[i-1][j]
            score += curr
            if diag >= left and diag >= abov:
                mask1[i-1] = 1
                mask2[j-1] = 1
                i = i - 1
                j = j - 1
            elif left >= diag and left >= abov:
                j = j - 1
            elif abov >= left and abov >= diag:
                i = i - 1
            else:
                print(f'unhandled error for {i},{j}')
                break
        break
     
    return mask1,mask2,score


def align_all(input_df):
    
    df = pd.DataFrame()

    def for_joblib(series):
        seqID = series.seqID
        s1 = series.AF_sequence
        s2 = series.jpred_sequence
        hits_s1,hits_s2,score = align(s1,s2)
        return df.append([{'seqID': seqID, 'score' : score, 'AF_sequence':s1,'AF_align':hits_s1, 'jpred_sequence':s2,'jpred_align':hits_s2}])

    df_list = Parallel(n_jobs=-1,verbose=1)(
        delayed(for_joblib)(row) for _,row in input_df.iterrows()
    )
    df = pd.concat(df_list, ignore_index=True)
    
    return df

def split(word):
    return [c for c in word]

def compress(seq,mask):
    return ''.join([s for s, m in zip(seq,mask) if m])

def string(byte_array):
    return ''.join(byte_array.astype(str))

def pad_jpred():
    global AF_align, jpred_align, AF_sequence, jpred_sequence
    i = 0
    j = 0
    lower = []
    while i < len(AF_align): # and j < len(jpred_align):
        j = min(j, len(jpred_align)-1)
        if AF_align[i] and jpred_align[j]:
            lower.append(jpred_sequence[j])
            i += 1
            j += 1
        elif AF_align[i] and not jpred_align[j]:
            lower.append('-')
            j += 1
        elif not AF_align[i]:
            lower.append('-')
            i += 1
        else:
            print('unhandled error')
            break
    return lower

In [306]:
os.chdir('/cluster/gjb_lab/2472402/.')
df = pd.read_csv('summary_table.csv')

aligned_df = align_all(df[:72])

it = aligned_df.iterrows()

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 64 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 out of  72 | elapsed:    0.4s remaining:    1.3s
[Parallel(n_jobs=-1)]: Done  72 out of  72 | elapsed:    4.1s finished


In [313]:
i, (seqID, score, AF_sequence, AF_align, jpred_sequence, jpred_align) = it.__next__()
print(seqID)
print(score)
#print(AF_sequence)
#print(string(AF_align))
#print(jpred_sequence)
#print(string(jpred_align))
#print(compress(AF_sequence,AF_align))
#print(compress(jpred_sequence,jpred_align))
#print(' '*100)
print(AF_sequence)
print(''.join(pad_jpred()))
print(jpred_sequence)

24716
7377
MSQWNQVQQLEIKFLEQVDQFYDDNFPMEIRHLLAQWIETQDWEVASNNETMATILLQNLLIQLDEQLGRVSKEKNLLLIHNLKRIRKVLQGKFHGNPMHVAVVISNCLREERRILAAANMPIQGPLEKSLQSSSVSERQRNVEHKVSAIKNSVQMTEQDTKYLEDLQDEFDYRYKTIQTMDQGDKNSILVNQEVLTLLQEMLNSLDFKRKEALSKMTQIVNETDLLMNSMLLEELQDWKKRQQIACIGGPLHNGLDQLQNCFTLLAESLFQLRQQLEKLQEQSTKMTYEGDPIPAQRAHLLERATFLIYNLFKNSFVVERQPCMPTHPQRPMVLKTLIQFTVKLRLLIKLPELNYQVKVKASIDKNVSTLSNRRFVLCGTHVKAMSSEESSNGSLSVEFRHLQPKEMKCSTGSKGNEGCHMVTEELHSITFETQICLYGLTINLETSSLPVVMISNVSQLPNAWASIIWYNVSTNDSQNLVFFNNPPSVTLGQLLEVMSWQFSSYVGRGLNSEQLNMLAEKLTVQSNYNDGHLTWAKFCKEHLPGKTFTFWTWLEAILDLIKKHILPLWIDGYIMGFVSKEKERLLLKDKMPGTFLLRFSESHLGGITFTWVDQSENGEVRFHSVEPYNKGRLSALAFADILRDYKVIMAENIPENPLKYLYPDIPKDKAFGKHYSSQPCEVSRPTERGDKGYVPSVFIPISTIRSDSTEPQSPSDLLPMSPSAYAVLRENLSPTTIETAMNSPYSAE
GGSQWNQVQQLEIKFLEQVDQFYDDNFPMEIRHLLAQWIETQDWEVASNNETMATILLQNLLIQLDEQLGRVSKEKNLLLIHNLKRIRKVLQGKFHGNPMHVAVVISNCLREERRILAAANM---------------------------------------------------------------------------------------------------------------------

Debugging use

In [314]:
print(seqID)
print(score)
print(AF_sequence)
print(string(AF_align),'\n')
print(''.join(pad_jpred()),'\n')
print(jpred_sequence)
print(string(jpred_align),'\n')
print(compress(AF_sequence,AF_align))
print(compress(jpred_sequence,jpred_align))


24716
7377
MSQWNQVQQLEIKFLEQVDQFYDDNFPMEIRHLLAQWIETQDWEVASNNETMATILLQNLLIQLDEQLGRVSKEKNLLLIHNLKRIRKVLQGKFHGNPMHVAVVISNCLREERRILAAANMPIQGPLEKSLQSSSVSERQRNVEHKVSAIKNSVQMTEQDTKYLEDLQDEFDYRYKTIQTMDQGDKNSILVNQEVLTLLQEMLNSLDFKRKEALSKMTQIVNETDLLMNSMLLEELQDWKKRQQIACIGGPLHNGLDQLQNCFTLLAESLFQLRQQLEKLQEQSTKMTYEGDPIPAQRAHLLERATFLIYNLFKNSFVVERQPCMPTHPQRPMVLKTLIQFTVKLRLLIKLPELNYQVKVKASIDKNVSTLSNRRFVLCGTHVKAMSSEESSNGSLSVEFRHLQPKEMKCSTGSKGNEGCHMVTEELHSITFETQICLYGLTINLETSSLPVVMISNVSQLPNAWASIIWYNVSTNDSQNLVFFNNPPSVTLGQLLEVMSWQFSSYVGRGLNSEQLNMLAEKLTVQSNYNDGHLTWAKFCKEHLPGKTFTFWTWLEAILDLIKKHILPLWIDGYIMGFVSKEKERLLLKDKMPGTFLLRFSESHLGGITFTWVDQSENGEVRFHSVEPYNKGRLSALAFADILRDYKVIMAENIPENPLKYLYPDIPKDKAFGKHYSSQPCEVSRPTERGDKGYVPSVFIPISTIRSDSTEPQSPSDLLPMSPSAYAVLRENLSPTTIETAMNSPYSAE
11111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000

In [300]:
dp = pd.DataFrame()
mask1, mask2, _ = align(AF_sequence,jpred_sequence,True)
#pd.DataFrame(dp,index=['-']+split(AF_sequence),columns=['-']+split(jpred_sequence))

max_row_val: 120, max_i: 122
max_col_val: -1, max_j: 1
Assigned 1 at mask1: 121, mask2: 123
starting i: 122, j: 124


In [288]:
string(mask1)

'11111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111100000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001'

In [289]:
string(mask2)

'1111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111'