## Aligning protein sequence of uniprot (AlphaFold) and SCOPe (jpred)

In [1]:
import pylcs
import os
import pandas as pd
import re
import sys

Function to convert 8 to 3 state dssp

In [2]:
def reduce_dssp(sec_string):
    sec_string = re.compile(r'[ST_bGIC?]').sub('-',sec_string) # convert all known non-sheet, non-helix structures to coil
    sec_string = re.compile(r'[B]').sub('E',sec_string) # convert B forms to E
    sec_string = re.compile(r'~[HE]').sub('-',sec_string) # convert everything not H or E to coil for backup
    return sec_string

Run tests for reduce_dssp

In [3]:
test_dssp_8 = '---------SEEEEEETTEEEETTS-EEEEEEEE-SS-EEEEEE-------S---HHHHTT-SS-SEEEEETTTEEEETTS-EEEEEEEEEETTEEEEEEEE----'
test_dssp_3 = '----------EEEEEE--EEEE----EEEEEEEE----EEEEEE-----------HHHH-------EEEEE---EEEE----EEEEEEEEEE--EEEEEEEE----'
assert test_dssp_3 == reduce_dssp(test_dssp_8)
test_dssp_8 = 'WQ0wZtjViB8XpKfQlD8I'
test_dssp_3 = 'WQ0wZtjViE8XpKfQlD8-'
assert test_dssp_3 == reduce_dssp(test_dssp_8)
test_dssp_8 = 'FF5tPEQqE4JnwD2Rob84'
test_dssp_3 = 'FF5tPEQqE4JnwD2Ro-84'
assert test_dssp_3 == reduce_dssp(test_dssp_8)
test_dssp_8 = 'SKREHZfXfgjwJGjdbyAm'
test_dssp_3 = '-KREHZfXfgjwJ-jd-yAm'
assert test_dssp_3 == reduce_dssp(test_dssp_8)
test_dssp_8 = 'fe3qTylkabnoewFxwp9K'
test_dssp_3 = 'fe3q-ylka-noewFxwp9K'
assert test_dssp_3 == reduce_dssp(test_dssp_8)
test_dssp_8 = 'yVGL1j00mrMIvb6G3wQP'
test_dssp_3 = 'yV-L1j00mrM-v-6-3wQP'
assert test_dssp_3 == reduce_dssp(test_dssp_8)

Function to score an alignment between two sequences of same length 

In [4]:
def per_residue_score(ref,comp):
    n = len(ref)
    assert n == len(comp)
    n_match = 0
    for i in range(0,n):
        if ref[i:i+1]==comp[i:i+1]:
            n_match += 1
    return n_match/(n+1e-32)

Run tests for residue by residue scoring function

In [5]:
testref=''
testcomp=''
assert per_residue_score(testref,testcomp)==0

testref  = 'MSQWNQVQQLEIKFLEQVDQFYDDNFPMEIRHLLAQWIETQDWEVASNNETMATILLQNLLIQLDEQLGRVSKEKNLLLIHNLKRIRKVLQGKFHGNPMHVAVVISNCLREERRILAAANMPIQ'
testcomp = 'GGSQWNQVQQLEIKFLEQVDQFYDDNFPMEIRHLLAQWIETQDWEVASNNETMATILLQNLLIQLDEQLGRVSKEKNLLLIHNLKRIRKVLQGKFHGNPMHVAVVISNCLREERRILAAANMPI'
assert per_residue_score(testref,testcomp) == 13/124

testref  = 'SQWNQVQQLEIKFLEQVDQFYDDNFPMEIRHLLAQWIETQDWEVASNNETMATILLQNLLIQLDEQLGRVSKEKNLLLIHNLKRIRKVLQGKFHGNPMHVAVVISNCLREERRILAAANMPI'
testcomp = 'SQWNQVQQLEIKFLEQVDQFYDDNFPMEIRHLLAQWIETQDWEVASNNETMATILLQNLLIQLDEQLGRVSKEKNLLLIHNLKRIRKVLQGKFHGNPMHVAVVISNCLREERRILAAANMPI'
assert per_residue_score(testref,testcomp) == 1

Function to align jpred and alphafold sequences

In [6]:
# left_align - alignment based on longest common subsequence
# assumption - no indels i.e. no gaps
# returns start and end indices to slice compstring with
# distance between start and end indices is exactly the length of refstring
# in practice 
# ... refstring is the jpred protein sequence,
# ... and compstring is the uniprot/alphafold sequence
# this function is used to trim uniprot sequence
# if threshold e.g. 0.9 is not met, then start and end indices of 0 and -1 is returned

def left_align(refstring,compstring,threshold):
    assert threshold <= 1
    lref = len(refstring) 
    lcomp = len(compstring)
    assert lref <= lcomp
    lsearch = lcomp - lref + 1 # how many amino acid is stringc longer than stringb by
    start,end = 0, lref # index to slice comparator string 
    best_overlap = per_residue_score(refstring,compstring[start:end]) # alignment start from N terminus
    best_offset = 0
    for offset in range(0,lsearch):
        # compare percentage similarity
        assert end+offset <= lcomp
        test_window = compstring[start+offset:end+offset]
        test_overlap = per_residue_score(refstring, test_window)
        if test_overlap > best_overlap:
            best_overlap = test_overlap
            best_offset = offset
        else:
            pass
    
    # sanity check on best_offset by using it to slice compstring
    best_window = compstring[best_offset:best_offset + lref]
    # check we indeed have the best answer (probably unecessary but I'm not a computer scientist)
    assert per_residue_score(best_window, refstring) == best_overlap
    
    if best_overlap >= threshold: 
        return best_offset, best_overlap
    else:
        # threshold is not met
        return -1, best_overlap

Run tests for alignment function

In [7]:
assert left_align('','',0)[0] == 0

assert left_align('','',0)[1] == 0

assert left_align('ALIGNMENT','---ALIGNMENT',0)==(3,1)

assert left_align('+ALIGNMENT','---ALIGNMENT',0)==(2,0.9)

assert left_align('++ALIGNMENT','-ALIGNMENT--',0)==(0,0.0)

assert left_align('ALIGAMENTS','ALIGNMENTS',0)==(0,0.9)

assert left_align('AILMENTS','ALIGNMENTS',0)==(2,0.625)

assert left_align('NKKPVNSWTCEDFLAVDESFQPTAVGFAEALNNKDKPEDAVLDVQGIATVTPAIVQACTQDKQANFKDKVKGEWDKIKK',
                 'MKKVLGVILGGLLLLPVVSNAADAQKAADNKKPVNSWTCEDFLAVDESFQPTAVGFAEALNNKDKPEDAVLDVQGIATVTPAIVQACTQDKQANFKDKVKGEWDKIKKDM',
                 1)==(29,1)

assert left_align('GSASPTPPYLKWAESLHSLLDDQDGISLFRTFLKQEGCADLLDFWFACTGFRKLEPCDSNEEKRLKLARAIYRKYILDNNGIVSRQTKPATKSFIKGCIMKQLIDPAMFDQAQTEIQATMEENTYPSFLKSDIYLEYTRTGSESPKV',
                 'MNIQEQGFPLDLGASFTEDAPRPPVPGEEGELVSTDPRPASYSFCSGKGVGIKGETSTATPRRSDLDLGYEPEGSASPTPPYLKWAESLHSLLDDQDGISLFRTFLKQEGCADLLDFWFACTGFRKLEPCDSNEEKRLKLARAIYRKYILDNNGIVSRQTKPATKSFIKGCIMKQLIDPAMFDQAQTEIQATMEENTYPSFLKSDIYLEYTRTGSESPKVCSDQSSGSGTGKGISGYLPTLNEDEEWKCDQDMDEDDGRDAAPPGRLPQKLLLETAAPRVSSSRRYSEGREFRYGSWREPVNPYYVNAGYALAPATSANDSEQQSLSSDADTLSLTDSSVDGIPPYRIRKQHRREMQESVQVNGRVPLPHIPRTYRVPKEVRVEPQKFAEELIHRLEAVQRTREAEEKLEERLKRVRMEEEGEDGDPSSGPPGPCHKLPPAPAWHHFPPRCVDMGCAGLRDAHEENPESILDEHVQRVLRTPGRQSPGPGHRSPDSGHVAKMPVALGGAASGHGKHVPKSGAKLDAAGLHHHRHVHHHVHHSTARPKEQVEAEATRRAQSSFAWGLEPHSHGARSRGYSESVGAAPNASDGLAHSGKVGVACKRNAKKAESGKSASTEVPGASEDAEKNQKIMQWIIEGEKEISRHRRTGHGSSGTRKPQPHENSRPLSLEHPWAGPQLRTSVQPSHLFIQDPTMPPHPAPNPLTQLEEARRRLEEEEKRASRAPSKQRYVQEVMRRGRACVRPACAPVLHVVPAVSDMELSETETRSQRKVGGGSAQPCDSIVVAYYFCGEPIPYRTLVRGRAVTLGQFKELLTKKGSYRYYFKKVSDEFDCGVVFEEVREDEAVLPVFEEKIIGKVEKVD',
                 1)==(73,1)

assert left_align('RIPTDPTMYRFYEMLQVYGTTLKALVHEKFGDGIISAINFKLDVKKVADPEGGERAVITLDGKYLPTKPF',
                 'MIQSQINRNIRLDLADAILLSKAKKDLSFAEIADGTGLAEAFVTAALLGQQALPADAARLVGAKLDLDEDSILLLQMIPLRGCIDDRIPTDPTMYRFYEMLQVYGTTLKALVHEKFGDGIISAINFKLDVKKVADPEGGERAVITLDGKYLPTKPF',
                 1)==(86,1)

assert left_align('NERNISRLWRAFRTVKEMVKDRGYFITQEEVELPLEDFKAKYCDSMGRPQRKMMSFQANPTEESISKFPDMGSLWVEFCDEPSVGVKTMKTFVIHIQEKNFQTGIFVYQNNITPSAMKLVPSIPPATIETFNEAALVVN',
                 'MDQENERNISRLWRAFRTVKEMVKDRGYFITQEEVELPLEDFKAKYCDSMGRPQRKMMSFQANPTEESISKFPDMGSLWVEFCDEPSVGVKTMKTFVIHIQEKNFQTGIFVYQNNITPSAMKLVPSIPPATIETFNEAALVVNITHHELVPKHIRLSSDEKRELLKRYRLKESQLPRIQRADPVALYLGLKRGEVVKIIRKSETSGRYASYRICM',
                 1)==(4,1)

assert left_align('SQWNQVQQLEIKFLEQVDQFYDDNFPMEIRHLLAQWIETQDWEVASNNETMATILLQNLLIQLDEQLGRVSKEKNLLLIHNLKRIRKVLQGKFHGNPMHVAVVISNCLREERRILAAANMPI',
                  'MSQWNQVQQLEIKFLEQVDQFYDDNFPMEIRHLLAQWIETQDWEVASNNETMATILLQNLLIQLDEQLGRVSKEKNLLLIHNLKRIRKVLQGKFHGNPMHVAVVISNCLREERRILAAANMPIQGPLEKSLQSSSVSERQRNVEHKVSAIKNSVQMTEQDTKYLEDLQDEFDYRYKTIQTMDQGDKNSILVNQEVLTLLQEMLNSLDFKRKEALSKMTQIVNETDLLMNSMLLEELQDWKKRQQIACIGGPLHNGLDQLQNCFTLLAESLFQLRQQLEKLQEQSTKMTYEGDPIPAQRAHLLERATFLIYNLFKNSFVVERQPCMPTHPQRPMVLKTLIQFTVKLRLLIKLPELNYQVKVKASIDKNVSTLSNRRFVLCGTHVKAMSSEESSNGSLSVEFRHLQPKEMKCSTGSKGNEGCHMVTEELHSITFETQICLYGLTINLETSSLPVVMISNVSQLPNAWASIIWYNVSTNDSQNLVFFNNPPSVTLGQLLEVMSWQFSSYVGRGLNSEQLNMLAEKLTVQSNYNDGHLTWAKFCKEHLPGKTFTFWTWLEAILDLIKKHILPLWIDGYIMGFVSKEKERLLLKDKMPGTFLLRFSESHLGGITFTWVDQSENGEVRFHSVEPYNKGRLSALAFADILRDYKVIMAENIPENPLKYLYPDIPKDKAFGKHYSSQPCEVSRPTERGDKGYVPSVFIPISTIRSDSTEPQSPSDLLPMSPSAYAVLRENLSPTTIETAMNSPYSAE',
                  1)==(1,1)

assert left_align('GGSQWNQVQQLEIKFLEQVDQFYDDNFPMEIRHLLAQWIETQDWEVASNNETMATILLQNLLIQLDEQLGRVSKEKNLLLIHNLKRIRKVLQGKFHGNPMHVAVVISNCLREERRILAAANMPI',
                  'MSQWNQVQQLEIKFLEQVDQFYDDNFPMEIRHLLAQWIETQDWEVASNNETMATILLQNLLIQLDEQLGRVSKEKNLLLIHNLKRIRKVLQGKFHGNPMHVAVVISNCLREERRILAAANMPIQGPLEKSLQSSSVSERQRNVEHKVSAIKNSVQMTEQDTKYLEDLQDEFDYRYKTIQTMDQGDKNSILVNQEVLTLLQEMLNSLDFKRKEALSKMTQIVNETDLLMNSMLLEELQDWKKRQQIACIGGPLHNGLDQLQNCFTLLAESLFQLRQQLEKLQEQSTKMTYEGDPIPAQRAHLLERATFLIYNLFKNSFVVERQPCMPTHPQRPMVLKTLIQFTVKLRLLIKLPELNYQVKVKASIDKNVSTLSNRRFVLCGTHVKAMSSEESSNGSLSVEFRHLQPKEMKCSTGSKGNEGCHMVTEELHSITFETQICLYGLTINLETSSLPVVMISNVSQLPNAWASIIWYNVSTNDSQNLVFFNNPPSVTLGQLLEVMSWQFSSYVGRGLNSEQLNMLAEKLTVQSNYNDGHLTWAKFCKEHLPGKTFTFWTWLEAILDLIKKHILPLWIDGYIMGFVSKEKERLLLKDKMPGTFLLRFSESHLGGITFTWVDQSENGEVRFHSVEPYNKGRLSALAFADILRDYKVIMAENIPENPLKYLYPDIPKDKAFGKHYSSQPCEVSRPTERGDKGYVPSVFIPISTIRSDSTEPQSPSDLLPMSPSAYAVLRENLSPTTIETAMNSPYSAE',
                  0)==(286,0.14516129032258066)

assert left_align('fDVC0%Nc-g^$a3NZQGT2hO_[SsxNsHSEOU3^sS*YP1HEdoQj9_RP/2aq)2lv-D£^£P*)M=0U&WB6yxc[-1a(SQt£hjj0-mmG&*h"',
                  'S!&-CbW3z5)n$m$=[zv[zA91O21ZeKe36-s-[xBy.dIQm3Tl%&X.-8cuLpOCYooC6Wz5]3!Jt262c/2aq)2lv-D£^£P*)(cNk=v£xi9F2_Un/Q5SOS6c',
                  0)==(16,0.03)

assert left_align('%FDDDSjnmgJ"=0ht#(N.Aa$QTvVm9-jnUUmLwABZh!y4Sv#v/!u9nsh5V3YatA2ADG$M4"f/X2KJEU7MXx%$09Qo4zN^VZ^8yf8a',
                  '7X1nq£Nvp*-^£LBxGufVm9-naQxYq/nI!&E(YqOv)AD3U=677EXekl4S£mC+&LBPHqtA2ADG$M43^xk&3W(q2L7]3j93Z9(erptrRnQ5AC%5G£BV"=#',
                  0)==(6,0.09)

Function to align a row by calling left_align and dealing with symmetry

In [8]:
# static function
def align_row(original_row, threshold=0.9):
    row = original_row.copy(deep=True)
    truncate = True
    lcs_length = pylcs.lcs(row.jpred_sequence, row.AF_sequence)
    if len(row.jpred_sequence) <= len(row.AF_sequence):
        AF_start, similarity = left_align(row.jpred_sequence, row.AF_sequence, threshold)
        
        if AF_start == -1:
            # could fail because the lcs could start at residue n
            # remove n resides from start of jpred sequence
            test_jpred_start = len(row.jpred_sequence) - lcs_length
            AF_start, similarity = left_align(row.jpred_sequence[test_jpred_start:], row.AF_sequence, threshold)
            if AF_start == -1:
                # no subsequence with similarity > threshold found
                truncate = False
            else:
                # truncate n terminus of jpred sequence (later)
                pass
        
        if truncate:
            # perform truncation
            # truncate N terminus of alphafold sequence to match N terminus of jpred sequence
            row.AF_sequence = row.AF_sequence[AF_start:]
            row.AFpred_8 = row.AFpred_8[AF_start:]
            row.AFpred_3 = row.AFpred_3[AF_start:]

            # truncate N terminus of jpred sequence
            # lcs length does not include single base substitutions
            # this can be measured by the edit distance
            # offset jpred_start by LCS length AND edit stance 
            n_substitutions = pylcs.edit_distance(row.AF_sequence)
            jpred_start = len(row.jpred_sequence) - lcs_length
            row.jpred_sequence = row.jpred_sequence[jpred_start:]
            row.jpred_3 = row.jpred_3[jpred_start:]

            # truncate C terminus of alphafold sequence
            AF_end = lcs_length + jpred_start
            row.AF_sequence = row.AF_sequence[:AF_end]
            row.AFpred_8 = row.AFpred_8[:AF_end]
            row.AFpred_3 = row.AFpred_3[:AF_end]

    else:
        # jpred sequence is longer than the alphafold sequence
        # so do exact same as above but call per_residue_align with arguments swapped
        jpred_start, similarity = left_align(row.AF_sequence, row.jpred_sequence, threshold)
        
        if jpred_start == -1:
            # could fail because the lcs could start at residue n
            # remove n residues from the jpred sequence
            test_AF_start = len(row.AF_sequence) - lcs_length
            jpred_start, similarity = left_align(row.AF_sequence[test_AF_start:], row.jpred_sequence, threshold)
            # try again
            if jpred_start == -1:
                # no subsequence with similarity > threshold found
                truncate = False
            else:
                # truncate n terminus of alphafold sequence (later)
                pass
            
        if truncate:
            
            # truncate N terminus of jpred sequence to match N terminus of alphafold sequence
            row.jpred_sequence = row.jpred_sequence[jpred_start:]
            row.jpred_3 = row.jpred_3[jpred_start:]
            
            # truncate N terminus of alphafold sequence
            AF_start = len(row.AF_sequence) - lcs_length
            row.AF_sequence = row.AF_sequence[AF_start:]
            row.AFpred_8 = row.AFpred_8[AF_start:]
            row.AFpred_3 = row.AFpred_3[AF_start:]
            
            # truncate C terminus of jpred sequence
            jpred_end = lcs_length + AF_start
            row.jpred_sequence = row.jpred_sequence[:jpred_end]
            row.jpred_3 = row.jpred_3[:jpred_end]
            
    assert similarity # should be at least 0
    row.loc['similarity'] = similarity
    
    # for debug
    # if no truncation was performed, it is because sequences don't align
    if not truncate:
        print(f'seqId {row.seqID}: best alignment has similarity below threshold', file=sys.stderr)
    
    return row

# Main routine

Load in input dataframe - already comes with 3 state dssp information

In [55]:
os.chdir('/cluster/gjb_lab/2472402')
debug_df = pd.read_csv('debug_table.csv')

Truncate input dataframe

In [56]:
%%time
threshold=0.0
l = []
for i,row in debug_df.iterrows():
    aligned_row = align_row(row,threshold)
    l.append(aligned_row)
debug_df_aligned = pd.concat(l,axis=1).T

CPU times: user 51.7 ms, sys: 2.22 ms, total: 53.9 ms
Wall time: 52.1 ms


## Print to summary table, this timed called debug.txt

Add full sequence information

In [57]:
for i,seqID in enumerate([24730,24739,24741,24800,24810,24813]):
    jpred_full = sum_df.loc[seqID]['jpred_sequence']
    AF_full = sum_df.loc[seqID]['AF_sequence']
    debug_df_aligned.loc[i,'jpred_full'] = jpred_full
    debug_df_aligned.loc[i,'AF_full'] = AF_full

In [58]:
with open('debug_threshold_%.2f.txt'%threshold,'w') as f:
    print('Only sequences with similarity > %.2f have been aligned' % threshold, file=f)
    for i,row in sum_df_aligned.iterrows():
        print('seqID          :',row.seqID,file=f)
        print('jpred_full     :',row.jpred_full,file=f)
        print('jpred_trunc    :',row.jpred_sequence,file=f)
        print('AF____full     :',row.AF_full,file=f)
        print('AF____trunc    :',row.AF_sequence,file=f)
        print('Similarity     :',row.similarity,file=f)
        print('',file=f)

Do not use

In [None]:
with open('debug_threshold_%.2f.txt'%threshold,'w') as f:
    print('Only sequences with similarity > %.2f have been aligned' % threshold, file=f)
    for i,row in sum_df_aligned.iterrows():
        print('seqID          :',row.seqID,file=f)
        print('domain         :',row.domain,file=f)
        print('UniprotId      :',row.uniprotId,file=f)
        print('PDBe_sequence  :',row.PDBe_sequence,file=f)
        print('DSSP_8         :',row.DSSP8,file=f)
        print('DSSP_3         :',row.DSSP3,file=f)
        print('jpred_sequence :',row.jpred_sequence,file=f)
        print('jpred_3        :',row.jpred_3,file=f)
        print('AF_sequence    :',row.AF_sequence,file=f)
        print('AFpred_8       :',row.AFpred_8,file=f)
        print('AFpred_3       :',row.AFpred_3,file=f)
        print('Similarity     :',row.similarity,file=f)
        print('',file=f)

# Debugging (new view)

In [69]:
debug_df_aligned = debug_df_aligned.set_index('seqID')
debug_df_aligned

Unnamed: 0_level_0,domain,uniprotId,PDBe_sequence,DSSP8,AFpred_8,AF_sequence,jpred_sequence,jpred_3,DSSP3,AFpred_3,jpred_full,AF_full,similarity
seqID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
24730,d1cfza_,P37182,MRILVLGVGNILLTDEAIGVRIVEALEQRYILPDYVEILDGGTAGM...,--EEEEEES-TTBGGGGHHHHHHHHHHHHEE--TTEEEEEEET--G...,--EEEEEES-TTBGGGGHHHHHHHHHHHHEE--TTEEEEEEET--G...,MRILVLGVGNILLTDEAIGVRIVEALEQRYILPDYVEILDGGTAGM...,ILVLGVGNILLTDEAIGVRIVEALEQRYILPDYVEILDGGTAGMEL...,EEEEEE---------HHHHHHHHHHHH------EEEEEE----HHH...,--EEEEEE----E----HHHHHHHHHHHHEE----EEEEEEE----...,--EEEEEE----E----HHHHHHHHHHHHEE----EEEEEEE----...,MRILVLGVGNILLTDEAIGVRIVEALEQRYILPDYVEILDGGTAGM...,MRILVLGVGNILLTDEAIGVRIVEALEQRYILPDYVEILDGGTAGM...,0.981481
24739,d1cuka1,P0A809,MIGRLRGIIIEKQPPLVLIEVGGVGYEVHMPMTCFYELPEAGQEAI...,--HHHHHHHHHHHHHT--HHHHHHHHHHS--SS--HHHHHHHHHHTT-,HHHHHHHHHHHHHHTT--HHHHHHHHHTT--TT--HHHHHHHHHHHT-,TDDAEQEAVAALVALGYKPQEASRMVSKIARPDASSETLIREALRAAL,DDAEQEAVARLVALGYKPQEASRMVSKIARPDASSETLIREALRAAL,--HHHHHHHHHHH----HHHHHHHHHHH------HHHHHHHHHHH--,--HHHHHHHHHHHHH---HHHHHHHHHH-------HHHHHHHHHH---,HHHHHHHHHHHHHH----HHHHHHHHH--------HHHHHHHHHHH--,TDDAEQEAVARLVALGYKPQEASRMVSKIARPDASSETLIREALRAAL,MIGRLRGIIIEKQPPLVLIEVGGVGYEVHMPMTCFYELPEAGQEAI...,0.979167
24741,d1cxzb_,P61586,SMAAIRKKLVIVGDVACGKTCLLIVFSKDQFPEVYVPTVFENYVAD...,--HHHHTT-TTS-TTSHHHHHHHHHHHHHHHHHHHHHHHHHHHHHH...,IIHHHHHHH-TTS-EEEEEE-GGGGG-HHHHHHHHHTT-----HHH...,KWTPEVKHFCPNVPIILVGNKKDLRNDEHTRRELAKMKQEPVKPEE...,ENLRRATTDLGRSLGPVELLLRGSSRRLDLLHQQLQELHAHV,HHHHHHH---HHHHHHHHHHHHHHHHHHHHHHHHHHH-----,--HHHH----------HHHHHHHHHHHHHHHHHHHHHHHHHHHHHH...,--HHHHHHH-----EEEEEE-------HHHHHHHHH-------HHH...,WSLLEQLGLAGADLAAPGVQQQLELERERLRREIRKELKLKEGAEN...,MAAIRKKLVIVGDGACGKTCLLIVFSKDQFPEVYVPTVFENYVADI...,0.139535
24800,d1fpoa2,P0A6L9,MDYFTLFGLPARYQLDTQALSLRFQDLQRQYHPDKFASGSQAEQLA...,--TT-SSS--S-HHHHHHHHHHHHHHHHHHHHT-HHHHHHHHHHHH...,--SSSTTS----HHHHHHHHHHHHHHHHHHHHT-HHHHHHHHHHHH...,FDLASEQHTVRDTAFLMEQLELREELDEIEQAKDEARLESFIKRVK...,DLASEQHTVRDTAFLMEQLELREELDEIEQAKDEARLESFIKRVKK...,-----------HHHHHHHHHHHHHHHHHH----HHHHHHHHHHHHH...,------------HHHHHHHHHHHHHHHHHHHH--HHHHHHHHHHHH...,------------HHHHHHHHHHHHHHHHHHHH--HHHHHHHHHHHH...,FDLASEQHTVRDTAFLMEQLELREELDEIEQAKDEARLESFIKRVK...,MDYFTLFGLPARYQLDTQALSLRFQDLQRQYHPDKFASGSQAEQLA...,0.989474
24810,d1g73a_,Q9NR28,AVPIAQKSEPHSLSSEALMRRAVSLVTDSTSTDLSQTTYALIEAIT...,-EE----------HHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHH...,----------SSHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHH...,AVPIAQKSEPHSLSSEALMRRAVSLVTDSTSTFLSQTTYALIEAIT...,VPIAQKSEPHSLSSEALMRRAVSLVTDSTSTDLSQTTYALIEAITE...,-------------HHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHH...,-EE----------HHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHH...,------------HHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHH...,AVPIAQKSEPHSLSSEALMRRAVSLVTDSTSTDLSQTTYALIEAIT...,MAALKSWLSRSVTSFFRYRQCLCVPVVANFKKRCFSELIRPWHKTV...,0.993631
24813,d1g8qa_,P60033,FVNKDQIAKDVKQFYDQALQQAVVDDDANNAKAVVKTFHETLDCCG...,---HHHHHHHHHHHHHHHHHHHHH-TT-HHHHHHHHHHHHHHT--S...,HHTHHHHHHHHHHHHHHHHHHHHH-SS-HHHHHHHHHHHHHHT--S...,FVNKDQIAKDVKQFYDQALQQAVVDDDANNAKAVVKTFHETLDCCG...,VNKDQIAKDVKQFYDQALQQAVVDDDANNAKAVVKTFHETLDCCGS...,---HHHHHHHHHHHHHHHHHH-------HHHHHHHHHHHHH-----...,---HHHHHHHHHHHHHHHHHHHHH----HHHHHHHHHHHHHH----...,HH-HHHHHHHHHHHHHHHHHHHHH----HHHHHHHHHHHHHH----...,FVNKDQIAKDVKQFYDQALQQAVVDDDANNAKAVVKTFHETLDCCG...,MGVEGCTKCIKYLLFVFNFVFWLAGGVILGVALWLRHDPQTTNLLY...,0.988889


In [70]:
debug_df_aligned.index

Int64Index([24730, 24739, 24741, 24800, 24810, 24813], dtype='int64', name='seqID')

In [74]:
jpred_sequence = debug_df_aligned.loc[24800]['jpred_sequence']
AF_sequence = debug_df_aligned.loc[24800]['AF_sequence']
jpred_full = debug_df_aligned.loc[24800]['jpred_full']
AF_full = debug_df_aligned.loc[24800]['AF_full']

In [72]:
jpred_sequence

'DLASEQHTVRDTAFLMEQLELREELDEIEQAKDEARLESFIKRVKKMFDTRHQLMVEQLDNETWDAAADTCRKLRFLDKLRSSAEQLEEKLLDF'

In [73]:
AF_sequence

'FDLASEQHTVRDTAFLMEQLELREELDEIEQAKDEARLESFIKRVKKMFDTRHQLMVEQLDNETWDAAADTVRKLRFLDKLRSSAEQLEEKLLDF'

In [75]:
jpred_full

'FDLASEQHTVRDTAFLMEQLELREELDEIEQAKDEARLESFIKRVKKMFDTRHQLMVEQLDNETWDAAADTCRKLRFLDKLRSSAEQLEEKLLDF'

In [76]:
AF_full

'MDYFTLFGLPARYQLDTQALSLRFQDLQRQYHPDKFASGSQAEQLAAVQQSATINQAWQTLRHPLMRAEYLLSLHGFDLASEQHTVRDTAFLMEQLELREELDEIEQAKDEARLESFIKRVKKMFDTRHQLMVEQLDNETWDAAADTVRKLRFLDKLRSSAEQLEEKLLDF'

In [77]:
lcs_length = pylcs.lcs(jpred_sequence, AF_sequence)
lcs_length

93

In [78]:
len(jpred_sequence)<=len(AF_sequence)

True

In [85]:
AF_start, similarity = left_align(jpred_full,AF_full,0.9)

In [83]:
len(jpred_sequence)

94

In [86]:
AF_sequence = AF_full[AF_start:]

In [None]:
jpred_start = len(jpred_sequence) - lcs_length