## Aligning protein sequence of uniprot (AlphaFold) and SCOPe (jpred)

In [1]:
import pylcs
import os
import pandas as pd
import re
import sys

Function to convert 8 to 3 state dssp

In [2]:
def reduce_dssp(sec_string):
    sec_string = re.compile(r'[ST_bGIC?]').sub('-',sec_string) # convert all known non-sheet, non-helix structures to coil
    sec_string = re.compile(r'[B]').sub('E',sec_string) # convert B forms to E
    sec_string = re.compile(r'~[HE]').sub('-',sec_string) # convert everything not H or E to coil for backup
    return sec_string

Run tests for reduce_dssp

In [3]:
test_dssp_8 = '---------SEEEEEETTEEEETTS-EEEEEEEE-SS-EEEEEE-------S---HHHHTT-SS-SEEEEETTTEEEETTS-EEEEEEEEEETTEEEEEEEE----'
test_dssp_3 = '----------EEEEEE--EEEE----EEEEEEEE----EEEEEE-----------HHHH-------EEEEE---EEEE----EEEEEEEEEE--EEEEEEEE----'
assert test_dssp_3 == reduce_dssp(test_dssp_8)
test_dssp_8 = 'WQ0wZtjViB8XpKfQlD8I'
test_dssp_3 = 'WQ0wZtjViE8XpKfQlD8-'
assert test_dssp_3 == reduce_dssp(test_dssp_8)
test_dssp_8 = 'FF5tPEQqE4JnwD2Rob84'
test_dssp_3 = 'FF5tPEQqE4JnwD2Ro-84'
assert test_dssp_3 == reduce_dssp(test_dssp_8)
test_dssp_8 = 'SKREHZfXfgjwJGjdbyAm'
test_dssp_3 = '-KREHZfXfgjwJ-jd-yAm'
assert test_dssp_3 == reduce_dssp(test_dssp_8)
test_dssp_8 = 'fe3qTylkabnoewFxwp9K'
test_dssp_3 = 'fe3q-ylka-noewFxwp9K'
assert test_dssp_3 == reduce_dssp(test_dssp_8)
test_dssp_8 = 'yVGL1j00mrMIvb6G3wQP'
test_dssp_3 = 'yV-L1j00mrM-v-6-3wQP'
assert test_dssp_3 == reduce_dssp(test_dssp_8)

Function to align jpred and alphafold sequences

In [4]:
# lcs_align - alignment based on longest common subsequence
# assumption - no indels i.e. no gaps
# returns start and end indices to slice compstring with
# distance between start and end indices is exactly the length of refstring
# in practice 
# ... refstring is the jpred protein sequence,
# ... and compstring is the uniprot/alphafold sequence
# this function is used to trim uniprot sequence
# if threshold e.g. 0.9 is not met, then start and end indices of 0 and -1 is returned
def lcs_align(refstring, compstring, threshold):
    assert threshold <= 1
    lref = len(refstring) 
    lcomp = len(compstring)
    assert lref <= lcomp
    lsearch = lcomp - lref + 1 # how many amino acid is stringc longer than stringb by
    start,end = 0, lref # index to slice comparator string 
    best_overlap = pylcs.lcs(refstring,compstring[start:end]) # alignment start from N terminus
    best_offset = 0
    for offset in range(0,lsearch):
        # compare percentage similarity
        assert end+offset <= lcomp
        test_window = compstring[start+offset:end+offset]
        test_overlap = pylcs.lcs(refstring, test_window)
        if test_overlap > best_overlap:
            best_overlap = test_overlap
            best_offset = offset
        else:
            pass
    
    # sanity check on best_offset by using it to slice compstring
    best_window = compstring[best_offset:best_offset + lref]
    # check we indeed have the best answer (probably unecessary but I'm not a computer scientist)
    assert pylcs.lcs(best_window, refstring) == best_overlap
    
    if lref == 0:
        best_similarity = 0.0
    else:
        best_similarity = best_overlap / lref
    
    if best_overlap >= threshold * lref: # recompute so as to avoid dividing for more precision
        return best_offset, best_similarity
    else:
        # threshold is not met
        return -1, best_similarity

Run tests for lcs_align

In [5]:
# test 1 - compare nothing
assert lcs_align('','',1)[0] == 0
assert lcs_align('','',1)[1] == 0

# test 2 - compare simple strings
assert lcs_align('ABC','DABAABCAB',1)[0]==4
assert lcs_align('ABC','DABAABCAB',1)[1]==1.0

# test 3 - fail threshold
assert lcs_align('ABDDC','DABAABCAB',0.8)[0]==-1
assert lcs_align('ABDDC','DABAABCAB',0.8)[1]==0.6

# test 4 - compare normal length strings
assert lcs_align('NKKPVNSWTCEDFLAVDESFQPTAVGFAEALNNKDKPEDAVLDVQGIATVTPAIVQACTQDKQANFKDKVKGEWDKIKK',
                 'MKKVLGVILGGLLLLPVVSNAADAQKAADNKKPVNSWTCEDFLAVDESFQPTAVGFAEALNNKDKPEDAVLDVQGIATVTPAIVQACTQDKQANFKDKVKGEWDKIKKDM',
                 1)[0]==29
assert lcs_align('NKKPVNSWTCEDFLAVDESFQPTAVGFAEALNNKDKPEDAVLDVQGIATVTPAIVQACTQDKQANFKDKVKGEWDKIKK',
                 'MKKVLGVILGGLLLLPVVSNAADAQKAADNKKPVNSWTCEDFLAVDESFQPTAVGFAEALNNKDKPEDAVLDVQGIATVTPAIVQACTQDKQANFKDKVKGEWDKIKKDM',
                 1)[1]==1

# test 5 
assert lcs_align('GSASPTPPYLKWAESLHSLLDDQDGISLFRTFLKQEGCADLLDFWFACTGFRKLEPCDSNEEKRLKLARAIYRKYILDNNGIVSRQTKPATKSFIKGCIMKQLIDPAMFDQAQTEIQATMEENTYPSFLKSDIYLEYTRTGSESPKV',
                 'MNIQEQGFPLDLGASFTEDAPRPPVPGEEGELVSTDPRPASYSFCSGKGVGIKGETSTATPRRSDLDLGYEPEGSASPTPPYLKWAESLHSLLDDQDGISLFRTFLKQEGCADLLDFWFACTGFRKLEPCDSNEEKRLKLARAIYRKYILDNNGIVSRQTKPATKSFIKGCIMKQLIDPAMFDQAQTEIQATMEENTYPSFLKSDIYLEYTRTGSESPKVCSDQSSGSGTGKGISGYLPTLNEDEEWKCDQDMDEDDGRDAAPPGRLPQKLLLETAAPRVSSSRRYSEGREFRYGSWREPVNPYYVNAGYALAPATSANDSEQQSLSSDADTLSLTDSSVDGIPPYRIRKQHRREMQESVQVNGRVPLPHIPRTYRVPKEVRVEPQKFAEELIHRLEAVQRTREAEEKLEERLKRVRMEEEGEDGDPSSGPPGPCHKLPPAPAWHHFPPRCVDMGCAGLRDAHEENPESILDEHVQRVLRTPGRQSPGPGHRSPDSGHVAKMPVALGGAASGHGKHVPKSGAKLDAAGLHHHRHVHHHVHHSTARPKEQVEAEATRRAQSSFAWGLEPHSHGARSRGYSESVGAAPNASDGLAHSGKVGVACKRNAKKAESGKSASTEVPGASEDAEKNQKIMQWIIEGEKEISRHRRTGHGSSGTRKPQPHENSRPLSLEHPWAGPQLRTSVQPSHLFIQDPTMPPHPAPNPLTQLEEARRRLEEEEKRASRAPSKQRYVQEVMRRGRACVRPACAPVLHVVPAVSDMELSETETRSQRKVGGGSAQPCDSIVVAYYFCGEPIPYRTLVRGRAVTLGQFKELLTKKGSYRYYFKKVSDEFDCGVVFEEVREDEAVLPVFEEKIIGKVEKVD',
                 1)[0]==73
assert lcs_align('GSASPTPPYLKWAESLHSLLDDQDGISLFRTFLKQEGCADLLDFWFACTGFRKLEPCDSNEEKRLKLARAIYRKYILDNNGIVSRQTKPATKSFIKGCIMKQLIDPAMFDQAQTEIQATMEENTYPSFLKSDIYLEYTRTGSESPKV',
                 'MNIQEQGFPLDLGASFTEDAPRPPVPGEEGELVSTDPRPASYSFCSGKGVGIKGETSTATPRRSDLDLGYEPEGSASPTPPYLKWAESLHSLLDDQDGISLFRTFLKQEGCADLLDFWFACTGFRKLEPCDSNEEKRLKLARAIYRKYILDNNGIVSRQTKPATKSFIKGCIMKQLIDPAMFDQAQTEIQATMEENTYPSFLKSDIYLEYTRTGSESPKVCSDQSSGSGTGKGISGYLPTLNEDEEWKCDQDMDEDDGRDAAPPGRLPQKLLLETAAPRVSSSRRYSEGREFRYGSWREPVNPYYVNAGYALAPATSANDSEQQSLSSDADTLSLTDSSVDGIPPYRIRKQHRREMQESVQVNGRVPLPHIPRTYRVPKEVRVEPQKFAEELIHRLEAVQRTREAEEKLEERLKRVRMEEEGEDGDPSSGPPGPCHKLPPAPAWHHFPPRCVDMGCAGLRDAHEENPESILDEHVQRVLRTPGRQSPGPGHRSPDSGHVAKMPVALGGAASGHGKHVPKSGAKLDAAGLHHHRHVHHHVHHSTARPKEQVEAEATRRAQSSFAWGLEPHSHGARSRGYSESVGAAPNASDGLAHSGKVGVACKRNAKKAESGKSASTEVPGASEDAEKNQKIMQWIIEGEKEISRHRRTGHGSSGTRKPQPHENSRPLSLEHPWAGPQLRTSVQPSHLFIQDPTMPPHPAPNPLTQLEEARRRLEEEEKRASRAPSKQRYVQEVMRRGRACVRPACAPVLHVVPAVSDMELSETETRSQRKVGGGSAQPCDSIVVAYYFCGEPIPYRTLVRGRAVTLGQFKELLTKKGSYRYYFKKVSDEFDCGVVFEEVREDEAVLPVFEEKIIGKVEKVD',
                 1)[1]==1

# test 6 
# fixed: lsearch = lcomp - lref +1 
assert lcs_align('RIPTDPTMYRFYEMLQVYGTTLKALVHEKFGDGIISAINFKLDVKKVADPEGGERAVITLDGKYLPTKPF',
                 'MIQSQINRNIRLDLADAILLSKAKKDLSFAEIADGTGLAEAFVTAALLGQQALPADAARLVGAKLDLDEDSILLLQMIPLRGCIDDRIPTDPTMYRFYEMLQVYGTTLKALVHEKFGDGIISAINFKLDVKKVADPEGGERAVITLDGKYLPTKPF',
                 1)[0]==86
assert lcs_align('RIPTDPTMYRFYEMLQVYGTTLKALVHEKFGDGIISAINFKLDVKKVADPEGGERAVITLDGKYLPTKPF',
                 'MIQSQINRNIRLDLADAILLSKAKKDLSFAEIADGTGLAEAFVTAALLGQQALPADAARLVGAKLDLDEDSILLLQMIPLRGCIDDRIPTDPTMYRFYEMLQVYGTTLKALVHEKFGDGIISAINFKLDVKKVADPEGGERAVITLDGKYLPTKPF',
                 1)[1]==1

# test 7
assert lcs_align('NERNISRLWRAFRTVKEMVKDRGYFITQEEVELPLEDFKAKYCDSMGRPQRKMMSFQANPTEESISKFPDMGSLWVEFCDEPSVGVKTMKTFVIHIQEKNFQTGIFVYQNNITPSAMKLVPSIPPATIETFNEAALVVN',
                 'MDQENERNISRLWRAFRTVKEMVKDRGYFITQEEVELPLEDFKAKYCDSMGRPQRKMMSFQANPTEESISKFPDMGSLWVEFCDEPSVGVKTMKTFVIHIQEKNFQTGIFVYQNNITPSAMKLVPSIPPATIETFNEAALVVNITHHELVPKHIRLSSDEKRELLKRYRLKESQLPRIQRADPVALYLGLKRGEVVKIIRKSETSGRYASYRICM',
                 1)[0]==4
assert lcs_align('NERNISRLWRAFRTVKEMVKDRGYFITQEEVELPLEDFKAKYCDSMGRPQRKMMSFQANPTEESISKFPDMGSLWVEFCDEPSVGVKTMKTFVIHIQEKNFQTGIFVYQNNITPSAMKLVPSIPPATIETFNEAALVVN',
                 'MDQENERNISRLWRAFRTVKEMVKDRGYFITQEEVELPLEDFKAKYCDSMGRPQRKMMSFQANPTEESISKFPDMGSLWVEFCDEPSVGVKTMKTFVIHIQEKNFQTGIFVYQNNITPSAMKLVPSIPPATIETFNEAALVVNITHHELVPKHIRLSSDEKRELLKRYRLKESQLPRIQRADPVALYLGLKRGEVVKIIRKSETSGRYASYRICM',
                 1)[1]==1

Load in input array

In [6]:
os.chdir('/cluster/gjb_lab/2472402')
sum_df = pd.read_csv('summary_table.csv')
sum_df.head()

Unnamed: 0,seqID,domain,uniprotId,PDBe_sequence,DSSP8,AFpred_8,AF_sequence,jpred_sequence,jpred_3,DSSP3
0,24695,d1a12a_,P18754,RRSPPADAIPKSKKVKVSHRSHSTEPGLVLTLGQGDVGQLGLGENV...,-------TT-----BEEEEEEE-TTSTT-S-TT--EEEEEEEE--S...,-------------S-------------TT-----BEEEEEEE-TTS...,MSPKRIAKRRSPPADAIPKSKKVKVSHRSHSTEPGLVLTLGQGDVG...,KKVKVSHRSHSTEPGLVLTLGQGDVGQLGLGENVMERKKPALVSIP...,-----EEEEEEE----EEEEE-------------------EEE---...,--------------EEEEEEEE-------------EEEEEEEE---...
1,24696,d1a1xa_,P56278,GSAGEDVGAPPDHLWVHQEGIYRDEYQRTWVAVVEEETSFLRARVQ...,---------SEEEEEETTEEEETTS-EEEEEEEE-SS-EEEEEE--...,-------S--SEEEEEETTEEEETT--EEEEEEEEETTEEEEEEE-...,MAGEDVGAPPDHLWVHQEGIYRDEYQRTWVAVVEEETSFLRARVQQ...,AGEDVGAPPDHLWVHQEGIYRDEYQRTWVAVVEEETSFLRARVQQI...,----------EEEEE---EEEE----EEEEEEEE----EEEEEEE-...,----------EEEEEE--EEEE----EEEEEEEE----EEEEEE--...
2,24697,d1a62a1,P0AG30,*NLTELKNTPVSELITLGEN*GLENLAR*RKQDIIFAILKQHAKSG...,-BHHHHHTS-HHHHHHHHHTTT----TTS-HHHHHHHHHHHHHHTT-,-BHHHHHHS-HHHHHHHHHHTT--SGGGS-HHHHHHHHHHHHHHTT...,MNLTELKNTPVSELITLGENMGLENLARMRKQDIIFAILKQHAKSG...,MNLTELKNTPVSELITLGENMGLENLARMRKQDIIFAILKQHAKSGE,--HHH-----HHHHHHHHHH----------HHHHHHHHHHHHHH---,-EHHHHH---HHHHHHHHH-----------HHHHHHHHHHHHHH---
3,24706,d1aiea_,P04637,EYFTLQIRGRERFEMFRELNEALELKDAQAG,--------SHHHHHHHHHHHHHHHHHHHH--,---------------TTTHHHHHHTS-S--S---------------...,MEEPQSDPSVEPPLSQETFSDLWKLLPENNVLSPLPSQAMDDLMLS...,EYFTLQIRGRERFEMFRELNEALELKDAQAG,-EEEEEEE--HHHHHHHHHHHHHHHHH----,---------HHHHHHHHHHHHHHHHHHHH--
4,24713,d1b79a_,P0ACB0,MASHMERDPQVAGLKVPPHSIEAEQSVLGGLMLDNERWDDVAERVV...,---HHHHHHHHHHHHHH-GGGHHHHHTT--GGGSSSHHHHHHHHHH...,------------------TTTTTS------HHHHHHHHHHHHH-GG...,MAGNKPFNKQQAEPRERDPQVAGLKVPPHSIEAEQSVLGGLMLDNE...,PPHSIEAEQSVLGGLMLDNERWDDVAERVVADDFYTRPHRHIFTEM...,----HHHHHHHHHHHH----HHHHHHHHH----HHHHHHHHHHHHH...,---HHHHHHHHHHHHHH----HHHHH----------HHHHHHHHHH...


Run code below to initialize/reset the input for in-place truncation subroutine

In [113]:
print('resetting input dataframe')
sum_df_trunc = sum_df.copy(deep=True)

resetting input dataframe


Run the code below for in-place truncation of jpred/alphafold sequences

In [114]:
# threshold for accepting sequences as capable of aligning to each other
# defined terms of length of longest common subsequence (IMPT: NOT substring) divided by length of the shorter sequence
threshold = 0.5

In [115]:
for i,row in sum_df_trunc.iterrows():
    sequences_do_not_align = False
    # order of arguments in lcs_align has to be len(arg1) < len(arg2)
    # that is the reason for if-else handling 
    if len(row.jpred_sequence) <= len(row.AF_sequence):
        AF_start, similarity = lcs_align(row.jpred_sequence, row.AF_sequence, threshold)
        if AF_start == -1:
            sequences_do_not_align = True
        else:
            # truncate N terminus of alphafold sequence to match N terminus of jpred sequence
            sum_df_trunc.loc[i,'AF_sequence'] = sum_df_trunc.loc[i,'AF_sequence'][AF_start:]
            sum_df_trunc.loc[i,'AFpred_8'] = sum_df_trunc.loc[i,'AFpred_8'][AF_start:]
            
            # truncate N terminus of jpred sequence
            overlap_length = pylcs.lcs(row.jpred_sequence, row.AF_sequence)
            jpred_start = len(row.jpred_sequence) - overlap_length
            sum_df_trunc.loc[i,'jpred_sequence'] = sum_df_trunc.loc[i,'AF_sequence'][jpred_start:]
            sum_df_trunc.loc[i,'jpred_3'] = reduce_dssp(sum_df_trunc.loc[i,'AFpred_8'])
            
            # truncate C terminus of alphafold sequence
            AF_end = AF_start + overlap_length + jpred_start
            sum_df_trunc.loc[i,'AF_sequence'] = sum_df_trunc.loc[i,'AF_sequence'][:AF_end]
            sum_df_trunc.loc[i,'AFpred_8'] = sum_df_trunc.loc[i,'AFpred_8'][:AF_end]
            
            # add column consisting of AF pred 3 state version
            sum_df_trunc.loc[i,'AFpred_3'] = reduce_dssp(sum_df_trunc.loc[i,'AFpred_8'])

    else:
        # jpred sequence is longer than the alphafold sequence
        jpred_start, similarity = lcs_align(row.AF_sequence, row.jpred_sequence, threshold)
        if jpred_start == -1:
            sequences_do_not_align = True
        else:
            # truncate N terminus of jpred sequence to match N terminus of alphafold sequence
            sum_df_trunc.loc[i,'jpred_sequence'] = sum_df_trunc.loc[i,'jpred_sequence'][jpred_start:]
            sum_df_trunc.loc[i,'jpred_3'] = sum_df_trunc.loc[i,'jpred_3'][jpred_start:]
            
            # truncate N terminus of alphafold sequence
            overlap_length = pylcs.lcs(row.AF_sequence, row.jpred_sequence)
            AF_start = len(row.AF_sequence) - overlap_length
            sum_df_trunc.loc[i,'AF_sequence'] = sum_df_trunc.loc[i,'AF_sequence'][AF_start:]
            sum_df_trunc.loc[i,'AFpred_8'] = sum_df_trunc.loc[i,'AFpred_8'][AF_start:]
            
            # truncate C terminus of jpred sequence
            jpred_end = jpred_start + overlap_length + AF_start
            sum_df_trunc.loc[i,'jpred_sequence'] = sum_df_trunc.loc[i,'jpred_sequence'][:jpred_end]
            sum_df_trunc.loc[i,'jpred_3'] = sum_df_trunc.loc[i,'jpred_3'][:jpred_end]
            
            # add column with AF pred 3 state version
            sum_df_trunc.loc[i,'AFpred_3'] = reduce_dssp(sum_df_trunc.loc[i,'AFpred_8'])
    
    assert similarity # should be at least 0
    sum_df_trunc.loc[i,'similarity'] = similarity
    
    # for debug
    if sequences_do_not_align:
        print(f'row {i} / pdbId {row.domain}: sequences do not align', file=sys.stderr)

row 14 / pdbId d1cxzb_: sequences do not align
row 23 / pdbId d1dtdb_: sequences do not align
row 31 / pdbId d1em8b_: sequences do not align
row 44 / pdbId d1fm0e_: sequences do not align
row 58 / pdbId d1gzsb_: sequences do not align
row 104 / pdbId d1lm8v_: sequences do not align
row 107 / pdbId d1m1eb_: sequences do not align
row 114 / pdbId d1nh2b_: sequences do not align
row 115 / pdbId d1nh2c_: sequences do not align
row 126 / pdbId d1oc0b_: sequences do not align
row 135 / pdbId d1or7c_: sequences do not align
row 159 / pdbId d1rk8c_: sequences do not align
row 173 / pdbId d1syxb_: sequences do not align
row 183 / pdbId d1twfc2: sequences do not align
row 184 / pdbId d1twff_: sequences do not align
row 185 / pdbId d1twfj_: sequences do not align
row 195 / pdbId d1uptb_: sequences do not align
row 196 / pdbId d1us7b_: sequences do not align
row 197 / pdbId d1usub_: sequences do not align
row 224 / pdbId d1xu1r_: sequences do not align
row 226 / pdbId d1y5ic1: sequences do not ali

## Print to summary table, this timed called debug.txt

In [122]:
with open('debug.txt','w') as f:
    print('Only sequences with similarity > %.2f have been aligned' % threshold, file=f)
    for i,row in sum_df_trunc.iterrows():
        print('seqID          :',row.seqID,file=f)
        print('domain         :',row.domain,file=f)
        print('UniprotId      :',row.uniprotId,file=f)
        print('PDBe_sequence  :',row.PDBe_sequence,file=f)
        print('jpred_sequence :',row.jpred_sequence,file=f)
        print('DSSP_8         :',row.DSSP8,file=f)
        print('DSSP_3         :',row.DSSP3,file=f)
        print('jpred_3        :',row.jpred_3,file=f)
        print('AF_sequence    :',row.AF_sequence,file=f)
        print('AFpred_8       :',row.AFpred_8,file=f)
        print('AFpred_3       :',row.AFpred_3,file=f)
        print('AFpred_3       :',row.AFpred_3,file=f)
        print('Similarity     :',row.similarity,file=f)
        print('',file=f)

# Retry debugging

Debug for seqID 24716

In [7]:
debug_df = sum_df[sum_df['seqID']==24716] # problematic one is 24716
debug_df

Unnamed: 0,seqID,domain,uniprotId,PDBe_sequence,DSSP8,AFpred_8,AF_sequence,jpred_sequence,jpred_3,DSSP3
6,24716,d1bgfa_,P42228,GGSQWNQVQQLEIKFLEQVDQFYDDNFPMEIRHLLAQWIETQDWEV...,---HHHHHHHS-GGGHHHHGGG-BTTB-HHHHHHTHHHHHHS-HHH...,--HHHHHHTS-HHHHHHHHTT--TTS-HHHHHHTHHHHHHS-HHHH...,MSQWNQVQQLEIKFLEQVDQFYDDNFPMEIRHLLAQWIETQDWEVA...,GGSQWNQVQQLEIKFLEQVDQFYDDNFPMEIRHLLAQWIETQDWEV...,-HHHHHHHH----HHHHHHHHH------HHHHHHHH----------...,---HHHHHHH-----HHHH----E--E-HHHHHH-HHHHHH--HHH...


In [8]:
def per_residue_score(ref,comp):
    n = len(ref)
    assert n == len(comp)
    n_match = 0
    for i in range(0,n):
        if ref[i:i+1]==comp[i:i+1]:
            n_match += 1
    return n_match/(n+1e-32)

In [9]:
testref=''
testcomp=''
per_residue_score(testref,testcomp)

0.0

In [10]:
testref  = 'MSQWNQVQQLEIKFLEQVDQFYDDNFPMEIRHLLAQWIETQDWEVASNNETMATILLQNLLIQLDEQLGRVSKEKNLLLIHNLKRIRKVLQGKFHGNPMHVAVVISNCLREERRILAAANMPIQ'
testcomp = 'GGSQWNQVQQLEIKFLEQVDQFYDDNFPMEIRHLLAQWIETQDWEVASNNETMATILLQNLLIQLDEQLGRVSKEKNLLLIHNLKRIRKVLQGKFHGNPMHVAVVISNCLREERRILAAANMPI'
per_residue_score(testref,testcomp)

0.10483870967741936

In [11]:
testref  = 'SQWNQVQQLEIKFLEQVDQFYDDNFPMEIRHLLAQWIETQDWEVASNNETMATILLQNLLIQLDEQLGRVSKEKNLLLIHNLKRIRKVLQGKFHGNPMHVAVVISNCLREERRILAAANMPI'
testcomp = 'SQWNQVQQLEIKFLEQVDQFYDDNFPMEIRHLLAQWIETQDWEVASNNETMATILLQNLLIQLDEQLGRVSKEKNLLLIHNLKRIRKVLQGKFHGNPMHVAVVISNCLREERRILAAANMPI'
per_residue_score(testref,testcomp)

1.0

In [24]:
def per_residue_align(refstring,compstring,threshold):
    assert threshold <= 1
    lref = len(refstring) 
    lcomp = len(compstring)
    assert lref <= lcomp
    lsearch = lcomp - lref + 1 # how many amino acid is stringc longer than stringb by
    start,end = 0, lref # index to slice comparator string 
    best_overlap = per_residue_score(refstring,compstring[start:end]) # alignment start from N terminus
    best_offset = 0
    for offset in range(0,lsearch):
        # compare percentage similarity
        assert end+offset <= lcomp
        test_window = compstring[start+offset:end+offset]
        test_overlap = per_residue_score(refstring, test_window)
        if test_overlap > best_overlap:
            best_overlap = test_overlap
            best_offset = offset
        else:
            pass
    
    # sanity check on best_offset by using it to slice compstring
    best_window = compstring[best_offset:best_offset + lref]
    # check we indeed have the best answer (probably unecessary but I'm not a computer scientist)
    assert per_residue_score(best_window, refstring) == best_overlap
    
    if best_overlap >= threshold: 
        return best_offset, best_overlap
    else:
        # threshold is not met
        return -1, best_overlap

In [13]:
testref = 'GGSQWNQVQQLEIKFLEQVDQFYDDNFPMEIRHLLAQWIETQDWEVASNNETMATILLQNLLIQLDEQLGRVSKEKNLLLIHNLKRIRKVLQGKFHGNPMHVAVVISNCLREERRILAAANMPI'
testcomp  = 'MSQWNQVQQLEIKFLEQVDQFYDDNFPMEIRHLLAQWIETQDWEVASNNETMATILLQNLLIQLDEQLGRVSKEKNLLLIHNLKRIRKVLQGKFHGNPMHVAVVISNCLREERRILAAANMPIQGPLEKSLQSSSVSERQRNVEHKVSAIKNSVQMTEQDTKYLEDLQDEFDYRYKTIQTMDQGDKNSILVNQEVLTLLQEMLNSLDFKRKEALSKMTQIVNETDLLMNSMLLEELQDWKKRQQIACIGGPLHNGLDQLQNCFTLLAESLFQLRQQLEKLQEQSTKMTYEGDPIPAQRAHLLERATFLIYNLFKNSFVVERQPCMPTHPQRPMVLKTLIQFTVKLRLLIKLPELNYQVKVKASIDKNVSTLSNRRFVLCGTHVKAMSSEESSNGSLSVEFRHLQPKEMKCSTGSKGNEGCHMVTEELHSITFETQICLYGLTINLETSSLPVVMISNVSQLPNAWASIIWYNVSTNDSQNLVFFNNPPSVTLGQLLEVMSWQFSSYVGRGLNSEQLNMLAEKLTVQSNYNDGHLTWAKFCKEHLPGKTFTFWTWLEAILDLIKKHILPLWIDGYIMGFVSKEKERLLLKDKMPGTFLLRFSESHLGGITFTWVDQSENGEVRFHSVEPYNKGRLSALAFADILRDYKVIMAENIPENPLKYLYPDIPKDKAFGKHYSSQPCEVSRPTERGDKGYVPSVFIPISTIRSDSTEPQSPSDLLPMSPSAYAVLRENLSPTTIETAMNSPYSAE'
per_residue_align(testref,testcomp,threshold=0.9)

(-1, 0.14516129032258066)

In [28]:
testref = 'GGSQWNQVQQLEIKFLEQVDQFYDDNFPMEIRHLLAQWIETQDWEVASNNETMATILLQNLLIQLDEQLGRVSKEKNLLLIHNLKRIRKVLQGKFHGNPMHVAVVISNCLREERRILAAANMPI'
testcomp  = 'MSQWNQVQQLEIKFLEQVDQFYDDNFPMEIRHLLAQWIETQDWEVASNNETMATILLQNLLIQLDEQLGRVSKEKNLLLIHNLKRIRKVLQGKFHGNPMHVAVVISNCLREERRILAAANMPIQGPLEKSLQSSSVSERQRNVEHKVSAIKNSVQMTEQDTKYLEDLQDEFDYRYKTIQTMDQGDKNSILVNQEVLTLLQEMLNSLDFKRKEALSKMTQIVNETDLLMNSMLLEELQDWKKRQQIACIGGPLHNGLDQLQNCFTLLAESLFQLRQQLEKLQEQSTKMTYEGDPIPAQRAHLLERATFLIYNLFKNSFVVERQPCMPTHPQRPMVLKTLIQFTVKLRLLIKLPELNYQVKVKASIDKNVSTLSNRRFVLCGTHVKAMSSEESSNGSLSVEFRHLQPKEMKCSTGSKGNEGCHMVTEELHSITFETQICLYGLTINLETSSLPVVMISNVSQLPNAWASIIWYNVSTNDSQNLVFFNNPPSVTLGQLLEVMSWQFSSYVGRGLNSEQLNMLAEKLTVQSNYNDGHLTWAKFCKEHLPGKTFTFWTWLEAILDLIKKHILPLWIDGYIMGFVSKEKERLLLKDKMPGTFLLRFSESHLGGITFTWVDQSENGEVRFHSVEPYNKGRLSALAFADILRDYKVIMAENIPENPLKYLYPDIPKDKAFGKHYSSQPCEVSRPTERGDKGYVPSVFIPISTIRSDSTEPQSPSDLLPMSPSAYAVLRENLSPTTIETAMNSPYSAE'
pylcs.lcs(testcomp,testref)

122

In [26]:
testref = 'SQWNQVQQLEIKFLEQVDQFYDDNFPMEIRHLLAQWIETQDWEVASNNETMATILLQNLLIQLDEQLGRVSKEKNLLLIHNLKRIRKVLQGKFHGNPMHVAVVISNCLREERRILAAANMPI'
testcomp  = 'MSQWNQVQQLEIKFLEQVDQFYDDNFPMEIRHLLAQWIETQDWEVASNNETMATILLQNLLIQLDEQLGRVSKEKNLLLIHNLKRIRKVLQGKFHGNPMHVAVVISNCLREERRILAAANMPIQGPLEKSLQSSSVSERQRNVEHKVSAIKNSVQMTEQDTKYLEDLQDEFDYRYKTIQTMDQGDKNSILVNQEVLTLLQEMLNSLDFKRKEALSKMTQIVNETDLLMNSMLLEELQDWKKRQQIACIGGPLHNGLDQLQNCFTLLAESLFQLRQQLEKLQEQSTKMTYEGDPIPAQRAHLLERATFLIYNLFKNSFVVERQPCMPTHPQRPMVLKTLIQFTVKLRLLIKLPELNYQVKVKASIDKNVSTLSNRRFVLCGTHVKAMSSEESSNGSLSVEFRHLQPKEMKCSTGSKGNEGCHMVTEELHSITFETQICLYGLTINLETSSLPVVMISNVSQLPNAWASIIWYNVSTNDSQNLVFFNNPPSVTLGQLLEVMSWQFSSYVGRGLNSEQLNMLAEKLTVQSNYNDGHLTWAKFCKEHLPGKTFTFWTWLEAILDLIKKHILPLWIDGYIMGFVSKEKERLLLKDKMPGTFLLRFSESHLGGITFTWVDQSENGEVRFHSVEPYNKGRLSALAFADILRDYKVIMAENIPENPLKYLYPDIPKDKAFGKHYSSQPCEVSRPTERGDKGYVPSVFIPISTIRSDSTEPQSPSDLLPMSPSAYAVLRENLSPTTIETAMNSPYSAE'
per_residue_align(testref,testcomp,threshold=1)

(1, 1.0)

In [30]:
def truncate_row(original_row, threshold=0.9):
    row = original_row.copy(deep=True)
    truncate = True
    lcs_length = pylcs.lcs(row.jpred_sequence, row.AF_sequence)
    if len(row.jpred_sequence) <= len(row.AF_sequence):
        AF_start, similarity = per_residue_align(row.jpred_sequence, row.AF_sequence, threshold)
        
        if AF_start == -1:
            # could fail because the lcs could start at residue n
            # remove n resides from start of jpred sequence
            jpred_start = len(row.jpred_sequence) - lcs_length # find n
            jpred_sequence_backup = row.jpred_sequence
            row.jpred_sequence = row.jpred_sequence[jpred_start:]
            # try again
            AF_start, similarity = per_residue_align(row.jpred_sequence, row.AF_sequence, threshold)
            
            if AF_start == -1:
                # no subsequence with similarity > threshold found
                truncate = False
                # reset jpred sequence to original
                row.jpred_sequence = jpred_sequence_backup
        
        if truncate:
            # perform truncation
            # truncate N terminus of alphafold sequence to match N terminus of jpred sequence
            row.AF_sequence = row.AF_sequence[AF_start:]
            row.AFpred_8 = row.AFpred_8[AF_start:]

            # truncate N terminus of jpred sequence
            jpred_start = len(row.jpred_sequence) - lcs_length
            row.jpred_sequence = row.jpred_sequence[jpred_start:]
            row.jpred_3 = row.jpred_3[jpred_start:]

            # truncate C terminus of alphafold sequence
            AF_end = AF_start + lcs_length + jpred_start
            row.AF_sequence = row.AF_sequence[:AF_end]
            row.AFpred_8 = row.AFpred_8[:AF_end]

            # add column consisting of AF pred 3 state version
            row.AFpred_3 = reduce_dssp(row.AFpred_8)

    else:
        # jpred sequence is longer than the alphafold sequence
        # so do exact same as above but call per_residue_align with arguments swapped
        jpred_start, similarity = per_residue_align(row.AF_sequence, row.jpred_sequence, threshold)
        
        if jpred_start == -1:
            # could fail because the lcs could start at residue n
            # remove n residues from the jpred sequence
            AF_start = len(row.AF_sequence) - lcs_length
            AF_sequence_backup = row.AF_sequence
            row.AF_sequence = row.AF_sequence[AF_start:]
            jpred_start, similarity = per_residue_align(row.AF_sequence, row.jpred_sequence, threshold)
            # try again
            if jpred_start == -1:
                # no subsequence with similarity > threshold found
                truncate = False
                # reset AF_sequence to original
                row.AF_sequence = AF_sequence_backup
            
        if truncate:
            # calculate length of lcs
            overlap_length = pylcs.lcs(row.AF_sequence, row.jpred_sequence)
            
            # truncate N terminus of jpred sequence to match N terminus of alphafold sequence
            row.jpred_sequence = row.jpred_sequence[jpred_start:]
            row.jpred_3 = row.jpred_3[jpred_start:]
            
            # truncate N terminus of alphafold sequence
            AF_start = len(row.AF_sequence) - lcs_length
            row.AF_sequence = row.AF_sequence[AF_start:]
            row.AFpred_8 = row.AFpred_8[AF_start:]
            
            # truncate C terminus of jpred sequence
            jpred_end = jpred_start + lcs_length + AF_start
            row.jpred_sequence = row.jpred_sequence[:jpred_end]
            row.jpred_3 = row.jpred_3[:jpred_end]
            
            # add column with AF pred 3 state version
            row.AFpred_3 = reduce_dssp(row.AFpred_8)
    
    assert similarity # should be at least 0
    row.loc['similarity'] = similarity
    
    # for debug
    # if no truncation was performed, it is because sequences don't align
    if not truncate:
        print(f'seqId {row.seqID}: sequences do not align', file=sys.stderr)
    
    return row

In [37]:
debug_df = sum_df[sum_df['seqID']<25000] # problematic one is 24716


In [38]:
l = []
for i,row in debug_df.iterrows():
    new_row = truncate_row(row,threshold=0.9)
    l.append(new_row)
pd.options.display.max_colwidth = 100
pd.concat(l,axis=1).T[['seqID','AF_sequence','jpred_sequence']]

seqId 24715: sequences do not align
seqId 24741: sequences do not align
seqId 24744: sequences do not align
seqId 24746: sequences do not align
seqId 24753: sequences do not align
seqId 24773: sequences do not align
seqId 24798: sequences do not align
seqId 24833: sequences do not align
seqId 24849: sequences do not align
seqId 24852: sequences do not align
seqId 24861: sequences do not align
seqId 24863: sequences do not align
seqId 24892: sequences do not align
seqId 24905: sequences do not align
seqId 24909: sequences do not align
seqId 24921: sequences do not align
seqId 24937: sequences do not align
seqId 24970: sequences do not align
seqId 24982: sequences do not align


Unnamed: 0,seqID,AF_sequence,jpred_sequence
0,24695,KKVKVSHRSHSTEPGLVLTLGQGDVGQLGLGENVMERKKPALVSIPEDVVQAEAGGMHTVCLSKSGQVYSFGCNDEGALGRDTSVEGSEMVPGKVE...,KKVKVSHRSHSTEPGLVLTLGQGDVGQLGLGENVMERKKPALVSIPEDVVQAEAGGMHTVCLSKSGQVYSFGCNDEGALGRDTSVEGSEMVPGKVE...
1,24696,AGEDVGAPPDHLWVHQEGIYRDEYQRTWVAVVEEETSFLRARVQQIQVPLGDAARPSHLLTSQLPLMWQLYPEERYMDNNSRLWQIQHHLMVRGVQ...,AGEDVGAPPDHLWVHQEGIYRDEYQRTWVAVVEEETSFLRARVQQIQVPLGDAARPSHLLTSQLPLMWQLYPEERYMDNNSRLWQIQHHLMVRGVQ...
2,24697,MNLTELKNTPVSELITLGENMGLENLARMRKQDIIFAILKQHAKSGE,MNLTELKNTPVSELITLGENMGLENLARMRKQDIIFAILKQHAKSGE
3,24706,EYFTLQIRGRERFEMFRELNEALELKDAQAGKEPGGSRAHSSHLKSKKGQSTSRHKKLMFKTEGPDSD,EYFTLQIRGRERFEMFRELNEALELKDAQAG
4,24713,PPHSIEAEQSVLGGLMLDNERWDDVAERVVADDFYTRPHRHIFTEMARLQESGSPIDLITLAESLERQGQLDSVGGFAYLAELSKNTPSAANISAY...,PPHSIEAEQSVLGGLMLDNERWDDVAERVVADDFYTRPHRHIFTEMARLQESGSPIDLITLAESLERQGQLDSVGGFAYLAELSKNTPSAANISAY...
...,...,...,...
105,24971,AAYPITGKLGSELTMTDTVGQVVLGWKVSDLKSSTAVIPGYPVAGQVWEATATVNAIRGSVTPAVSQFNARTADGINYRVLWQAAGPDTISGATIP...,AYPITGKLGSELTMTDTVGQVVLGWKVSDLKSSTAVIPGYPVAGQVWEATATVNAIRGSVTPAVSQFNARTADGINYRVLWQAAGPDTISGATIPQ...
106,24978,QDGGWSHWSPWSSCSVTCGDGVITRIRLCNSPSPQMNGKPCEGEARETKACKKDACPINGGWGPWSPWDICSVTCGGGVQKRSRLCNNPTPQFGGK...,QDGGWSHWSPWSSCSVTCGDGVITRIRLCNSPSPQMNGKPCEGEARETKACKKD
107,24982,MATQADLMELDMAMEPDRKAAVSHWQQQSYLDSGIHSGATTTAPSLSGKGNPEEEDVDTSQVLYEWEQGFSQSFTQEQVADIDGQYAMTRAQRVRA...,KSPEEMYIQQKVRVLLMLRKMGSNLTASEEEFLRTYAGVVNSQLSQIDQGAEDVVMAFSRSETED
108,24990,CPQEDSDIAFLIDGSGSIIPHDFRRMKEFVSTVMEQLKKSKTLFSLMQYSEEFRIHFTFKEFQNNPNPRSLVKPITQLLGRTHTATGIRKVVRELF...,CPQEDSDIAFLIDGSGSIIPHDFRRMKEFVSTVMEQLKKSKTLFSLMQYSEEFRIHFTFKEFQNNPNPRSLVKPITQLLGRTHTATGIRKVVRELF...
