# Antigen surface analysis

- Date of run: 2024-10-14
- Environment: python 3.12
- Packages required: pandas, numpy, sklearn, statsmodels, seaborn, matplotlib, rust-sasa-python, **sbg**

**Note**: This is a SBG workspace dependent code, it won't work in other manchines. Please contact Leandro Radusky if you want to install the dependencies and run this script by yourself.

In [1]:
# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

In [2]:
# Import the sbg workspace and local utils
import sys
sys.path.append('/home/leandro/Dropbox/workspacesbg/sbg/')
sys.path.append('../../../code/')

from sbg.orf.Orf import OnlineOrf
from sbg.structure.AlphaFoldHandler import AlphaFoldHandler
from sequence_utils import highlight_sequence, align_sequences_biopython, display_alignment_with_highlighting
from sbg.pdbtools.pdb_sasa import pdbSASA

In [3]:
# Other imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

from rust_sasa_python import calculate_sasa_at_residue_level


# Sequences definition

This is the data extracted from the file [Antigen secuences](<https://mimarkdx.sharepoint.com/:w:/s/Scientific/EfUQVeXjaZtPrtHjBqypXQkBkSqHkctdpEofaIkgUl92SQ?e=y6X5Nw>) available in the sharepoint, processed to work with it in python.

In [4]:
BIOMARKERS = ["AGRIN", "CLIC1", "HSPB1", "KPYM", "MMP9", "PERM", "PIGR"]

In [5]:
N_TERM = {
    "AGRIN": "MKWVTFISLLFLFSSAYSRGVFRREAHKSEIAHRFNDVGEEHFIGLVLITFSQYLQKAPYEEHAKLVKEVTDLAKACVADESAANCDKSLHDIFGDKICALPSLRDTYGDVADCCEKKEPERNECFLHHKDDKPDLPPFARPEADVLCKAFHDDEKAFFGHYLYEVARRHPYFYAPELLYYAQKYKAILTECCEAADKGACLTPKLDGGGGSGGGGSGGGAS",
    "CLIC1": "MKWVTFISLLFLFSSAYSRGVFRREAHKSEIAHRFNDVGEEHFIGLVLITFSQYLQKAPYEEHAKLVKEVTDLAKACVADESAANCDKSLHDIFGDKICALPSLRDTYGDVADCCEKKEPERNECFLHHKDDKPDLPPFARPEADVLCKAFHDDEKAFFGHYLYEVARRHPYFYAPELLYYAQKYKAILTECCEAADKGACLTPKLDGGGGSGGGGSGGGAS",
    "HSPB1": "MKWVTFISLLFLFSSAYSRGVFRREAHKSEIAHRFNDVGEEHFIGLVLITFSQYLQKAPYEEHAKLVKEVTDLAKACVADESAANCDKSLHDIFGDKICALPSLRDTYGDVADCCEKKEPERNECFLHHKDDKPDLPPFARPEADVLCKAFHDDEKAFFGHYLYEVARRHPYFYAPELLYYAQKYKAILTECCEAADKGACLTPKLDGGGGSGGGGSGGGAS",
    "KPYM": "MGHHHHHHHHHENLYFQGG",
    "MMP9": "MSLWQPLVLVLLVLGCCFA",
    "PERM": "MKWVTFISLLFLFSSAYSRGVFRREAHKSEIAHRFNDVGEEHFIGLVLITFSQYLQKAPYEEHAKLVKEVTDLAKACVADESAANCDKSLHDIFGDKICALPSLRDTYGDVADCCEKKEPERNECFLHHKDDKPDLPPFARPEADVLCKAFHDDEKAFFGHYLYEVARRHPYFYAPELLYYAQKYKAILTECCEAADKGACLTPKLDGGGGSGGGGSGGGAS",
    "PIGR": "MLLFVLTCLLAVFPAIST",
}

In [6]:
C_TERM = {
    "AGRIN": "GSGHHHHHH",
    "CLIC1": "GSGHHHHHH",
    "HSPB1": "GSGHHHHHH",
    "KPYM": "",
    "MMP9": "GSGHHHHHH",
    "PERM": "GSGHHHHHH",
    "PIGR": "GSGHHHHHH",
}

In [7]:
WT_SEQ = {
    "AGRIN": "GRRPPAPQQPPKPCDSQPCFHGGTCQDWALGGGFTCSCPAGRGGAVCEKVLGAPVPAFEGRSFLAFPTLRAYHTLRLALEFRALEPQGLLLYNGNARGKDFLALALLDGRVQLRFDTGSGPAVLTSAVPVEPGQWHRLELSRHWRRGTLSVDGETPVLGESPSGTDGLNLDTDLFVGGVPEDQAAVALERTFVGAGLRGCIRLLDVNNQRLELGIGPGAATRGSGVGECGDHPCLPNPCHGGAPCQNLEAGRFHCQCPPGRVGPTCADEKSPCQPNPCHGAAPCRVLPEGGAQCECPLGREGTFCQTASGQDGSGPFLADFNGFSHLELRGLHTFARDLGEKMALEVVFLARGPSGLLLYNGQKTDGKGDFVSLALRDRRLEFRYDLGKGAAVIRSREPVTLGAWTRVSLERNGRKGALRVGDGPRVLGESPKSRKVPHTVLNLKEPLYVGGAPDFSKLARAAAVSSGFDGAIQLVSLGGRQLLTPEHVLRQVDVTSFAGHPCTRASGHPCLNGASCVPREAAYVCLCPGGFSGPHCEKGLVEKSAGDVDTLAFDGRTFVEYLNAVTESELANEIPVPETLDSGALHSEKALQSNHFELSLRTEATQGLVLWSGKATERADYVALAIVDGHLQLSYNLGSQPVVLRSTVPVNTNRWLRVVAHREQREGSLQVGNEAPVTGSSPLGATQLDTDGALWLGGLPELPVGPALPKAYGTGFVGCLRDVVVGRHPLHLLEDAVTKPELRPCPTP",
    "CLIC1": "AEEQPQVELFVKAGSDGAKIGNCPFSQRLFMVLWLKGVTFNVTTVDTKRRTETVQKLCPGGQLPFLLYGTEVHTDTNKIEEFLEAVLCPPRYPKLAALNPESNTAGLDIFAKFSAYIKNSNPALNDNLEKGLLKALKVLDNYLTSPLPEEVDETSAEDEGVSQRKFLDGNELTLADCNLLPKLHIVQVVCKKYRGFTIPEAFRGVHRYLSNAYAREEFASTCPDDEEIELAYEQVAKALK",
    "HSPB1": "TERRVPFSLLRGPSWDPFRDWYPHSRLFDQAFGLPRLPEEWSQWLGGSSWPGYVRPLPPAAIESPAVAAPAYSRALSRQLSSGVSEIRHTADRWRVSLDVNHFAPDELTVKTKDGVVEITGKHEERQDEHGYISRCFTRKYTLPPGVDPTQVSSSLSPEGTLTVEAPMPKLATQSNEITIPVTFESRAQLGGPEAAKSDETAAK",
    "KPYM": "MSKPHSEAGTAFIQTQQLHAAMADTFLEHMCRLDIDSPPITARNTGIICTIGPASRSVETLKEMIKSGMNVARLNFSHGTHEYHAETIKNVRTATESFASDPILYRPVAVALDTKGPEIRTGLIKGSGTAEVELKKGATLKITLDNAYMEKCDENILWLDYKNICKVVEVGSKIYVDDGLISLQVKQKGADFLVTEVENGGSLGSKKGVNLPGAAVDLPAVSEKDIQDLKFGVEQDVDMVFASFIRKASDVHEVRKVLGEKGKNIKIISKIENHEGVRRFDEILEASDGIMVARGDLGIEIPAEKVFLAQKMMIGRCNRAGKPVICATQMLESMIKKPRPTRAEGSDVANAVLDGADCIMLSGETAKGDYPLEAVRMQHLIAREAEAAIYHLQLFEELRRLAPITSDPTEATAVGAVEASFKCCSGAIIVLTKSGRSAHQVARYRPRAPIIAVTRNPQTARQAHLYRGIFPVLCKDPVQEAWAEDVDLRVNFAMNVGKARGFFKKGDVVIVLTGWRPGSGFTNTMRVVPVP",
    "MMP9": "APRQRQSTLVLFPGDLRTNLTDRQLAEEYLYRYGYTRVAEMRGESKSLGPALLLLQKQLSLPETGELDSATLKAMRTPRCGVPDLGRFQTFEGDLKWHHHNITYWIQNYSEDLPRAVIDDAFARAFALWSAVTPLTFTRVYSRDADIVIQFGVAEHGDGYPFDGKDGLLAHAFPPGPGIQGDAHFDDDELWSLGKGVVVPTRFGNADGAACHFPFIFEGRSYSACTTDGRSDGLPWCSTTANYDTDDRFGFCPSERLYTQDGNADGKPCQFPFIFQGQSYSACTTDGRSDGYRWCATTANYDRDKLFGFCPTRADSTVMGGNSAGELCVFPFTFLGKEYSTCTSEGRGDGRLWCATTSNFDSDKKWGFCPDQGYSLFLVAAHEFGHALGLDHSSVPEALMYPMYRFTEGPPLHKDDVNGIRHLYGPRPEPEPRPPTTTTPQPTAPPTVCPTGPPTVHPSERPTAGPTGPPSAGPTGPPTAGPSTATTVPLSPVDDACNVNIFDAIAEIGNQLYLFKDGKYWRFSEGRGSRPQGPFLIADKWPALPRKLDSVFEERLSKKLFFFSGRQVWVYTGASVLGPRRLDKLGLGADVAQVTGALRSGRGKMLLFSGRRLWRFDVKAQMVDPRSASEVDRMFPGVPLDTHDVFQYREKAYFCQDRFYWRVSSRSELNQVDQVGYVTYDILQCPED",
    "PERM": "AAPAVLGEVDTSLVLSSMEEAKQLVDKAYKERRESIKQRLRSGSASPMELLSYFKQPVAATRTAVRAADYLHVALDLLERKLRSLWRRPFNVTDVLTPAQLNVLSKSSGCAYQDVGVTCPEQDKYRTITGMCNNRRSPTLGASNRAFVRWLPAEYEDGFSLPYGWTPGVKRNGFPVALARAVSNEIVRFPTDQLTPDQERSLMFMQWGQLLDHDLDFTPEPAARASFVTGVNCETSCVQQPPCFPLKIPPNDPRIKNQADCIPFFRSCPACPGSNITIRNQINALTSFVDASMVYGSEEPLARNLRNMSNQLGLLAVNQRFQDNGRALLPFDNLHDDPCLLTNRSARIPCFLAGDTRSSEMPELTSMHTLLLREHNRLATELKSLNPRWDGERLYQEARKIVGAMVQIITYRDYLPLVLGPTAMRKYLPTYRSYNDSVDPRIANVFTNAFRYGHTLIQPFMFRLDNRYQPMEPNPRVPLSRVFFASWRVVLEGGIDPILRGLMATPAKLNRQNQIAVDEIRERLFEQVMRIGLDLPALNMQRSRDHGLPGYNAWRRFCGLPQPETVGQLGTVLRNLKLARKLMEQYGTPNNIDIWMGGVSEPLKRKGRVGPLLACIIGTQFRKLRDGDRFWWENEGVFSMQQRQALAQISLPRIICDNTGITTVSKNNIFMSNSYPRDFVNCSTLPALNLASWREAS",
    "PIGR": "SPIFGPEEVNSVEGNSVSITCYYPPTSVNRHTRKYWCRQGARGGCITLISSEGYVSSKYAGRANLTNFPENGTFVVNIAQLSQDDSGRYKCGLGINSRGLSFDVSLEVSQGPGLLNDTKVYTVDLGRTVTINCPFKTENAQKRKSLYKQIGLYPVLVIDSSGYVNPNYTGRIRLDIQGTGQLLFSVVINQLRLSDAGQYLCQAGDDSNSNKKNADLQVLKPEPELVYEDLRGSVTFHCALGPEVANVAKFLCRQSSGENCDVVVNTLGKRAPAFEGRILLNPQDKDGSFSVVITGLRKEDAGRYLCGAHSDGQLQEGSPIQAWQLFVNEESTIPRSPTVVKGVAGGSVAVLCPYNRKESKSIKYWCLWEGAQNGRCPLLVDSEGWVKAQYEGRLSLLEEPGNGTFTVILNQLTSRDAGFYWCLTNGDTLWRTTVEIKIIEGEPNLKVPGNVTAVLGETLKVPCHFPCKFSSYEKYWCKWNNTGCQALPSQDEGPSKAFVNCDENSRLVSLTLNLVTRADEGWYWCGVKQGHFYGETAAVYVAVEERKAAGSRDVSLAKADAAPDEKVLDSGFREIENKAIQDPR",
}

In [8]:
PEPTIDES = {
    "AGRIN": ["VLGAPVPAFEGR", "LELGIGPGAATR"],
    "CLIC1": ["NSNPALNDNLEK", "LAALNPESNTAGLDIFAK"],
    "HSPB1": ["LFDQAFGLPR", "LATQSNEITIPVTFESR"],
    "KPYM": ["NTGIICTIGPASR", "APIIAVTR"],
    "MMP9": ["SLGPALLLLQK", "AFALWSAVTPLTFTR"],
    "PERM": ["IANVFTNAFR", "VVLEGGIDPILR"],
    "PIGR": ["GGCITLISSEGYVSSK", "VYTVDLGR"],
}

# Load UniProt protein data

In [9]:
PROTEIN_DATA_PATH = '../../../data/ms/Womec_DIA_protein_data.csv'
df_protein_data = pd.read_csv(PROTEIN_DATA_PATH, sep='\t', index_col='Protein.Names', header=0)


In [10]:
df_protein_data.loc[[f"{b}_HUMAN" for b in BIOMARKERS]]

Unnamed: 0_level_0,Protein.Group,Protein.Ids,Genes,First.Protein.Description
Protein.Names,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AGRIN_HUMAN,O00468-6,O00468;O00468-3;O00468-4;O00468-5;O00468-6;O00...,AGRN,Isoform 6 of Agrin
CLIC1_HUMAN,O00299,O00299,CLIC1,Chloride intracellular channel protein 1
HSPB1_HUMAN,P04792,P04792;A0A6Q8PGK1,HSPB1,Heat shock protein beta-1
KPYM_HUMAN,P14618-2,P14618-2;A0A804F6T5;A0A804F729;H3BTN5,PKM,Isoform M1 of Pyruvate kinase PKM
KPYM_HUMAN,P14618;P14618-2,P14618;P14618-2;A0A804F6T5;A0A804F729;A0A8V8TN...,PKM,Pyruvate kinase PKM
KPYM_HUMAN,P14618;P14618-2;P14618-3,P14618;P14618-2;A0A804F6T5;A0A804F729;A0A8V8TN...,PKM,Pyruvate kinase PKM
KPYM_HUMAN,P14618;P14618-3,P14618;A0A8V8TNX9;P14618-3;B4DNK4,PKM,Pyruvate kinase PKM
MMP9_HUMAN,P14780,P14780,MMP9,Matrix metalloproteinase-9
PERM_HUMAN,P05164-2,P05164;P05164-2,MPO,Isoform H14 of Myeloperoxidase
PERM_HUMAN,P05164-2;P05164-3,P05164;P05164-2;P05164-3,MPO,Isoform H14 of Myeloperoxidase


**Observation**: For *AGRIN*, *KPYM* & *PERM*, in the protein data file we have information that there are several isoforms, not just *KYPM*, so maybe we should do something about these specific isoforms?

In [11]:
UNIPROT_ACCESSION = {
    "AGRIN": "O00468",
    "CLIC1": "O00299",
    "HSPB1": "P04792",
    "KPYM": "P14618",
    "MMP9": "P14780",
    "PERM": "P05164",
    "PIGR": "P01833",
}

In [12]:
UNIPROT_OBJECTS = { b: OnlineOrf(UNIPROT_ACCESSION[b]) for b in BIOMARKERS }
UNIPROT_ISOFORMS = { b: UNIPROT_OBJECTS[b].getIsoforms() for b in BIOMARKERS }

In [13]:
for bmk in BIOMARKERS:
    print(f"{bmk}: {len(UNIPROT_ISOFORMS[bmk].keys())} isoforms")

AGRIN: 7 isoforms
CLIC1: 0 isoforms
HSPB1: 0 isoforms
KPYM: 4 isoforms
MMP9: 0 isoforms
PERM: 3 isoforms
PIGR: 0 isoforms


We have loaded the isoforms when available to make sequence comparisons. Just as a sample, let's see which isoforms are there for KPYM.

In [14]:
for up_id, up_obj in UNIPROT_ISOFORMS["KPYM"].items():
    print(f"{up_id} - {up_obj.name}")
    print(f"Sequence: {up_obj.getSequence()}")

P14618-1 - Isoform M2 of Pyruvate kinase PKM
Sequence: MSKPHSEAGTAFIQTQQLHAAMADTFLEHMCRLDIDSPPITARNTGIICTIGPASRSVETLKEMIKSGMNVARLNFSHGTHEYHAETIKNVRTATESFASDPILYRPVAVALDTKGPEIRTGLIKGSGTAEVELKKGATLKITLDNAYMEKCDENILWLDYKNICKVVEVGSKIYVDDGLISLQVKQKGADFLVTEVENGGSLGSKKGVNLPGAAVDLPAVSEKDIQDLKFGVEQDVDMVFASFIRKASDVHEVRKVLGEKGKNIKIISKIENHEGVRRFDEILEASDGIMVARGDLGIEIPAEKVFLAQKMMIGRCNRAGKPVICATQMLESMIKKPRPTRAEGSDVANAVLDGADCIMLSGETAKGDYPLEAVRMQHLIAREAEAAIYHLQLFEELRRLAPITSDPTEATAVGAVEASFKCCSGAIIVLTKSGRSAHQVARYRPRAPIIAVTRNPQTARQAHLYRGIFPVLCKDPVQEAWAEDVDLRVNFAMNVGKARGFFKKGDVVIVLTGWRPGSGFTNTMRVVPVP
P14618-2 - Isoform M1 of Pyruvate kinase PKM
Sequence: MSKPHSEAGTAFIQTQQLHAAMADTFLEHMCRLDIDSPPITARNTGIICTIGPASRSVETLKEMIKSGMNVARLNFSHGTHEYHAETIKNVRTATESFASDPILYRPVAVALDTKGPEIRTGLIKGSGTAEVELKKGATLKITLDNAYMEKCDENILWLDYKNICKVVEVGSKIYVDDGLISLQVKQKGADFLVTEVENGGSLGSKKGVNLPGAAVDLPAVSEKDIQDLKFGVEQDVDMVFASFIRKASDVHEVRKVLGEKGKNIKIISKIENHEGVRRFDEILEASDGIMVARGDLGIEIPAEKVFLAQKMMIGRCNRAGKPVICATQMLESMIKKPRPTRAEGSDVANAVLDGADC

Just as a small check, let's see if M1 and M2 isoforms have different sequences.

In [15]:
UNIPROT_ISOFORMS["KPYM"]["P14618-1"].getSequence() == UNIPROT_ISOFORMS["KPYM"]["P14618-2"].getSequence()

False

They are not, we are loading them Ok.

Let's check that the antigen sequences are the same than in Uniprot or if they are included within the one in Uniprot. This will allow us consider the positions within the sequence with the proper number.

In [16]:
for b in BIOMARKERS:
    orf_seq = UNIPROT_OBJECTS[b].getSequence()

    # Remove starting methionine both in orf and WT sequence if present
    if orf_seq[0] == "M":
        orf_seq = orf_seq[1:]
    if WT_SEQ[b][0] == "M":
        WT_SEQ[b] = WT_SEQ[b][1:]

    if orf_seq == WT_SEQ[b]:
        print(f"{b} sequence matches the one in the database")
    elif WT_SEQ[b] in orf_seq:
        print(f"{b} sequence is a substring of the one in the database")
    elif orf_seq in WT_SEQ[b]:
        print(f"{b} sequence contains the one in the database")
    else:
        print(f"WARNING: {b} sequence does not match the one in the database")
        print(f"Database sequence: {WT_SEQ[b]}")
        print(f"ORF sequence     : {orf_seq}")
    

AGRIN sequence is a substring of the one in the database
CLIC1 sequence matches the one in the database
HSPB1 sequence matches the one in the database
KPYM sequence matches the one in the database
MMP9 sequence is a substring of the one in the database
PERM sequence is a substring of the one in the database
PIGR sequence is a substring of the one in the database


For all the proteins, or they are the same than in uniprot or a substring of it. This is possibly to small changes in the terminals, but the sequence we are using as antigen is at least included in the unprot one. This will allow us to map sites.

## Load glycosilation sites

In [17]:
GLYCOLISATION_SITES = { b: UNIPROT_OBJECTS[b].getGlycosylationSites() for b in BIOMARKERS }

## Load AlphaFold models

In [18]:
ALPHAFOLD_OBJECTS = { b: AlphaFoldHandler().get_model(UNIPROT_ACCESSION[b]) for b in BIOMARKERS }

## Compute Solvent Accessible Surface Area

In [19]:
max_sasa = {
    'ALA': 121.0, 'ARG': 265.0, 'ASN': 187.0, 'ASP': 187.0,
    'CYS': 148.0, 'GLN': 214.0, 'GLU': 214.0, 'GLY': 97.0,
    'HIS': 216.0, 'ILE': 195.0, 'LEU': 191.0, 'LYS': 230.0,
    'MET': 203.0, 'PHE': 228.0, 'PRO': 154.0, 'SER': 143.0,
    'THR': 163.0, 'TRP': 264.0, 'TYR': 255.0, 'VAL': 165.0
}

In [20]:
def compute_sasa_mapping(sasa_values):
    mapping = []
    for residue, sasa in sasa_values:
        res_type = residue.split('_')[1]  # Extract residue type (e.g., 'MET')
        pos = int(residue.split('_')[-1])  # Extract residue position (e.g., '1')

        # Compute percentage exposure if the residue type exists in max_sasa
        if res_type in max_sasa:
            max_sasa_value = max_sasa[res_type]
            percentage_exposure = (sasa / max_sasa_value) * 100
            mapping.append((pos, 100 - percentage_exposure))  # 100% - percentage exposure

    return mapping

In [21]:
SASA = {}
for b in BIOMARKERS:
    sasa_values = calculate_sasa_at_residue_level(ALPHAFOLD_OBJECTS[b].filePath)
    mapping = compute_sasa_mapping(sasa_values)

    SASA[b] = {pos: exposure for pos, exposure in mapping}

# Display the results

## Wild type sequences

For the wild type sequences, we are recognizing diferent stuff. Here some references:

- The unique peptides detected in mass spec are highlighted in blue
- The glycosilation sites are highlighted in red
- The arginines, lysines and tyrosines (asked by David Ledden) are highlighted in orange
- The second line is mapped to the confidence of modelling that we have in the AlphaFold model of the protein. Low values in green, high in purple. Usually low confindence parts are involved in exposed loops.
- The third line is the solvent accesibility for each residue (in %), high exposed residues in green, low exposed ones in purple. We want to target with the antibodies areas with some greens... but this is more complex since is three-dimensional, has to be analyzed case by case.

In [22]:
for b in BIOMARKERS:
    sequence = UNIPROT_OBJECTS[b].getSequence()
    highlight_positions = {int(i+1): "navajowhite" for i in range(len(sequence)) if sequence[i] in ["R", "K", "Y"]}
    for site in GLYCOLISATION_SITES[b]:
        highlight_positions[int(site[0])] = "indianred"
    highlight_substrings = {pep: "lightblue" for pep in PEPTIDES[b]}
    confidence_maping = {i:int(ALPHAFOLD_OBJECTS[b].residues["A"][i+1].atoms[0].bfactor) for i in range(len(sequence))}

    highlight_sequence(str(sequence), highlight_positions, highlight_substrings, b+"-WT", [confidence_maping, SASA[b]])


## Isoforms

For the isoforms, we just want to see if the peptides detected in mass spec are present. Unfortunately the alphafold models are not available for isoforms so we should make an alignment in order to see with regions corresponds to the wild type and evaluate manually.

In [29]:
for b in BIOMARKERS:
    if len(UNIPROT_ISOFORMS[b].items()) < 2:
        continue
    print(f" *** {b} *** ")
    for code, isoform in UNIPROT_ISOFORMS[b].items():
        isoform_seq = isoform.getSequence()
        # Inform if the PEPTIDES[b] are present in the isoform
        for pep in PEPTIDES[b]:
            if pep in isoform_seq:
                print(f"{b} {code} - {isoform.name} contains {pep}")
            else:
                print(f"{b} {code} - {isoform.name} does not contain {pep}")

        



 *** AGRIN *** 
AGRIN O00468-1 - Isoform 1 of Agrin contains VLGAPVPAFEGR
AGRIN O00468-1 - Isoform 1 of Agrin contains LELGIGPGAATR
AGRIN O00468-2 - Isoform 2 of Agrin contains VLGAPVPAFEGR
AGRIN O00468-2 - Isoform 2 of Agrin contains LELGIGPGAATR
AGRIN O00468-3 - Isoform 3 of Agrin contains VLGAPVPAFEGR
AGRIN O00468-3 - Isoform 3 of Agrin contains LELGIGPGAATR
AGRIN O00468-4 - Isoform 4 of Agrin contains VLGAPVPAFEGR
AGRIN O00468-4 - Isoform 4 of Agrin contains LELGIGPGAATR
AGRIN O00468-5 - Isoform 5 of Agrin contains VLGAPVPAFEGR
AGRIN O00468-5 - Isoform 5 of Agrin contains LELGIGPGAATR
AGRIN O00468-6 - Isoform 6 of Agrin contains VLGAPVPAFEGR
AGRIN O00468-6 - Isoform 6 of Agrin contains LELGIGPGAATR
AGRIN O00468-7 - Isoform 7 of Agrin contains VLGAPVPAFEGR
AGRIN O00468-7 - Isoform 7 of Agrin contains LELGIGPGAATR
 *** KPYM *** 
KPYM P14618-1 - Isoform M2 of Pyruvate kinase PKM contains NTGIICTIGPASR
KPYM P14618-1 - Isoform M2 of Pyruvate kinase PKM contains APIIAVTR
KPYM P14618-2 - 

As can be seen, except for one isoform of KPYM the peptides detected in mass spec are all present in all the isoforms when they are, so we are detecting all of them. If we want to target specific isoforms we should design very carefully (and manually).

## Isoform alignment

As an aid for future design, let's show the alignments of each biomarker with all their isoforms.

In [30]:
for b in BIOMARKERS:
    if len(UNIPROT_ISOFORMS[b].items()) < 2:
        continue
    print(f" *** {b} *** ")
    
    secuences = []
    names = []
    for code, isoform in UNIPROT_ISOFORMS[b].items():
        secuences.append(isoform.getSequence())
        names.append(code)

    alignment = align_sequences_biopython(secuences, names)

    display_alignment_with_highlighting(alignment)


 *** AGRIN *** 


 *** KPYM *** 


 *** PERM *** 
