In [1]:
# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

In [None]:
# %pip install biopython

Collecting biopython
  Downloading biopython-1.85-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Downloading biopython-1.85-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: biopython
Successfully installed biopython-1.85
Note: you may need to restart the kernel to use updated packages.


In [2]:
# Import the sbg workspace and local utils
import sys
sys.path.append('/home/leandro/Dropbox/workspacesbg/sbg/')
sys.path.append('../../../code/')

from sbg.orf.Orf import OnlineOrf
from sbg.structure.AlphaFoldHandler import AlphaFoldHandler
from sbg.structure.Structure import Structure
from sbg.common.FileHandler import FileHandler
from sbg.scripts.foldx.TangoHandler import TangoHandler

import pipeline_functions as pf

In [3]:
# External libraries
import pandas as pd
import os
from sequence_utils import highlight_sequence, align_sequences_biopython, display_alignment_with_highlighting


ModuleNotFoundError: No module named 'Bio'

In [None]:
# Load the data from data/protein_data.csv
# data = pd.read_csv('data/protein_data.csv', index_col=0)

In [None]:
# Run for Irene
up_accs = ['P05067','P02452','O00468','Q5S248','P10451','P35321','A8K2U0','P22528','P50454','P04275','P16035','P80188']

# Computation Pipeline

In [None]:
entry = 0
for up_acc in data.index:
    try:
        entry += 1
        print(f'Processing {up_acc} ') #{data.loc[up_acc, 'swiss_prot']}, {entry} of {len(data.index)}')

        results_dir = f'/home/leandro/Insync/gdrive/Mimark/code/cohort-analysis/notebooks/other_analyses/antigens/data/results_screenEC/{up_acc[0:2]}/{up_acc}/'
        FileHandler.ensureDir(results_dir)

        # Step 1: Get the Uniprot object
        step = 1
        uniprot_object = OnlineOrf(up_acc)
        step = 1.1
        glycosilation_sites = uniprot_object.getGlycosylationSites()
        if len(uniprot_object.getGlycosylationSites()) > 0:
            glycosylation_df = pd.DataFrame(uniprot_object.getGlycosylationSites(), columns=['res', 'glycosylation'])
            glycosylation_df.to_csv(results_dir + f'{up_acc}_glycosylation.csv')
        
        step = 1.2
        modified_residues = uniprot_object.getModifiedResidues()
        if len(uniprot_object.getModifiedResidues()) > 0:
            modified_residues_df = pd.DataFrame(uniprot_object.getModifiedResidues(), columns=['res', 'modification'])
            modified_residues_df.to_csv(results_dir + f'{up_acc}_modifications.csv')
        
        # step = 1.3
        try:
            isoforms = uniprot_object.getIsoforms()
        except:
            isoforms = []
        step = 1.4
        uniprot_string_id = uniprot_object.getStringId()
        step = 1.5
        try:
            subcellular_location = uniprot_object.getSubcellularLocation()
        except:
            subcellular_location = None
        step=1.6
        crytals = uniprot_object.getCrystals()

        # Step 2: Get the AlphaFold object
        step = 2
        alphafold_object = AlphaFoldHandler().get_model(up_acc)

        # Step 3: Run discotope 3.0 and save the results, then load them
        step = 3
        if alphafold_object is not None:
            discotope3_file = f'{up_acc}_A_discotope3.csv'
            # If no results, run discotope
            # if not FileHandler.fileExists(path=results_dir + discotope3_file):
            #     pf.run_discotope(up_acc, save_to=results_dir)
            
            # Now, if the results are there, load them
            if FileHandler.fileExists(path=results_dir + discotope3_file):
                discotope_results = pd.read_csv(results_dir + discotope3_file, index_col=0)
            else:
                discotope_results = None

        # Step 4: Compute the expression 
        step = 4
        exp_file = f'{up_acc}_exp_results.csv'
        #if FileHandler.fileExists(results_dir + exp_file):
        #    exp_results = pd.read_csv(results_dir + exp_file, index_col=0)
        #else:
        try:
            exp_auc, exp_normal, exp_cancer = pf.compare_gene_expression(gene_name=uniprot_object.getGeneName(), up_acc=up_acc, save_to=results_dir)
        except:
            exp_auc, exp_normal, exp_cancer = None, None, None
        
        exp_results = pd.DataFrame({'exp_auc': [exp_auc], 'exp_normal': [exp_normal], 'exp_cancer': [exp_cancer]})
        exp_results.to_csv(results_dir + exp_file)

        # Step 5: Align the isoforms
        step = 5
        if not FileHandler.fileExists(results_dir + f'{up_acc}_isoforms_alignment.html') and len(isoforms) > 1:
            secuences = []
            names = []
            for isoform in isoforms:
                isoform_obj = OnlineOrf(isoform)
                secuences.append(isoform_obj.getSequence())
                names.append(isoform)
            alignment = align_sequences_biopython(secuences, names)
            display_alignment_with_highlighting(alignment, save_to=results_dir + f'{up_acc}_isoforms_alignment.html')

        # Step 6: Compute the aggregation propensity
        step = 6
        if alphafold_object is not None:
            if not FileHandler.fileExists(results_dir + f'{up_acc}_agg.txt'):
                TangoHandler.getAggregation(alphafold_object,results_dir)

            agg_df = pd.read_csv(results_dir + f'{up_acc}_agg.txt', sep='\t', header=0, index_col="res")[["Aggregation"]]
        
        # Step 7: Get interaction partners
        step = 7
        if not FileHandler.fileExists(results_dir + f'{up_acc}_interactors.tsv'):
            interactors = pf.get_interactors(uniprot_string_id, up_acc, save_to=results_dir)
        else:
            interactors = pd.read_csv(results_dir + f'{up_acc}_interactors.tsv', sep='\t', index_col=0)

        # Step 8: Get the protein homology
        step = 8
        if not FileHandler.fileExists(results_dir + f'{up_acc}_homologs.tsv'):
            homology = pf.get_homologs(uniprot_object.getGeneName(), up_acc, save_to=results_dir)
        else:
            homology = pd.read_csv(results_dir + f'{up_acc}_homologs.tsv', sep='\t', index_col=0)

        # Step 9: Get the protein structures bioassemblies to check for homo and multimers
        step = 9
        if not FileHandler.fileExists(results_dir + f'{up_acc}_bioassemblies.csv'):
            try:
                all_dataframes = []
                for crystal in crytals:
                    structure = Structure(crystal, replaceExistent=False)
                    df = structure.analyzeStructureType()
                    df['crystal_id'] = crystal
                    all_dataframes.append(df)

                if len(all_dataframes) > 1:
                    bioassemblies_df = pd.concat(all_dataframes)
                elif len(all_dataframes) == 1:
                    bioassemblies_df = all_dataframes[0]
                else:
                    bioassemblies_df = None

                if len(all_dataframes) > 0:
                    bioassemblies_df.to_csv(results_dir+f'{up_acc}_bioassemblies.csv')
            except:
                bioassemblies_df = None
        else:
            bioassemblies_df = pd.read_csv(results_dir + f'{up_acc}_bioassemblies.csv')

    except Exception as e:
        print(f'Error with {up_acc} in step {step}: {e}')

Processing P07355 ANXA2_HUMAN, 1 of 505
Processing O00425 IF2B3_HUMAN, 2 of 505
Processing O15392 BIRC5_HUMAN, 3 of 505
Processing P00749 UROK_HUMAN, 4 of 505
Processing P02787 TRFE_HUMAN, 5 of 505
Processing P03372 ESR1_HUMAN, 6 of 505
Processing P04792 HSPB1_HUMAN, 7 of 505
Processing P07339 CATD_HUMAN, 8 of 505
Processing P08253 MMP2_HUMAN, 9 of 505
Processing P09603 CSF1_HUMAN, 10 of 505
Processing P10415 BCL2_HUMAN, 11 of 505
Processing P11166 GTR1_HUMAN, 12 of 505
Processing P14780 MMP9_HUMAN, 13 of 505
Processing P15692 VEGFA_HUMAN, 14 of 505
Processing P15941 MUC1_HUMAN, 15 of 505
Processing P17813 EGLN_HUMAN, 16 of 505
Processing P35354 PGH2_HUMAN, 17 of 505
Processing P38936 CDN1A_HUMAN, 18 of 505
Processing P42771 CD2A1_HUMAN, 19 of 505
Processing P60484 PTEN_HUMAN, 20 of 505
Processing P61604 CH10_HUMAN, 21 of 505
Processing Q13938 CAYP1_HUMAN, 22 of 505
Processing Q14508 WFDC2_HUMAN, 23 of 505
Processing P12830 CADH1_HUMAN, 24 of 505
Processing P11802 CDK4_HUMAN, 25 of 505

# Auxiliary: epitopes

In [6]:
up_accs = {
    "KPYM": "P14618",
    "MMP9": "P14780",
    "HSPB1": "P04792",
    "AGRIN": "O00468",
    "CLIC1": "O00299"
}

epitopes = {
    "KPYM": ["EAVRMQHLIARE"],
    "MMP9": ["TFLGKEY", "GYPFDGKD"],
    "HSPB1": ["KDGVV"],
    "AGRIN": ["RLELSRHW", "FVGAGLRGC", "NPCHGAAPC", "RDRRLEF", "GHPCLNGASC", "VCLCPGGF"],
    "CLIC1": ["KRRTE", "QVELF", "KRRTET"] 
}


up_objs = {k: OnlineOrf(v) for k, v in up_accs.items()}



In [7]:
def find_all_epitopes(seq, epitope):
    positions = []
    for i in range(len(seq) - len(epitope)):
        if seq[i:i+len(epitope)] == epitope:
            positions.append((i, i+len(epitope)))
    return positions

for up in up_objs.keys():
    print(f'Processing {up}')
    up_obj = up_objs[up]
    seq = up_obj.getSequence()

    for epitope in epitopes[up]:
        print(f'Epitope: {epitope}')
        print(f'Positions: {find_all_epitopes(seq, epitope)}')

    print('\n\n')


Processing KPYM
Epitope: EAVRMQHLIARE
Positions: [(372, 384)]



Processing MMP9
Epitope: TFLGKEY
Positions: [(351, 358)]
Epitope: GYPFDGKD
Positions: [(177, 185)]



Processing HSPB1
Epitope: KDGVV
Positions: [(113, 118)]



Processing AGRIN
Epitope: RLELSRHW
Positions: [(1455, 1463)]
Epitope: FVGAGLRGC
Positions: [(1510, 1519)]
Epitope: NPCHGAAPC
Positions: [(1594, 1603)]
Epitope: RDRRLEF
Positions: [(1695, 1702)]
Epitope: GHPCLNGASC
Positions: [(1826, 1836)]
Epitope: VCLCPGGF
Positions: [(1843, 1851)]



Processing CLIC1
Epitope: KRRTE
Positions: [(48, 53)]
Epitope: QVELF
Positions: [(6, 11)]
Epitope: KRRTET
Positions: [(48, 54)]



