In [1]:
# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

In [2]:
# Import the sbg workspace and local utils
import sys
sys.path.append('/home/leandro/Dropbox/workspacesbg/sbg/')
sys.path.append('../../../code/')

from sbg.orf.Orf import OnlineOrf
from sbg.structure.AlphaFoldHandler import AlphaFoldHandler
from sbg.structure.Structure import Structure
from sbg.common.FileHandler import FileHandler
from sbg.scripts.foldx.TangoHandler import TangoHandler

import pipeline_functions as pf

In [3]:
# External libraries
import pandas as pd
import os
from sequence_utils import highlight_sequence, align_sequences_biopython, display_alignment_with_highlighting


In [4]:
# Load the data from data/protein_data.csv
data = pd.read_csv('data/protein_data.csv', index_col=0)

# Computation Pipeline

In [6]:
entry = 0
for up_acc in data.index:
    try:
        entry += 1
        print(f'Processing {up_acc}, {entry} of {len(data.index)}')

        results_dir = f'/home/leandro/Insync/gdrive/Mimark/code/cohort-analysis/notebooks/other_analyses/antigens/data/results/{up_acc[0:2]}/{up_acc}/'
        FileHandler.ensureDir(results_dir)

        # Step 1: Get the Uniprot object
        step = 1
        uniprot_object = OnlineOrf(up_acc)
        step = 1.1
        glycosilation_sites = uniprot_object.getGlycosylationSites()
        step = 1.2
        modified_residues = uniprot_object.getModifiedResidues()
        step = 1.3
        try:
            isoforms = uniprot_object.getIsoforms()
        except:
            isoforms = []
        step = 1.4
        uniprot_string_id = uniprot_object.getStringId()
        step = 1.5
        try:
            subcellular_location = uniprot_object.getSubcellularLocation()
        except:
            subcellular_location = None
        step=1.6
        crytals = uniprot_object.getCrystals()

        # Step 2: Get the AlphaFold object
        step = 2
        alphafold_object = AlphaFoldHandler().get_model(up_acc)

        # Step 3: Run discotope 3.0 and save the results, then load them
        step = 3
        if alphafold_object is not None:
            discotope3_file = f'{up_acc}_A_discotope3.csv'
            # If no results, run discotope
            # if not FileHandler.fileExists(path=results_dir + discotope3_file):
            #     pf.run_discotope(up_acc, save_to=results_dir)
            
            # Now, if the results are there, load them
            if FileHandler.fileExists(path=results_dir + discotope3_file):
                discotope_results = pd.read_csv(results_dir + discotope3_file, index_col=0)
            else:
                discotope_results = None

        # Step 4: Compute the expression 
        step = 4
        exp_file = f'{up_acc}_exp_results.csv'
        if FileHandler.fileExists(results_dir + exp_file):
            exp_results = pd.read_csv(results_dir + exp_file, index_col=0)
        else:
            try:
                exp_auc, exp_normal, exp_cancer = pf.compare_gene_expression(gene_name=data.loc[up_acc, 'swiss_prot'].split("_")[0], up_acc=up_acc, save_to=results_dir)
            except:
                exp_auc, exp_normal, exp_cancer = None, None, None
            
            exp_file = f'{up_acc}_exp_results.csv'
            exp_results = pd.DataFrame({'exp_auc': [exp_auc], 'exp_normal': [exp_normal], 'exp_cancer': [exp_cancer]})
            exp_results.to_csv(results_dir + exp_file)

        # Step 5: Align the isoforms
        step = 5
        if not FileHandler.fileExists(results_dir + f'{up_acc}_isoforms_alignment.html') and len(isoforms) > 1:
            secuences = []
            names = []
            for isoform in isoforms:
                isoform_obj = OnlineOrf(isoform)
                secuences.append(isoform_obj.getSequence())
                names.append(isoform)
            alignment = align_sequences_biopython(secuences, names)
            display_alignment_with_highlighting(alignment, save_to=results_dir + f'{up_acc}_isoforms_alignment.html')

        # Step 6: Compute the aggregation propensity
        step = 6
        if alphafold_object is not None:
            if not FileHandler.fileExists(results_dir + f'{up_acc}_agg.txt'):
                TangoHandler.getAggregation(alphafold_object,results_dir)

            agg_df = pd.read_csv(results_dir + f'{up_acc}_agg.txt', sep='\t', header=0, index_col="res")[["Aggregation"]]
        
        # Step 7: Get interaction partners
        step = 7
        if not FileHandler.fileExists(results_dir + f'{up_acc}_interactors.tsv'):
            interactors = pf.get_interactors(uniprot_string_id, up_acc, save_to=results_dir)
        else:
            interactors = pd.read_csv(results_dir + f'{up_acc}_interactors.tsv', sep='\t', index_col=0)

        # Step 8: Get the protein homology
        step = 8
        if not FileHandler.fileExists(results_dir + f'{up_acc}_homologs.tsv'):
            homology = pf.get_homologs(uniprot_object.getGeneName(), up_acc, save_to=results_dir)
        else:
            homology = pd.read_csv(results_dir + f'{up_acc}_homologs.tsv', sep='\t', index_col=0)

        # Step 9: Get the protein structures bioassemblies to check for homo and multimers
        step = 9
        if not FileHandler.fileExists(results_dir + f'{up_acc}_bioassemblies.csv'):
            print("entre")
            try:
                all_dataframes = []
                for crystal in crytals:
                    structure = Structure(crystal, replaceExistent=False)
                    df = structure.analyzeStructureType()
                    df['crystal_id'] = crystal
                    all_dataframes.append(df)

                if len(all_dataframes) > 1:
                    bioassemblies_df = pd.concat(all_dataframes)
                elif len(all_dataframes) == 1:
                    bioassemblies_df = all_dataframes[0]
                else:
                    bioassemblies_df = None

                if len(all_dataframes) > 0:
                    bioassemblies_df.to_csv(results_dir+f'{up_acc}_bioassemblies.csv')
            except:
                bioassemblies_df = None
        else:
            bioassemblies_df = pd.read_csv(results_dir + f'{up_acc}_bioassemblies.csv')

        #print(bioassemblies_df)

    except Exception as e:
        print(f'Error with {up_acc} in step {step}: {e}')

Processing P07355, 1 of 505
Processing O00425, 2 of 505
Processing O15392, 3 of 505
Processing P00749, 4 of 505
Processing P02787, 5 of 505
Processing P03372, 6 of 505
Processing P04792, 7 of 505
Processing P07339, 8 of 505
Processing P08253, 9 of 505
Processing P09603, 10 of 505
Processing P10415, 11 of 505
Processing P11166, 12 of 505
Processing P14780, 13 of 505
Processing P15692, 14 of 505
Error with P15692 in step 7: 400 Client Error: Bad Request for url: https://string-db.org/api/tsv/interaction_partners?identifiers=&species=9606&limit=50
Processing P15941, 15 of 505
Processing P17813, 16 of 505
Processing P35354, 17 of 505
Processing P38936, 18 of 505
Processing P42771, 19 of 505
Processing P60484, 20 of 505
Processing P61604, 21 of 505
Processing Q13938, 22 of 505
Processing Q14508, 23 of 505
entre
Processing P12830, 24 of 505
Processing P11802, 25 of 505
Processing P17661, 26 of 505
entre
Processing O14497, 27 of 505
Processing Q07812, 28 of 505
Processing P04626, 29 of 505
Pr