In [1]:
# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

In [2]:
# Import the sbg workspace and local utils
# Import the sbg workspace and local utils
import sys
sys.path.append('/home/leandro/Dropbox/workspacesbg/sbg/')
sys.path.append('../../../code/')

from sbg.orf.Orf import OnlineOrf
import pipeline_functions as pf

In [3]:
# External libraries
import pandas as pd
import os

In [4]:
# Load the data from data/protein_data.csv
# data = pd.read_csv('data/protein_data.csv', index_col=0)

In [5]:
# Run for Irene
up_accs = ['P05067','P02452','O00468','Q5S248','P10451','P35321','A8K2U0','P22528','P50454','P04275','P16035','P80188']

# create a data dataframe
data = pd.DataFrame(up_accs, columns=['up_acc'])
data = data.set_index('up_acc')

# Results file generation pipeline

In [6]:
results_base_dir = "/home/leandro/Insync/gdrive/Mimark/code/cohort-analysis/notebooks/other_analyses/antigens/data/results_screenEC/"
summary_data = []  # List to store each protein's summary information

In [None]:
step = 0
summary_data = []
for up_acc in data.index:
    try:
        step += 1
        # Define result directory for this entry
        results_dir = f'{results_base_dir}/{up_acc[0:2]}/{up_acc}/'

        # For Irene, we will use the following fields
        oorf = OnlineOrf(up_acc)
        gene_name = oorf.getGeneName()
        protein_name = oorf.getName()
        swiss_prot = "NA"
        up_down_regulated = "NA"
        family = "NA"
        biomarker_application = "NA"

        # For the original MS data, we have used these:
        # Retrieve relevant fields from the original data
        # gene_name = data.loc[up_acc, 'gene_name']
        # protein_name = data.loc[up_acc, 'protein_name']
        # swiss_prot = data.loc[up_acc, 'swiss_prot']
        # up_down_regulated = data.loc[up_acc, 'up_down_regulated']
        # family = data.loc[up_acc, 'family']
        # biomarker_application = data.loc[up_acc, 'biomarker_application']
        
        # Step 1: Get the Uniprot object (like in the original pipeline)
        try:
            uniprot_object = OnlineOrf(up_acc)
        except:
            print(f"Error retrieving {up_acc}")
            continue
        
        # Quick computations from OnlineOrf object
        glycosylation_sites = uniprot_object.getGlycosylationSites()
        modified_residues = uniprot_object.getModifiedResidues()
        try:
            subcellular_location = " \\| ".join(uniprot_object.getSubcellularLocation()) if uniprot_object.getSubcellularLocation() else 'N/A'
        except:
            subcellular_location = 'N/A'
        try:
            isoforms = uniprot_object.getIsoforms()
        except:
            isoforms = []
        number_of_isoforms = len(isoforms) if isoforms else 0
        
        # Load data files
        discotope_results = pd.read_csv(f'{results_dir}{up_acc}_A_discotope3.csv') if os.path.exists(f'{results_dir}{up_acc}_A_discotope3.csv') else None
        agg_data = pd.read_csv(f'{results_dir}{up_acc}_agg.txt', sep='\t') if os.path.exists(f'{results_dir}{up_acc}_agg.txt') else None
        exp_results = pd.read_csv(f'{results_dir}{up_acc}_exp_results.csv') if os.path.exists(f'{results_dir}{up_acc}_exp_results.csv') else None
        interactors = pd.read_csv(f'{results_dir}{up_acc}_interactors.tsv', sep='\t') if os.path.exists(f'{results_dir}{up_acc}_interactors.tsv') else None
        homologs = pd.read_csv(f'{results_dir}{up_acc}_homologs.tsv', sep='\t') if os.path.exists(f'{results_dir}{up_acc}_homologs.tsv') else None
        bioassemblies = pd.read_csv(f'{results_dir}{up_acc}_bioassemblies.csv') if os.path.exists(f'{results_dir}{up_acc}_bioassemblies.csv') else None
        
        # Retrieve important metrics for summary
        exp_auc, fold_change, up_down_regulated_transcriptomics = pf.get_expression_metrics(exp_results)
        epitope_count = pf.count_discotope_epitopes(discotope_results)
        agg_critical_residues = pf.count_critical_aggregation_sites(agg_data)
        homo_max_n_uniprots, hetero_max_n_uniprots = pf.get_bioassemblies_metrics(bioassemblies)

        # Create a unified DataFrame per residue
        residues_df = pf.create_unified_residue_df(discotope_results, agg_data, glycosylation_sites, modified_residues)
        
        # Format interactors and homologs for the summary table
        interactor_count, formatted_interactors = pf.format_interactors_homologs(interactors, "preferredName_B")
        homolog_count, formatted_homologs = pf.format_interactors_homologs(homologs, "gene_id")

        # Compile metrics for the summary
        metrics = {
            'AUC': exp_auc,
            'Fold Change': fold_change,
            'Up/Down Regulated' : up_down_regulated_transcriptomics,
            'Discotope Epitope Count': epitope_count,
            'Max n_uniprots Homo': homo_max_n_uniprots,
            'Max n_uniprots Hetero': hetero_max_n_uniprots
        }
        
        # Append data to the summary list
        summary_data.append({
            'Uniprot ID': up_acc,
            'Gene Name': gene_name,
            'Protein Name': protein_name,
            'Swiss Prot': swiss_prot,
            'Up/Down Regulated': up_down_regulated,
            'Family': family,
            'Biomarker Application': biomarker_application,
            '(transcriptomics) AUC': exp_auc,
            '(transcriptomics) Fold Change ': fold_change,
            '(transcriptomics) Up/Down Regulated ': up_down_regulated_transcriptomics,
            'Seq Length': len(uniprot_object.getSequence()),
            'Glycosylation Sites': len(glycosylation_sites),
            'Modified Residues': len(modified_residues),
            'Subcellular Location': subcellular_location,
            'Discotope Epitope Count': epitope_count,
            'Critical Aggregation Sites (>50)': agg_critical_residues,
            'Interactor Count': interactor_count,
            'Interactors': formatted_interactors,
            'Homolog Count': homolog_count,
            'Homologs': formatted_homologs,
            'Max n_uniprots Homo': homo_max_n_uniprots,
            'Max n_uniprots Hetero': hetero_max_n_uniprots,
            'Number of Isoforms': number_of_isoforms
        })
        
        # Save detailed markdown file
        pf.save_detailed_markdown(
            up_acc, results_dir, residues_df, metrics, exp_results, 
            glycosylation_sites, modified_residues, interactors, homologs, bioassemblies, {
                'gene_name': gene_name,
                'protein_name': protein_name,
                'swiss_prot': swiss_prot,
                'up_down_regulated': up_down_regulated,
                'family': family,
                'biomarker_application': biomarker_application
            },
            number_of_isoforms
        )
        
        print(f'Processed {up_acc}, {step} out of {len(data.index)}')
        
    except Exception as e:
        print(f'Error processing {up_acc}: {e}')

# Create a DataFrame from the collected summary data
summary_df = pd.DataFrame(summary_data)

# Save the summary DataFrame to Excel
excel_file_path = os.path.join(results_base_dir, "general_summary.xlsx")
summary_df.to_excel(excel_file_path, index=False)

print(f"General summary saved to {excel_file_path}")




Error processing P05067: name 'swiss_prot' is not defined


Error processing P02452: name 'swiss_prot' is not defined


Error processing O00468: name 'swiss_prot' is not defined
Error processing Q5S248: name 'swiss_prot' is not defined


Error processing P10451: name 'swiss_prot' is not defined


Error processing P35321: name 'swiss_prot' is not defined

Error processing A8K2U0: name 'swiss_prot' is not defined


Error processing P22528: name 'swiss_prot' is not defined


Error processing P50454: name 'swiss_prot' is not defined


Error processing P04275: name 'swiss_prot' is not defined


Error processing P16035: name 'swiss_prot' is not defined


Error processing P80188: name 'swiss_prot' is not defined
General summary saved to /home/leandro/Insync/gdrive/Mimark/code/cohort-analysis/notebooks/other_analyses/antigens/data/results_screenEC/general_summary.xlsx


In [8]:
def sanitize_dataframe(df):
    # Iterate over each column to handle mixed or complex types
    for column in df.columns:
        def flatten_value(value):
            # If the value is a pandas Series, extract the first element or convert to string
            if isinstance(value, pd.Series):
                return str(value.iloc[0]) if not value.empty else 'N/A'
            # Convert None or NaN to 'N/A'
            elif pd.isna(value):
                return 'N/A'
            # Convert other types to string
            else:
                return str(value)
        
        # Apply the flattening to each column
        df[column] = df[column].apply(flatten_value)
    
    return df

# Sanitize the summary_df
summary_df = sanitize_dataframe(summary_df)


In [9]:
# Save the general markdown file
pf.save_general_markdown(summary_df, results_base_dir)

KeyError: 'Uniprot ID'