In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import scipy.stats as stats
from statsmodels.stats.multitest import multipletests

In [2]:
def perform_regression_fdr(df_pheno, df_proteome, Protein_ID, platform_name, df_proteome_median, type):
    def run_regression(y, X, family):
        X = sm.add_constant(X)  # Add constant term for intercept
        if family == 'binomial':
            model = sm.GLM(y, X, family=sm.families.Binomial()).fit()
        else:
            model = sm.GLM(y, X, family=sm.families.Gaussian()).fit()
        return model

    df_proteome = df_proteome.to_numpy()

    # Define the dependent variables and their respective covariates
    regression_config = {
        'Age': (['Sex', 'Bmi'], 'gaussian'),
        'Sex': (['Age', 'Bmi'], 'binomial'),
        'Bmi': (['Age', 'Sex'], 'gaussian')
    }
    # Define common covariates
    common_covariates = ['FREG5_Ethnic_Group_I', 'FREG5_Ethnic_Group_M']
    results = {dep_var: [] for dep_var in regression_config}

    for i in range(df_proteome.shape[1]):
        if type=='preanml':
            protein_data = df_proteome[:, i]
        else:
            protein_data = df_proteome[:, i]
        
        for dep_var, (covariates, family) in regression_config.items():
            X = np.column_stack((protein_data, df_pheno[covariates].values, df_pheno[common_covariates].values))
            #X = np.column_stack((protein_data, df_pheno[covariates].values))
            model = run_regression(df_pheno[dep_var], X, family)
            results[dep_var].append([
    Protein_ID[i], model.params.iloc[1], model.bse.iloc[1], model.tvalues.iloc[1], model.pvalues.iloc[1]
])

    significance_value = 0.05
    bonferroni_threshold = significance_value / len(Protein_ID)
    significant_results = {}
    top_10_results = {}
    allresults = {}
    significant_results_no_corr={}

    print(f"Number of samples: {df_pheno.shape[0]}")
    print(f"Number of proteins: {df_proteome.shape[1]}")

    for dep_var, result_list in results.items():
        columns = ['Protein_ID', 'Est', 'SE', 't_value', 'P']
        res_df = pd.DataFrame(result_list, columns=columns)

        # Print uncorrected p-value threshold
        print(f"Uncorrected p-value threshold: {significance_value}")
        print(f"Bonferroni correction threshold: {bonferroni_threshold}")

        res_df = res_df.dropna(subset=['P'])

        # FDR correction
        res_df['FDR_P'] = multipletests(res_df['P'], method='fdr_bh')[1]

        # Bonferroni correction
        res_df['Bonferroni_P'] = res_df['P'] < bonferroni_threshold

        # Find and print the FDR p-value threshold
        fdr_threshold = res_df[res_df['FDR_P'] < significance_value]['FDR_P'].max()
        print(f"FDR-corrected p-value threshold for {dep_var}: {fdr_threshold}")

        print(f"Number of significant results for {dep_var} (uncorrected): {res_df[res_df['P'] < significance_value].shape[0]}")
        print(f"Number of significant results for {dep_var} (FDR corrected): {res_df[res_df['FDR_P'] < significance_value].shape[0]}")
        print(f"Number of significant results for {dep_var} (Bonferroni corrected): {res_df[res_df['Bonferroni_P']].shape[0]}")

        # Filter significant results using FDR-corrected p-values
        sig_res_df = res_df[res_df['FDR_P'] < significance_value]
        significant_results[dep_var] = sig_res_df

        # Annotate with Uniprot ID names
        uniprot = pd.read_csv("../data/uniprotkb_Human_AND_model_organism_9606_2024_05_20.tsv", sep='\t')
        uniprot = uniprot.iloc[:, [0, 3, 4, 7]]

        final_out = pd.merge(res_df, uniprot, left_on='Protein_ID', right_on='Entry', how='left')
        final_out = final_out.sort_values(by='FDR_P')
        #final_out = pd.merge(final_out, df_proteome_median, left_on='Protein_ID', right_index=True, how='left')

        allresults[dep_var] = final_out
        # Save annotated results file
        output_file = f"./output/all/{platform_name}_{dep_var.lower()}_associations_{type}_fdr_corrected.csv"
        final_out.to_csv(output_file, sep='\t', index=False)
        print(f"Saved all {dep_var} associations to {output_file}")

        #save significant results
        sig_output_file = f"./output/significant/{platform_name}_{dep_var.lower()}_significant_associations_{type}_fdr_corrected.csv"
        sig_res_df.to_csv(sig_output_file, sep='\t', index=False)
        print(f"Saved significant {dep_var} associations to {sig_output_file}")

        # Store the top 10 significant results for each dependent variable
        top_10_results[dep_var] = final_out.head(10)

    # Lookup known associations by Uniprot ID (example: P15502)
    lookup_results = {dep_var: final_out[final_out['Protein_ID'] == "P15502"] for dep_var, final_out in significant_results.items()}

    return significant_results, top_10_results, lookup_results, allresults


In [3]:
# read the data for anml normalization now

def process_somalogic_data(type, tech_rep, normalization):
    """
    Process Somalogic data for either 'all' or 'unique' data type.
    
    Parameters:
        type: str
            'all' or 'unique' to determine which dataset to process.
        tech_rep: int or str
            Technical replicate identifier to filter unique participants.
            
    Returns:
        tuple: (df_pheno, df_proteome, df_proteome_median)
    """
    # Read data
    def read_data(type):
        if type == 'anml':
            merged_df = pd.read_csv('../../data/somalogic/Preprocessed_data/Somalogic_Merged_All.csv')
        else:
            merged_df = pd.read_csv('../../data/somalogic/Preprocessed_data_preANML/Somalogic_Merged_All.csv')
            #merged_df = pd.read_csv('../../data/somalogic/Preprocessed_data_preANML/Somalogic_Merged_Unique.csv')

        return merged_df

    merged_df = read_data(type)
    
    # Drop rows where SampleType column is not 'Sample'
   
    merged_df = merged_df[merged_df['SampleType'] == 'Sample']
    
    # Apply log2 transformation if type is 'all'
    merged_df.iloc[:, 20:] = merged_df.iloc[:, 20:].apply(lambda x: np.log2(x + 1))
    
    # Load phenotype data
    pheno = pd.read_csv("../data/HELIOS_Core_v4.csv")
    
    # Filter unique participants based on tech_rep
     # df_unique = df[(df['tech_rep_id'] == "N") & (df['bio_rep_id'] == "N") | (df['tech_rep'] == 1)]
    df_unique = merged_df[(merged_df['tech_rep_id'] == "N") & (merged_df['bio_rep_id'] == "N") | (merged_df['tech_rep'] == tech_rep)]
    
    # Subset and factorize phenotypes
    pheno = (pheno
             .assign(Bmi=lambda x: 100*100*x['DBI14_Weight'] / (x['DBI13_Height']**2))
             .assign(Age=lambda x: x['FREG8_Age'],
                     Sex=lambda x: np.where(x['FREG7_Gender'] == "F", 1, 0))
             .loc[:, ['FREG1_Barcode', 'Age', 'Sex', 'FREG5_Ethnic_Group', 'Bmi']]
             .query('FREG5_Ethnic_Group != "O"'))

    pheno['Sex'] = pheno['Sex'].astype('category')
    pheno['ethnicity'] = pheno['FREG5_Ethnic_Group']
    pheno = pd.get_dummies(pheno, columns=['FREG5_Ethnic_Group'], drop_first=False)
    
    # Merge phenotype and proteomics data
    df_unique = pd.merge(pheno, df_unique, on='FREG1_Barcode')
    
    # Separate phenotype and proteomic datasets
    df_pheno = df_unique.iloc[:, :27]
    def inverse_normal_transform(series):
        ranks = stats.rankdata(series)  # Rank the data
        ranks = (ranks - 0.5) / len(series)  # Convert ranks to percentiles
        transformed = stats.norm.ppf(ranks)  # Apply inverse normal transformation
        return transformed

    # Do rank-based inverse normal transformation of the phenotypes Age and Sex to make them normally distributed
    if normalization == True:
        df_pheno['Age'] = inverse_normal_transform(df_pheno['Age'])
        df_pheno['Bmi'] = inverse_normal_transform(df_pheno['Bmi'])
    df_pheno['Age'] = inverse_normal_transform(df_pheno['Age'])
    df_pheno['Bmi'] = inverse_normal_transform(df_pheno['Bmi'])
    df_proteome = df_unique.iloc[:, 27:]
    
    # Remove proteins with only one unique value
    df_proteome = df_proteome.loc[:, df_proteome.apply(lambda x: x.nunique()) != 1]
    
    # Calculate median for proteomic data
    df_proteome_median = df_proteome.median(axis=1).reset_index()
    df_proteome_median.columns = ['Sample', 'Median']
    df_proteome_median.set_index('Sample', inplace=True)
    
    return df_pheno, df_proteome, df_proteome_median, df_unique



In [None]:

#anml data
df_phen_anml, df_proteome_anml, df_proteome_median_anml, df_unique_anml = process_somalogic_data('anml', 1, True)
proteins_anml = df_proteome_anml.columns
display(df_phen_anml.head())

In [5]:
# pre-anml data
df_phen_preanml, df_proteome_preanml, df_proteome_median_preanml, df_unique_preanml = process_somalogic_data('preanml', 1, True)
proteins_preanml = df_proteome_preanml.columns

  merged_df = pd.read_csv('../../data/somalogic/Preprocessed_data_preANML/Somalogic_Merged_All.csv')


In [6]:
# read the inforamtion on different platform versions
def return_protein_list():
    #read xlsx file
    df1 = pd.read_excel('../../platform_analytes_list/somalogic_5k.xlsx', header=1)
    #display(df1.head())
    uniprot_list_v1 = df1['UniProt'].tolist()
    sequence_list_v1 = df1['SeqId'].tolist()

    df2 = pd.read_excel('../../platform_analytes_list/somalogic_7k.xlsx', header=1)
    #display(df2.head())
    uniprot_list_v2 = df2['UniProt'].tolist()
    sequence_list_v2 = df2['SeqId'].tolist()



    df3 = pd.read_csv('../../data/somalogic/Preprocessed_data/Somalogic_Analyte_Annotation_All.csv',header=0)
    #display(df3.head())
    data3 = df3.loc[:, ['SeqId', 'UniProt']]
    #display(data3.head())
    uniprot_list_v3 = df3['UniProt'].tolist()
    sequence_list_v3 = df3['AptName'].tolist()

    df_lod = pd.read_csv("../../data/somalogic/Preprocessed_data/SomaLogic_HighProportionProteins_0.2.csv")
    uniprot_lod = df_lod['UniProt'].to_list()
    seq_lod = df_lod['AptName'].to_list()

    #convert names of the analytes to the same format for all, 1000-28 will be converted to seq.1000.28
    def format_seqid(seqid_list):
        return [f"seq.{entry.replace('-', '.')}" for entry in seqid_list]

    # Convert sequence lists to new format
    sequence_list_v1 = format_seqid(sequence_list_v1)
    sequence_list_v2 = format_seqid(sequence_list_v2)


    uniprot_to_seqid = dict(zip(data3['UniProt'], data3['SeqId']))

    #function to convert uniprot to olinkid
    def convert_uniprot_to_seqid(uniprot_list, uniprot_to_seqid):
        return list(filter(None, map(lambda uniprot: uniprot_to_seqid.get(uniprot), uniprot_list)))

    uniprot_list_set1 = uniprot_list_v1
    uniprot_list_set2 = list(set(uniprot_list_v2) - set(uniprot_list_v1))
    uniprot_list_set3 = list(set(uniprot_list_v3) - set(uniprot_list_v2))

    sequence_list_set1 = sequence_list_v1
    sequence_list_set2 = list(set(sequence_list_v2) - set(sequence_list_v1))
    sequence_list_set3 = list(set(sequence_list_v3) - set(sequence_list_v2))


    return uniprot_list_set1, uniprot_list_set2, uniprot_list_set3, uniprot_lod, sequence_list_set1, sequence_list_set2, sequence_list_set3, seq_lod
    
#filter the data to only select the required proteins

def return_set_breakdown (proteinlist, aptamerlist):
    set1, set2, set3, lod, apt_set1, apt_set2, apt_set3, apt_lod  = return_protein_list()
    set1_count = len(set(proteinlist) & set(set1))
    set2_count = len(set(proteinlist) & set(set2))
    set3_count = len(set(proteinlist) & set(set3))
    lod_count = len(set(proteinlist) & set(lod))

    apt_set1_count = len(set(aptamerlist) & set(apt_set1))
    apt_set2_count = len(set(aptamerlist) & set(apt_set2))
    apt_set3_count = len(set(aptamerlist) & set(apt_set3))
    apt_lod_count = len(set(aptamerlist) & set(apt_lod))
    return set1_count, set2_count, set3_count, lod_count, apt_set1_count, apt_set2_count, apt_set3_count, apt_lod_count

In [7]:
#generate the column names from the common proteins file
common_proteins = pd.read_csv("common_proteins_soma_olink_thermo.txt", header=None)
common_proteins.columns = ['Proteins']
common_proteins_list = common_proteins['Proteins'].tolist()

#convert the common_proteins_list which are uniprot IDs to ALL matching SomaLogic IDs
def convert_uniprot_to_soma(uniprot_list):
    soma_to_uniprot = pd.read_csv("Somalogic_Analyte_Annotation_All_anml.csv")
    soma_to_uniprot = soma_to_uniprot.loc[:, ['AptName', 'UniProt']]
    
    # Filter rows where UniProt is in the common_proteins_list
    matching_proteins = soma_to_uniprot[soma_to_uniprot['UniProt'].isin(uniprot_list)]
    
    # Return all AptNames that match
    return matching_proteins['AptName'].tolist()

common_proteins_soma = convert_uniprot_to_soma(common_proteins_list)

print(f"Number of common proteins (UniProt IDs): {len(common_proteins_list)}")
print(f"Number of corresponding SomaLogic IDs: {len(common_proteins_soma)}")

# Optional: Check the mapping ratio
soma_annotation = pd.read_csv("Somalogic_Analyte_Annotation_All_anml.csv")
mapping_counts = soma_annotation[soma_annotation['UniProt'].isin(common_proteins_list)].groupby('UniProt').size()
print(f"Average SomaLogic IDs per UniProt ID: {mapping_counts.mean():.2f}")
print(f"Max SomaLogic IDs for a single UniProt ID: {mapping_counts.max()}")

Number of common proteins (UniProt IDs): 1740
Number of corresponding SomaLogic IDs: 2136
Average SomaLogic IDs per UniProt ID: 1.23
Max SomaLogic IDs for a single UniProt ID: 11


In [13]:
def common(df_unique, use_common_samples=True, use_common_proteins=True):
    if use_common_samples:
        # Read common samples
        common_samples = pd.read_csv("common_samples.csv", header=None)
        common_samples.columns = ['FREG0_PID']

        # Filter the samples from df_unique
        df_unique_filter = df_unique[df_unique['FREG0_PID'].isin(common_samples['FREG0_PID'])]
    else:
        df_unique_filter = df_unique

    if use_common_proteins:
       # Filter the proteins from df_unique_filter using common_proteins
        df_unique_proteome = df_unique_filter.iloc[:, 27:]  # Changed from 24: to 27:
        df_unique_proteome = df_unique_proteome[common_proteins_soma]
    else:
        df_unique_proteome = df_unique_filter.iloc[:, 27:]  # Changed from 24: to 27:

    # Separate into phenotype and proteomic datasets
    df_pheno_filter = df_unique_filter.iloc[:, :27]  # Changed from :24 to :27
    df_proteome_filter = df_unique_proteome

    # Remove proteins with only one value
    df_proteome_filter = df_proteome_filter.loc[:, df_proteome_filter.apply(lambda x: x.nunique()) != 1]

    return df_pheno_filter, df_proteome_filter

In [14]:
# Apply common filtering to ANML data
df_pheno_common_anml, df_proteome_common_anml = common(df_unique_anml, use_common_samples=True, use_common_proteins=True)
Protein_ID_common_anml = df_proteome_common_anml.columns

# Apply common filtering to pre-ANML data  
df_pheno_common_preanml, df_proteome_common_preanml = common(df_unique_preanml, use_common_samples=True, use_common_proteins=True)
Protein_ID_common_preanml = df_proteome_common_preanml.columns

def somaid_to_uniprot(somalist):
    df3 = pd.read_csv('../../data/somalogic/Preprocessed_data/Somalogic_Analyte_Annotation_All.csv',header=0)
    data3 = df3.loc[:, ['SeqId', 'UniProt']]
    uniprot_list_v3 = df3['UniProt'].tolist()
    sequence_list_v3 = df3['SeqId'].tolist()
    
    def format_seqid(seqid_list):
        return [f"seq.{entry.replace('-', '.')}" for entry in seqid_list]
    
    sequence_list_v3 = format_seqid(sequence_list_v3)
    seqid_to_uniprot = dict(zip(sequence_list_v3, uniprot_list_v3))
    return list(set(list(filter(None, map(lambda soma: seqid_to_uniprot.get(soma), somalist)))))

# Get UniProt IDs for ANML common proteins
Protein_ID_common_anml_uniprot = somaid_to_uniprot(Protein_ID_common_anml)

# Get UniProt IDs for pre-ANML common proteins
Protein_ID_common_preanml_uniprot = somaid_to_uniprot(Protein_ID_common_preanml)

# Print summary statistics
print("="*60)
print("COMMON SAMPLES AND PROTEINS SUMMARY")
print("="*60)

print(f"ANML Dataset:")
print(f"  Number of common samples: {len(df_pheno_common_anml)}")
print(f"  Number of common proteins (SeqID): {len(Protein_ID_common_anml)}")
print(f"  Number of common proteins (UniProt): {len(Protein_ID_common_anml_uniprot)}")

print(f"\nPre-ANML Dataset:")
print(f"  Number of common samples: {len(df_pheno_common_preanml)}")
print(f"  Number of common proteins (SeqID): {len(Protein_ID_common_preanml)}")
print(f"  Number of common proteins (UniProt): {len(Protein_ID_common_preanml_uniprot)}")

print(f"\nComparison:")
print(f"  Same number of samples: {len(df_pheno_common_anml) == len(df_pheno_common_preanml)}")
print(f"  Same number of proteins: {len(Protein_ID_common_anml) == len(Protein_ID_common_preanml)}")

COMMON SAMPLES AND PROTEINS SUMMARY
ANML Dataset:
  Number of common samples: 46
  Number of common proteins (SeqID): 2136
  Number of common proteins (UniProt): 1740

Pre-ANML Dataset:
  Number of common samples: 46
  Number of common proteins (SeqID): 2136
  Number of common proteins (UniProt): 1740

Comparison:
  Same number of samples: True
  Same number of proteins: True


# ANML common samples, common proteins

In [16]:
# Perform regression analysis on common samples and proteins for ANML data
significant_results_common_anml, top_10_common_anml, lookup_result_common_anml, results_common_anml = perform_regression_fdr(
    df_pheno_common_anml, 
    df_proteome_common_anml, 
    Protein_ID_common_anml, 
    "soma_anml", 
    df_proteome_median_anml, 
    type="common_protein_common_sample"
)

# Display results breakdown for ANML common data
for key in significant_results_common_anml.keys():
    set1_count_total, set2_count_total, set3_count_total, lod_count_total, apt_set1_count_total, apt_set2_count_total, apt_set3_count_total, apt_lod_count_total = return_set_breakdown(Protein_ID_common_anml_uniprot, Protein_ID_common_anml)
    significant_aptamers = significant_results_common_anml[key]['Protein_ID'].to_list()
    associated_proteins = somaid_to_uniprot(significant_aptamers)
    
    print(f"\n{key} (ANML):")
    print("Number of total aptamers: ", len(Protein_ID_common_anml), "Number of total proteins: ", len(Protein_ID_common_anml_uniprot))
    print("Number of lod aptamers: ", apt_lod_count_total, "Number of lod proteins: ", lod_count_total)
    print(f"Number of significant aptamers: {len(significant_aptamers)}", "Number of significant proteins: ", len(associated_proteins))
    set1_count_associated, set2_count_associated, set3_count_associated, lod_count_associated, apt_set1_count_associated, apt_set2_count_associated, apt_set3_count_associated, apt_lod_count_associated = return_set_breakdown(associated_proteins, significant_aptamers)
    
    print("Protein Breakdown")
    print(f"{key}: Set 1: {set1_count_associated}/{set1_count_total}, Set 2: {set2_count_associated}/{set2_count_total}, Set 3: {set3_count_associated}/{set3_count_total}")
    print(f"{key}: Below LoD: {lod_count_associated}/{lod_count_total}, Above LoD: {len(associated_proteins) - lod_count_associated}/{len(Protein_ID_common_anml_uniprot) - lod_count_total}")
    
    print("Aptamer Breakdown")
    print(f"{key}: Set 1: {apt_set1_count_associated}/{apt_set1_count_total}, Set 2: {apt_set2_count_associated}/{apt_set2_count_total}, Set 3: {apt_set3_count_associated}/{apt_set3_count_total}")
    print(f"{key}: Below LoD: {apt_lod_count_associated}/{apt_lod_count_total}, Above LoD: {len(significant_aptamers) - apt_lod_count_associated}/{len(Protein_ID_common_anml) - apt_lod_count_total}")

Number of samples: 46
Number of proteins: 2136
Uncorrected p-value threshold: 0.05
Bonferroni correction threshold: 2.3408239700374533e-05
FDR-corrected p-value threshold for Age: 0.04843353987344548
Number of significant results for Age (uncorrected): 271
Number of significant results for Age (FDR corrected): 26
Number of significant results for Age (Bonferroni corrected): 10
Saved all Age associations to ./output/all/soma_anml_age_associations_common_protein_common_sample_fdr_corrected.csv
Saved significant Age associations to ./output/significant/soma_anml_age_significant_associations_common_protein_common_sample_fdr_corrected.csv
Uncorrected p-value threshold: 0.05
Bonferroni correction threshold: 2.3408239700374533e-05
FDR-corrected p-value threshold for Sex: nan
Number of significant results for Sex (uncorrected): 226
Number of significant results for Sex (FDR corrected): 0
Number of significant results for Sex (Bonferroni corrected): 0
Saved all Sex associations to ./output/all/

In [19]:
# collapse the results to unique uniprot ids

def collapse_results(results):
    collapsed_results = {}
    for key in results.keys():
        associated_proteins = somaid_to_uniprot(results[key]['Protein_ID'].to_list())
        collapsed_results[key] = associated_proteins
    return collapsed_results

collapsed_results_common = collapse_results(significant_results_common_anml)
#print the collapsed results
for key in collapsed_results_common.keys():
    print(key, len(collapsed_results_common[key]))


Age 23
Sex 0
Bmi 82


# ANML common samples, all proteins

In [17]:
# Process ANML data with common samples and all proteins
df_pheno_commonsample_anml, df_proteome_commonsample_anml = common(df_unique_anml, use_common_samples=True, use_common_proteins=False)
Protein_ID_commonsample_anml = df_proteome_commonsample_anml.columns
Protein_ID_commonsample_anml_uniprot = somaid_to_uniprot(Protein_ID_commonsample_anml)


significant_results_commonsample_anml, top_10_commonsample_anml, lookup_result_commonsample_anml, results_commonsample_anml = perform_regression_fdr(df_pheno_commonsample_anml, df_proteome_commonsample_anml, Protein_ID_commonsample_anml, "soma_anml", df_proteome_median_anml, type="all_protein_common_sample")

for key in significant_results_commonsample_anml.keys():
    set1_count_total, set2_count_total, set3_count_total, lod_count, apt_set1_count_total, apt_set2_count_total, apt_set3_count_total, apt_lod_count = return_set_breakdown(Protein_ID_commonsample_anml_uniprot, Protein_ID_commonsample_anml)
    significant_aptamers = significant_results_commonsample_anml[key]['Protein_ID'].to_list()
    associated_proteins = somaid_to_uniprot(significant_aptamers)
    print(f"\n{key} (ANML):")
    print("Number of total aptamers: ", len(Protein_ID_commonsample_anml), "Number of total proteins: ", len(Protein_ID_commonsample_anml_uniprot))
    print("Number of lod aptamers: ", apt_lod_count, "Number of lod proteins: ", lod_count)
    print(f"Number of significant aptamers: {len(significant_aptamers)}", "Number of significant proteins: ", len(associated_proteins))
    set1_count_associated, set2_count_associated, set3_count_associated, lod_count_associated, apt_set1_count_associated, apt_set2_count_associated, apt_set3_count_associated, apt_lod_count_associated = return_set_breakdown(associated_proteins, significant_aptamers)
    print("Protein Breakdown")
    print(f"{key}: Set 1: {set1_count_associated}/{set1_count_total}, Set 2: {set2_count_associated}/{set2_count_total}, Set 3: {set3_count_associated}/{set3_count_total}")
    print(f"{key}: Below LoD: {lod_count_associated}/{lod_count}, Above LoD: {len(associated_proteins) - (lod_count_associated)}/{len(Protein_ID_commonsample_anml_uniprot) - lod_count}")
    print("Aptamer Breakdown")
    print(f"{key}: Set 1: {apt_set1_count_associated}/{apt_set1_count_total}, Set 2: {apt_set2_count_associated}/{apt_set2_count_total}, Set 3: {apt_set3_count_associated}/{apt_set3_count_total}")
    print(f"{key}: Below LoD: {apt_lod_count_associated}/{apt_lod_count}, Above LoD: {len(significant_aptamers) - (apt_lod_count_associated)}/{len(Protein_ID_commonsample_anml) - apt_lod_count}")


Number of samples: 46
Number of proteins: 10675
Uncorrected p-value threshold: 0.05
Bonferroni correction threshold: 4.68384074941452e-06
FDR-corrected p-value threshold for Age: 0.0495974181369162
Number of significant results for Age (uncorrected): 1133
Number of significant results for Age (FDR corrected): 68
Number of significant results for Age (Bonferroni corrected): 16
Saved all Age associations to ./output/all/soma_anml_age_associations_all_protein_common_sample_fdr_corrected.csv
Saved significant Age associations to ./output/significant/soma_anml_age_significant_associations_all_protein_common_sample_fdr_corrected.csv
Uncorrected p-value threshold: 0.05
Bonferroni correction threshold: 4.68384074941452e-06
FDR-corrected p-value threshold for Sex: nan
Number of significant results for Sex (uncorrected): 726
Number of significant results for Sex (FDR corrected): 0
Number of significant results for Sex (Bonferroni corrected): 0
Saved all Sex associations to ./output/all/soma_anml

In [20]:
# collapse the results to unique uniprot ids
collapsed_results_commonsample = collapse_results(significant_results_commonsample_anml)
#print the collapsed results
for key in collapsed_results_commonsample.keys():
    print(key, len(collapsed_results_commonsample[key]))

Age 64
Sex 0
Bmi 413


# preANML common samples, common proteins

In [21]:
# Perform regression analysis on common samples and proteins for pre-ANML data
significant_results_common_preanml, top_10_common_preanml, lookup_result_common_preanml, results_common_preanml = perform_regression_fdr(
    df_pheno_common_preanml, 
    df_proteome_common_preanml, 
    Protein_ID_common_preanml, 
    "soma_preanml", 
    df_proteome_median_preanml, 
    type="common_protein_common_sample"
)

# Display results breakdown for pre-ANML common data
for key in significant_results_common_preanml.keys():
    set1_count_total, set2_count_total, set3_count_total, lod_count_total, apt_set1_count_total, apt_set2_count_total, apt_set3_count_total, apt_lod_count_total = return_set_breakdown(Protein_ID_common_preanml_uniprot, Protein_ID_common_preanml)
    significant_aptamers = significant_results_common_preanml[key]['Protein_ID'].to_list()
    associated_proteins = somaid_to_uniprot(significant_aptamers)
    
    print(f"\n{key} (Pre-ANML):")
    print("Number of total aptamers: ", len(Protein_ID_common_preanml), "Number of total proteins: ", len(Protein_ID_common_preanml_uniprot))
    print("Number of lod aptamers: ", apt_lod_count_total, "Number of lod proteins: ", lod_count_total)
    print(f"Number of significant aptamers: {len(significant_aptamers)}", "Number of significant proteins: ", len(associated_proteins))
    set1_count_associated, set2_count_associated, set3_count_associated, lod_count_associated, apt_set1_count_associated, apt_set2_count_associated, apt_set3_count_associated, apt_lod_count_associated = return_set_breakdown(associated_proteins, significant_aptamers)
    
    print("Protein Breakdown")
    print(f"{key}: Set 1: {set1_count_associated}/{set1_count_total}, Set 2: {set2_count_associated}/{set2_count_total}, Set 3: {set3_count_associated}/{set3_count_total}")
    print(f"{key}: Below LoD: {lod_count_associated}/{lod_count_total}, Above LoD: {len(associated_proteins) - lod_count_associated}/{len(Protein_ID_common_preanml_uniprot) - lod_count_total}")
    
    print("Aptamer Breakdown")
    print(f"{key}: Set 1: {apt_set1_count_associated}/{apt_set1_count_total}, Set 2: {apt_set2_count_associated}/{apt_set2_count_total}, Set 3: {apt_set3_count_associated}/{apt_set3_count_total}")
    print(f"{key}: Below LoD: {apt_lod_count_associated}/{apt_lod_count_total}, Above LoD: {len(significant_aptamers) - apt_lod_count_associated}/{len(Protein_ID_common_preanml) - apt_lod_count_total}")

Number of samples: 46
Number of proteins: 2136
Uncorrected p-value threshold: 0.05
Bonferroni correction threshold: 2.3408239700374533e-05
FDR-corrected p-value threshold for Age: 0.04469049659356662
Number of significant results for Age (uncorrected): 187
Number of significant results for Age (FDR corrected): 18
Number of significant results for Age (Bonferroni corrected): 7
Saved all Age associations to ./output/all/soma_preanml_age_associations_common_protein_common_sample_fdr_corrected.csv
Saved significant Age associations to ./output/significant/soma_preanml_age_significant_associations_common_protein_common_sample_fdr_corrected.csv
Uncorrected p-value threshold: 0.05
Bonferroni correction threshold: 2.3408239700374533e-05
FDR-corrected p-value threshold for Sex: nan
Number of significant results for Sex (uncorrected): 579
Number of significant results for Sex (FDR corrected): 0
Number of significant results for Sex (Bonferroni corrected): 0
Saved all Sex associations to ./output

# preANML common samples, all proteins

In [22]:
# Process pre-ANML data with common samples and all proteins
df_pheno_commonsample_preanml, df_proteome_commonsample_preanml = common(df_unique_preanml, use_common_samples=True, use_common_proteins=False)
Protein_ID_commonsample_preanml = df_proteome_commonsample_preanml.columns
Protein_ID_commonsample_preanml_uniprot = somaid_to_uniprot(Protein_ID_commonsample_preanml)
#length of proteins
print(f"Number of proteins in terms on unique UniProt IDs (Pre-ANML): {len(Protein_ID_commonsample_preanml_uniprot)}")

significant_results_commonsample_preanml, top_10_commonsample_preanml, lookup_result_commonsample_preanml, results_commonsample_preanml = perform_regression_fdr(df_pheno_commonsample_preanml, df_proteome_commonsample_preanml, Protein_ID_commonsample_preanml, "soma_preanml", df_proteome_median_preanml, type="all_protein_common_sample")

for key in significant_results_commonsample_preanml.keys():
    set1_count_total, set2_count_total, set3_count_total, lod_count, apt_set1_count_total, apt_set2_count_total, apt_set3_count_total, apt_lod_count = return_set_breakdown(Protein_ID_commonsample_preanml_uniprot, Protein_ID_commonsample_preanml)
    significant_aptamers = significant_results_commonsample_preanml[key]['Protein_ID'].to_list()
    associated_proteins = somaid_to_uniprot(significant_aptamers)
    print(f"\n{key} (Pre-ANML):")
    print("Number of total aptamers: ", len(Protein_ID_commonsample_preanml), "Number of total proteins: ", len(Protein_ID_commonsample_preanml_uniprot))
    print("Number of lod aptamers: ", apt_lod_count, "Number of lod proteins: ", lod_count)
    print(f"Number of significant aptamers: {len(significant_aptamers)}", "Number of significant proteins: ", len(associated_proteins))
    set1_count_associated, set2_count_associated, set3_count_associated, lod_count_associated, apt_set1_count_associated, apt_set2_count_associated, apt_set3_count_associated, apt_lod_count_associated = return_set_breakdown(associated_proteins, significant_aptamers)
    print("Protein Breakdown")
    print(f"{key}: Set 1: {set1_count_associated}/{set1_count_total}, Set 2: {set2_count_associated}/{set2_count_total}, Set 3: {set3_count_associated}/{set3_count_total}")
    print(f"{key}: Below LoD: {lod_count_associated}/{lod_count}, Above LoD: {len(associated_proteins) - (lod_count_associated)}/{len(Protein_ID_commonsample_preanml_uniprot) - lod_count}")
    print("Aptamer Breakdown")
    print(f"{key}: Set 1: {apt_set1_count_associated}/{apt_set1_count_total}, Set 2: {apt_set2_count_associated}/{apt_set2_count_total}, Set 3: {apt_set3_count_associated}/{apt_set3_count_total}")
    print(f"{key}: Below LoD: {apt_lod_count_associated}/{apt_lod_count}, Above LoD: {len(significant_aptamers) - (apt_lod_count_associated)}/{len(Protein_ID_commonsample_preanml) - apt_lod_count}")

Number of proteins in terms on unique UniProt IDs (Pre-ANML): 9563
Number of samples: 46
Number of proteins: 10675
Uncorrected p-value threshold: 0.05
Bonferroni correction threshold: 4.68384074941452e-06
FDR-corrected p-value threshold for Age: 0.0480891564542309
Number of significant results for Age (uncorrected): 634
Number of significant results for Age (FDR corrected): 31
Number of significant results for Age (Bonferroni corrected): 11
Saved all Age associations to ./output/all/soma_preanml_age_associations_all_protein_common_sample_fdr_corrected.csv
Saved significant Age associations to ./output/significant/soma_preanml_age_significant_associations_all_protein_common_sample_fdr_corrected.csv
Uncorrected p-value threshold: 0.05
Bonferroni correction threshold: 4.68384074941452e-06
FDR-corrected p-value threshold for Sex: nan
Number of significant results for Sex (uncorrected): 3239
Number of significant results for Sex (FDR corrected): 0
Number of significant results for Sex (Bonf

# All results together

In [23]:
from rich.console import Console
from rich.table import Table
from rich.panel import Panel
from rich.text import Text
from rich.columns import Columns

console = Console()

def create_comprehensive_comparison():
    """
    Create a comprehensive comparison with both individual and combined views
    """
    console.print(Panel(Text("COMPREHENSIVE SOMALOGIC ASSOCIATION RESULTS COMPARISON", style="bold white"), style="bright_magenta"))
    console.print()
    
    # First, create detailed breakdown tables for each analysis
    create_detailed_breakdown_tables()
    
    # Then create the overall summary tables
    console.print("\n" + "="*80)
    console.print("SUMMARY TABLES")
    console.print("="*80)
    
    # Create overall summary table with both aptamers and proteins
    overall_table = Table(title="OVERALL SUMMARY - All Analyses (Aptamers)", title_style="bold bright_magenta")
    overall_table.add_column("Analysis Type", style="cyan", no_wrap=True)
    overall_table.add_column("Dataset", style="white")
    overall_table.add_column("Age", justify="center", style="green")
    overall_table.add_column("Sex", justify="center", style="yellow")
    overall_table.add_column("BMI", justify="center", style="red")
    overall_table.add_column("Total Unique", justify="center", style="bright_blue")
    
    # Create UniProt summary table
    uniprot_table = Table(title="OVERALL SUMMARY - All Analyses (UniProt Proteins)", title_style="bold bright_magenta")
    uniprot_table.add_column("Analysis Type", style="cyan", no_wrap=True)
    uniprot_table.add_column("Dataset", style="white")
    uniprot_table.add_column("Age", justify="center", style="green")
    uniprot_table.add_column("Sex", justify="center", style="yellow")
    uniprot_table.add_column("BMI", justify="center", style="red")
    uniprot_table.add_column("Total Unique", justify="center", style="bright_blue")
    
    analyses = [
        ("Common Samples + Common Proteins", "significant_results_common"),
        ("Common Samples + All Proteins", "significant_results_commonsample")
    ]
    
    datasets = [("ANML", "_anml"), ("Pre-ANML", "_preanml")]
    
    for analysis_name, results_var in analyses:
        for dataset_name, suffix in datasets:
            try:
                results = globals()[results_var + suffix]
                
                # Get aptamer counts
                age_apt_count = len(results['Age']['Protein_ID'].to_list()) if 'Age' in results else 0
                sex_apt_count = len(results['Sex']['Protein_ID'].to_list()) if 'Sex' in results else 0
                bmi_apt_count = len(results['Bmi']['Protein_ID'].to_list()) if 'Bmi' in results else 0
                
                # Get unique aptamers across all phenotypes
                all_significant_apt = []
                for pheno in ['Age', 'Sex', 'Bmi']:
                    if pheno in results:
                        all_significant_apt.extend(results[pheno]['Protein_ID'].to_list())
                unique_aptamers = len(set(all_significant_apt))
                
                # Get UniProt protein counts
                age_proteins = somaid_to_uniprot(results['Age']['Protein_ID'].to_list()) if 'Age' in results else []
                sex_proteins = somaid_to_uniprot(results['Sex']['Protein_ID'].to_list()) if 'Sex' in results else []
                bmi_proteins = somaid_to_uniprot(results['Bmi']['Protein_ID'].to_list()) if 'Bmi' in results else []
                
                age_prot_count = len(age_proteins)
                sex_prot_count = len(sex_proteins)
                bmi_prot_count = len(bmi_proteins)
                
                # Get unique proteins across all phenotypes
                all_significant_prot = []
                all_significant_prot.extend(age_proteins)
                all_significant_prot.extend(sex_proteins)
                all_significant_prot.extend(bmi_proteins)
                unique_proteins = len(set(all_significant_prot))
                
                # Add rows to both tables
                overall_table.add_row(
                    analysis_name if dataset_name == "ANML" else "",
                    dataset_name,
                    str(age_apt_count),
                    str(sex_apt_count), 
                    str(bmi_apt_count),
                    str(unique_aptamers)
                )
                
                uniprot_table.add_row(
                    analysis_name if dataset_name == "ANML" else "",
                    dataset_name,
                    str(age_prot_count),
                    str(sex_prot_count), 
                    str(bmi_prot_count),
                    str(unique_proteins)
                )
                
            except KeyError:
                overall_table.add_row(
                    analysis_name if dataset_name == "ANML" else "",
                    dataset_name,
                    "N/A", "N/A", "N/A", "N/A"
                )
                uniprot_table.add_row(
                    analysis_name if dataset_name == "ANML" else "",
                    dataset_name,
                    "N/A", "N/A", "N/A", "N/A"
                )
        
        # Add empty row as separator between analyses
        if analysis_name != analyses[-1][0]:
            overall_table.add_row("", "", "", "", "", "")
            uniprot_table.add_row("", "", "", "", "", "")
    
    console.print(overall_table)
    console.print()
    console.print(uniprot_table)

def create_detailed_breakdown_tables():
    """
    Create detailed breakdown tables showing sets and LoD information - APTAMER FOCUSED
    """
    console.print("="*80)
    console.print("DETAILED BREAKDOWN BY SETS AND LOD (APTAMER FOCUSED)")
    console.print("="*80)
    
    analyses = [
        ("Common Samples + Common Proteins", "significant_results_common", "Protein_ID_common", "Protein_ID_common", "_uniprot"),
        ("Common Samples + All Proteins", "significant_results_commonsample", "Protein_ID_commonsample", "Protein_ID_commonsample", "_uniprot")
    ]
    
    datasets = [("ANML", "_anml"), ("Pre-ANML", "_preanml")]
    
    for analysis_name, results_var, protein_ids_var, protein_base_var, uniprot_suffix in analyses:
        for dataset_name, dataset_suffix in datasets:
            console.print(f"\n{analysis_name} - {dataset_name}:")
            console.print("-" * 60)
            
            try:
                results = globals()[results_var + dataset_suffix]
                protein_ids = globals()[protein_ids_var + dataset_suffix]
                protein_uniprot = globals()[protein_base_var + dataset_suffix + uniprot_suffix]
                
                # Create detailed breakdown table - APTAMER FOCUSED
                breakdown_table = Table(title=f"Set and LoD Breakdown - {analysis_name} ({dataset_name})")
                breakdown_table.add_column("Phenotype", style="cyan")
                breakdown_table.add_column("Sig Aptamers", justify="center", style="green")
                breakdown_table.add_column("Sig Proteins", justify="center", style="green")
                breakdown_table.add_column("Set 1 (A/P)", justify="center", style="yellow")
                breakdown_table.add_column("Set 2 (A/P)", justify="center", style="yellow") 
                breakdown_table.add_column("Set 3 (A/P)", justify="center", style="yellow")
                breakdown_table.add_column("Below LoD (A/P)", justify="center", style="red")
                breakdown_table.add_column("Above LoD (A/P)", justify="center", style="blue")
                
                # Get total counts once
                set1_count_total, set2_count_total, set3_count_total, lod_count_total, apt_set1_count_total, apt_set2_count_total, apt_set3_count_total, apt_lod_count_total = return_set_breakdown(protein_uniprot, protein_ids)
                
                for phenotype in ['Age', 'Sex', 'Bmi']:
                    if phenotype in results:
                        significant_aptamers = results[phenotype]['Protein_ID'].to_list()
                        associated_proteins = somaid_to_uniprot(significant_aptamers)
                        
                        set1_count_associated, set2_count_associated, set3_count_associated, lod_count_associated, apt_set1_count_associated, apt_set2_count_associated, apt_set3_count_associated, apt_lod_count_associated = return_set_breakdown(associated_proteins, significant_aptamers)
                        
                        breakdown_table.add_row(
                            phenotype,
                            str(len(significant_aptamers)),
                            str(len(associated_proteins)),
                            f"{apt_set1_count_associated}/{set1_count_associated}",
                            f"{apt_set2_count_associated}/{set2_count_associated}",
                            f"{apt_set3_count_associated}/{set3_count_associated}",
                            f"{apt_lod_count_associated}/{lod_count_associated}",
                            f"{len(significant_aptamers) - apt_lod_count_associated}/{len(associated_proteins) - lod_count_associated}"
                        )
                    else:
                        breakdown_table.add_row(phenotype, "0", "0", "0/0", "0/0", "0/0", "0/0", "0/0")
                
                # Add total row
                breakdown_table.add_row(
                    "TOTAL",
                    str(len(protein_ids)),
                    str(len(protein_uniprot)),
                    f"{apt_set1_count_total}/{set1_count_total}",
                    f"{apt_set2_count_total}/{set2_count_total}",
                    f"{apt_set3_count_total}/{set3_count_total}",
                    f"{apt_lod_count_total}/{lod_count_total}",
                    f"{len(protein_ids) - apt_lod_count_total}/{len(protein_uniprot) - lod_count_total}"
                )
                
                console.print(breakdown_table)
                
            except KeyError as e:
                console.print(f"Data not available for {analysis_name} - {dataset_name}: {e}")

# Call the comprehensive comparison function
create_comprehensive_comparison()