In [17]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import scipy.stats as stats
from statsmodels.stats.multitest import multipletests

In [None]:
# Load data
df = pd.read_csv("../../data/Thermo/Preprocessed _data/Thermo_Fisher_Merged_All_Imputed.csv")
#df = pd.read_csv("../data/illumina/Preprocessed_data/Illumina_Merged_Unique.csv")
# df = pd.read_csv("./data/Olink_Merged_Unique.csv")
pheno = pd.read_csv("../data/HELIOS_Core_v4.csv")

# Filter unique participants
df_unique = df[(df['tech_rep_id'] == "N") & (df['bio_rep_id'] == "N") | (df['tech_rep'] == 1)]

display(df_unique.head())

# number of samples
print("Number of unique samples:", df_unique.shape[0])

In [None]:
# Subset and factorize phenotypes
pheno = (pheno
         .assign(Bmi=lambda x: 100*100*x['DBI14_Weight'] / (x['DBI13_Height']**2))
         .assign(Age=lambda x: x['FREG8_Age'],
                 Sex=lambda x: np.where(x['FREG7_Gender'] == "F", 1, 0))
         .loc[:, ['FREG1_Barcode', 'Age', 'Sex', 'FREG5_Ethnic_Group', 'Bmi']]
         .query('FREG5_Ethnic_Group != "O"'))


pheno['Sex'] = pheno['Sex'].astype('category')

# Convert 'FREG5_Ethnic_Group' to categorical dummy variables
pheno = pd.get_dummies(pheno, columns=['FREG5_Ethnic_Group'], drop_first=True)
print(pheno.head())



In [None]:
# Merge with proteomics data
df_unique = pd.merge(pheno, df_unique, on='FREG1_Barcode')
df_unique

In [None]:
import scipy.stats as stats

# Separate into phenotype and proteomic datasets
df_pheno = df_unique.iloc[:, :14]
display(df_pheno.head())
df_proteome = df_unique.iloc[:, 14:]
display(df_proteome.head())

# Remove proteins with only one value
df_proteome = df_proteome.loc[:, df_proteome.apply(lambda x: x.nunique()) != 1]

print(df_proteome.shape)

df_proteome_median = df_proteome.median()
#set first column as index of the dataframe
df_proteome_median = df_proteome_median.reset_index()
df_proteome_median.columns = ['Protein', 'Median']
#display(df_proteome_median.head())
df_proteome_median.set_index('Protein', inplace=True)
#display(df_proteome_median.head())

# Function to apply inverse normal transformation
def inverse_normal_transform(series):
    ranks = stats.rankdata(series)  # Rank the data
    ranks = (ranks - 0.5) / len(series)  # Convert ranks to percentiles
    transformed = stats.norm.ppf(ranks)  # Apply inverse normal transformation
    return transformed

# Do rank-based inverse normal transformation of the phenotypes Age and BMI to make them normally distributed
df_pheno['Age'] = inverse_normal_transform(df_pheno['Age'])
df_pheno['Bmi'] = inverse_normal_transform(df_pheno['Bmi'])

In [8]:
# read the inforamtion on different platform versions
def return_protein_list():
    #read xlsx file
    df1 = pd.read_excel('../../platform_analytes_list/somalogic_5k.xlsx', header=1)
    #display(df1.head())
    uniprot_list_v1 = df1['UniProt'].tolist()
    sequence_list_v1 = df1['SeqId'].tolist()

    df2 = pd.read_excel('../../platform_analytes_list/somalogic_7k.xlsx', header=1)
    #display(df2.head())
    uniprot_list_v2 = df2['UniProt'].tolist()
    sequence_list_v2 = df2['SeqId'].tolist()



    df3 = pd.read_csv('../../data/somalogic/Preprocessed_data/Somalogic_Analyte_Annotation_All.csv',header=0)
    #display(df3.head())
    data3 = df3.loc[:, ['SeqId', 'UniProt']]
    #display(data3.head())
    uniprot_list_v3 = df3['UniProt'].tolist()
    sequence_list_v3 = df3['SeqId'].tolist()

    df_lod = pd.read_csv("../data/Thermo_Fisher_HighProportionProteins_0.2.csv")
    uniprot_lod = df_lod['UniProt'].to_list()


    uniprot_to_seqid = dict(zip(data3['UniProt'], data3['SeqId']))

    #function to convert uniprot to olinkid
    def convert_uniprot_to_seqid(uniprot_list, uniprot_to_seqid):
        return list(filter(None, map(lambda uniprot: uniprot_to_seqid.get(uniprot), uniprot_list)))

    uniprot_list_set1 = uniprot_list_v1
    uniprot_list_set2 = list(set(uniprot_list_v2) - set(uniprot_list_v1))
    uniprot_list_set3 = list(set(uniprot_list_v3) - set(uniprot_list_v2))

    return uniprot_list_set1, uniprot_list_set2, uniprot_list_set3, uniprot_lod
    
#filter the data to only select the required proteins

def return_set_breakdown (proteinlist):
    set1, set2, set3, lod = return_protein_list()
    set1_count = len(set(proteinlist) & set(set1))
    set2_count = len(set(proteinlist) & set(set2))
    set3_count = len(set(proteinlist) & set(set3))
    lod_count = len(set(proteinlist) & set(lod))
    return set1_count, set2_count, set3_count, lod_count

In [10]:
def perform_regression_fdr(df_pheno, df_proteome, Protein_ID, platform_name, df_proteome_median, type):
    def run_regression(y, X, family):
        X = sm.add_constant(X)  # Add constant term for intercept
        if family == 'binomial':
            model = sm.GLM(y, X, family=sm.families.Binomial()).fit()
        else:
            model = sm.GLM(y, X, family=sm.families.Gaussian()).fit()
        return model

    df_proteome = df_proteome.to_numpy()

    # Define the dependent variables and their respective covariates
    regression_config = {
        'Age': (['Sex', 'Bmi'], 'gaussian'),
        'Sex': (['Age', 'Bmi'], 'binomial'),
        'Bmi': (['Age', 'Sex'], 'gaussian')
    }
    # Define common covariates
    common_covariates = ['FREG5_Ethnic_Group_I', 'FREG5_Ethnic_Group_M']
    results = {dep_var: [] for dep_var in regression_config}

    for i in range(df_proteome.shape[1]):
        if type=='preanml':
            protein_data = df_proteome[:, i]
        else:
            protein_data = df_proteome[:, i]
        
        for dep_var, (covariates, family) in regression_config.items():
            X = np.column_stack((protein_data, df_pheno[covariates].values, df_pheno[common_covariates].values))
            #X = np.column_stack((protein_data, df_pheno[covariates].values))
            model = run_regression(df_pheno[dep_var], X, family)
            results[dep_var].append([
    Protein_ID[i], model.params.iloc[1], model.bse.iloc[1], model.tvalues.iloc[1], model.pvalues.iloc[1]
])

    significance_value = 0.05
    bonferroni_threshold = significance_value / len(Protein_ID)
    significant_results = {}
    top_10_results = {}
    allresults = {}
    significant_results_no_corr={}

    print(f"Number of samples: {df_pheno.shape[0]}")
    print(f"Number of proteins: {df_proteome.shape[1]}")

    for dep_var, result_list in results.items():
        columns = ['Protein_ID', 'Est', 'SE', 't_value', 'P']
        res_df = pd.DataFrame(result_list, columns=columns)

        # Print uncorrected p-value threshold
        print(f"Uncorrected p-value threshold: {significance_value}")
        print(f"Bonferroni correction threshold: {bonferroni_threshold}")

        res_df = res_df.dropna(subset=['P'])

        # FDR correction
        res_df['FDR_P'] = multipletests(res_df['P'], method='fdr_bh')[1]

        # Bonferroni correction
        res_df['Bonferroni_P'] = res_df['P'] < bonferroni_threshold

        # Find and print the FDR p-value threshold
        fdr_threshold = res_df[res_df['FDR_P'] < significance_value]['FDR_P'].max()
        print(f"FDR-corrected p-value threshold for {dep_var}: {fdr_threshold}")

        print(f"Number of significant results for {dep_var} (uncorrected): {res_df[res_df['P'] < significance_value].shape[0]}")
        print(f"Number of significant results for {dep_var} (FDR corrected): {res_df[res_df['FDR_P'] < significance_value].shape[0]}")
        print(f"Number of significant results for {dep_var} (Bonferroni corrected): {res_df[res_df['Bonferroni_P']].shape[0]}")

        # Filter significant results using FDR-corrected p-values
        sig_res_df = res_df[res_df['FDR_P'] < significance_value]
        significant_results[dep_var] = sig_res_df

        # Annotate with Uniprot ID names
        uniprot = pd.read_csv("../data/uniprotkb_Human_AND_model_organism_9606_2024_05_20.tsv", sep='\t')
        uniprot = uniprot.iloc[:, [0, 3, 4, 7]]

        final_out = pd.merge(res_df, uniprot, left_on='Protein_ID', right_on='Entry', how='left')
        final_out = final_out.sort_values(by='FDR_P')
        #final_out = pd.merge(final_out, df_proteome_median, left_on='Protein_ID', right_index=True, how='left')

        allresults[dep_var] = final_out
        # Save annotated results file
        output_file = f"./output/all/{platform_name}_{dep_var.lower()}_associations_{type}_fdr_corrected.csv"
        final_out.to_csv(output_file, sep='\t', index=False)
        print(f"Saved all {dep_var} associations to {output_file}")

        #save significant results
        sig_output_file = f"./output/significant/{platform_name}_{dep_var.lower()}_significant_associations_{type}_fdr_corrected.csv"
        sig_res_df.to_csv(sig_output_file, sep='\t', index=False)
        print(f"Saved significant {dep_var} associations to {sig_output_file}")

        # Store the top 10 significant results for each dependent variable
        top_10_results[dep_var] = final_out.head(10)

    # Lookup known associations by Uniprot ID (example: P15502)
    lookup_results = {dep_var: final_out[final_out['Protein_ID'] == "P15502"] for dep_var, final_out in significant_results.items()}

    return significant_results, top_10_results, lookup_results, allresults


In [13]:
#generate the column names from the common proteins file
common_proteins = pd.read_csv("common_proteins_soma_olink_thermo.txt", header=None)
common_proteins.columns = ['Proteins']
common_proteins_list = common_proteins['Proteins'].tolist()

#convert the common_proteins_list which are uniprot IDs to ALL matching thermo IDs
def convert_uniprot_to_thermo(uniprot_list):
    thermo_to_uniprot = pd.read_csv("Thermo_Fisher_Protein_Annotation_All.csv")
    thermo_to_uniprot = thermo_to_uniprot.loc[:, ['ProteinID', 'UniProt']]
    
    # Filter rows where UniProt is in the common_proteins_list
    matching_proteins = thermo_to_uniprot[thermo_to_uniprot['UniProt'].isin(uniprot_list)]
    
    # Return all ProteinIDs that match
    return matching_proteins['ProteinID'].tolist()

common_proteins_thermo = convert_uniprot_to_thermo(common_proteins_list)

print(f"Number of common proteins (UniProt IDs): {len(common_proteins_list)}")
print(f"Number of corresponding Thermo ProteinIDs: {len(common_proteins_thermo)}")

# Optional: Check the mapping ratio
thermo_annotation = pd.read_csv("Thermo_Fisher_Protein_Annotation_All.csv")
mapping_counts = thermo_annotation[thermo_annotation['UniProt'].isin(common_proteins_list)].groupby('UniProt').size()
print(f"Average Thermo ProteinIDs per UniProt ID: {mapping_counts.mean():.2f}")
print(f"Max Thermo ProteinIDs for a single UniProt ID: {mapping_counts.max()}")

Number of common proteins (UniProt IDs): 1740
Number of corresponding Thermo ProteinIDs: 1779
Average Thermo ProteinIDs per UniProt ID: 1.02
Max Thermo ProteinIDs for a single UniProt ID: 4


In [14]:
def common(df_unique, use_common_samples=True, use_common_proteins=True):
    if use_common_samples:
        # Read common samples
        common_samples = pd.read_csv("common_samples.csv", header=None)
        common_samples.columns = ['FREG0_PID']

        # Filter the samples from df_unique
        df_unique_filter = df_unique[df_unique['FREG0_PID'].isin(common_samples['FREG0_PID'])]
    else:
        df_unique_filter = df_unique

    if use_common_proteins:
        df_unique_proteome = df_unique_filter.iloc[:, 14:]
        display(df_unique_proteome.head())
        df_unique_proteome = df_unique_proteome[common_proteins_thermo]
    else:
        df_unique_proteome = df_unique_filter.iloc[:, 14:]

    # Separate into phenotype and proteomic datasets
    df_pheno_filter = df_unique_filter.iloc[:, :14]
    display(df_pheno_filter.head())
    df_proteome_filter = df_unique_proteome

    # Remove proteins with only one value
    df_proteome_filter = df_proteome_filter.loc[:, df_proteome_filter.apply(lambda x: x.nunique()) != 1]

    return df_pheno_filter, df_proteome_filter


In [15]:
#write a function to return the number of unique uniprot IDs given a list of thermo protein IDs
def count_unique_uniprot_ids(thermo_protein_ids):
    thermo_to_uniprot = pd.read_csv("Thermo_Fisher_Protein_Annotation_All.csv")
    thermo_to_uniprot = thermo_to_uniprot.loc[:, ['ProteinID', 'UniProt']]
    
    # Filter rows where ProteinID is in the thermo_protein_ids
    matching_proteins = thermo_to_uniprot[thermo_to_uniprot['ProteinID'].isin(thermo_protein_ids)]
    
    # Return the number of unique UniProt IDs
    return matching_proteins['UniProt'].nunique()


In [None]:
df_pheno_common, df_proteome_common = common(df_unique, use_common_samples=True, use_common_proteins=True)
#chnage from uniprot 
Protein_ID_common = df_proteome_common.columns
significant_results_common, top_10_common, lookup_result_common, _ = perform_regression_fdr(df_pheno_common, df_proteome_common, Protein_ID_common, "thermo", df_proteome_median, type="common_protein_common_sample")

In [19]:
# return the number of unique uniprot IDs and total thermo iDS in the significant results common
for key in significant_results_common.keys():
    unique_uniprot_count = count_unique_uniprot_ids(significant_results_common[key]['Protein_ID'].tolist())
    total_thermo_ids = len(significant_results_common[key]['Protein_ID'].tolist())
    print(f"{key}: Unique UniProt IDs: {unique_uniprot_count}, Total Thermo IDs: {total_thermo_ids}")

Age: Unique UniProt IDs: 11, Total Thermo IDs: 12
Sex: Unique UniProt IDs: 0, Total Thermo IDs: 0
Bmi: Unique UniProt IDs: 50, Total Thermo IDs: 51


In [None]:
df_pheno_commonsample, df_proteome_commonsample = common(df_unique, use_common_samples=True, use_common_proteins=False)
Protein_ID_commonsample = df_proteome_commonsample.columns

significant_results_commonsample, top_10_commonsample, lookup_result_commonsample,_ = perform_regression_fdr(df_pheno_commonsample, df_proteome_commonsample, Protein_ID_commonsample, "thermo", df_proteome_median, type="all_protein_common_sample")

In [21]:
# count unique uniprot IDs and thermo proteins IDS in the significant results commonsample
for key in significant_results_commonsample.keys():
    unique_uniprot_count = count_unique_uniprot_ids(significant_results_commonsample[key]['Protein_ID'].tolist())
    total_thermo_ids = len(significant_results_commonsample[key]['Protein_ID'].tolist())
    print(f"{key}: Unique UniProt IDs: {unique_uniprot_count}, Total Thermo IDs: {total_thermo_ids}")

Age: Unique UniProt IDs: 14, Total Thermo IDs: 14
Sex: Unique UniProt IDs: 0, Total Thermo IDs: 0
Bmi: Unique UniProt IDs: 135, Total Thermo IDs: 137
