In [1]:
import pandas as pd
import os

# Proteomics

In [3]:
def read_excel_file(file_path, sheet_name=0):
    """
    Reads an Excel (.xlsx) file into a pandas DataFrame.

    :param file_path: Path to the Excel file.
    :param sheet_name: Sheet name or index (default is first sheet).
    :return: pandas DataFrame or None if error occurs.
    """
    try:
        # Validate file existence
        if not os.path.isfile(file_path):
            raise FileNotFoundError(f"File not found: {file_path}")

        # Validate file extension
        if not file_path.lower().endswith(".xlsx"):
            raise ValueError("Only .xlsx files are supported.")

        # Read Excel file
        df = pd.read_excel(file_path, sheet_name=sheet_name, engine="openpyxl")
        print(f"Successfully loaded sheet '{sheet_name}' from {file_path}")
        return df

    except FileNotFoundError as fnf_err:
        print(f"Error: {fnf_err}")
    except ValueError as val_err:
        print(f"Error: {val_err}")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")

    return None

In [4]:
file_path = "/home/jamie/Documents/AI4BI/BiomarkersOfAgeChallenge/dataverse_files/BoAC_CNS_Alamar.xlsx"  # Replace with your file path
sheet = 0  # You can specify the sheet name or index here
CNS_Alamar = read_excel_file(file_path, sheet)
CNS_Alamar.head()

Successfully loaded sheet '0' from /home/jamie/Documents/AI4BI/BiomarkersOfAgeChallenge/dataverse_files/BoAC_CNS_Alamar.xlsx


Unnamed: 0,Panel,PanelLotNumber,PlateID,SampleName,SampleType,Target,AlamarTargetID,UniProtID,ProteinName,SampleQC,LOD,NPQ
0,CNS Disease Panel V1,panelLot021,Plate_01,A_01_BoA_Plasma_P1,Sample,ACHE,t8246,P22303,Acetylcholinesterase,PASS,4.482828,13.182884
1,CNS Disease Panel V1,panelLot021,Plate_01,A_02_BoA_Plasma_P9,Sample,ACHE,t8246,P22303,Acetylcholinesterase,PASS,4.482828,13.887148
2,CNS Disease Panel V1,panelLot021,Plate_01,A_03_BoA_Plasma_P17,Sample,ACHE,t8246,P22303,Acetylcholinesterase,PASS,4.482828,13.21353
3,CNS Disease Panel V1,panelLot021,Plate_01,A_04_BoA_Plasma_P25,Sample,ACHE,t8246,P22303,Acetylcholinesterase,PASS,4.482828,13.665878
4,CNS Disease Panel V1,panelLot021,Plate_01,A_05_BoA_Plasma_P33,Sample,ACHE,t8246,P22303,Acetylcholinesterase,PASS,4.482828,13.840203


In [5]:
file_path = "/home/jamie/Documents/AI4BI/BiomarkersOfAgeChallenge/dataverse_files/BoAC_Inflammation_Alamar.xlsx"  # Replace with your file path
sheet = 0  # Can be sheet name like "Sheet1" or index like 0
Inflammation_Alamar = read_excel_file(file_path, sheet)
Inflammation_Alamar.head()

Successfully loaded sheet '0' from /home/jamie/Documents/AI4BI/BiomarkersOfAgeChallenge/dataverse_files/BoAC_Inflammation_Alamar.xlsx


Unnamed: 0,Panel,PanelLotNumber,PlateID,SampleName,SampleType,Target,AlamarTargetID,UniProtID,ProteinName,SampleQC,LOD,NPQ
0,200-Plex Inflammation Panel v2,panelLot018,Plate_01,A_01_BoA_Plasma_P1,Sample,AGER,t5521,Q15109,Advanced glycosylation end product-specific re...,PASS,4.174109,12.932788
1,200-Plex Inflammation Panel v2,panelLot018,Plate_01,A_02_BoA_Plasma_P9,Sample,AGER,t5521,Q15109,Advanced glycosylation end product-specific re...,PASS,4.174109,13.594548
2,200-Plex Inflammation Panel v2,panelLot018,Plate_01,A_03_BoA_Plasma_P17,Sample,AGER,t5521,Q15109,Advanced glycosylation end product-specific re...,PASS,4.174109,13.938275
3,200-Plex Inflammation Panel v2,panelLot018,Plate_01,A_04_BoA_Plasma_P25,Sample,AGER,t5521,Q15109,Advanced glycosylation end product-specific re...,PASS,4.174109,12.903652
4,200-Plex Inflammation Panel v2,panelLot018,Plate_01,A_05_BoA_Plasma_P33,Sample,AGER,t5521,Q15109,Advanced glycosylation end product-specific re...,PASS,4.174109,14.069248


In [6]:
file_path = "/home/jamie/Documents/AI4BI/BiomarkersOfAgeChallenge/dataverse_files/BoAC_plasma_metadata.xlsx"  # Replace with your file path
sheet = 0  # Can be sheet name like "Sheet1" or index like 0
Plasma_Metadata = read_excel_file(file_path, sheet)
Plasma_Metadata.head()

Successfully loaded sheet '0' from /home/jamie/Documents/AI4BI/BiomarkersOfAgeChallenge/dataverse_files/BoAC_plasma_metadata.xlsx


Unnamed: 0,Plasma.ID,Sex,Race1,Race2,Ethnicity,Age
0,P1,Male,Black,,Non Hispanic,67.137577
1,P2,Male,White,,Non Hispanic,85.705681
2,P3,Male,White,,Non Hispanic,82.902122
3,P4,Female,White,,DECLINED,22.477755
4,P5,Male,White,,Non Hispanic,70.584531


In [7]:
CNS_Alamar.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64604 entries, 0 to 64603
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Panel           64604 non-null  object 
 1   PanelLotNumber  64604 non-null  object 
 2   PlateID         64604 non-null  object 
 3   SampleName      64604 non-null  object 
 4   SampleType      64604 non-null  object 
 5   Target          64604 non-null  object 
 6   AlamarTargetID  64604 non-null  object 
 7   UniProtID       64604 non-null  object 
 8   ProteinName     64604 non-null  object 
 9   SampleQC        64604 non-null  object 
 10  LOD             63562 non-null  float64
 11  NPQ             64604 non-null  float64
dtypes: float64(2), object(10)
memory usage: 5.9+ MB


In [8]:
Inflammation_Alamar.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 130250 entries, 0 to 130249
Data columns (total 12 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   Panel           130250 non-null  object 
 1   PanelLotNumber  130250 non-null  object 
 2   PlateID         130250 non-null  object 
 3   SampleName      130250 non-null  object 
 4   SampleType      130250 non-null  object 
 5   Target          130250 non-null  object 
 6   AlamarTargetID  130250 non-null  object 
 7   UniProtID       130250 non-null  object 
 8   ProteinName     130250 non-null  object 
 9   SampleQC        130250 non-null  object 
 10  LOD             129208 non-null  float64
 11  NPQ             130250 non-null  float64
dtypes: float64(2), object(10)
memory usage: 11.9+ MB


In [None]:
Plasma_Metadata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 503 entries, 0 to 502
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Plasma.ID  503 non-null    object
 1   Sex        503 non-null    object
 2   Race1      503 non-null    object
 3   Race2      6 non-null      object
 4   Ethnicity  503 non-null    object
 5   Age        503 non-null    object
dtypes: object(6)
memory usage: 23.7+ KB


# DNA Methylation

In [3]:
betas = pd.read_csv("GSE246337_betas.csv")
betas.head()

Unnamed: 0.1,Unnamed: 0,207686140044_R01C01,207686140044_R02C01,207686140044_R03C01,207686140044_R04C01,207686140044_R05C01,207686140044_R06C01,207686140044_R07C01,207686140044_R08C01,207686140076_R01C01,...,207805820161_R07C01,207805820161_R08C01,207805820171_R01C01,207805820171_R02C01,207805820171_R03C01,207805820171_R04C01,207805820171_R05C01,207805820171_R06C01,207805820171_R07C01,207805820171_R08C01
0,cg00000029_TC21,0.720084,0.453491,0.535564,0.496694,0.525362,0.525319,0.43433,0.408679,0.59424,...,0.485967,0.452223,0.567441,0.473869,0.52324,0.554518,0.446254,0.487361,0.337215,0.062426
1,cg00000109_TC21,0.938044,0.925149,0.949446,0.934988,0.930916,0.949987,0.939432,0.932865,0.943177,...,0.943495,0.915393,0.953727,0.925223,0.943845,0.940539,0.933257,0.938523,0.940187,0.94209
2,cg00000155_BC21,0.957093,0.95903,0.948919,0.954976,0.955666,0.957251,0.955374,0.947911,0.946194,...,0.953201,0.944205,0.962515,0.95749,0.962842,0.956204,0.949011,0.966912,0.968026,0.970274
3,cg00000158_BC21,0.965191,0.957272,0.967278,0.955776,0.97066,0.966632,0.976063,0.971095,0.967079,...,0.961862,0.968069,0.962177,0.969633,0.960911,0.965914,0.973085,0.971039,0.969745,0.96966
4,cg00000165_TC21,0.144061,0.122099,0.167346,0.116861,0.208221,0.198627,0.161761,0.098129,0.166552,...,0.175452,0.354382,0.131584,0.186962,0.16911,0.192639,0.166137,0.156199,0.133038,0.641788


In [None]:
betas.shape

(937690, 501)

In [5]:
betas.describe()

Unnamed: 0,207686140044_R01C01,207686140044_R02C01,207686140044_R03C01,207686140044_R04C01,207686140044_R05C01,207686140044_R06C01,207686140044_R07C01,207686140044_R08C01,207686140076_R01C01,207686140076_R02C01,...,207805820161_R07C01,207805820161_R08C01,207805820171_R01C01,207805820171_R02C01,207805820171_R03C01,207805820171_R04C01,207805820171_R05C01,207805820171_R06C01,207805820171_R07C01,207805820171_R08C01
count,893786.0,887924.0,893019.0,891758.0,894765.0,894557.0,895592.0,898590.0,897491.0,890540.0,...,884190.0,886485.0,888275.0,899585.0,889322.0,891671.0,896899.0,897316.0,896221.0,896947.0
mean,0.632324,0.625617,0.629608,0.628098,0.625219,0.627548,0.618811,0.623185,0.633191,0.625836,...,0.62545,0.626422,0.624777,0.625575,0.633029,0.631592,0.62561,0.631909,0.626946,0.635765
std,0.359507,0.354507,0.362031,0.355753,0.35716,0.354825,0.358204,0.371804,0.358607,0.355006,...,0.361985,0.364694,0.356435,0.361202,0.363336,0.358111,0.372707,0.365578,0.373659,0.380576
min,0.006801,0.007902,0.00704,0.008162,0.007337,0.007154,0.006819,0.006808,0.007673,0.008645,...,0.008035,0.006922,0.0082,0.006482,0.007978,0.007141,0.007517,0.007259,0.006335,0.006734
25%,0.24329,0.244865,0.224893,0.250337,0.241821,0.257019,0.22345,0.17215,0.25447,0.241915,...,0.212634,0.202744,0.237916,0.228941,0.222566,0.253848,0.162606,0.225418,0.17476,0.148654
50%,0.83988,0.821537,0.838462,0.823654,0.819739,0.818399,0.809815,0.844442,0.836568,0.822749,...,0.833279,0.841919,0.819397,0.824052,0.847211,0.830259,0.850107,0.845347,0.854354,0.877883
75%,0.930105,0.921332,0.929748,0.925685,0.926378,0.925436,0.924222,0.935909,0.932077,0.923008,...,0.927183,0.92885,0.923817,0.930379,0.931667,0.929648,0.936141,0.936403,0.93947,0.945145
max,0.994578,0.994257,0.994795,0.994432,0.994581,0.994795,0.994897,0.994929,0.994571,0.993633,...,0.994669,0.995179,0.99461,0.995122,0.994605,0.995008,0.994684,0.995306,0.995394,0.995869


In [8]:
# Find missing values
missing_values = betas.isnull().sum()
print("Missing values in each column:")
print(missing_values[missing_values > 0])

Missing values in each column:
207686140044_R01C01    43904
207686140044_R02C01    49766
207686140044_R03C01    44671
207686140044_R04C01    45932
207686140044_R05C01    42925
                       ...  
207805820171_R04C01    46019
207805820171_R05C01    40791
207805820171_R06C01    40374
207805820171_R07C01    41469
207805820171_R08C01    40743
Length: 500, dtype: int64


In [None]:
import numpy as np
import rpy2.robjects as ro
from rpy2.robjects.packages import importr
from rpy2.robjects import conversion, default_converter

idat_dir = "/home/jamie/Documents/AI4BI/BiomarkersOfAgeChallenge/GSE246337_RAW/"

with conversion.localconverter(default_converter):
    sesame = importr('sesame')
    ro.r(f'''
        library(sesame)
        sesame_checkVersion()
      	sesameDataCache()
     	print(tools::R_user_dir("ExperimentHub", which="cache"))
      	qcs = openSesame("{idat_dir}", prep="", func=sesameQC_calcStats, funs="detection")
      	betas = openSesame("{idat_dir}", BPPARAM = BiocParallel::MulticoreParam(2), func = getBetas)
	  	allele_freqs = openSesame("{idat_dir}", func = getAFs)
	  	sdfs = openSesame("{idat_dir}", prep="QCDPB", func = NULL)
   	  	pvals = openSesame("{idat_dir}", func = pOOBAH, return.pval=TRUE)
    ''')
    qcs = list(ro.r('qcs'))
    betas = np.array(ro.r('betas'))
    allele_freqs = np.array(ro.r('allele_freqs'))
    sdfs = np.array(ro.r('sdfs'))
    pvals = np.array(ro.r('pvals'))


R callback write-console: SeSAMe requires matched versions of R, sesame, sesameData and ExperimentHub.
Here is the current versions installed:
R: 4.4.3
Bioconductor: 3.20
sesame: 1.24.0
sesameData: 1.24.0
ExperimentHub: 2.14.0

  


[1] "/home/jamie/.cache/R/ExperimentHub"
