In [1]:
import pandas as pd
import biolearn
from biolearn.data_library import DataLibrary
data='../data/'
cache=biolearn.cache.LocalFolderCache(data+'cache',1000)
n=6
#load cohorts and clean DANm data 
dts=[]
elovs=[]    

In [2]:
##mesa
meta=pd.read_table(data+'source/cohorts/mesa/phs001416.v3.pht010510.v1.p1.c1.TOPMed_MESA_Methylomics_Sample_Attributes.HMB.txt',skiprows=10,index_col=0,
                   usecols=['SAMPLE_ID','ASSAY_TYPE','HISTOLOGICAL_TYPE','ANALYTE_ISOLATION_LAB','TOPMED_PHASE','SUBJECT_ID','COLLECTION_VISIT','AGE_AT_COLLECTION'])
meta.shape
meta=meta[(meta['ASSAY_TYPE']=='BIS')&(meta['HISTOLOGICAL_TYPE']=='PBMC')&(meta['ANALYTE_ISOLATION_LAB']=='Tracy-UVT')&(meta['TOPMED_PHASE']=='TOPMed MULTI-OMICS Pilot')].copy()
meta=meta[['COLLECTION_VISIT','AGE_AT_COLLECTION','SUBJECT_ID']]
meta.columns=['visit','age','sid']
meta['visit']=meta['visit'].replace({5:2})
subj=pd.read_table(data+'source/cohorts/mesa/phs000209.v13.pht001116.v10.p3.c1.MESA_Exam1Main.HMB.txt',skiprows=10,index_col=0,usecols=['sidno','gender1'])
subj.columns=['sex']
meta=meta.merge(subj[['sex']],left_on='sid',right_index=True,how='inner')
meta.to_csv(data+'clean/cohorts/meta/mesa_dna.csv')
meta.shape

(1774, 4)

In [3]:
%%time
df=pd.read_pickle(data+'pkls/dna/mesa.pkl')
df.shape

CPU times: user 84 ms, sys: 9.99 s, total: 10.1 s
Wall time: 10.1 s


(866553, 1955)

In [4]:
%%time
dt=df.T
dt=dt.join(meta[meta['visit']==1][['age']],how='inner')
dt['cohort']='MESA1'
elovs.append(dt[['cg16867657','age','cohort']])
dt=df.T
dt=dt.join(meta[meta['visit']==2][['age']],how='inner')
dt=dt[~dt.index.duplicated(keep='first')].copy()
dt['cohort']='MESA2'
elovs.append(dt[['cg16867657','age','cohort']])
meta=meta[['visit','age','sex']].copy()
meta.columns=['cohort','age','sex']
meta['cohort']=meta.cohort.replace({1:'MESA1',2:'MESA2'})
dts.append(meta[['cohort','sex','age']])
dt.shape

CPU times: user 14 s, sys: 16.5 s, total: 30.5 s
Wall time: 30.6 s


(888, 866555)

In [5]:
%%time
##PPMI
ids=pd.read_table(data+'source/cohorts/ppmi/PPMI_Meth_n524_for_LONI030718.txt',index_col=0)
ids['id']=ids['Sentrix ID'].astype(str)+'_'+ids['Sentrix Position']
meta=pd.read_excel(data+'source/cohorts/ppmi/PPMI_Curated_Data_Cut_Public_20240129.xlsx',sheet_name='20240129')
meta=meta[meta['EVENT_ID']=='BL']
meta.index=meta['PATNO'].astype(str)
meta=meta[['age_at_visit','SEX']].join(ids['id']).dropna() #0:female
meta.columns=['age','sex','id']
meta.index=meta['id']
meta.to_csv(data+'clean/cohorts/meta/ppmi_dna.csv')
df=pd.read_pickle(data+'pkls/dna/ppmi.pkl')
dt=df.T
dt=dt.join(meta.drop('id',axis=1),how='inner')
dt['cohort']='PPMI'
dts.append(dt[['cohort','sex','age']])
elovs.append(dt[['cg16867657','age','cohort']])
dt.shape

CPU times: user 10.2 s, sys: 7.49 s, total: 17.7 s
Wall time: 18.5 s


(510, 865921)

In [6]:
%%time 
##grady
meta=pd.read_table(data+'source/cohorts/grady/GSE132203_series_matrix.txt',skiprows=29,index_col=0).iloc[8:10].T
meta.columns=['sex','age']
meta['age']=meta['age'].str.strip('age: ').astype(float)
meta['sex']=meta['sex'].replace({'gender: Female':0,'gender: Male':1})
meta.to_csv(data+'clean/cohorts/meta/grady.csv')
df=pd.read_pickle(data+'pkls//dna/grady.pkl')
dt=df.T
dt=dt.join(meta,how='inner')
dt['cohort']='Grady'
dts.append(dt[['cohort','sex','age']])
elovs.append(dt[['cg16867657','age','cohort']])



CPU times: user 2.67 s, sys: 6.39 s, total: 9.07 s
Wall time: 9.09 s


In [7]:
%%time
##mgb
meta=pd.read_csv(data+'clean/cohorts/meta/mgb.csv',index_col=0,usecols=['ID','sex','age_at_sample','race'])
meta.columns=['sex','age','race']
df=pd.read_pickle(data+'pkls/dna/mgb.pkl')
dt=df.T
dt=dt.join(meta,how='inner')
dt['cohort']='MGB'
dt['sex']=dt['sex'].replace({'F':0,'M':1})
dts.append(dt[['cohort','sex','age','race']])
elovs.append(dt[['cg16867657','age','cohort']])

CPU times: user 2.07 s, sys: 4.29 s, total: 6.36 s
Wall time: 6.37 s




In [8]:
%%time
#load data from GEO usign Biolearn
geos={}
for geo in ['GSE157131','GSE42861']: #geno
    print (geo)
    data_source = DataLibrary(cache=cache).get(geo)
    geos[geo]=data_source.load()

GSE157131
GSE42861
CPU times: user 11.5 s, sys: 4.58 s, total: 16.1 s
Wall time: 16.1 s


In [9]:
##geno
geo='GSE157131'
df=geos[geo].dnam
meta=geos[geo].metadata.copy()
meta['sex']=meta['sex']-1
dt=meta[['sex','age']].join(df.T)
dt['cohort']='GENOA'
dts.append(dt[['cohort','sex','age']])
elovs.append(dt[['cg16867657','age','cohort']])

In [10]:
%%time
## RA
geo='GSE42861'
df=(geos[geo].dnam).copy()
meta=(geos[geo].metadata).copy()
meta['sex']=meta['sex']-1
dt=meta[['sex','age']].join(df.T)
dt['cohort']='RA'
dts.append(dt[['cohort','sex','age']])
elovs.append(dt[['cg16867657','age','cohort']])

CPU times: user 2.52 s, sys: 2.5 s, total: 5.02 s
Wall time: 5.03 s


In [11]:
ages=pd.concat(dts)
ages['sex']=ages['sex'].replace({0:'Female',1:'Male',2:'Unknown'})
ages.to_csv(data+'results/dna_age.csv')
ages['cohort'].value_counts()

cohort
GENOA    946
MESA2    888
MESA1    886
Grady    795
RA       689
PPMI     510
MGB      461
Name: count, dtype: int64

In [12]:
fig1e=ages[['cohort','age']].reset_index(drop=True)
fig1e.to_csv(data+'figs/1e.csv')
fig1e.shape

(5175, 2)

In [14]:
fig3a1=ages[['sex','age']].reset_index(drop=True)
fig3a1.to_csv(data+'figs/3a1.csv')
fig3a1.shape

(5175, 2)

In [17]:
fig3a2=ages[['sex','race']].reset_index(drop=True)
fig3a2.to_csv(data+'figs/3a2.csv')
fig3a2.shape

(5175, 2)

In [18]:
elov=pd.concat(elovs)
elov.round(3).to_csv(data+'results/elov.csv')
elov['cohort'].value_counts()

cohort
GENOA    946
MESA2    888
MESA1    886
Grady    795
RA       689
PPMI     510
MGB      461
Name: count, dtype: int64

In [19]:
fig1g=elov.reset_index(drop=True)
fig1g.to_csv(data+'figs/1g.csv')
fig1g.shape

(5175, 3)