In [1]:
import pandas as pd
import numpy as np
import glob,os
import xlrd
data='../data/'
n=6

In [2]:
#get gene lists
gen=pd.read_csv(data+'clean/genes.csv',index_col=0)
gen.shape

(18937, 1)

In [3]:
#load and clean RNA data

In [4]:
##mesa
meta_=data+'source/cohorts/mesa/phs001416.v3.pht010512.v1.p1.c1.TOPMed_MESA_RNA_Seq_Expression_Sample_Attributes.HMB.txt'
meta=pd.read_table(meta_,skiprows=10,index_col='SAMPLE_ID')
meta=meta[meta['HISTOLOGICAL_TYPE']=='PBMC']
meta=meta[['AGE_AT_COLLECTION','COLLECTION_VISIT','SUBJECT_ID','ANALYTE_ISOLATION_BATCH_ID']]
meta.columns=['age','cohort','sid','phase']
meta=meta[meta['phase'].isin(['1','2','3','4','5'])].copy()
meta['cohort']=meta['cohort'].replace({1:'MESA1',5:'MESA2'})
meta=meta[['cohort','sid','age']].sort_values(['cohort','sid']).copy()
subj=pd.read_table(data+'source/cohorts/mesa/phs000209.v13.pht001116.v10.p3.c1.MESA_Exam1Main.HMB.txt',skiprows=10,index_col=0,usecols=['sidno','gender1'])
subj.columns=['sex']
subj['sex']=subj['sex'].replace({0:'Female',1:'Male'})
meta=meta.merge(subj[['sex']],left_on='sid',right_index=True,how='inner')
meta.to_csv(data+'clean/cohorts/meta/mesa_rna.csv')
meta.shape

(1861, 4)

In [5]:
%%time
# create data pickles 
mesa=pd.read_pickle(data+'pkls/rna/mesa.pkl')
mesa['id']=mesa.index.str.split('.').str[0]
mesa=mesa.groupby('id').mean()
mesa=mesa[mesa.index.isin(gen.index)]
mesa=mesa.T
mesa=meta.join(mesa,how='inner')
mesa.shape

CPU times: user 3.95 s, sys: 4.11 s, total: 8.06 s
Wall time: 8.08 s


(1861, 18834)

In [6]:
jen=pd.read_pickle(data+'pkls/rna/jena.pkl')
jen.columns=jen.columns.str.strip(' ')
jen=jen[jen.index.isin(gen.index)]
jen=jen.drop(['external_gene_id','description','gene_biotype'],axis=1)
jen=jen[jen.sum(1)>0].T
jen1_meta_='https://ftp.ncbi.nlm.nih.gov/geo/series/GSE103nnn/GSE103232/matrix/GSE103232_series_matrix.txt.gz'
jen1_meta=pd.read_table(jen1_meta_,index_col=0,skiprows=32)
age1=jen1_meta.iloc[8].str[5:7].astype(int)
age1.index=age1.index.str[-3:]
jen2_meta_='https://ftp.ncbi.nlm.nih.gov/geo/series/GSE75nnn/GSE75337/matrix/GSE75337_series_matrix.txt.gz'
jen2_meta=pd.read_table(jen2_meta_,index_col=0,skiprows=34)
age2=jen2_meta.iloc[8].str[5:7].astype(int)
age2=age2[age2.index.str.contains('blood')]
age=pd.concat([age1,age2])
jena=jen.join(age.rename('age'))
jena['cohort']='JenAge'
jena['sex']='Unknown'
jena.shape

(62, 17540)

In [7]:
%%time
##gc6
meta_=data+'source/cohorts/gc6/GSE94438_series_matrix.txt'
meta=pd.read_table(meta_,index_col=0,skiprows=33).iloc[[7,10,11,12]].T
meta.columns=['id','age','sex','group']
meta['sex']=meta['sex'].replace({'gender: F':'Female','gender: M':'Male'})
meta=meta[meta['group']=='group: Control'].copy()
meta['age']=meta['age'].str.strip('age: ')
meta['age']=pd.to_numeric(meta['age'], errors='coerce')
meta.index='X'+meta['id'].str.strip('code: : ')
gc6=pd.read_pickle(data+'pkls/rna/gc6.pkl')
gc6=gc6[gc6.index.isin(gen.index)]
gc6=gc6.drop('symbol',axis=1)
gc6=meta[['age','sex']].join(gc6.T)
gc6['cohort']='GC6'
gc6.shape

CPU times: user 108 ms, sys: 122 ms, total: 230 ms
Wall time: 227 ms


(327, 13513)

In [8]:
%%time
##dutch
meta_='https://ftp.ncbi.nlm.nih.gov/geo/series/GSE134nnn/GSE134080/matrix/GSE134080_series_matrix.txt.gz'
meta=pd.read_table(meta_,skiprows=30).drop('!Sample_title',axis=1).iloc[10:12].T
meta.columns=['age','sex']
meta['age']=meta['age'].str.strip('age: ').astype(int)
meta['sex']=meta['sex'].replace({'Sex: female':'Female','Sex: male':'Male'})
meta.index=meta.index.str.strip('Dutch 500FG cohort, whole blood, individual ')
dutch=pd.read_pickle(data+'pkls/rna/dutch.pkl')
dutch=dutch[dutch.index.isin(gen.index)]
dutch=dutch.T.join(meta,how='inner')
dutch['cohort']='Dutch'
dutch.shape

CPU times: user 72.5 ms, sys: 63.2 ms, total: 136 ms
Wall time: 718 ms


(97, 17418)

In [9]:
%%time
#ppmi
meta=pd.read_excel(data+'source/cohorts/ppmi/PPMI_Curated_Data_Cut_Public_20240129.xlsx',sheet_name='20240129')
meta=meta[meta['EVENT_ID']=='BL']
meta.index=meta['PATNO'].astype(str)
meta=meta[['age_at_visit','SEX','visit_date']]
meta.columns=['age','sex','visit_date']
meta['sex']=meta['sex'].replace({0:'Female',1:'Male'})
meta.to_csv(data+'clean/cohorts/meta/ppmi_rna.csv')
ppmi=pd.read_pickle(data+'pkls/rna/ppmi.pkl')
ppmi['id']=ppmi.index.str.split('.').str[0]
ppmi=ppmi.groupby('id').mean()
ppmi=ppmi[ppmi.index.isin(gen.index)]
ppmi=meta.drop('visit_date',axis=1).join(ppmi.T,how='inner')
ppmi['cohort']='PPMI'
ppmi.shape

CPU times: user 8.82 s, sys: 2.39 s, total: 11.2 s
Wall time: 12.2 s


(1111, 18853)

In [10]:
%%time
rna=pd.concat([mesa,ppmi,gc6,dutch,jena])
rna.to_pickle(data+'results/rna.pkl')
rna[['cohort','age','sex']].reset_index(drop=True).to_csv(data+'figs/1a.csv')    
rna[['cohort','age','ENSG00000174807']].reset_index(drop=True).to_csv(data+'figs/1c.csv')    

CPU times: user 987 ms, sys: 732 ms, total: 1.72 s
Wall time: 2.49 s
