# Preprocessing
- Gene Annotation
- Metabrain liftover gr38->gr37

In [1]:
import glob
import pandas as pd
import biomart as bm
import json
import pyensembl
import sys
import subprocess
import os

In [2]:
def shell_do(command, log=False, return_log=False, make_part=False):
    print(f'Executing: {(" ").join(command.split())}', file=sys.stderr)

    if make_part == False:
        res = subprocess.run(command.split(), stdout=subprocess.PIPE)
    else:
        res = subprocess.run(command, shell=True, stdout=subprocess.PIPE)

    if log:
        print(res.stdout.decode('utf-8'))
    if return_log:
        return (res.stdout.decode('utf-8'))

In [3]:
# Use ensembl id or position to find gene name
def gene_rename(df, release = 75):
    
    ensembl = pyensembl.EnsemblRelease(release)
    
    gene_list = []
    for gene,c,p in zip(df['probeID'], df['ProbeChr'],df['Probe_bp']):
        # try pulling data using probeID
        try:
            gene_list.append(ensembl.gene_name_of_gene_id(gene))
        except:
            opt = ensembl.gene_names_at_locus(c,p)
            if isinstance(opt,list):
                gene_list.append("novel_or_none")
            else:
                gene_list.append(opt)
            
    df['Gene_rename'] = gene_list
    
    return df

## 1. Gene Annotation

### 1.1 eQTL Sources

In [4]:
# load main list of eqtl msmr results
msmr_files_main = glob.glob("/../omicSynth/intermediate_results/intermediate_results/*expression*.msmr")

### 1.2.1 metaBrain - GR38
- need to format probeID to remove .##
- create a list of unique ENSG probeIDs to then get Gene names for through pyensembl or biomart

In [43]:
def gene_annot_meta(paths):
    
    for x in paths:
        # read in msmr file
        init_df = pd.read_csv(x, sep = '\t')

        # clean probeID
        init_df['probeID'] = init_df['probeID'].apply(lambda x: x.split('.')[0])
        
        # create new copy of df
        annot_df = init_df.copy()
        annot_df = gene_rename(annot_df, 100)

        # drop Gene column from msmr and replace with gene_map then rename gene_map to Gene
        #annot_df.drop('Gene', axis = 1, inplace = True)

        annot_df.rename({'Gene_rename': 'Gene'}, axis = 1, inplace = True)

        # reposition Gene column
        final_df = annot_df.iloc[:,[0,1,-1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20]]

        # export final_df and replace original file
        final_df.to_csv(x, index = None, sep = '\t')
        print(f'Replaced {x} with new gene annotated file')

In [40]:
# get paths that are metabrain
meta_msmr = [x for x in msmr_files_main if 'metaBrain' in x]

unique_metabrain = []

# loop through GTEx file paths
for path in meta_msmr:
    df = pd.read_csv(path, sep = '\t')
    
    # modify probeID
    df['probeID'] = df['probeID'].apply(lambda x: x.split('.')[0])
    
    # turn column into list
    probe_list = list(df['probeID'].unique())
    
    # only add probes not already in main probelist
    for x in probe_list:
        if x not in unique_metabrain:
            unique_metabrain.append(x)

In [None]:
gene_annot_meta(meta_msmr)

### 1.2.2 eQTLgen - GR37
- created mapping in seperate python file - read in json file

In [7]:
# get paths that are eQTLgen
gen_msmr = [x for x in msmr_files_main if 'eQTLgen' in x]

In [10]:
ndd_list = ['AD','ALS', 'FTD', 'LBD', 'PD', 'PSP']

gen_ndd = []
for x in gen_msmr:
    for dx in ndd_list:
        if dx in x:
            gen_ndd.append(x)

In [9]:
def gene_annot(paths):
    
    for x in paths:
        # read in msmr file
        init_df = pd.read_csv(x, sep = '\t')

        # create new copy of df
        annot_df = init_df.copy()
        annot_df = gene_rename(annot_df)

        # drop Gene column from msmr and replace with gene_map then rename gene_map to Gene
        annot_df.drop('Gene', axis = 1, inplace = True)

        annot_df = annot_df.rename({'Gene_rename': 'Gene'}, axis = 1)

        # reposition Gene column
        final_df = annot_df.iloc[:,[0,1,-1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20]]

        # export final_df and replace original file
        final_df.to_csv(x, index = None, sep = '\t')
        print(f'Replaced {x} with new gene annotated file')

In [None]:
gene_annot(gen_ndd)

### 1.2.3 Multi Ancestry - GR37
- Annotate any probes that were missed during inital BESD prep

In [15]:
# load main list of multiancestry eqtl msmr results
msmr_files_ma = glob.glob("/../omicSynth/intermediate_results/intermediate_results/*multiancestry*.msmr")

In [16]:
ndd_list = ['AD','ALS', 'FTD', 'LBD', 'PD', 'PSP']

ma_ndd = []
for x in msmr_files_ma:
    for dx in ndd_list:
        if dx in x:
            ma_ndd.append(x)

In [None]:
gene_annot(ma_ndd)

### 1.2 mQTL

In [13]:
def gene_annot_mqtl(paths, probe_df, output = False):
    
    for x in paths:
        # read in msmr file
        init_df = pd.read_csv(x, sep = '\t')

        # merge msmr file with probe_df
        merged_df = init_df.merge(probe_df[['Name', 'UCSC_RefGene_Name']], left_on = 'probeID', right_on = 'Name', how = 'left')

        # drop Gene column from msmr and replace with Closest_TSS_gene_name then rename Closest_TSS_gene_name to Gene
        merged_df.drop('Gene', axis = 1, inplace = True)

        merged_df.rename({'UCSC_RefGene_Name': 'Gene'}, axis = 1, inplace = True)

        # reposition Gene column
        final_df = merged_df.iloc[:,[0,1,-1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20]]

        # export final_df and replace original file
        final_df.to_csv(x, index = None, sep = '\t')
        
        if output:
            print(f'Replaced {x} with new gene annotated file')

In [14]:
# glob for SMR results and filter for mMeta + mcrae
msmr_files = glob.glob("/../omicSynth/intermediate_results/intermediate_results/*.msmr")

# filter for files we care about
mmeta = [x for x in msmr_files if 'mMeta' in x]

mcrae = [x for x in msmr_files if 'mcrae' in x]

In [None]:
# read in methylation probe file
i450k = pd.read_csv('/../omicSynth/illumina_Meth450.csv' ,sep = ',')

In [16]:
# reduce down to relavent oclumns to speed up process
i450k_red = i450k[['chr', 'pos', 'Name','Relation_to_Island', 'UCSC_RefGene_Name', 'UCSC_RefGene_Accession']]

In [18]:
# annotate each path and replace old file with new file 
gene_annot_mqtl(mmeta, i450k_red)
gene_annot_mqtl(mcrae, i450k_red)

### 1.2.2 Fix Gene Names
- some lists in gene name spot

In [19]:
test = pd.read_csv(mmeta[0], sep = '\t')

In [20]:
test

Unnamed: 0,probeID,ProbeChr,Gene,Probe_bp,topSNP,topSNP_chr,topSNP_bp,A1,A2,Freq,...,p_GWAS,b_eQTL,se_eQTL,p_eQTL,b_SMR,se_SMR,p_SMR,p_SMR_multi,p_HEIDI,nsnp_HEIDI
0,cg05505459,1,C1orf170,911995,rs3737728,1,1021415,A,G,0.264826,...,0.080911,0.657000,0.114014,8.291478e-09,0.159665,0.095581,0.094825,0.094825,,
1,cg15899727,1,,1003529,rs3737728,1,1021415,A,G,0.264826,...,0.080911,0.392314,0.069718,1.831792e-08,0.267388,0.160394,0.095500,0.095500,,
2,cg07181843,1,,1009445,rs3737728,1,1021415,A,G,0.264826,...,0.080911,-0.486032,0.068586,1.375945e-12,-0.215830,0.127350,0.090118,0.090118,,
3,cg01912388,1,,1011560,rs3737728,1,1021415,A,G,0.264826,...,0.080911,0.500445,0.068389,2.523275e-13,0.209613,0.123462,0.089546,0.089546,,
4,cg14854474,1,,1012526,rs3737728,1,1021415,A,G,0.264826,...,0.080911,0.510518,0.068247,7.407435e-14,0.205478,0.120886,0.089175,0.089175,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
65442,cg00704796,22,ARSA;ARSA;ARSA;ARSA;ARSA,51064137,rs131729,22,51053719,C,T,0.678937,...,0.883685,-0.538020,0.045526,3.155299e-32,-0.015799,0.107997,0.883694,0.646017,0.753268,6.0
65443,cg13909895,22,ARSA;ARSA;ARSA;ARSA;ARSA,51066142,rs2269382,22,51018911,C,T,0.965235,...,0.772923,-0.785456,0.140920,2.492728e-08,0.046215,0.160376,0.773219,0.773219,,
65444,cg05282459,22,SHANK3,51117157,rs6010044,22,51101938,A,C,0.777096,...,0.033955,0.598396,0.055080,1.707439e-27,0.232121,0.111525,0.037404,0.117544,0.487893,3.0
65445,cg02218200,22,SHANK3,51135138,rs5770992,22,51146139,A,G,0.875256,...,0.206316,0.615830,0.096994,2.165383e-10,0.178945,0.144375,0.215180,0.215180,0.438655,3.0


In [None]:
# fix gene names that are a list
tmp = []

for x in test['Gene']:
    if ';' in x:
        res = x.split(';')
        if len(set(res)) > 1:
            tmp_set = ','.join(map(str,list(set(res))))
            tmp.append(tmp_set)
        elif len(set(res)) == 1:
            tmp.append(res[0])
    else:
        tmp.append(x)

## 2. Metabrain Liftover GR38 -> GR37
We need to liftover SMR output files since we don't have a (known) way to liftover the BESD files provided by metabrain

In [19]:
def liftover_stats(path_dict, bash_path):
    # need to set up per-user R library on biowulf and download bigsnpr
    ## (https://hpc.nih.gov/apps/R.html)
    
    with open('liftover_stats.swarm', 'w') as f:
        for cohort in path_dict:
            f.write(f"bash {bash_path} -i {path_dict[cohort]['stats_path']} -c {path_dict[cohort]['chrom']} -p {path_dict[cohort]['pos']} -o {path_dict[cohort]['out_path']}\n")
        f.close()
    
    shell_do('swarm -f liftover_stats.swarm -g 50 --module R/4.1')
    
# create dfs for each omic and concat into one big df
def central_df(omic, df):
    # initialize central df
    print(f'Creating {omic} main df')
    main_df = pd.DataFrame()
    
    # use paths to aggregate data into single df
    for index,row in df.iterrows():
        df = pd.read_csv(row.Path, sep = '\t')
        df.insert(0, 'Omic', omic)
        df.insert(1, 'Disease', row.Disease)
        
        main_df = pd.concat([main_df,df])

    return main_df  

In [20]:
msmr_files_main = glob.glob("/../omicSynth/intermediate_results/*expression*.msmr")
# read in an msmr file paths for a disease and concat into one diseases specific file
meta_msmr = [x for x in msmr_files_main if 'metaBrain' in x]

ndd_list = ['AD','ALS', 'FTD', 'LBD', 'PD', 'PSP']

In [23]:
# pull paths for diseases we want
dx_paths = []

for path in meta_msmr:
    for dx in ndd_list:
        if dx in path:
            dx_paths.append(path)

dx_paths_clean = [x for x in dx_paths if 'FTDold' not in x]
# 660 paths in total expected (110 files per disease)* 6dx
len(dx_paths_clean)

660

In [24]:
meta_df = pd.DataFrame()

for path in dx_paths:
    # load in path
    df = pd.read_csv(path, sep = '\t')
    
    # add columns with path data
    df['path'] = path

    # add onto larger df
    meta_df = pd.concat([meta_df, df])

In [32]:
meta_df

Unnamed: 0,probeID,ProbeChr,nsnp_HEIDI,Gene,Probe_bp,topSNP,topSNP_chr,topSNP_bp,A1,A2,...,b_eQTL,se_eQTL,p_eQTL,b_SMR,se_SMR,p_SMR,p_SMR_multi,p_HEIDI,path,Gene.1
0,ENSG00000164818,7,20.0,DNAAF5,726978,7:732388:rs4072597:G_A,7,732388,G,A,...,-0.938440,0.048177,1.654795e-84,0.025210,0.039244,0.520619,0.833209,0.908062,/data/CARD_AA/projects/omicSynth/v8/intermedia...,
1,ENSG00000164849,7,20.0,GPR146,1044546,7:1052008:rs2363285:C_T,7,1052008,C,T,...,0.663118,0.056660,1.224461e-31,0.041674,0.055954,0.456404,0.424068,0.560253,/data/CARD_AA/projects/omicSynth/v8/intermedia...,
2,ENSG00000164880,7,20.0,INTS1,1504389,7:1479375:rs56975547:G_A,7,1479375,G,A,...,0.608613,0.058551,2.623848e-25,0.022646,0.061732,0.713738,0.772412,0.958307,/data/CARD_AA/projects/omicSynth/v8/intermedia...,
3,ENSG00000122687,7,20.0,MRM2,2242205,7:2238591:rs7799006:C_T,7,2238591,C,T,...,-0.531628,0.061569,5.893306e-18,0.033261,0.073326,0.650108,0.798440,0.577434,/data/CARD_AA/projects/omicSynth/v8/intermedia...,
4,ENSG00000106268,7,20.0,NUDT1,2242222,7:2277217:rs7792045:C_T,7,2277217,C,T,...,0.382054,0.062574,1.024151e-09,-0.072968,0.100865,0.469421,0.627739,0.696662,/data/CARD_AA/projects/omicSynth/v8/intermedia...,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16,ENSG00000184206,15,,GOLGA6L4,84235773,15:84599416:rs3762168:A_C,15,84599416,A,C,...,-0.822622,0.125503,5.579110e-11,0.056993,0.125996,0.651025,0.616417,0.382000,/data/CARD_AA/projects/omicSynth/v8/intermedia...,ENSG00000184206.12
17,ENSG00000185043,15,,CIB1,90233703,15:90262908:rs9920862:G_T,15,90262908,G,T,...,-0.628521,0.114715,4.277362e-08,-0.224133,0.174004,0.197714,0.197714,0.329995,/data/CARD_AA/projects/omicSynth/v8/intermedia...,ENSG00000185043.12
18,ENSG00000166965,15,,RCCD1,90955796,15:90977057:rs12594925:A_G,15,90977057,G,A,...,-0.884014,0.152065,6.121658e-09,-0.338168,0.164580,0.039905,0.039905,0.947496,/data/CARD_AA/projects/omicSynth/v8/intermedia...,ENSG00000166965.12
19,ENSG00000103852,15,,TTC23,99251223,15:99227790:rs62025697:G_A,15,99227790,A,G,...,-0.691854,0.095428,4.167719e-13,0.072010,0.122885,0.557875,0.201506,0.363476,/data/CARD_AA/projects/omicSynth/v8/intermedia...,ENSG00000103852.13


In [25]:
meta_df.to_csv('/../omicSynth/metabrain_prelift.csv', index = None)

In [26]:
# GWAS sum stats data path
gwas_path = '/../omicSynth/metabrain_prelift.csv'
# out path
sum_out = "/../omicSynth/metabrain_postlift.csv" # path for after liftover is completed

# liftover stats bash path
liftover_stats_bash_path = '/../omicSynth/liftover_stats.sh'

# name of chromosome/position columns
chrom = 'topSNP_chr'
pos = 'topSNP_bp'

# liftover stats path dict
liftover_stats_path_dict = {'meta_snp':{}}

# F1
liftover_stats_path_dict['meta_snp']['stats_path'] = gwas_path
liftover_stats_path_dict['meta_snp']['out_path'] = sum_out
liftover_stats_path_dict['meta_snp']['chrom'] = chrom
liftover_stats_path_dict['meta_snp']['pos'] = pos

liftover_stats(liftover_stats_path_dict, liftover_stats_bash_path)

Executing: swarm -f liftover_stats.swarm -g 50 --module R/4.1


In [30]:
# read in lifted stats # 102 variants not mapped
metalift = pd.read_csv('/../omicSynth/metabrain_postlift.csv', sep = '\t')
metalift

Unnamed: 0,probeID,ProbeChr,nsnp_HEIDI,Gene,Probe_bp,topSNP,topSNP_chr,topSNP_bp,A1,A2,...,b_eQTL,se_eQTL,p_eQTL,b_SMR,se_SMR,p_SMR,p_SMR_multi,p_HEIDI,path,Gene.1
0,ENSG00000164818,7,20.0,DNAAF5,726978,7:732388:rs4072597:G_A,7,772025,G,A,...,-0.938440,0.048177,1.654795e-84,0.025210,0.039244,0.520619,0.833209,0.908062,/data/CARD_AA/projects/omicSynth/v8/intermedia...,
1,ENSG00000164849,7,20.0,GPR146,1044546,7:1052008:rs2363285:C_T,7,1091644,C,T,...,0.663118,0.056660,1.224461e-31,0.041674,0.055954,0.456404,0.424068,0.560253,/data/CARD_AA/projects/omicSynth/v8/intermedia...,
2,ENSG00000164880,7,20.0,INTS1,1504389,7:1479375:rs56975547:G_A,7,1519011,G,A,...,0.608613,0.058551,2.623848e-25,0.022646,0.061732,0.713738,0.772412,0.958307,/data/CARD_AA/projects/omicSynth/v8/intermedia...,
3,ENSG00000122687,7,20.0,MRM2,2242205,7:2238591:rs7799006:C_T,7,2278226,C,T,...,-0.531628,0.061569,5.893306e-18,0.033261,0.073326,0.650108,0.798440,0.577434,/data/CARD_AA/projects/omicSynth/v8/intermedia...,
4,ENSG00000106268,7,20.0,NUDT1,2242222,7:2277217:rs7792045:C_T,7,2316852,C,T,...,0.382054,0.062574,1.024151e-09,-0.072968,0.100865,0.469421,0.627739,0.696662,/data/CARD_AA/projects/omicSynth/v8/intermedia...,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
98520,ENSG00000184206,15,,GOLGA6L4,84235773,15:84599416:rs3762168:A_C,15,85142647,A,C,...,-0.822622,0.125503,5.579110e-11,0.056993,0.125996,0.651025,0.616417,0.382000,/data/CARD_AA/projects/omicSynth/v8/intermedia...,ENSG00000184206.12
98521,ENSG00000185043,15,,CIB1,90233703,15:90262908:rs9920862:G_T,15,90806140,G,T,...,-0.628521,0.114715,4.277362e-08,-0.224133,0.174004,0.197714,0.197714,0.329995,/data/CARD_AA/projects/omicSynth/v8/intermedia...,ENSG00000185043.12
98522,ENSG00000166965,15,,RCCD1,90955796,15:90977057:rs12594925:A_G,15,91520287,G,A,...,-0.884014,0.152065,6.121658e-09,-0.338168,0.164580,0.039905,0.039905,0.947496,/data/CARD_AA/projects/omicSynth/v8/intermedia...,ENSG00000166965.12
98523,ENSG00000103852,15,,TTC23,99251223,15:99227790:rs62025697:G_A,15,99767995,A,G,...,-0.691854,0.095428,4.167719e-13,0.072010,0.122885,0.557875,0.201506,0.363476,/data/CARD_AA/projects/omicSynth/v8/intermedia...,ENSG00000103852.13


In [33]:
metalift = metalift[['probeID', 'ProbeChr', 'Gene', 'Probe_bp',
       'topSNP', 'topSNP_chr', 'topSNP_bp', 'A1', 'A2', 'Freq', 'b_GWAS',
       'se_GWAS', 'p_GWAS', 'b_eQTL', 'se_eQTL', 'p_eQTL', 'b_SMR', 'se_SMR',
       'p_SMR', 'p_SMR_multi', 'p_HEIDI', 'nsnp_HEIDI', 'path']]
metalift

Unnamed: 0,probeID,ProbeChr,Gene,Probe_bp,topSNP,topSNP_chr,topSNP_bp,A1,A2,Freq,...,b_eQTL,se_eQTL,p_eQTL,b_SMR,se_SMR,p_SMR,p_SMR_multi,p_HEIDI,nsnp_HEIDI,path
0,ENSG00000164818,7,DNAAF5,726978,7:732388:rs4072597:G_A,7,772025,G,A,0.555215,...,-0.938440,0.048177,1.654795e-84,0.025210,0.039244,0.520619,0.833209,0.908062,20.0,/data/CARD_AA/projects/omicSynth/v8/intermedia...
1,ENSG00000164849,7,GPR146,1044546,7:1052008:rs2363285:C_T,7,1091644,C,T,0.469325,...,0.663118,0.056660,1.224461e-31,0.041674,0.055954,0.456404,0.424068,0.560253,20.0,/data/CARD_AA/projects/omicSynth/v8/intermedia...
2,ENSG00000164880,7,INTS1,1504389,7:1479375:rs56975547:G_A,7,1519011,G,A,0.538855,...,0.608613,0.058551,2.623848e-25,0.022646,0.061732,0.713738,0.772412,0.958307,20.0,/data/CARD_AA/projects/omicSynth/v8/intermedia...
3,ENSG00000122687,7,MRM2,2242205,7:2238591:rs7799006:C_T,7,2278226,C,T,0.659509,...,-0.531628,0.061569,5.893306e-18,0.033261,0.073326,0.650108,0.798440,0.577434,20.0,/data/CARD_AA/projects/omicSynth/v8/intermedia...
4,ENSG00000106268,7,NUDT1,2242222,7:2277217:rs7792045:C_T,7,2316852,C,T,0.615542,...,0.382054,0.062574,1.024151e-09,-0.072968,0.100865,0.469421,0.627739,0.696662,20.0,/data/CARD_AA/projects/omicSynth/v8/intermedia...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
98520,ENSG00000184206,15,GOLGA6L4,84235773,15:84599416:rs3762168:A_C,15,85142647,A,C,0.841513,...,-0.822622,0.125503,5.579110e-11,0.056993,0.125996,0.651025,0.616417,0.382000,,/data/CARD_AA/projects/omicSynth/v8/intermedia...
98521,ENSG00000185043,15,CIB1,90233703,15:90262908:rs9920862:G_T,15,90806140,G,T,0.750511,...,-0.628521,0.114715,4.277362e-08,-0.224133,0.174004,0.197714,0.197714,0.329995,,/data/CARD_AA/projects/omicSynth/v8/intermedia...
98522,ENSG00000166965,15,RCCD1,90955796,15:90977057:rs12594925:A_G,15,91520287,G,A,0.861963,...,-0.884014,0.152065,6.121658e-09,-0.338168,0.164580,0.039905,0.039905,0.947496,,/data/CARD_AA/projects/omicSynth/v8/intermedia...
98523,ENSG00000103852,15,TTC23,99251223,15:99227790:rs62025697:G_A,15,99767995,A,G,0.437628,...,-0.691854,0.095428,4.167719e-13,0.072010,0.122885,0.557875,0.201506,0.363476,,/data/CARD_AA/projects/omicSynth/v8/intermedia...


In [35]:
for path in dx_paths:
    # remove msmr file extension
    name = path.split('.msmr')[0]
    
    # query lifted df for any results under the specified path
    tmp_df = metalift.query(f'path == "{path}"')
    
    # drop path column
    tmp_df = tmp_df.drop(columns = ['path'])
    
    # export
    tmp_df.to_csv(f'{name}_lifted.csv', sep = '\t', index = None)