# Load and process fine-mapped GTeX eQTLs (PIP > 0.9) from the eQTL catalog

In [1]:
import numpy as np
import pandas as pd
import anndata
import os
import tqdm

from grelu.data.preprocess import filter_blacklist, filter_chromosomes
from grelu.variant import filter_variants

import torch
%matplotlib inline

## Paths

In [2]:
eqtl_cat_path = 'https://raw.githubusercontent.com/eQTL-Catalogue/eQTL-Catalogue-resources/master/data_tables/dataset_metadata.tsv'
anndata_file = "/gstore/data/resbioai/grelu/decima/20240823/data.h5ad"
susie_dir='/gstore/data/resbioai/grelu/decima/QTS000015/susie/'
out_dir = '/gstore/data/resbioai/grelu/decima/20240823/bulk_eqtl_results/'

## Get GTeX dataset ID to tissue mapping

In [3]:
eqtl_meta = pd.read_table(eqtl_cat_path)
eqtl_meta = eqtl_meta[eqtl_meta.quant_method == 'ge'] # gene exp. QTLs
eqtl_meta = eqtl_meta[eqtl_meta.study_label == 'GTEx']
eqtl_meta = eqtl_meta[['dataset_id', 'sample_group']]

eqtl_meta

Unnamed: 0,dataset_id,sample_group
115,QTD000116,adipose_subcutaneous
120,QTD000121,adipose_visceral
125,QTD000126,adrenal_gland
130,QTD000131,artery_aorta
135,QTD000136,artery_coronary
140,QTD000141,artery_tibial
145,QTD000146,brain_amygdala
150,QTD000151,brain_anterior_cingulate_cortex
155,QTD000156,brain_caudate
160,QTD000161,brain_cerebellar_hemisphere


In [4]:
ct_dict = eqtl_meta.set_index('dataset_id').sample_group.to_dict()
ct_dict

{'QTD000116': 'adipose_subcutaneous',
 'QTD000121': 'adipose_visceral',
 'QTD000126': 'adrenal_gland',
 'QTD000131': 'artery_aorta',
 'QTD000136': 'artery_coronary',
 'QTD000141': 'artery_tibial',
 'QTD000146': 'brain_amygdala',
 'QTD000151': 'brain_anterior_cingulate_cortex',
 'QTD000156': 'brain_caudate',
 'QTD000161': 'brain_cerebellar_hemisphere',
 'QTD000166': 'brain_cerebellum',
 'QTD000171': 'brain_cortex',
 'QTD000176': 'brain_frontal_cortex',
 'QTD000181': 'brain_hippocampus',
 'QTD000186': 'brain_hypothalamus',
 'QTD000191': 'brain_nucleus_accumbens',
 'QTD000196': 'brain_putamen',
 'QTD000201': 'brain_spinal_cord',
 'QTD000206': 'brain_substantia_nigra',
 'QTD000211': 'breast',
 'QTD000216': 'fibroblast',
 'QTD000221': 'LCL',
 'QTD000226': 'colon_sigmoid',
 'QTD000231': 'colon_transverse',
 'QTD000236': 'esophagus_gej',
 'QTD000241': 'esophagus_mucosa',
 'QTD000246': 'esophagus_muscularis',
 'QTD000251': 'heart_atrial_appendage',
 'QTD000256': 'heart_left_ventricle',
 'QTD00

## Load decima metadata

In [5]:
ad = anndata.read_h5ad(anndata_file)
ensembl_id_map = ad.var[['gene_id']].reset_index().set_index('gene_id')['index'].to_dict()

## Load fine-mapping results

In [6]:
susie_df = []

for ds in ct_dict.keys():
    print(ds)
    df = pd.read_table(f'{susie_dir}/{ds}/{ds}.credible_sets.tsv.gz')
    df['chrom'] = [x.split('_')[0] for x in df.variant]
    df['pos'] = [int(x.split('_')[1]) for x in df.variant]
    df['ref'] = [x.split('_')[2] for x in df.variant]
    df['alt'] = [x.split('_')[3] for x in df.variant]
    df['gene'] = df.gene_id.map(ensembl_id_map)
    df['dataset'] = ds
    df['celltype'] = ct_dict[ds]
    susie_df.append(df)

susie_df = pd.concat(susie_df, axis=0).reset_index(drop=True)
print(len(susie_df))

# make complete list of "credible variants"
cs_vars = set(susie_df['variant'])

QTD000116
QTD000121
QTD000126
QTD000131
QTD000136
QTD000141
QTD000146
QTD000151
QTD000156
QTD000161
QTD000166
QTD000171
QTD000176
QTD000181
QTD000186
QTD000191
QTD000196
QTD000201
QTD000206
QTD000211
QTD000216
QTD000221
QTD000226
QTD000231
QTD000236
QTD000241
QTD000246
QTD000251
QTD000256
QTD000261
QTD000266
QTD000271
QTD000276
QTD000281
QTD000286
QTD000291
QTD000296
QTD000301
QTD000306
QTD000311
QTD000316
QTD000321
QTD000326
QTD000331
QTD000336
QTD000341
QTD000346
QTD000351
QTD000356
5598870


In [8]:
print(len(cs_vars))

1043119


In [7]:
susie_df.head(3)

Unnamed: 0,molecular_trait_id,gene_id,cs_id,variant,rsid,cs_size,pip,pvalue,beta,se,z,cs_min_r2,region,chrom,pos,ref,alt,gene,dataset,celltype
0,ENSG00000156876,ENSG00000156876,ENSG00000156876_L1,chr1_100002416_C_T,rs12128170,64,0.014681,1.15737e-08,0.681192,0.117618,5.865267,0.60822,chr1:99132955-101132955,chr1,100002416,C,T,SASS6,QTD000116,adipose_subcutaneous
1,ENSG00000156876,ENSG00000156876,ENSG00000156876_L1,chr1_100003083_G_T,rs11801439,64,0.015396,1.19031e-08,0.685405,0.118448,5.860279,0.60822,chr1:99132955-101132955,chr1,100003083,G,T,SASS6,QTD000116,adipose_subcutaneous
2,ENSG00000156876,ENSG00000156876,ENSG00000156876_L1,chr1_100005399_C_T,rs11805704,64,0.014681,1.15737e-08,0.681192,0.117618,5.865267,0.60822,chr1:99132955-101132955,chr1,100005399,C,T,SASS6,QTD000116,adipose_subcutaneous


## Select positives (PIP > 0.9)

In [9]:
print(len(susie_df))
susie_df = susie_df[susie_df.pip > 0.9]
len(susie_df)

5598870


34183

In [12]:
susie_df.dataset.value_counts().sort_values().tail(10)

dataset
QTD000356    1085
QTD000241    1117
QTD000246    1197
QTD000141    1275
QTD000216    1318
QTD000316    1339
QTD000116    1386
QTD000336    1589
QTD000286    1813
QTD000341    1828
Name: count, dtype: int64

## Filter fine-mapped variants

In [13]:
susie_df = susie_df[susie_df.gene.notna()]
susie_df = filter_variants(susie_df, max_del_len=0, max_insert_len=0, standard_bases=True) # remove indels
susie_df = filter_chromosomes(susie_df, include='autosomesXY') # keep standard chroms
susie_df = filter_blacklist(susie_df, genome="hg38", window=100) # remove variants in blacklisted regions

Initial number of variants: 23244
Final number of variants: 20627
Keeping 20627 intervals
Keeping 19937 intervals


In [14]:
susie_df.dataset.value_counts().sort_values().tail(10)

dataset
QTD000281     683
QTD000336     710
QTD000356     717
QTD000246     730
QTD000141     788
QTD000316     788
QTD000116     801
QTD000216     877
QTD000341    1073
QTD000286    1092
Name: count, dtype: int64

In [15]:
# subset to those in decima intervals
susie_df = susie_df.merge(ad.var[['gene_id', 'start', 'end', 'strand', 'gene_mask_start']]).rename(
    columns={'start': 'gene_window_start', 'end': 'gene_window_end', 'strand': 'gene_strand'}) # add window information
susie_df = susie_df[((susie_df.pos > susie_df.gene_window_start) & (susie_df.pos < susie_df.gene_window_end))] # keep variants within the sequence window
len(susie_df)

19049

In [16]:
susie_df.dataset.value_counts().sort_values().tail(10)

dataset
QTD000281     661
QTD000336     675
QTD000356     687
QTD000246     707
QTD000141     748
QTD000316     759
QTD000116     766
QTD000216     842
QTD000341    1032
QTD000286    1053
Name: count, dtype: int64

## Add relative position to fine-mapped variants

In [17]:
susie_df['pos_relative'] = susie_df.pos - susie_df.gene_window_start - 1
susie_df.loc[susie_df.gene_strand=='-', 'pos_relative'] = susie_df.gene_window_end[susie_df.gene_strand=='-'] - susie_df.pos[susie_df.gene_strand=='-']
susie_df['abspos_rel_TSS'] = np.abs(susie_df["pos_relative"] - susie_df['gene_mask_start'])

In [18]:
susie_df.head()

Unnamed: 0,molecular_trait_id,gene_id,cs_id,variant,rsid,cs_size,pip,pvalue,beta,se,...,alt,gene,dataset,celltype,gene_window_start,gene_window_end,gene_strand,gene_mask_start,pos_relative,abspos_rel_TSS
0,ENSG00000079335,ENSG00000079335,ENSG00000079335_L1,chr1_100353172_T_G,rs17420882,4,0.928287,1.47253e-07,-0.261039,0.049038,...,G,CDC14A,QTD000116,adipose_subcutaneous,100181161,100705449,+,163840,172010,8170
1,ENSG00000162631,ENSG00000162631,ENSG00000162631_L1,chr1_107135646_G_C,rs115668827,1,1.0,2.96028e-41,1.29766,0.088815,...,C,NTNG1,QTD000116,adipose_subcutaneous,106976167,107500455,+,163840,159478,4362
2,ENSG00000181754,ENSG00000181754,ENSG00000181754_L1,chr1_109509517_A_G,rs2570972,1,1.0,1.70801e-60,0.710806,0.038266,...,G,AMIGO1,QTD000116,adipose_subcutaneous,109149290,109673578,-,163840,164061,221
3,ENSG00000134184,ENSG00000134184,ENSG00000134184_L3,chr1_109671748_C_T,rs72705222,1,0.998111,3.59572e-06,0.523832,0.111933,...,T,GSTM1,QTD000116,adipose_subcutaneous,109523974,110048262,+,163840,147773,16067
4,ENSG00000134184,ENSG00000134184,ENSG00000134184_L1,chr1_109675302_G_A,rs611951,1,1.0,2.97655e-23,-0.803308,0.077334,...,A,GSTM1,QTD000116,adipose_subcutaneous,109523974,110048262,+,163840,151327,12513


## Save

In [13]:
susie_df.to_csv(os.path.join(out_dir, 'susie_df.csv'), index=False)

In [16]:
np.save(os.path.join(out_dir, 'cs_vars.npy'), list(cs_vars))