## Load and process negative GTeX SNPs from the eQTL catalog

In [2]:
import numpy as np
import pandas as pd
import polars as pl
import anndata
import os
import tqdm

%matplotlib inline

## Paths

In [8]:
eqtl_cat_path = 'https://raw.githubusercontent.com/eQTL-Catalogue/eQTL-Catalogue-resources/master/data_tables/dataset_metadata.tsv'
anndata_file = "/gstore/data/resbioai/grelu/decima/20240823/data.h5ad"
susie_file='/gstore/data/resbioai/grelu/decima/20240823/bulk_eqtl_results/susie_df.csv'
cs_vars_file='/gstore/data/resbioai/grelu/decima/20240823/bulk_eqtl_results/cs_vars.npy'
out_dir = '/gstore/data/resbioai/grelu/decima/20240823/bulk_eqtl_results/'
sumstats_dir ='/gstore/data/resbioai/grelu/decima/QTS000015/sumstats/'

## Get GTeX dataset ID to tissue mapping

In [4]:
eqtl_meta = pd.read_table(eqtl_cat_path)
eqtl_meta = eqtl_meta[eqtl_meta.quant_method == 'ge'] # gene exp. QTLs
eqtl_meta = eqtl_meta[eqtl_meta.study_label == 'GTEx']
eqtl_meta = eqtl_meta[['dataset_id', 'sample_group']]

eqtl_meta

Unnamed: 0,dataset_id,sample_group
115,QTD000116,adipose_subcutaneous
120,QTD000121,adipose_visceral
125,QTD000126,adrenal_gland
130,QTD000131,artery_aorta
135,QTD000136,artery_coronary
140,QTD000141,artery_tibial
145,QTD000146,brain_amygdala
150,QTD000151,brain_anterior_cingulate_cortex
155,QTD000156,brain_caudate
160,QTD000161,brain_cerebellar_hemisphere


In [5]:
ct_dict = eqtl_meta.set_index('dataset_id').sample_group.to_dict()
ct_dict

{'QTD000116': 'adipose_subcutaneous',
 'QTD000121': 'adipose_visceral',
 'QTD000126': 'adrenal_gland',
 'QTD000131': 'artery_aorta',
 'QTD000136': 'artery_coronary',
 'QTD000141': 'artery_tibial',
 'QTD000146': 'brain_amygdala',
 'QTD000151': 'brain_anterior_cingulate_cortex',
 'QTD000156': 'brain_caudate',
 'QTD000161': 'brain_cerebellar_hemisphere',
 'QTD000166': 'brain_cerebellum',
 'QTD000171': 'brain_cortex',
 'QTD000176': 'brain_frontal_cortex',
 'QTD000181': 'brain_hippocampus',
 'QTD000186': 'brain_hypothalamus',
 'QTD000191': 'brain_nucleus_accumbens',
 'QTD000196': 'brain_putamen',
 'QTD000201': 'brain_spinal_cord',
 'QTD000206': 'brain_substantia_nigra',
 'QTD000211': 'breast',
 'QTD000216': 'fibroblast',
 'QTD000221': 'LCL',
 'QTD000226': 'colon_sigmoid',
 'QTD000231': 'colon_transverse',
 'QTD000236': 'esophagus_gej',
 'QTD000241': 'esophagus_mucosa',
 'QTD000246': 'esophagus_muscularis',
 'QTD000251': 'heart_atrial_appendage',
 'QTD000256': 'heart_left_ventricle',
 'QTD00

## Load decima metadata

In [6]:
ad = anndata.read_h5ad(anndata_file)
ensembl_id_map = ad.var[['gene_id']].reset_index().set_index('gene_id')['index'].to_dict()

## Load negative variants

In [13]:
# for each gene, we need the interval, to exclude "unseeable" stuff
gene_to_interval_dict = {v.gene_id:{'chr':v.chrom,'start':v.start,'end':v.end} for k,v in ad.var.iterrows()}

def check_scoreability(row):
    if row['gene_id'] not in gene_to_interval_dict:
        return False
    interval = gene_to_interval_dict[row['gene_id']]
    pos = row['position']
    gene_window_start = interval['start']
    gene_window_end = interval['end']
    return (pos > gene_window_start) & (pos < gene_window_end)

In [15]:
susie_df = pd.read_csv(susie_file)
cs_vars = np.load(cs_vars_file)
len(cs_vars)

In [27]:
negvar_list = []

for ds in ct_dict.keys():
    print(ds)

    print("Loading sumstats")
    file = f'{sumstats_dir}/{ds}/{ds}.all.tsv.gz'
    negvars = pl.read_csv(file, separator="\t", 
                      schema_overrides={'chromosome':pl.String},
                      columns=(1,2,3,4,5,7,8,11,16,18),
                     )
    print(len(negvars))

    print("Filtering negatives")
    # Select common SNPs
    negvars = negvars.filter(
        (pl.col('pvalue') > 0.05) & (pl.col('type') == "SNP") & (pl.col('maf') > 0.05)) # pre-select nonsignificant
    # For the same genes as the positives
    negvars = negvars.filter(pl.col('gene_id').is_in(susie_df[susie_df.dataset == ds].gene_id))
    # Not in any credible set
    negvars = negvars.filter(~pl.col('variant').is_in(list(cs_vars)))
    # In decima interval
    negvars = negvars.filter(pl.struct(pl.all()).map_elements(check_scoreability, return_dtype=pl.Boolean))
    
    negvars = negvars.to_pandas()

    print("Filtering chromosomes")
    negvars = negvars.rename(columns={'chromosome':'chrom', 'position':'pos'})
    negvars = negvars.dropna(subset=['chrom'])
    negvars['chrom'] = ['chr' + x for x in negvars.chrom]

    print("Adding metadata")
    negvars['dataset'] = ds
    negvars['celltype'] = ct_dict[ds]

    print(len(negvars))
    negvar_list.append(negvars)

negvars = pd.concat(negvar_list)

QTD000116
Loading sumstats
168441427
Filtering negatives
Filtering chromosomes
Adding metadata
455807
QTD000121
Loading sumstats
167019167
Filtering negatives
Filtering chromosomes
Adding metadata
298493
QTD000126
Loading sumstats
155295380
Filtering negatives
Filtering chromosomes
Adding metadata
193833
QTD000131
Loading sumstats
156857426
Filtering negatives
Filtering chromosomes
Adding metadata
302831
QTD000136
Loading sumstats
163011525
Filtering negatives
Filtering chromosomes
Adding metadata
115800
QTD000141
Loading sumstats
156641820
Filtering negatives
Filtering chromosomes
Adding metadata
444148
QTD000146
Loading sumstats
142903869
Filtering negatives
Filtering chromosomes
Adding metadata
59664
QTD000151
Loading sumstats
144577410
Filtering negatives
Filtering chromosomes
Adding metadata
93223
QTD000156
Loading sumstats
170781268
Filtering negatives
Filtering chromosomes
Adding metadata
162185
QTD000161
Loading sumstats
146116003
Filtering negatives
Filtering chromosomes
Addin

## Add gene metadata

In [30]:
# for each negative, compute the relpos and distance to the TSS 
negvar_dedup = negvars[['gene_id','variant','pos']].drop_duplicates()

# add gene information and calculate relative variant positions (offset)
negvar_dedup = negvar_dedup.merge(ad.var[['gene_id', 'start', 'end', 'strand', 'gene_mask_start']]).rename(
    columns={'start': 'gene_window_start', 'end': 'gene_window_end', 'strand': 'gene_strand'}) # add window information
negvar_dedup['pos_relative'] = negvar_dedup.pos - negvar_dedup.gene_window_start - 1

# use gene_end to calculate offset for - genes 
negvar_dedup.loc[negvar_dedup.gene_strand=='-', 'pos_relative'] = negvar_dedup.gene_window_end[negvar_dedup.gene_strand=='-'] - negvar_dedup.pos[negvar_dedup.gene_strand=='-']
negvar_dedup['abspos_rel_TSS'] = np.abs(negvar_dedup["pos_relative"] - negvar_dedup['gene_mask_start'])

negvars = negvars.merge(negvar_dedup[['gene_id','variant','gene_strand',"pos_relative","abspos_rel_TSS"]],
    on=['gene_id','variant'])

## Add gene name from Decima metadata

In [32]:
negvars = negvars.drop(columns=['type'])
negvars['gene'] = negvars.gene_id.map(ensembl_id_map)
negvars.head()

Unnamed: 0,chrom,pos,ref,alt,variant,maf,pvalue,gene_id,rsid,dataset,celltype,gene_strand,pos_relative,abspos_rel_TSS,gene
0,chr1,502653,G,T,chr1_502653_G_T,0.065405,0.586796,ENSG00000225880,rs1167381111,QTD000116,adipose_subcutaneous,-,488709,324869,LINC00115
1,chr1,591460,T,C,chr1_591460_T_C,0.060241,0.831717,ENSG00000225880,rs60396226,QTD000116,adipose_subcutaneous,-,399902,236062,LINC00115
2,chr1,625776,T,G,chr1_625776_T_G,0.129088,0.398739,ENSG00000225880,rs150334593,QTD000116,adipose_subcutaneous,-,365586,201746,LINC00115
3,chr1,627983,C,T,chr1_627983_C_T,0.133391,0.339284,ENSG00000225880,rs137905425,QTD000116,adipose_subcutaneous,-,363379,199539,LINC00115
4,chr1,632781,C,T,chr1_632781_C_T,0.149742,0.195945,ENSG00000225880,rs61769296,QTD000116,adipose_subcutaneous,-,358581,194741,LINC00115


## Add credible set columns

In [33]:
negvars['pip'] = 0
negvars['cs_id'] = 'negative'
negvars['cs_size'] = 0
negvars.head()

Unnamed: 0,chrom,pos,ref,alt,variant,maf,pvalue,gene_id,rsid,dataset,celltype,gene_strand,pos_relative,abspos_rel_TSS,gene,pip,cs_id,cs_size
0,chr1,502653,G,T,chr1_502653_G_T,0.065405,0.586796,ENSG00000225880,rs1167381111,QTD000116,adipose_subcutaneous,-,488709,324869,LINC00115,0,negative,0
1,chr1,591460,T,C,chr1_591460_T_C,0.060241,0.831717,ENSG00000225880,rs60396226,QTD000116,adipose_subcutaneous,-,399902,236062,LINC00115,0,negative,0
2,chr1,625776,T,G,chr1_625776_T_G,0.129088,0.398739,ENSG00000225880,rs150334593,QTD000116,adipose_subcutaneous,-,365586,201746,LINC00115,0,negative,0
3,chr1,627983,C,T,chr1_627983_C_T,0.133391,0.339284,ENSG00000225880,rs137905425,QTD000116,adipose_subcutaneous,-,363379,199539,LINC00115,0,negative,0
4,chr1,632781,C,T,chr1_632781_C_T,0.149742,0.195945,ENSG00000225880,rs61769296,QTD000116,adipose_subcutaneous,-,358581,194741,LINC00115,0,negative,0


## Match to positive variants by distance

In [34]:
# for each positive variant, collect target_negative_n negatives wich are as close to the TSS as possible, and not yet selected for this cell-type
target_negative_n = 20
selected_vars = {}
for celltype in set(susie_df['celltype']):
    print(celltype)
    
    curr_positive_df = susie_df.query('celltype == @celltype')
    curr_negative_df = negvars.query('celltype == @celltype')
    
    selected_vars[celltype] = set()
    for _,positive in curr_positive_df.iterrows():
        negative_sub = curr_negative_df[curr_negative_df.gene_id == positive['gene_id']]
        negative_sub = negative_sub.sort_values('abspos_rel_TSS')
        i = 0
        for _,variant in negative_sub.iterrows():
            if variant['variant'] not in selected_vars[celltype]:
                selected_vars[celltype].add(variant['variant'])
                i += 1
                if i == target_negative_n:
                    break

brain_caudate
adipose_subcutaneous
heart_left_ventricle
artery_tibial
vagina
brain_hypothalamus
brain_nucleus_accumbens
stomach
pituitary
pancreas
breast
uterus
testis
esophagus_mucosa
esophagus_muscularis
brain_amygdala
artery_aorta
brain_cerebellum
colon_sigmoid
kidney_cortex
ovary
fibroblast
brain_frontal_cortex
adipose_visceral
heart_atrial_appendage
nerve_tibial
brain_cerebellar_hemisphere
colon_transverse
lung
artery_coronary
brain_cortex
esophagus_gej
brain_anterior_cingulate_cortex
thyroid
skin_sun_exposed
brain_substantia_nigra
blood
small_intestine
prostate
skin_not_sun_exposed
brain_putamen
brain_spinal_cord
adrenal_gland
minor_salivary_gland
spleen
liver
LCL
brain_hippocampus
muscle


In [36]:
negvars_reduced = pd.concat([
    negvars[negvars.variant.isin(selected_vars[celltype]) & (negvars.celltype == celltype)
        ] for celltype in selected_vars])
len(susie_df), len(negvars_reduced)

(19049, 523679)

## Print summary stats

In [35]:
rows = []
for celltype in selected_vars:
    rows.append({'celltype':celltype, 
                 'pos_genes':len(set(susie_df.query('celltype == @celltype')['gene'])),
                 'pos_genes_with_neg_ct':len(set(negvars.query('celltype == @celltype')['gene']) & set(susie_df.query('celltype == @celltype')['gene'])),
                 'total_pos':len(set(susie_df.query('celltype == @celltype')['variant'])),
                 'total_matched_neg':len(set(negvars.query('celltype == @celltype').merge(susie_df.query('celltype == @celltype')['gene_id'],on='gene_id')['variant'])),
                 'reduced_matched_neg':len(selected_vars[celltype]),
                 })
    
pd.DataFrame(rows)

Unnamed: 0,celltype,pos_genes,pos_genes_with_neg_ct,total_pos,total_matched_neg,reduced_matched_neg
0,brain_caudate,237,237,245,145892,4926
1,adipose_subcutaneous,695,695,748,381142,15206
2,heart_left_ventricle,364,364,364,225530,7620
3,artery_tibial,684,684,729,371392,14887
4,vagina,83,83,84,51042,1686
5,brain_hypothalamus,134,134,139,87201,2802
6,brain_nucleus_accumbens,203,203,207,127664,4211
7,stomach,299,299,310,171579,6342
8,pituitary,260,260,264,161558,5374
9,pancreas,421,421,435,264308,8839


## Combine positive and negative variants

In [37]:
susie_df = pd.concat([susie_df, negvars_reduced])
len(susie_df)

542728

In [42]:
susie_df.drop(columns=['molecular_trait_id', 'maf'], inplace=True)

In [43]:
susie_df.head()

Unnamed: 0,gene_id,cs_id,variant,rsid,cs_size,pip,pvalue,beta,se,z,...,alt,gene,dataset,celltype,gene_window_start,gene_window_end,gene_strand,gene_mask_start,pos_relative,abspos_rel_TSS
0,ENSG00000079335,ENSG00000079335_L1,chr1_100353172_T_G,rs17420882,4,0.928287,1.47253e-07,-0.261039,0.049038,-5.419992,...,G,CDC14A,QTD000116,adipose_subcutaneous,100181161.0,100705449.0,+,163840.0,172010,8170
1,ENSG00000162631,ENSG00000162631_L1,chr1_107135646_G_C,rs115668827,1,1.0,2.96028e-41,1.29766,0.088815,14.796299,...,C,NTNG1,QTD000116,adipose_subcutaneous,106976167.0,107500455.0,+,163840.0,159478,4362
2,ENSG00000181754,ENSG00000181754_L1,chr1_109509517_A_G,rs2570972,1,1.0,1.70801e-60,0.710806,0.038266,18.768437,...,G,AMIGO1,QTD000116,adipose_subcutaneous,109149290.0,109673578.0,-,163840.0,164061,221
3,ENSG00000134184,ENSG00000134184_L3,chr1_109671748_C_T,rs72705222,1,0.998111,3.59572e-06,0.523832,0.111933,4.730494,...,T,GSTM1,QTD000116,adipose_subcutaneous,109523974.0,110048262.0,+,163840.0,147773,16067
4,ENSG00000134184,ENSG00000134184_L1,chr1_109675302_G_A,rs611951,1,1.0,2.97655e-23,-0.803308,0.077334,-10.478578,...,A,GSTM1,QTD000116,adipose_subcutaneous,109523974.0,110048262.0,+,163840.0,151327,12513


## Create VEP input file

In [44]:
variant_df = susie_df[['chrom', 'pos', 'ref', 'alt', 'variant', 'rsid', 'gene_id', 'gene', 'gene_strand', 'pos_relative']].drop_duplicates(subset=['gene_id','variant'])
len(variant_df)

229828

In [45]:
variant_df.head()

Unnamed: 0,chrom,pos,ref,alt,variant,rsid,gene_id,gene,gene_strand,pos_relative
0,chr1,100353172,T,G,chr1_100353172_T_G,rs17420882,ENSG00000079335,CDC14A,+,172010
1,chr1,107135646,G,C,chr1_107135646_G_C,rs115668827,ENSG00000162631,NTNG1,+,159478
2,chr1,109509517,A,G,chr1_109509517_A_G,rs2570972,ENSG00000181754,AMIGO1,-,164061
3,chr1,109671748,C,T,chr1_109671748_C_T,rs72705222,ENSG00000134184,GSTM1,+,147773
4,chr1,109675302,G,A,chr1_109675302_G_A,rs611951,ENSG00000134184,GSTM1,+,151327


## Save final variants

In [46]:
variant_df.to_csv("variants_df.csv", index=False)
susie_df.to_csv("susie_df.csv", index=False)

In [2]:
len(variant_df)

229828