# Generate VEP scores for GTeX tissues

In [1]:
import pandas as pd
import numpy as np
import scanpy as sc
import anndata
import os

## Paths

In [2]:
out_dir = '/gstore/data/resbioai/grelu/decima/20240823/bulk_eqtl_results'
susie_file = os.path.join(out_dir, "susie_df.csv")
decima_preds_file = os.path.join(out_dir, 'gtex_eqtl_cat_decima.h5ad')
borzoi_preds_file = os.path.join(out_dir, 'gtex_eqtl_cat_borzoi.h5ad')
gtex_sample_file = 'https://raw.githubusercontent.com/broadinstitute/gtex-v8/master/data/GTEx_Analysis_v8_RNAseq_samples.txt'

## Load fine-mapping results

In [3]:
susie_df = pd.read_csv(susie_file)
susie_df=susie_df[['variant', 'pip', 'beta', 'gene', 'celltype', 'abspos_rel_TSS']]
print(len(susie_df))

542728


  susie_df = pd.read_csv(susie_file)


## Label positive variants (PIP > 0.9)

In [4]:
susie_df['label'] = susie_df.pip > 0.9
susie_df = susie_df.drop(columns='pip')
susie_df.head(3)

Unnamed: 0,variant,beta,gene,celltype,abspos_rel_TSS,label
0,chr1_100353172_T_G,-0.261039,CDC14A,adipose_subcutaneous,8170,True
1,chr1_107135646_G_C,1.29766,NTNG1,adipose_subcutaneous,4362,True
2,chr1_109509517_A_G,0.710806,AMIGO1,adipose_subcutaneous,221,True


## Load predictions

In [6]:
decima_preds = sc.read(decima_preds_file)
# Only consider healthy tracks to match Gtex
decima_preds = decima_preds[:, decima_preds.var.disease.isin(['healthy', 'NA'])]
decima_preds

View of AnnData object with n_obs × n_vars = 229828 × 5790
    obs: 'variant', 'gene'
    var: 'cell_type', 'tissue', 'organ', 'disease', 'study', 'dataset', 'region', 'subregion', 'celltype_coarse', 'n_cells', 'total_counts', 'n_genes', 'size_factor', 'train_pearson', 'val_pearson', 'test_pearson'

In [7]:
borzoi_preds = sc.read(borzoi_preds_file)
borzoi_preds

AnnData object with n_obs × n_vars = 229828 × 7611
    obs: 'variant', 'gene'
    var: 'identifier', 'file', 'clip', 'clip_soft', 'scale', 'sum_stat', 'strand_pair', 'description'

## Map GTeX tissues to decima tasks

In [8]:
gtex_map = {
 'adipose_subcutaneous': ('tissue', ['adipose tissue', 'subcutaneous adipose tissue']),
 'adrenal_gland': ('tissue', ['adrenal gland']),
 'artery_aorta': ('tissue', ['aorta']),
 'blood':('tissue', ['blood']),
 'brain_amygdala': ('subregion', ['Amygdala']),
 'brain_anterior_cingulate_cortex': ('subregion', ['Anterior cingulate cortex','Anterior cingulate cortex - ACC']),
 'brain_cerebellar_hemisphere': ('tissue', ['Cerebellum_Lateral hemisphere of cerebellum - CBL']),
 'brain_cerebellum': ('region', ['Cerebellum']),
 'brain_cortex': ('region', ['Cerebral cortex']),
 'brain_frontal_cortex': ('tissue',['Cerebral cortex_Frontal Cortex']),
 'brain_hippocampus': ('region',['Hippocampus']),
 'brain_hypothalamus': ('region', ['Hypothalamus']),
 'brain_nucleus_accumbens': ('subregion', ['Nucleus accumbens', 'Nucleus Accumbens - NAC']),
 'brain_putamen':('subregion', ['Putamen - Pu']),
 'brain_spinal_cord': ('region',['Spinal cord']),
 'brain_substantia_nigra': ('subregion', ['Substantia Nigra - SN','Substantia nigra']),
 'breast': ('tissue', ['breast', 'mammary gland']),
 'colon_sigmoid': ('tissue',['colon', 'sigmoid colon']),
 'colon_transverse': ('tissue',['colon', 'transverse colon']),
 'esophagus_muscularis': ('tissue',['esophagus muscularis mucosa']),
 'heart_left_ventricle': ('tissue', ['heart left ventricle']),
 'kidney_cortex': ('tissue',['cortex of kidney']),
 'liver': ('organ', ['liver']),
 'lung': ('organ', ['lung']),
 'muscle': ('tissue', ['muscle tissue', 'psoas muscle']),
 'ovary': ('tissue', ['ovary']),
 'pancreas': ('tissue', ['pancreas']),
 'prostate': ('tissue', ['prostate gland', 'peripheral zone of prostate', 'transition zone of prostate']),
 'small_intestine': ('tissue',['ileum', 'small intestine']),
 'spleen': ('organ', ['spleen']),
 'stomach': ('tissue', ['stomach']),
 'testis': ('organ', ['testis']),
 'uterus': ('organ', ['uterus']),
}

#'adipose_visceral', 'brain_caudate','esophagus_mucosa',
#'minor_salivary_gland', 'pituitary', 'heart_atrial_appendage', 
#'skin_not_sun_exposed', 'skin_sun_exposed', 'thyroid', 'vagina'

gtex_map = pd.DataFrame(gtex_map, index=['decima_col', 'decima_sel']).T.reset_index(names='tissue')
print(len(gtex_map))
gtex_map.head()

33


Unnamed: 0,tissue,decima_col,decima_sel
0,adipose_subcutaneous,tissue,"[adipose tissue, subcutaneous adipose tissue]"
1,adrenal_gland,tissue,[adrenal gland]
2,artery_aorta,tissue,[aorta]
3,blood,tissue,[blood]
4,brain_amygdala,subregion,[Amygdala]


## Match these GTeX tissues to Borzoi tracks

In [9]:
gtex_tissues = pd.read_table(gtex_sample_file).set_index('sample_id').tissue_id.to_dict()
gtex_tissues

{'GTEX-1117F-0226-SM-5GZZ7': 'Adipose_Subcutaneous',
 'GTEX-1117F-0426-SM-5EGHI': 'Muscle_Skeletal',
 'GTEX-1117F-0526-SM-5EGHJ': 'Artery_Tibial',
 'GTEX-1117F-0626-SM-5N9CS': 'Artery_Coronary',
 'GTEX-1117F-0726-SM-5GIEN': 'Heart_Atrial_Appendage',
 'GTEX-1117F-1326-SM-5EGHH': 'Adipose_Visceral_Omentum',
 'GTEX-1117F-2426-SM-5EGGH': 'Uterus',
 'GTEX-1117F-2526-SM-5GZY6': 'Vagina',
 'GTEX-1117F-2826-SM-5GZXL': 'Breast_Mammary_Tissue',
 'GTEX-1117F-2926-SM-5GZYI': 'Skin_Not_Sun_Exposed_Suprapubic',
 'GTEX-1117F-3026-SM-5GZYU': 'Minor_Salivary_Gland',
 'GTEX-1117F-3226-SM-5N9CT': 'Brain_Cortex',
 'GTEX-111CU-0126-SM-5GZWZ': 'Adrenal_Gland',
 'GTEX-111CU-0226-SM-5GZXC': 'Thyroid',
 'GTEX-111CU-0326-SM-5GZXO': 'Lung',
 'GTEX-111CU-0426-SM-5GZY1': 'Spleen',
 'GTEX-111CU-0526-SM-5EGHK': 'Pancreas',
 'GTEX-111CU-0626-SM-5EGHL': 'Esophagus_Muscularis',
 'GTEX-111CU-0726-SM-5GZYD': 'Esophagus_Mucosa',
 'GTEX-111CU-0826-SM-5EGIJ': 'Esophagus_Gastroesophageal_Junction',
 'GTEX-111CU-0926-SM-5EGIK

In [10]:
borzoi_gtex_samples = borzoi_preds.var[borzoi_preds.var.identifier.str.contains('GTEX')].identifier
borzoi_gtex_samples = borzoi_gtex_samples.str.split('.').apply(lambda x: x[0])
borzoi_gtex_samples.head()

7522    GTEX-132QS-2526-SM-62LFJ
7523    GTEX-1GMR3-0826-SM-9WYT4
7524    GTEX-1HSEH-0226-SM-ACKVV
7525    GTEX-11GSP-0326-SM-5A5KW
7526    GTEX-13PVR-0226-SM-5RQJI
Name: identifier, dtype: object

In [11]:
borzoi_map = borzoi_gtex_samples.map(gtex_tissues).reset_index().dropna().groupby('identifier').index.apply(list)

In [12]:
borzoi_map.index = [x.lower() for x in borzoi_map.index]

In [13]:
borzoi_map.index.difference(gtex_map.tissue)

Index(['adipose_visceral_omentum', 'artery_tibial', 'bladder',
       'breast_mammary_tissue', 'cells_cultured_fibroblasts',
       'cells_ebv-transformed_lymphocytes', 'cervix_ectocervix',
       'cervix_endocervix', 'esophagus_gastroesophageal_junction',
       'esophagus_mucosa', 'fallopian_tube', 'heart_atrial_appendage',
       'minor_salivary_gland', 'muscle_skeletal', 'nerve_tibial', 'pituitary',
       'skin_not_sun_exposed_suprapubic', 'skin_sun_exposed_lower_leg',
       'small_intestine_terminal_ileum', 'thyroid', 'vagina', 'whole_blood'],
      dtype='object')

In [14]:
set(gtex_map.tissue).difference(borzoi_map.index)

{'blood',
 'brain_anterior_cingulate_cortex',
 'brain_cerebellum',
 'brain_frontal_cortex',
 'brain_hippocampus',
 'brain_hypothalamus',
 'brain_nucleus_accumbens',
 'brain_putamen',
 'brain_spinal_cord',
 'brain_substantia_nigra',
 'breast',
 'esophagus_muscularis',
 'muscle',
 'small_intestine'}

In [15]:
gtex_tissue_to_eqtl_cat_mapping = {
    'adipose_visceral_omentum':'adipose_visceral',
    'whole_blood':'blood',
    'small_intestine_terminal_ileum':'small_intestine',
    'breast_mammary_tissue':'breast',
    'muscle_skeletal':'muscle',
}

for k, v in gtex_tissue_to_eqtl_cat_mapping.items():
    borzoi_map.index = [x if x!=k else v for x in borzoi_map.index]

In [16]:
borzoi_map = pd.DataFrame(borzoi_map).reset_index()
borzoi_map.columns=['tissue', 'borzoi_tracks']

In [17]:
borzoi_map

Unnamed: 0,tissue,borzoi_tracks
0,adipose_subcutaneous,"[7522, 7524]"
1,adipose_visceral,[7523]
2,adrenal_gland,"[7525, 7526, 7527]"
3,artery_aorta,[7535]
4,artery_tibial,"[7534, 7536]"
5,bladder,"[7528, 7529, 7530]"
6,brain_amygdala,[7540]
7,brain_cerebellar_hemisphere,[7539]
8,brain_cortex,[7541]
9,breast,"[7542, 7543, 7544]"


## Combine decima and borzoi mappings

In [18]:
gtex_map = gtex_map.merge(borzoi_map, how='left')
gtex_map.head(3)

Unnamed: 0,tissue,decima_col,decima_sel,borzoi_tracks
0,adipose_subcutaneous,tissue,"[adipose tissue, subcutaneous adipose tissue]","[7522, 7524]"
1,adrenal_gland,tissue,[adrenal gland],"[7525, 7526, 7527]"
2,artery_aorta,tissue,[aorta],[7535]


In [19]:
len(gtex_map), len(gtex_map.dropna(subset='borzoi_tracks'))

(33, 23)

## Compute scores

In [20]:
def compute_mean_score(ad):
    return ad.X.mean(1)

def compute_weighted_score(ad):
    return np.matmul(ad.X, ad.var.n_cells)/ad.var.n_cells.sum()

In [21]:
decima_scores = dict()
borzoi_scores = dict()

# For each tissue
for row in gtex_map.itertuples():

    # Compute decima score
    decima_sub = decima_preds[:, decima_preds.var[row.decima_col].isin(row.decima_sel)]
    decima_scores[row.tissue] = compute_weighted_score(decima_sub)
    # Compute Borzoi score
    if isinstance(row.borzoi_tracks, list):
        borzoi_scores[row.tissue] =  compute_mean_score(borzoi_preds[:, row.borzoi_tracks])
    
decima_scores = pd.DataFrame(decima_scores)
borzoi_scores = pd.DataFrame(borzoi_scores)

decima_scores = pd.concat([decima_preds.obs.reset_index(drop=True), decima_scores], axis=1)
borzoi_scores = pd.concat([borzoi_preds.obs.reset_index(drop=True), borzoi_scores], axis=1)

In [22]:
decima_scores.head(2)

Unnamed: 0,variant,gene,adipose_subcutaneous,adrenal_gland,artery_aorta,blood,brain_amygdala,brain_anterior_cingulate_cortex,brain_cerebellar_hemisphere,brain_cerebellum,...,lung,muscle,ovary,pancreas,prostate,small_intestine,spleen,stomach,testis,uterus
0,chr1_100353172_T_G,CDC14A,-0.008337,-0.003642,-0.020992,-0.012648,-0.008689,-0.00595,-0.007069,-0.006343,...,-0.012233,-0.007603,-0.002903,-0.00721,-0.008354,-0.009008,-0.011102,-0.011276,-0.002577,-0.00282
1,chr1_107135646_G_C,NTNG1,0.261189,0.398851,0.086615,0.049476,-0.122944,-0.331547,-0.435692,-0.282026,...,0.190218,0.243493,0.522173,0.247603,0.218538,0.233432,0.055801,0.110799,0.369275,0.364059


## Save scores

In [23]:
decima_scores.to_csv(
    os.path.join(out_dir, 'gtex_eqtl_cat_decima_scores.csv'),
    index=False
)
borzoi_scores.to_csv(
    os.path.join(out_dir, 'gtex_eqtl_cat_borzoi_scores.csv'),
    index=False
)

## Subset susie to represented tissues

In [24]:
print(len(susie_df))
susie_df = susie_df[susie_df.celltype.isin(gtex_map.tissue)]
print(len(susie_df))

542728
303417


## Merge scores with fine-mapping results

In [25]:
decima_scores = decima_scores.melt(
    id_vars=['variant', 'gene'], var_name='celltype', value_name='decima_score'
)
borzoi_scores = borzoi_scores.melt(
    id_vars=['variant', 'gene'], var_name='celltype', value_name='borzoi_score'
)

In [26]:
decima_scores.head(3)

Unnamed: 0,variant,gene,celltype,decima_score
0,chr1_100353172_T_G,CDC14A,adipose_subcutaneous,-0.008337
1,chr1_107135646_G_C,NTNG1,adipose_subcutaneous,0.261189
2,chr1_109509517_A_G,AMIGO1,adipose_subcutaneous,0.06801


In [27]:
print(len(susie_df))
susie_df = susie_df.merge(decima_scores, on=['variant', 'gene', 'celltype'], how='left')
susie_df = susie_df.merge(borzoi_scores, on=['variant', 'gene', 'celltype'], how='left')
print(len(susie_df))

303417
303417


## Compute absolute scores

In [29]:
susie_df['abs_decima_score'] = np.abs(susie_df['decima_score'])
susie_df['abs_borzoi_score'] = np.abs(susie_df['borzoi_score'])

## Save

In [30]:
susie_df.to_csv(os.path.join(out_dir, 'gtex_eqtl_cat_susie_scored.csv'),
               index=False)