# Aggregate Decima VEP results

In [1]:
import pandas as pd
import numpy as np
import scanpy as sc
import anndata
import os

## Paths

In [2]:
decima_metadata_file = '/gstore/data/resbioai/grelu/decima/20240823/data.h5ad'
out_dir = '/gstore/data/resbioai/grelu/decima/20240823/bulk_eqtl_results/'

## Load Decima metadata

In [3]:
decima_tasks = sc.read(decima_metadata_file).obs

## Load individual replicate predictions

In [4]:
preds = []
for rep in range(4):
    rep_preds_file = os.path.join(out_dir, f'gtex_eqtl_cat_decima_rep{rep}.pq')
    preds.append(pd.read_parquet(rep_preds_file))

## Split variant metadata from predictions

In [5]:
obs = preds[0].iloc[:, :15]
obs.head()

Unnamed: 0,chrom,pos,ref,alt,gene,start,end,strand,gene_mask_start,gene_mask_end,rel_pos,ref_tx,alt_tx,tss_dist,variant
0,chr1,100353172,T,G,CDC14A,100181161,100705449,+,163840,339116,172010,T,G,8170,chr1_100353172_T_G
1,chr1,107135646,G,C,NTNG1,106976167,107500455,+,163840,507291,159478,G,C,-4362,chr1_107135646_G_C
2,chr1,109509517,A,G,AMIGO1,109149290,109673578,-,163840,169400,164061,T,C,221,chr1_109509517_A_G
3,chr1,109671748,C,T,GSTM1,109523974,110048262,+,163840,185065,147773,C,T,-16067,chr1_109671748_C_T
4,chr1,109675302,G,A,GSTM1,109523974,110048262,+,163840,185065,151327,G,A,-12513,chr1_109675302_G_A


In [6]:
preds = [p.iloc[:, 15:].values for p in preds]

## Average across replicates

In [7]:
preds = np.stack(preds).mean(0)
preds.shape

(229828, 8856)

## Make anndata

In [8]:
preds = anndata.AnnData(X=preds, obs=obs[['variant', 'gene']], var=decima_tasks)
preds.shape



(229828, 8856)

## Minimally annotate .obs

In [9]:
susie_file = os.path.join(out_dir, "susie_df.csv")
susie_df = pd.read_csv(susie_file)
susie_df=susie_df[['variant', 'pip', 'beta', 'gene', 'celltype']]
susie_df['label'] = susie_df.pip > 0.9
print(len(susie_df))

  susie_df = pd.read_csv(susie_file)


542728


In [17]:
pos = susie_df[susie_df.label][['variant', 'gene']].drop_duplicates()
pos["is_eQTL"] = True
len(pos)

7183

In [19]:
preds.obs = preds.obs.merge(pos, how="left")
preds.obs.loc[preds.obs.is_eQTL.isna(), "is_eQTL"] = False

In [25]:
preds.obs.is_eQTL = preds.obs.is_eQTL.astype(str)

## Save

In [26]:
out_file = os.path.join(out_dir, 'gtex_eqtl_cat_decima.h5ad')
preds.write_h5ad(out_file)