# Aggregate Borzoi VEP results

In [1]:
import pandas as pd
import numpy as np
import scanpy as sc
import anndata
import os
from tqdm import tqdm

## Paths

In [2]:
preds_dir = '/gstore/data/resbioai/grelu/decima/20240823/bulk_eqtl_results'
borzoi_tasks_file= 'https://github.com/calico/borzoi/raw/refs/heads/main/data/targets_human.txt.gz'
variants_file = os.path.join(preds_dir, "variants_df.csv")

## Combine chunked borzoi results

In [3]:
[x for x in os.listdir(preds_dir) if (x.startswith('gtex_eqtl_cat_borzoi')) & (x.endswith('npy'))]

['gtex_eqtl_cat_borzoi_2_60000_80000.npy',
 'gtex_eqtl_cat_borzoi_1_200000_220000.npy',
 'gtex_eqtl_cat_borzoi_3_120000_140000.npy',
 'gtex_eqtl_cat_borzoi_0_160000_180000.npy',
 'gtex_eqtl_cat_borzoi_3_80000_100000.npy',
 'gtex_eqtl_cat_borzoi_1_60000_80000.npy',
 'gtex_eqtl_cat_borzoi_3_180000_200000.npy',
 'gtex_eqtl_cat_borzoi_3_60000_80000.npy',
 'gtex_eqtl_cat_borzoi_0_220000_229828.npy',
 'gtex_eqtl_cat_borzoi_1_160000_180000.npy',
 'gtex_eqtl_cat_borzoi_2_120000_140000.npy',
 'gtex_eqtl_cat_borzoi_1_40000_60000.npy',
 'gtex_eqtl_cat_borzoi_3_140000_160000.npy',
 'gtex_eqtl_cat_borzoi_1_100000_120000.npy',
 'gtex_eqtl_cat_borzoi_0_0_20000.npy',
 'gtex_eqtl_cat_borzoi_0_100000_120000.npy',
 'gtex_eqtl_cat_borzoi_2_200000_220000.npy',
 'gtex_eqtl_cat_borzoi_3_200000_220000.npy',
 'gtex_eqtl_cat_borzoi_3_100000_120000.npy',
 'gtex_eqtl_cat_borzoi_1_20000_40000.npy',
 'gtex_eqtl_cat_borzoi_3_0_20000.npy',
 'gtex_eqtl_cat_borzoi_3_220000_229828.npy',
 'gtex_eqtl_cat_borzoi_1_80000_10

In [4]:
starts = list(range(0, 229828, 20000))
ends = list(range(20000, 229828, 20000)) + [229828]
chunks = list(zip(starts, ends))
print(chunks)

[(0, 20000), (20000, 40000), (40000, 60000), (60000, 80000), (80000, 100000), (100000, 120000), (120000, 140000), (140000, 160000), (160000, 180000), (180000, 200000), (200000, 220000), (220000, 229828)]


In [5]:
preds = []
for chunk in tqdm(chunks):
    chunk_preds = []
    for rep in range(4):
        f = os.path.join(preds_dir, f'gtex_eqtl_cat_borzoi_{rep}_{chunk[0]}_{chunk[1]}.npy')
        chunk_preds.append(np.load(f))
    chunk_preds = np.stack(chunk_preds).mean(0)
    preds.append(chunk_preds)

100%|██████████| 12/12 [00:30<00:00,  2.56s/it]


In [6]:
preds = np.vstack(preds)
preds.shape

(229828, 7611)

## Add task and variant metadata

In [7]:
borzoi_tasks = pd.read_table(borzoi_tasks_file, index_col=0)

In [8]:
variants = pd.read_csv(variants_file)
print(len(variants))
variants.head()

229828


Unnamed: 0,chrom,pos,ref,alt,variant,rsid,gene_id,gene,gene_strand,pos_relative
0,chr1,100353172,T,G,chr1_100353172_T_G,rs17420882,ENSG00000079335,CDC14A,+,172010
1,chr1,107135646,G,C,chr1_107135646_G_C,rs115668827,ENSG00000162631,NTNG1,+,159478
2,chr1,109509517,A,G,chr1_109509517_A_G,rs2570972,ENSG00000181754,AMIGO1,-,164061
3,chr1,109671748,C,T,chr1_109671748_C_T,rs72705222,ENSG00000134184,GSTM1,+,147773
4,chr1,109675302,G,A,chr1_109675302_G_A,rs611951,ENSG00000134184,GSTM1,+,151327


In [9]:
preds = anndata.AnnData(X = preds, obs=variants[['variant', 'gene']], var=borzoi_tasks)



## Save

In [10]:
out_file = os.path.join(preds_dir, 'gtex_eqtl_cat_borzoi.h5ad')
preds.write_h5ad(out_file)