In [1]:
import os
import tiledb
import tiledb.cloud
import tiledb.cloud.utilities
import tiledbvcf
import numpy as np
import pandas as pd

print(
    f"tiledb v{tiledb.version.version}\n"
    f"tiledb-vcf v{tiledbvcf.version}\n"
    f"tiledb-cloud v{tiledb.cloud.version.version}\n"
)

# Create a base config, if you have aws credentials configured we can automically use them for ingestion
#config = tiledb.cloud.utilities.read_aws_config()

tiledb v0.23.2
tiledb-vcf v0.26.2
tiledb-cloud v0.11.0



In [2]:
config = tiledb.Config()
config["vfs.s3.aws_access_key_id"] = "####################"
config["vfs.s3.aws_secret_access_key"] = "####################################"
config["vfs.s3.region"] = "us-east-1"
vfs = tiledb.VFS(config=config)

## Query the gene_based array

In [None]:
#Have to first copy the array created locally to s3 by running the following line
#aws s3 --profile insight cp ukb_ancestry_array s3://tak-insight-priv-tiledb-plat/arrays/ukb_ancestry_array --recursive

In [12]:
%%time
array_uri = "s3://tak-insight-open-geneshealthconsort-silv/tiledb_array/GNH_2023_02_44kCallset_qt_gene_based/"
with tiledb.open(array_uri, mode="r", ctx=tiledb.Ctx(config)) as A:
    print(A.schema)

ArraySchema(
  domain=Domain(*[
    Dim(name='Gene ID', domain=('', ''), tile=None, dtype='|S0', var=True, filters=FilterList([ZstdFilter(level=-1), ])),
    Dim(name='Phenotype', domain=('', ''), tile=None, dtype='|S0', var=True, filters=FilterList([ZstdFilter(level=-1), ])),
  ]),
  attrs=[
    Attr(name='CHROM', dtype='int64', var=False, nullable=False, enum_label=None, filters=FilterList([ZstdFilter(level=-1), ])),
    Attr(name='GENPOS', dtype='int64', var=False, nullable=False, enum_label=None, filters=FilterList([ZstdFilter(level=-1), ])),
    Attr(name='ID', dtype='<U0', var=True, nullable=False, enum_label=None, filters=FilterList([ZstdFilter(level=-1), ])),
    Attr(name='ALLELE0', dtype='<U0', var=True, nullable=False, enum_label=None, filters=FilterList([ZstdFilter(level=-1), ])),
    Attr(name='Mask', dtype='<U0', var=True, nullable=False, enum_label=None, filters=FilterList([ZstdFilter(level=-1), ])),
    Attr(name='A1FREQ', dtype='<U0', var=True, nullable=False, enum_lab

### Query for all results for a given gene (TYK2)

In [13]:
%%time
with tiledb.open(array_uri, mode="r", ctx=tiledb.Ctx(config)) as A:
    print(A.df['ENSG00000105397',:])

                                  CHROM    GENPOS  \
Gene ID         Phenotype                           
ENSG00000105397 AFP                  19  10350840   
                AFP                  19  10350840   
                AFP                  19  10350840   
                AFP                  19  10350840   
                AFP                  19  10350840   
...                                 ...       ...   
                urine_creatinine     19  10350840   
                urine_creatinine     19  10350840   
                urine_creatinine     19  10350840   
                urine_creatinine     19  10350840   
                urine_creatinine     19  10350840   

                                                                      ID  \
Gene ID         Phenotype                                                  
ENSG00000105397 AFP                   TYK2(ENSG00000105397).MASK_C.0.001   
                AFP               TYK2(ENSG00000105397).MASK_B.singleton   
      

### Query for all signigicant burden associations for a given trait (Bilirubin)

In [14]:
%%time
with tiledb.open(array_uri, mode="r", ctx=tiledb.Ctx(config)) as A:
    print(A.df[:,'Bilirubin'])

                           CHROM     GENPOS  \
Gene ID         Phenotype                     
ENSG00000000419 Bilirubin     20   50935148   
                Bilirubin     20   50935148   
                Bilirubin     20   50935148   
                Bilirubin     20   50935148   
                Bilirubin     20   50935148   
...                          ...        ...   
ENSG00000288705 Bilirubin      2  233712995   
                Bilirubin      2  233712995   
                Bilirubin      2  233712995   
                Bilirubin      2  233712995   
                Bilirubin      2  233712995   

                                                                 ID ALLELE0  \
Gene ID         Phenotype                                                     
ENSG00000000419 Bilirubin        DPM1(ENSG00000000419).MASK_A.0.001     ref   
                Bilirubin         DPM1(ENSG00000000419).MASK_D.0.01     ref   
                Bilirubin        DPM1(ENSG00000000419).MASK_D.0.001     

## Query the single_variant array

In [4]:
%%time
array_uri = "s3://tak-insight-open-geneshealthconsort-silv/tiledb_array/GNH_2023_02_44kCallset_qt_single_variant"
with tiledb.open(array_uri, mode="r", ctx=tiledb.Ctx(config)) as B:
    print(B.schema)

ArraySchema(
  domain=Domain(*[
    Dim(name='CHROM', domain=(-9223372036854775808, 9223372036854675807), tile=100000, dtype='int64', filters=FilterList([ZstdFilter(level=-1), ])),
    Dim(name='GENPOS', domain=(-9223372036854775808, 9223372036854675807), tile=100000, dtype='int64', filters=FilterList([ZstdFilter(level=-1), ])),
    Dim(name='Phenotype', domain=('', ''), tile=None, dtype='|S0', var=True, filters=FilterList([ZstdFilter(level=-1), ])),
  ]),
  attrs=[
    Attr(name='ID', dtype='<U0', var=True, nullable=False, enum_label=None, filters=FilterList([ZstdFilter(level=-1), ])),
    Attr(name='ALLELE0', dtype='<U0', var=True, nullable=False, enum_label=None, filters=FilterList([ZstdFilter(level=-1), ])),
    Attr(name='ALLELE1', dtype='<U0', var=True, nullable=False, enum_label=None, filters=FilterList([ZstdFilter(level=-1), ])),
    Attr(name='A1FREQ', dtype='<U0', var=True, nullable=False, enum_label=None, filters=FilterList([ZstdFilter(level=-1), ])),
    Attr(name='N', dtyp

### Query for all single variant associations within the genomic region of a given gene (TYK2)

In [5]:
variants = [
    "chr19_10350865_C_A","chr19_10350910_T_C","chr19_10350911_C_T","chr19_10350923_G_A","chr19_10350937_G_A","chr19_10351069_C_T","chr19_10351093_G_A","chr19_10351103_C_G","chr19_10351143_C_T","chr19_10351155_A_C","chr19_10351159_A_G","chr19_10352436_T_TG","chr19_10352439_G_C","chr19_10352442_G_C","chr19_10352487_C_T","chr19_10352518_G_C","chr19_10352523_T_G","chr19_10352948_C_T","chr19_10352957_C_G","chr19_10352960_G_A","chr19_10352962_T_C","chr19_10352970_G_C","chr19_10352972_G_A","chr19_10353043_T_C","chr19_10353052_G_C","chr19_10353062_G_A","chr19_10353072_G_C","chr19_10353080_C_A","chr19_10353545_C_T","chr19_10353553_A_G","chr19_10353554_G_T","chr19_10353572_T_C","chr19_10353574_C_T","chr19_10353581_G_A","chr19_10353620_C_T","chr19_10353644_C_T","chr19_10354077_T_A","chr19_10354092_G_A","chr19_10354141_C_T","chr19_10354167_G_A","chr19_10354210_A_G","chr19_10354222_T_A","chr19_10354511_C_T","chr19_10354517_C_T","chr19_10354520_G_T","chr19_10354559_T_C","chr19_10354563_G_T","chr19_10354565_C_CCGA","chr19_10354579_T_C","chr19_10354593_C_A","chr19_10354598_C_T","chr19_10356582_C_T","chr19_10356583_G_A","chr19_10356606_C_T","chr19_10356607_G_A","chr19_10356612_G_C","chr19_10356627_G_C","chr19_10356669_G_A","chr19_10356670_G_A","chr19_10356690_C_T","chr19_10356691_G_A","chr19_10357766_C_T","chr19_10357799_C_T","chr19_10357868_C_T","chr19_10357904_T_A","chr19_10357915_C_T","chr19_10357916_G_A","chr19_10358018_C_T","chr19_10358030_C_T","chr19_10358032_C_A","chr19_10358083_C_T","chr19_10358084_G_A","chr19_10358087_C_T","chr19_10358101_C_T","chr19_10358102_G_A","chr19_10358134_T_C","chr19_10359179_T_G","chr19_10359189_C_T","chr19_10359207_C_A","chr19_10359242_C_T","chr19_10359243_G_A","chr19_10359248_C_G","chr19_10359251_C_T","chr19_10359252_G_A","chr19_10359299_A_C","chr19_10361520_C_T","chr19_10361522_C_T","chr19_10361532_C_T","chr19_10361541_C_A","chr19_10361541_C_T","chr19_10361592_A_G","chr19_10361593_GAA_G","chr19_10361775_C_G","chr19_10361775_C_T","chr19_10361792_G_A","chr19_10361801_A_G","chr19_10361816_C_T","chr19_10361828_C_T","chr19_10361831_C_T","chr19_10361832_G_A","chr19_10361835_C_T","chr19_10361847_C_T","chr19_10361886_C_G","chr19_10361895_C_T","chr19_10361903_C_A","chr19_10361903_C_T","chr19_10361904_G_A","chr19_10361909_C_T","chr19_10361922_C_T","chr19_10362083_T_TGATC","chr19_10362103_C_G","chr19_10362103_C_T","chr19_10362104_G_A","chr19_10362127_T_A","chr19_10362148_C_T","chr19_10362149_G_A","chr19_10362154_CCCCG_C","chr19_10362159_C_CTT","chr19_10362287_C_T","chr19_10362291_G_A","chr19_10362312_C_T","chr19_10362315_C_A","chr19_10362351_C_T","chr19_10362353_C_T","chr19_10362354_G_C","chr19_10362357_C_T","chr19_10362365_A_C","chr19_10362371_C_T","chr19_10362372_G_A","chr19_10362374_C_A","chr19_10362380_C_T","chr19_10362390_C_T","chr19_10362399_C_T","chr19_10362416_G_A","chr19_10362432_G_A","chr19_10362443_A_G","chr19_10362551_G_C","chr19_10362557_G_A","chr19_10362580_C_T","chr19_10362581_G_A","chr19_10362617_G_C","chr19_10362620_C_T","chr19_10362626_C_T","chr19_10362631_C_T","chr19_10362632_G_A","chr19_10364638_C_T","chr19_10364672_C_T","chr19_10364684_A_G","chr19_10364692_G_A","chr19_10364717_C_T","chr19_10364743_G_A","chr19_10364760_C_G","chr19_10364857_CTTG_C","chr19_10364871_G_A","chr19_10364901_C_G","chr19_10364919_G_A","chr19_10364951_C_T","chr19_10364952_G_A","chr19_10364954_G_A","chr19_10364966_G_A","chr19_10364973_C_T","chr19_10364976_C_A","chr19_10364978_G_A","chr19_10364982_T_C","chr19_10364990_T_C","chr19_10364991_T_C","chr19_10364996_T_A","chr19_10364996_T_G","chr19_10365008_A_G","chr19_10365030_C_T","chr19_10365048_C_G","chr19_10365512_CTCACCTCCTCCTTGT_C","chr19_10365517_C_A","chr19_10365525_T_C","chr19_10365531_C_T","chr19_10365546_G_A","chr19_10365547_C_T","chr19_10365548_C_G","chr19_10365581_A_G","chr19_10365585_C_T","chr19_10365587_T_G","chr19_10365590_G_C","chr19_10365641_C_T","chr19_10365661_C_A","chr19_10365669_C_T","chr19_10365695_C_T","chr19_10365699_C_A","chr19_10365699_C_T","chr19_10365707_C_T","chr19_10365714_T_C","chr19_10365722_C_A","chr19_10365723_G_A","chr19_10365735_G_A","chr19_10365780_G_A","chr19_10365782_C_G","chr19_10365782_C_T","chr19_10365783_G_A","chr19_10365785_C_T","chr19_10365788_G_A","chr19_10365800_C_G","chr19_10365800_C_T","chr19_10365801_G_A","chr19_10365807_A_G","chr19_10365812_C_T","chr19_10365813_G_A","chr19_10365819_C_T","chr19_10365839_G_T","chr19_10365847_G_T","chr19_10365853_C_A","chr19_10365858_G_A","chr19_10365866_C_T","chr19_10365867_G_A","chr19_10365869_C_T","chr19_10365870_G_A","chr19_10365879_G_A","chr19_10365881_G_A","chr19_10365891_C_T","chr19_10366417_C_T","chr19_10366456_C_T","chr19_10366457_G_A","chr19_10366463_C_T","chr19_10366469_G_A","chr19_10366486_G_C","chr19_10366527_C_A","chr19_10366529_C_T","chr19_10366556_C_T","chr19_10366561_T_C","chr19_10366568_A_C","chr19_10366581_C_T","chr19_10368063_A_C","chr19_10368071_T_G","chr19_10368105_G_A","chr19_10368127_C_A","chr19_10368161_G_A","chr19_10368167_C_T","chr19_10368168_G_A","chr19_10368173_T_C","chr19_10368180_C_T","chr19_10368191_C_T","chr19_10368192_G_A","chr19_10368301_C_T","chr19_10368302_G_A","chr19_10368311_G_C","chr19_10368337_T_C","chr19_10368346_T_C","chr19_10368380_C_T","chr19_10368399_GAAGC_G","chr19_10368404_A_G","chr19_10378228_A_G","chr19_10378229_T_C","chr19_10378232_G_A","chr19_10378246_T_C","chr19_10378250_C_T","chr19_10378258_G_A","chr19_10378264_T_A","chr19_10378282_G_A","chr19_10378289_C_T","chr19_10378291_C_A","chr19_10378292_C_T","chr19_10378337_C_T","chr19_10378363_A_G","chr19_10378364_C_T","chr19_10378371_A_C","chr19_10378378_C_T","chr19_10378381_G_A","chr19_10378397_G_A"
]

In [8]:
import re
data = {'chr':{},'pos':{},'ref':{},'alt':{}}
for v in variants:
    v = re.sub(r'^chr','',v)
    parts = v.split('_')
    data['chr'].update({v: int(parts[0])})
    data['pos'].update({v: int(parts[1])})
    data['ref'].update({v: parts[2]})
    data['alt'].update({v: parts[3]})
    
v_df = pd.DataFrame.from_dict(data).dropna(subset=['chr','pos']).drop_duplicates(subset=['chr','pos'])
v_df

Unnamed: 0,chr,pos,ref,alt
19_10350865_C_A,19,10350865,C,A
19_10350910_T_C,19,10350910,T,C
19_10350911_C_T,19,10350911,C,T
19_10350923_G_A,19,10350923,G,A
19_10350937_G_A,19,10350937,G,A
...,...,...,...,...
19_10378364_C_T,19,10378364,C,T
19_10378371_A_C,19,10378371,A,C
19_10378378_C_T,19,10378378,C,T
19_10378381_G_A,19,10378381,G,A


In [10]:
%%time
#query all variants within TYK2
with tiledb.open(array_uri, mode="r", ctx=tiledb.Ctx(config)) as A:
    array = []
    for chrom,group in v_df.groupby('chr'):
        positions = sorted(list(group['pos']))
        print(chrom,len(positions))
        df1 = A.query().df[chrom,positions,:]
        array.append(df1)
            
    res = pd.concat(array,join='inner',axis=0)
    res

19 258
CPU times: user 1.61 s, sys: 828 ms, total: 2.44 s
Wall time: 3.46 s


In [11]:
res

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,ID,ALLELE0,ALLELE1,A1FREQ,N,TEST,BETA,SE,CHISQ,LOG10P,Raw Path,Stat,Gene-based,Female-only,Male-only,Label,Bonferroni Sig
CHROM,GENPOS,Phenotype,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
19,10350910,AFP,chr19_10350910_T_C,T,C,0.0219665,478,ADD,0.213802,0.213772,1.00027,0.498606,tak-insight-open-geneshealthconsort-silv/raw/2...,residual,False,False,False,AFP,False
19,10350910,ALP,chr19_10350910_T_C,T,C,0.0205445,31590,ADD,0.00476006,0.0264368,0.0324196,0.0669635,tak-insight-open-geneshealthconsort-silv/raw/2...,residual,False,False,False,ALP,False
19,10350910,ALT,chr19_10350910_T_C,T,C,0.0204732,31700,ADD,-0.0531791,0.0241984,4.82956,1.55322,tak-insight-open-geneshealthconsort-silv/raw/2...,residual,False,False,False,ALT,False
19,10350910,APTT,chr19_10350910_T_C,T,C,0.0193864,2966,ADD,0.126957,0.0912224,1.93692,0.785148,tak-insight-open-geneshealthconsort-silv/raw/2...,residual,False,False,False,APTT,False
19,10350910,AST,chr19_10350910_T_C,T,C,0.0169284,11844,ADD,-0.0733833,0.0474843,2.38833,0.912772,tak-insight-open-geneshealthconsort-silv/raw/2...,residual,False,False,False,AST,False
19,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19,10378397,creatinine,chr19_10378397_G_A,G,A,0.00127144,32247,ADD,0.0908263,0.0780732,1.35338,0.611387,tak-insight-open-geneshealthconsort-silv/raw/2...,residual,False,False,False,creatinine,False
19,10378397,oestradiol,chr19_10378397_G_A,G,A,0.00118437,2533,ADD,0.682535,0.402813,2.87107,1.04487,tak-insight-open-geneshealthconsort-silv/raw/2...,residual,False,False,False,oestradiol,False
19,10378397,progesterone,chr19_10378397_G_A,G,A,0.00128041,2343,ADD,-0.0727316,0.406903,0.0319495,0.0664426,tak-insight-open-geneshealthconsort-silv/raw/2...,residual,False,False,False,progesterone,False
19,10378397,testosterone,chr19_10378397_G_A,G,A,0.000971031,6179,ADD,0.12292,0.1609,0.58362,0.351741,tak-insight-open-geneshealthconsort-silv/raw/2...,residual,False,False,False,testosterone,False


## Query the binary traits single_variant array

In [16]:
%%time
array_uri = "s3://tak-insight-open-geneshealthconsort-silv/tiledb_array/GNH_2023_02_44kCallset_bin_single_variant"
with tiledb.open(array_uri, mode="r", ctx=tiledb.Ctx(config)) as B:
    print(B.schema)

ArraySchema(
  domain=Domain(*[
    Dim(name='CHROM', domain=(-9223372036854775808, 9223372036854675807), tile=100000, dtype='int64', filters=FilterList([ZstdFilter(level=-1), ])),
    Dim(name='GENPOS', domain=(-9223372036854775808, 9223372036854675807), tile=100000, dtype='int64', filters=FilterList([ZstdFilter(level=-1), ])),
    Dim(name='Phenotype', domain=('', ''), tile=None, dtype='|S0', var=True, filters=FilterList([ZstdFilter(level=-1), ])),
  ]),
  attrs=[
    Attr(name='ID', dtype='<U0', var=True, nullable=False, enum_label=None, filters=FilterList([ZstdFilter(level=-1), ])),
    Attr(name='ALLELE0', dtype='<U0', var=True, nullable=False, enum_label=None, filters=FilterList([ZstdFilter(level=-1), ])),
    Attr(name='ALLELE1', dtype='<U0', var=True, nullable=False, enum_label=None, filters=FilterList([ZstdFilter(level=-1), ])),
    Attr(name='A1FREQ', dtype='<U0', var=True, nullable=False, enum_label=None, filters=FilterList([ZstdFilter(level=-1), ])),
    Attr(name='N', dtyp

### Query for all single variant associations within the genomic region of a given gene (TYK2)

In [17]:
import re

variants = [
    "chr19_10350865_C_A","chr19_10350910_T_C","chr19_10350911_C_T","chr19_10350923_G_A","chr19_10350937_G_A","chr19_10351069_C_T","chr19_10351093_G_A","chr19_10351103_C_G","chr19_10351143_C_T","chr19_10351155_A_C","chr19_10351159_A_G","chr19_10352436_T_TG","chr19_10352439_G_C","chr19_10352442_G_C","chr19_10352487_C_T","chr19_10352518_G_C","chr19_10352523_T_G","chr19_10352948_C_T","chr19_10352957_C_G","chr19_10352960_G_A","chr19_10352962_T_C","chr19_10352970_G_C","chr19_10352972_G_A","chr19_10353043_T_C","chr19_10353052_G_C","chr19_10353062_G_A","chr19_10353072_G_C","chr19_10353080_C_A","chr19_10353545_C_T","chr19_10353553_A_G","chr19_10353554_G_T","chr19_10353572_T_C","chr19_10353574_C_T","chr19_10353581_G_A","chr19_10353620_C_T","chr19_10353644_C_T","chr19_10354077_T_A","chr19_10354092_G_A","chr19_10354141_C_T","chr19_10354167_G_A","chr19_10354210_A_G","chr19_10354222_T_A","chr19_10354511_C_T","chr19_10354517_C_T","chr19_10354520_G_T","chr19_10354559_T_C","chr19_10354563_G_T","chr19_10354565_C_CCGA","chr19_10354579_T_C","chr19_10354593_C_A","chr19_10354598_C_T","chr19_10356582_C_T","chr19_10356583_G_A","chr19_10356606_C_T","chr19_10356607_G_A","chr19_10356612_G_C","chr19_10356627_G_C","chr19_10356669_G_A","chr19_10356670_G_A","chr19_10356690_C_T","chr19_10356691_G_A","chr19_10357766_C_T","chr19_10357799_C_T","chr19_10357868_C_T","chr19_10357904_T_A","chr19_10357915_C_T","chr19_10357916_G_A","chr19_10358018_C_T","chr19_10358030_C_T","chr19_10358032_C_A","chr19_10358083_C_T","chr19_10358084_G_A","chr19_10358087_C_T","chr19_10358101_C_T","chr19_10358102_G_A","chr19_10358134_T_C","chr19_10359179_T_G","chr19_10359189_C_T","chr19_10359207_C_A","chr19_10359242_C_T","chr19_10359243_G_A","chr19_10359248_C_G","chr19_10359251_C_T","chr19_10359252_G_A","chr19_10359299_A_C","chr19_10361520_C_T","chr19_10361522_C_T","chr19_10361532_C_T","chr19_10361541_C_A","chr19_10361541_C_T","chr19_10361592_A_G","chr19_10361593_GAA_G","chr19_10361775_C_G","chr19_10361775_C_T","chr19_10361792_G_A","chr19_10361801_A_G","chr19_10361816_C_T","chr19_10361828_C_T","chr19_10361831_C_T","chr19_10361832_G_A","chr19_10361835_C_T","chr19_10361847_C_T","chr19_10361886_C_G","chr19_10361895_C_T","chr19_10361903_C_A","chr19_10361903_C_T","chr19_10361904_G_A","chr19_10361909_C_T","chr19_10361922_C_T","chr19_10362083_T_TGATC","chr19_10362103_C_G","chr19_10362103_C_T","chr19_10362104_G_A","chr19_10362127_T_A","chr19_10362148_C_T","chr19_10362149_G_A","chr19_10362154_CCCCG_C","chr19_10362159_C_CTT","chr19_10362287_C_T","chr19_10362291_G_A","chr19_10362312_C_T","chr19_10362315_C_A","chr19_10362351_C_T","chr19_10362353_C_T","chr19_10362354_G_C","chr19_10362357_C_T","chr19_10362365_A_C","chr19_10362371_C_T","chr19_10362372_G_A","chr19_10362374_C_A","chr19_10362380_C_T","chr19_10362390_C_T","chr19_10362399_C_T","chr19_10362416_G_A","chr19_10362432_G_A","chr19_10362443_A_G","chr19_10362551_G_C","chr19_10362557_G_A","chr19_10362580_C_T","chr19_10362581_G_A","chr19_10362617_G_C","chr19_10362620_C_T","chr19_10362626_C_T","chr19_10362631_C_T","chr19_10362632_G_A","chr19_10364638_C_T","chr19_10364672_C_T","chr19_10364684_A_G","chr19_10364692_G_A","chr19_10364717_C_T","chr19_10364743_G_A","chr19_10364760_C_G","chr19_10364857_CTTG_C","chr19_10364871_G_A","chr19_10364901_C_G","chr19_10364919_G_A","chr19_10364951_C_T","chr19_10364952_G_A","chr19_10364954_G_A","chr19_10364966_G_A","chr19_10364973_C_T","chr19_10364976_C_A","chr19_10364978_G_A","chr19_10364982_T_C","chr19_10364990_T_C","chr19_10364991_T_C","chr19_10364996_T_A","chr19_10364996_T_G","chr19_10365008_A_G","chr19_10365030_C_T","chr19_10365048_C_G","chr19_10365512_CTCACCTCCTCCTTGT_C","chr19_10365517_C_A","chr19_10365525_T_C","chr19_10365531_C_T","chr19_10365546_G_A","chr19_10365547_C_T","chr19_10365548_C_G","chr19_10365581_A_G","chr19_10365585_C_T","chr19_10365587_T_G","chr19_10365590_G_C","chr19_10365641_C_T","chr19_10365661_C_A","chr19_10365669_C_T","chr19_10365695_C_T","chr19_10365699_C_A","chr19_10365699_C_T","chr19_10365707_C_T","chr19_10365714_T_C","chr19_10365722_C_A","chr19_10365723_G_A","chr19_10365735_G_A","chr19_10365780_G_A","chr19_10365782_C_G","chr19_10365782_C_T","chr19_10365783_G_A","chr19_10365785_C_T","chr19_10365788_G_A","chr19_10365800_C_G","chr19_10365800_C_T","chr19_10365801_G_A","chr19_10365807_A_G","chr19_10365812_C_T","chr19_10365813_G_A","chr19_10365819_C_T","chr19_10365839_G_T","chr19_10365847_G_T","chr19_10365853_C_A","chr19_10365858_G_A","chr19_10365866_C_T","chr19_10365867_G_A","chr19_10365869_C_T","chr19_10365870_G_A","chr19_10365879_G_A","chr19_10365881_G_A","chr19_10365891_C_T","chr19_10366417_C_T","chr19_10366456_C_T","chr19_10366457_G_A","chr19_10366463_C_T","chr19_10366469_G_A","chr19_10366486_G_C","chr19_10366527_C_A","chr19_10366529_C_T","chr19_10366556_C_T","chr19_10366561_T_C","chr19_10366568_A_C","chr19_10366581_C_T","chr19_10368063_A_C","chr19_10368071_T_G","chr19_10368105_G_A","chr19_10368127_C_A","chr19_10368161_G_A","chr19_10368167_C_T","chr19_10368168_G_A","chr19_10368173_T_C","chr19_10368180_C_T","chr19_10368191_C_T","chr19_10368192_G_A","chr19_10368301_C_T","chr19_10368302_G_A","chr19_10368311_G_C","chr19_10368337_T_C","chr19_10368346_T_C","chr19_10368380_C_T","chr19_10368399_GAAGC_G","chr19_10368404_A_G","chr19_10378228_A_G","chr19_10378229_T_C","chr19_10378232_G_A","chr19_10378246_T_C","chr19_10378250_C_T","chr19_10378258_G_A","chr19_10378264_T_A","chr19_10378282_G_A","chr19_10378289_C_T","chr19_10378291_C_A","chr19_10378292_C_T","chr19_10378337_C_T","chr19_10378363_A_G","chr19_10378364_C_T","chr19_10378371_A_C","chr19_10378378_C_T","chr19_10378381_G_A","chr19_10378397_G_A"
]

data = {'chr':{},'pos':{},'ref':{},'alt':{}}
for v in variants:
    v = re.sub(r'^chr','',v)
    parts = v.split('_')
    data['chr'].update({v: int(parts[0])})
    data['pos'].update({v: int(parts[1])})
    data['ref'].update({v: parts[2]})
    data['alt'].update({v: parts[3]})
    
v_df = pd.DataFrame.from_dict(data).dropna(subset=['chr','pos']).drop_duplicates(subset=['chr','pos'])
v_df

Unnamed: 0,chr,pos,ref,alt
19_10350865_C_A,19,10350865,C,A
19_10350910_T_C,19,10350910,T,C
19_10350911_C_T,19,10350911,C,T
19_10350923_G_A,19,10350923,G,A
19_10350937_G_A,19,10350937,G,A
...,...,...,...,...
19_10378364_C_T,19,10378364,C,T
19_10378371_A_C,19,10378371,A,C
19_10378378_C_T,19,10378378,C,T
19_10378381_G_A,19,10378381,G,A


In [18]:
%%time
#query all variants within TYK2
with tiledb.open(array_uri, mode="r", ctx=tiledb.Ctx(config)) as A:
    array = []
    for chrom,group in v_df.groupby('chr'):
        positions = sorted(list(group['pos']))
        print(chrom,len(positions))
        df1 = A.query().df[chrom,positions,:]
        array.append(df1)
            
    res = pd.concat(array,join='inner',axis=0)

19 258
CPU times: user 2.07 s, sys: 829 ms, total: 2.89 s
Wall time: 5.24 s


In [19]:
res

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,ID,ALLELE0,ALLELE1,A1FREQ,N,TEST,BETA,SE,CHISQ,LOG10P,Raw Path,Gene-based,Phenotype Name,Female-only,Male-only,Controls,Label,Bonferroni Sig
CHROM,GENPOS,Phenotype,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
19,10350910,A01,chr19_10350910_T_C,T,C,0.0209649,44026,ADD,0.584347,0.421951,1.91787,0.779652,tak-insight-open-geneshealthconsort-silv/raw/2...,False,A01 Typhoid and paratyphoid fevers,False,False,All,A01 Typhoid and paratyphoid fevers,False
19,10350910,A04,chr19_10350910_T_C,T,C,0.0209649,44026,ADD,0.150398,0.154348,0.949467,0.481677,tak-insight-open-geneshealthconsort-silv/raw/2...,False,A04 Other bacterial intestinal infections,False,False,All,A04 Other bacterial intestinal infections,False
19,10350910,A05,chr19_10350910_T_C,T,C,0.0209649,44026,ADD,-0.641005,0.433729,2.18417,0.855624,tak-insight-open-geneshealthconsort-silv/raw/2...,False,"A05 Other bacterial foodborne intoxications, n...",False,False,All,"A05 Other bacterial foodborne intoxications, n...",False
19,10350910,A08,chr19_10350910_T_C,T,C,0.0209649,44026,ADD,-0.182752,0.106676,2.93486,1.06205,tak-insight-open-geneshealthconsort-silv/raw/2...,False,A08 Viral and other specified intestinal infec...,False,False,All,A08 Viral and other specified intestinal infec...,False
19,10350910,A09,chr19_10350910_T_C,T,C,0.0209649,44026,ADD,-0.0737161,0.102079,0.521501,0.327715,tak-insight-open-geneshealthconsort-silv/raw/2...,False,A09 Other gastroenteritis and colitis of infec...,False,False,All,A09 Other gastroenteritis and colitis of infec...,False
19,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19,10378397,Venous_thromboembolism,chr19_10378397_G_A,G,A,0.00119248,44026,ADD,0.396223,0.692474,0.327396,0.246266,tak-insight-open-geneshealthconsort-silv/raw/2...,False,Venous thromboembolism,False,False,All,Venous thromboembolism,False
19,10378397,Visual_impairment_and_blindness,chr19_10378397_G_A,G,A,0.00119248,44026,ADD,0.0529151,0.604107,0.00767241,0.0314233,tak-insight-open-geneshealthconsort-silv/raw/2...,False,Visual impairment and blindness,False,False,All,Visual impairment and blindness,False
19,10378397,Vitamin_B12_deficiency_with_and_without_anaemia,chr19_10378397_G_A,G,A,0.00119248,44026,ADD,-0.534968,0.422832,1.60074,0.686557,tak-insight-open-geneshealthconsort-silv/raw/2...,False,Vitamin B12 deficiency with and without anaemia,False,False,All,Vitamin B12 deficiency with and without anaemia,False
19,10378397,Vitiligo,chr19_10378397_G_A,G,A,0.00119248,44026,ADD,-1.01747,0.834045,1.48821,0.652681,tak-insight-open-geneshealthconsort-silv/raw/2...,False,Vitiligo,False,False,All,Vitiligo,False
