In [1]:
import genomicsdb
print(genomicsdb.version())

1.4.4-SNAPSHOT-13db87a


Below we set up some (optional) environment variables in order to connect to Azure Blob Storage. We also support using environment variables to connect AWS S3 or GCS storage -- in each case, we use the native cloud SDK so refer to the appropriate documentation for supported environment variables. In addition, we also support using roles or service principals for access to cloud storage.

Next, we specify the cloud URIs for the GenomicsDB workspace, callset, vid and reference file, as well as the GenomicsDB array we wish to query. Lastly, we also specify the genomic attributes we wish to query from the workspace.

The environment variables and URIs have all been redacted below.

In [2]:
# set up environment variables and configuration for query
import os
os.environ['AZURE_STORAGE_ACCOUNT'] = ''
os.environ['AZURE_STORAGE_KEY'] = ''

workspace = 'az://<container>@<storageaccount>.blob.core.windows.net/<path/to/workspace>'
callset_file = 'az://<container>@<storageaccount>.blob.core.windows.net/<path/to/callset_mapping.json>'
vid_file = 'az://<container>@<storageaccount>.blob.core.windows.net/<path/to/vid_mapping.json>'
reference_file = 'az://<container>@<storageaccount>.blob.core.windows.net/<path/to/reference>'
array = '<array_name>'
attributes = ['REF', 'ALT', 'GT']

In [3]:
gdb = genomicsdb.connect(workspace, callset_file, vid_file, reference_file, attributes)
list = gdb.query_variant_calls(array, [(0,1000000000)], [(0,3)])
print(list)

[(0, 1000000000, [{'Row': 1, 'Col': 11188011, 'Sample': '0x00922C4598840C041CB1BB19DC75231C969E9057F7E3B9CC04EE8B44E714B793_1xAFO897WL2I', 'CHROM': '1', 'POS': 11188012, 'END': 11188012, 'REF': 'C', 'ALT': '[T]', 'GT': '0/0'}, {'Row': 1, 'Col': 65310488, 'Sample': '0x00922C4598840C041CB1BB19DC75231C969E9057F7E3B9CC04EE8B44E714B793_1xAFO897WL2I', 'CHROM': '1', 'POS': 65310489, 'END': 65310489, 'REF': 'T', 'ALT': '[C]', 'GT': '0/0'}, {'Row': 3, 'Col': 65310488, 'Sample': '0x00AFE0D460A52BC28399A89E68A62D4CF2D279B56F482166B11BA7F29AFB45C9_1xGQPZ8BXWU0', 'CHROM': '1', 'POS': 65310489, 'END': 65310489, 'REF': 'T', 'ALT': '[C]', 'GT': '0/0'}, {'Row': 3, 'Col': 281159492, 'Sample': '0x00AFE0D460A52BC28399A89E68A62D4CF2D279B56F482166B11BA7F29AFB45C9_1xGQPZ8BXWU0', 'CHROM': '2', 'POS': 29416366, 'END': 29416366, 'REF': 'G', 'ALT': '[C]', 'GT': '0/0'}, {'Row': 1, 'Col': 281159698, 'Sample': '0x00922C4598840C041CB1BB19DC75231C969E9057F7E3B9CC04EE8B44E714B793_1xAFO897WL2I', 'CHROM': '2', 'POS': 29

In [4]:
import pandas as pd

list = gdb.query_variant_calls(array, [(0,13000), (13000, 1000000000)], [(0,3)])
print(list)
x,y,calls = zip(*list)
print(pd.DataFrame(calls[0]).head().to_markdown())

[(13000, 1000000000, [{'Row': 1, 'Col': 11188011, 'Sample': '0x00922C4598840C041CB1BB19DC75231C969E9057F7E3B9CC04EE8B44E714B793_1xAFO897WL2I', 'CHROM': '1', 'POS': 11188012, 'END': 11188012, 'REF': 'C', 'ALT': '[T]', 'GT': '0/0'}, {'Row': 1, 'Col': 65310488, 'Sample': '0x00922C4598840C041CB1BB19DC75231C969E9057F7E3B9CC04EE8B44E714B793_1xAFO897WL2I', 'CHROM': '1', 'POS': 65310489, 'END': 65310489, 'REF': 'T', 'ALT': '[C]', 'GT': '0/0'}, {'Row': 3, 'Col': 65310488, 'Sample': '0x00AFE0D460A52BC28399A89E68A62D4CF2D279B56F482166B11BA7F29AFB45C9_1xGQPZ8BXWU0', 'CHROM': '1', 'POS': 65310489, 'END': 65310489, 'REF': 'T', 'ALT': '[C]', 'GT': '0/0'}, {'Row': 3, 'Col': 281159492, 'Sample': '0x00AFE0D460A52BC28399A89E68A62D4CF2D279B56F482166B11BA7F29AFB45C9_1xGQPZ8BXWU0', 'CHROM': '2', 'POS': 29416366, 'END': 29416366, 'REF': 'G', 'ALT': '[C]', 'GT': '0/0'}, {'Row': 1, 'Col': 281159698, 'Sample': '0x00922C4598840C041CB1BB19DC75231C969E9057F7E3B9CC04EE8B44E714B793_1xAFO897WL2I', 'CHROM': '2', 'POS'