In [None]:
from gnomad_db.database import gnomAD_DB
import pandas as pd
import numpy as np

# Download SQLite preprocessed files

I have preprocessed and created sqlite3 files for gnomAD v2, v3, v4 for you, which can be easily downloaded from here. They contain all variants on the 24 standard chromosomes.

You can find the links to download the sqlite3 file in the README (https://github.com/KalinNonchev/gnomAD_DB)

In [None]:
# uncomment if you actually want to download it
# download_link = "https://zenodo.org/records/10066310/files/gnomad_db_wes_v4.0.sqlite3.gz?download=1"
# output_dir = "test_dir" # database_location
# gnomAD_DB.download_and_unzip(download_link, output_dir) 

# Initialize Database

In [None]:
# pass dir
database_location = "test_dir"

In [None]:
# initialize database
db = gnomAD_DB(database_location, gnomad_version="v4")

# Insert gnomAD variants into the database from single tsv file
Look into insertVariants notebook to do it for big vcf files

In [None]:
# get some variants
var_df = pd.read_csv("data/test_vcf_gnomad_chr21_10000.tsv.gz", sep="\t", names=db.columns, index_col=False)
# preprocess missing values
# IMPORTANT: The database removes internally chr prefix (chr1->1)
var_df = var_df.replace(".", np.nan)
var_df.head()

In [None]:
# insert variants
db.insert_variants(var_df)

# Query MAF

In [None]:
# check db columns, which we can query
db.columns

In [None]:
var_df = var_df[["chrom", "pos", "ref", "alt"]]
var_df.head()

## You can pass a dataframe with variants
It should contain the columns: [chrom, pos, ref, alt]

In [None]:
db.get_info_from_df(var_df, "AF").head() # only one columns

In [None]:
db.get_info_from_df(var_df, "AF, AF_popmax").head() # multiple columns

In [None]:
db.get_info_from_df(var_df, "*") # everything

In [None]:
dummy_var_df = pd.DataFrame({
    "chrom": ["1", "21"], 
    "pos": [21, 9825790], 
    "ref": ["T", "C"], 
    "alt": ["G", "T"]})
dummy_var_df

In [None]:
db.get_info_from_df(dummy_var_df, "AF")

## You can pass a single string as a variant

In [None]:
db.get_info_from_str("10:95606780:A>C", "*")

In [None]:
db.get_info_from_str("21:9825790:C>T", "*")

## You can look for the MAF scores in an interval

In [None]:
db.get_info_for_interval(chrom=21, interval_start=9825780, interval_end=9825799, query="*")