In [1]:
from gnomad_db.gnomad_db import gnomAD_DB
import pandas as pd
import numpy as np

# Initialize Database

In [2]:
# pass dir
database_location = "data"

In [3]:
# initialize database
db = gnomAD_DB(database_location)

# Insert gnomAD variants into the database from single tsv file
Look into insertVariants notebook to do it for big vcf files

In [4]:
# get some variants
var_df = pd.read_csv("data/test_chr10_n100.tsv", sep="\t", names=db.columns, index_col=False)
# preprocess missing values
# IMPORTANT: The database removes internally chr prefix (chr1->1)
var_df = var_df.replace(".", np.NaN)
var_df.head()

Unnamed: 0,chrom,pos,ref,alt,AF,AF_afr,AF_eas,AF_fin,AF_nfe,AF_asj,AF_oth,AF_popmax
0,chr10,10265,C,T,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
1,chr10,10266,T,A,0.000367,0.0,0.0,0.0,0.000862,0.0,0.0,0.000862069
2,chr10,10266,T,C,0.000367,0.0,0.0,0.002336,0.0,0.0,0.0,
3,chr10,10268,A,C,0.000331,0.0,0.0,0.002415,0.0,0.0,0.0,
4,chr10,10278,T,C,0.0,0.0,0.0,0.0,0.0,0.0,0.0,


In [5]:
# insert variants
db.insert_variants(var_df)

# Query MAF

In [6]:
# check db columns, which we can query
db.columns

['chrom',
 'pos',
 'ref',
 'alt',
 'AF',
 'AF_afr',
 'AF_eas',
 'AF_fin',
 'AF_nfe',
 'AF_asj',
 'AF_oth',
 'AF_popmax']

In [7]:
var_df = var_df[["chrom", "pos", "ref", "alt"]]
var_df.head()

Unnamed: 0,chrom,pos,ref,alt
0,chr10,10265,C,T
1,chr10,10266,T,A
2,chr10,10266,T,C
3,chr10,10268,A,C
4,chr10,10278,T,C


## You can pass a dataframe with variants
It should contain the columns: [chrom, pos, ref, alt]

In [8]:
db.get_maf_from_df(var_df, "AF").head() # only one columns

Unnamed: 0,AF
0,0.0
1,0.000367
2,0.000367
3,0.000331
4,0.0


In [9]:
db.get_maf_from_df(var_df, "AF, AF_popmax").head() # multiple columns

Unnamed: 0,AF,AF_popmax
0,0.0,
1,0.000367,0.000862
2,0.000367,
3,0.000331,
4,0.0,


In [10]:
db.get_maf_from_df(var_df, "*").head() # everything

Unnamed: 0,chrom,pos,ref,alt,AF,AF_afr,AF_eas,AF_fin,AF_nfe,AF_asj,AF_oth,AF_popmax
0,10,10265,C,T,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
1,10,10266,T,A,0.000367,0.0,0.0,0.0,0.000862,0.0,0.0,0.000862
2,10,10266,T,C,0.000367,0.0,0.0,0.002336,0.0,0.0,0.0,
3,10,10268,A,C,0.000331,0.0,0.0,0.002415,0.0,0.0,0.0,
4,10,10278,T,C,0.0,0.0,0.0,0.0,0.0,0.0,0.0,


In [11]:
dummy_var_df = pd.DataFrame({
    "chrom": ["1", "10"], 
    "pos": [10, 10265], 
    "ref": ["T", "C"], 
    "alt": ["G", "T"]})
dummy_var_df

Unnamed: 0,chrom,pos,ref,alt
0,1,10,T,G
1,10,10265,C,T


In [12]:
db.get_maf_from_df(dummy_var_df, "*").head()

Unnamed: 0,chrom,pos,ref,alt,AF,AF_afr,AF_eas,AF_fin,AF_nfe,AF_asj,AF_oth,AF_popmax
0,1,10,T,G,,,,,,,,
1,10,10265,C,T,0.0,0.0,0.0,0.0,0.0,0.0,0.0,


## You can pass also a single string as a variant

In [13]:
db.get_maf_from_str("10:10268:A>C", "AF")

0.000331345

In [14]:
db.get_maf_from_str("10:10268:A>C", "*")

chrom              10
pos             10268
ref                 A
alt                 C
AF           0.000331
AF_afr            0.0
AF_eas            0.0
AF_fin       0.002415
AF_nfe            0.0
AF_asj            0.0
AF_oth            0.0
AF_popmax        None
Name: 0, dtype: object