In [None]:
import os

bucket = os.environ['WORKSPACE_BUCKET']


from hail.plot import show
from pprint import pprint
from collections import Counter
from bokeh.plotting import output_file, save
import bokeh.io
from bokeh.io import *
from bokeh.resources import INLINE

In [None]:
bokeh.io.output_notebook(INLINE) 
%matplotlib inline

In [None]:
# Initialize Hail
import hail as hl
#import os
#from hail.plot import show

hl.init(default_reference='GRCh38')
#hl.plot.output_notebook()

In [None]:
pheno = hl.import_table(f'{bucket}/data/gwas_v4/pheno_hail_final.tsv', key = "person_id",  impute=True,types={'person_id':hl.tstr})

In [None]:
vds = hl.vds.read_vds("gs://fc-aou-datasets-controlled/v7/wgs/short_read/snpindel/vds/hail.vds")

In [None]:
vds = hl.vds.filter_samples(vds, pheno, keep = True, remove_dead_alleles = True)

In [None]:
vars=vds.variant_data.rows()

In [None]:
snp_table=vars.filter(hl.is_snp(vars.alleles[0],vars.alleles[1]))

In [None]:
SNPS=snp_table.count()
#947621484 variants

In [None]:
vds = hl.vds.filter_variants(vds,snp_table)

In [None]:
vds = hl.vds.split_multi(vds)

In [None]:
mt_full = hl.vds.to_dense_mt(vds)


#mt_full = mt_full.filter_rows(hl.is_snp(mt_full.alleles[0], mt_full.alleles[1]))

#mt_full=mt_full.annotate_entries(DP=hl.sum(mt_full.AD))
mt_full.describe()

In [None]:
mt_full = hl.variant_qc(mt_full)
mt_full.describe()

mt_full = mt_full.filter_rows(mt_full.variant_qc.AF[1] > 0.005)
#mt_full.count()
#20304183 variants meet AF

In [None]:
mt_full = mt_full.filter_rows((mt_full.variant_qc.call_rate > 0.95)&(mt_full.variant_qc.p_value_hwe > 1e-30))

In [None]:
#allele balance
ab = mt_full.AD[1] / hl.sum(mt_full.AD)

filter_condition_ab = ((mt_full.GT.is_hom_ref() & (ab <= 0.1)) |
                        (mt_full.GT.is_het() & (ab >= 0.25) & (ab <= 0.75)) |
                        (mt_full.GT.is_hom_var() & (ab >= 0.9)))



In [None]:

mt4 = mt_full.filter_entries(filter_condition_ab)

mt4.row.describe()

In [None]:
#~32 hrs for this step
mt4.write(f'{bucket}/data/gwas_v4/gwas_filtered.mt', overwrite=True )

In [None]:
#add pheno file covars
mt4 = mt4.annotate_cols(pheno = pheno[mt4.s])

In [None]:
covariates = [1.0, mt4.pheno.age, mt4.pheno.PC1, mt4.pheno.PC2, mt4.pheno.PC3, mt4.pheno.is_female]


In [None]:
gwas = hl.logistic_regression_rows(y=mt4.pheno.label,
                                   test='wald',
                                 x=mt4.GT.n_alt_alleles(),
                                 covariates=covariates)
gwas.describe()

In [None]:
gwas.export(f'{bucket}/data/gwas_v4/gwas_v4.tsv')