In [None]:
import os

bucket = os.environ['WORKSPACE_BUCKET']


from hail.plot import show
from pprint import pprint
from collections import Counter
from bokeh.plotting import output_file, save
import bokeh.io
from bokeh.io import *
from bokeh.resources import INLINE

In [None]:
bokeh.io.output_notebook(INLINE) 
%matplotlib inline

In [None]:
# Initialize Hail
import hail as hl
#import os
#from hail.plot import show

hl.init(default_reference='GRCh38')
#hl.plot.output_notebook()

In [None]:
pheno = hl.import_table(f'{bucket}/data/gwas_v4/pheno_hail_final.tsv', key = "person_id",  impute=True,types={'person_id':hl.tstr})

In [None]:
vds = hl.vds.read_vds("gs://fc-aou-datasets-controlled/v7/wgs/short_read/snpindel/vds/hail.vds")

In [None]:
vds = hl.vds.filter_samples(vds, pheno, keep = True, remove_dead_alleles = True)

In [None]:
snp_table = vds.variant_data.filter_rows(hl.is_snp(vds.variant_data.alleles[0], vds.variant_data.alleles[1])).rows()

In [None]:
vds = hl.vds.filter_variants(vds,snp_table)

In [None]:
#this only tests chr12
vds_cut = hl.vds.filter_chromosomes(vds, keep= ["chr12"])

"""

vds_cut = hl.vds.filter_intervals(
    vds_cut,
    [hl.parse_locus_interval(x,)
     for x in ['chr12:29.01M-29.02M']])"""

In [None]:
vds_cut = hl.vds.split_multi(vds_cut)

In [None]:
mt_full = hl.vds.to_dense_mt(vds_cut)


mt_full = mt_full.filter_rows(hl.is_snp(mt_full.alleles[0], mt_full.alleles[1]))

mt_full=mt_full.annotate_entries(DP=hl.sum(mt_full.AD))
mt_full.describe()

In [None]:
mt_full = hl.variant_qc(mt_full)
mt_full.describe()

In [None]:
mt2.DP.summarize()

In [None]:
mt_full.variant_qc.summarize()

In [None]:
#call rate histogram
p = hl.plot.histogram(mt_full.variant_qc.call_rate, range=(0,1), legend='chr 12 variant Call Rate', title='chr 12 call rate Histogram')
show(p)

In [None]:
mt_full = hl.sample_qc(mt_full)

In [None]:
mt_full.sample_qc.summarize()

In [None]:
#DP histogram
# Aggregate values greater than 100 into one bin
agg_value_counts =mt_full.aggregate_entries(hl.agg.hist(mt_full.DP, 0,100,10))

In [None]:
counts_below_100 = agg_value_counts.bin_freq
count_above_100 = agg_value_counts.n_larger
counts = counts_below_100 + [count_above_100]

In [None]:
# Plot the histogram
plt.figure(figsize=(10, 6))
plt.bar(agg_value_counts.bin_edges, counts, width=10, align='edge')
plt.xlabel('DP')
plt.ylabel('Frequency')
plt.ticklabel_format(style='plain', axis='y')
plt.title('chr12 DP histogram')
plt.show()

In [None]:
#sample call rate histogram
p = hl.plot.histogram(mt_test.sample_qc.call_rate, range=(0.99,1), legend='chr 12 Call Rate', title='chr 12 call rate Histogram')
show(p)

In [None]:
#GQ histogram
p = hl.plot.histogram(mt_test.sample_qc.gq_stats.mean, bins=20, range=(46,48), legend='chr12 Mean Sample GQ',title='chr12 mean gq')
show(p)

In [None]:
#DP vs call rate
p = hl.plot.scatter(mt_test.sample_qc.dp_stats.mean, mt_test.sample_qc.call_rate, xlabel='Mean DP', ylabel='Call Rate',title='chr12 dp vs call_rate')
show(p)