In [None]:
# @Peter some of these are probably unused 
from pathlib import Path
import pandas as pd
import pyspark
import dxpy
import hail as hl
from datetime import datetime
from src.matrixtables import import_mt, interval_qc_mt, smart_split_multi_mt
from subprocess import run

from bokeh.io import show, output_notebook
from bokeh.layouts import gridplot
output_notebook()

Path("/tmp").resolve().mkdir(parents=True, exist_ok=True)



In [None]:
# Spark and Hail
# @Peter this can probably stay as is
# 

DATABASE = "matrix_tables"
REFERENCE_GENOME = 'GRCh38'
PROJ_NAME = "GLP2R"

LOG_FILE = (
    Path("../hail_logs", f"{PROJ_NAME}_{datetime.now().strftime('%H%M')}.log")
    .resolve()
    .__str__()
)

# Hail init
sc = pyspark.SparkContext()
spark = pyspark.sql.SparkSession(sc)

hl.init(sc=sc, default_reference=REFERENCE_GENOME, log=LOG_FILE)

# Create database in DNAX
spark.sql(f"CREATE DATABASE IF NOT EXISTS {DATABASE} LOCATION 'dnax://'")
mt_database = dxpy.find_one_data_object(name=DATABASE)["id"]

In [None]:
# @Peter see here: https://dnanexus.gitbook.io/uk-biobank-rap/science-corner/whole-exome-sequencing-oqfe-protocol/generation-and-utilization-of-quality-control-set-90pct10dp-on-oqfe-data/details-on-processing-the-300k-exome-data-to-generate-the-quality-control-set 
# for a 90% DP > 10 filter that we need to add. `ukb23145_300k_OQFE.90pct10dp_qc_variants.txt` is a helper file somewhere

In [None]:
# Read in metadata and region
# Map genes to blocks and regions
GENES = ["MC4R", "GLP2R"]
MAPPING_FILE = Path("../../data/misc/mappings_with_blocks.tsv").resolve()
mapping = pd.read_csv(MAPPING_FILE, sep="\t").set_index("HGNC", drop=False)
mapping.loc[GENES,:]

In [None]:
# Import gVCFs as Matrix Table and filter to only GENES

VCF_DIR = Path("/mnt/project/Bulk/Exome sequences/Population level exome OQFE variants, pVCF format - final release/")
FIELD_ID = 23157 # UKB field ID for latest exome data

mt = import_mt(GENES, mapping, vcf_dir=VCF_DIR, vcf_version="v1", field_id=FIELD_ID).key_rows_by(
    "locus", "alleles"
)

In [None]:

# Filter to only WES target regions 
# @Peter if you want to dig up the reference for this that would be cool. Something something WES target region UKB

INTERVAL_FILE=Path("../../data/misc/xgen_plus_spikein.b38.bed").resolve()
run(["hadoop", "fs", "-put", str(INTERVAL_FILE), "/tmp"])

interval_table = hl.import_bed(
        f"/tmp/{INTERVAL_FILE.name}",
        reference_genome="GRCh38",
    )

mt = mt.filter_rows(hl.is_defined(interval_table[mt.locus]))



In [None]:
# Initial checkpoint
# @Peter might add something about why Hail likes to do checkpoint
stage = "INITIAL"
checkpoint_file = f"/tmp/{PROJ_NAME}.{stage}.cp.mt"

mt = mt.checkpoint(checkpoint_file, overwrite=False)

v, s = mt.count()
print(f"{v} variants and {s} samples after import and target region filter")

In [None]:
# Split multi alleles 
mt = mt.filter_rows(mt.alleles.length() <= 6)
mt = smart_split_multi_mt(mt)

print(f"{mt.count_rows()} variants with not more than 6 alleles after splitting")

In [None]:
# Annotate with VEP and generate protein consequence
VEP_JSON = Path("../../data/misc/GRCh38_VEP.json").resolve()

mt = hl.vep(mt, f"file:{VEP_JSON}")

is_MANE = mt.aggregate_rows(
    hl.agg.all(hl.is_defined(mt.vep.transcript_consequences.mane_select))
)
assert is_MANE, "Selected transcript may not be MANE Select. Check manually."

mt = mt.annotate_rows(
    protCons=mt.vep.transcript_consequences.amino_acids[0].split("/")[0]
    + hl.str(mt.vep.transcript_consequences.protein_end[0])
    + mt.vep.transcript_consequences.amino_acids[0].split("/")[-1]
)



In [None]:
# Write final MT to DNAX
# find database ID of newly created database using a dxpy method
# From https://github.com/dnanexus/OpenBio/blob/master/hail_tutorial/MatrixTable_variant_annotation_with_VEP.ipynb
MT_NAME = ""
db_uri = dxpy.find_one_data_object(name=f"{DATABASE}", classname="database")['id']
url = f"dnax://{db_uri}/{MT_NAME}"


# Note: Writing (saving/storing) the Table to the database can be computationally expensive
# depending on the size of the annotations.
# 
# Before this step, the Hail Table is just an object in memory. To persist it and be able to access 
# it later, the notebook needs to write it into a persistent filesystem (in this case DNAX).
# See https://hail.is/docs/0.2/hail.Table.html#hail.Table.write for additional documentation.
mt.write(url) # Note: output should describe size of Table (i.e. number of rows, partitions)
