In [1]:
import subprocess
from datetime import datetime
from distutils.version import LooseVersion
from functools import partial
from pathlib import Path
from pprint import pprint

import dxdata
import dxpy
import hail as hl
import pandas as pd
import pyspark
import tomli
from matrixtables import *
from utils import get_stats

Path("../tmp").resolve().mkdir(parents=True, exist_ok=True)

In [2]:
with open("../config.toml", "rb") as f:
    conf = tomli.load(f)

# BASICS
IMPORT = conf["IMPORT"]
NAME = conf["NAME"]
VCF_VERSION = IMPORT["VCF_VERSION"]
REFERENCE_GENOME = conf["REFERENCE_GENOME"]
DATABASE = IMPORT["DATABASE"]


LOG_FILE = (
    Path(IMPORT["LOG_DIR"], f"{NAME}_{datetime.now().strftime('%H%M')}.log")
    .resolve()
    .__str__()
)

FILTER_FILE = (
    Path(conf["SAMPLE_QC"]["DATA_DIR"], conf["SAMPLE_QC"]["SAMPLE_FILTER_FILE"])
    .resolve()
    .__str__()
)

# GENES
GENE_FILE = Path(IMPORT["GENE_FILE"]).resolve().__str__()

with open(GENE_FILE, "r") as file:
    GENES = file.read().splitlines()
    
if NAME == "NONE":
    NAME = GENES[0]

# SAMPLE        
MIN_CALL_RATE = conf["SAMPLE_QC"]["MIN_CALL_RATE"]
MIN_MEAN_DP = conf["SAMPLE_QC"]["MIN_MEAN_DP"]
MIN_MEAN_GQ = conf["SAMPLE_QC"]["MIN_MEAN_GQ"]

# VARIANT
MIN_P_HWE = conf["VARIANT_QC"]["MIN_P_HWE"]
MIN_VAR_GQ = conf["VARIANT_QC"]["MIN_VAR_GQ"]

# GENOTYPE
MIN_DP = conf["ENTRY_QC"]["MIN_DP"]
MIN_GQ = conf["ENTRY_QC"]["MIN_GQ"]

# EXPORT
TMP_DIR = conf["EXPORT"]["TMP_DIR"]

BGEN_FILE = Path(TMP_DIR, f"{NAME}").resolve().__str__()
ANNOTATIONS_FILE = Path(TMP_DIR, f"{NAME}.annotations").resolve().__str__()
SETLIST_FILE = Path(TMP_DIR, f"{NAME}.setlist").resolve().__str__()


In [3]:
# Spark and Hail
sc = pyspark.SparkContext()
spark = pyspark.sql.SparkSession(sc)

mt_database = dxpy.find_one_data_object(name=DATABASE)["id"]
hl.init(sc=sc, default_reference=REFERENCE_GENOME, log=LOG_FILE)

pip-installed Hail requires additional configuration options in Spark referring
  to the path to the Hail Python module directory HAIL_DIR,
  e.g. /path/to/python/site-packages/hail:
    spark.jars=HAIL_DIR/hail-all-spark.jar
    spark.driver.extraClassPath=HAIL_DIR/hail-all-spark.jar
    spark.executor.extraClassPath=./hail-all-spark.jarRunning on Apache Spark version 2.4.4
SparkUI available at http://ip-10-60-79-22.eu-west-2.compute.internal:8081
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.78-b17627756568
LOGGING: writing to /opt/notebooks/gogoGPCR/hail_logs/NONE_1833.log


In [4]:
STAGE = "LABELLED"
READ_PATH = "dnax://" + mt_database + f"/{NAME}.{STAGE}.mt"

mt = hl.read_matrix_table(READ_PATH)

interesting = mt.filter_rows(
    (hl.is_defined(mt.labels)) & (hl.agg.any(mt.GT.is_non_ref()))
).count_rows()

pprint(f"{interesting} annotated variants found before QC")

'15 annotated variants found before QC'


In [5]:
# Withdrawn
mt = mt.filter_cols(~mt.s.startswith("W"))

pprint(f"Samples remaining after removing withdrawn participants: {mt.count_cols()} ")

'Samples remaining after removing withdrawn participants: 200611 '


In [6]:
# Filter samples
samples_to_remove = hl.import_table("file:" + FILTER_FILE, key="eid")
mt = mt.anti_join_cols(samples_to_remove)

pprint(f"Samples remaining after hard filtering samples: {mt.count_cols()} ")

2021-12-15 18:34:26 Hail: INFO: Reading table without type imputation
  Loading field 'eid' as type str (not specified)


'Samples remaining after hard filtering samples: 185834 '


In [9]:
# Sample QC
# May need adjustment if too many samples are removed by default settings
MIN_MEAN_DP = 15
MIN_MEAN_GQ = 48.5

mt = sample_QC_mt(mt, MIN_CALL_RATE, MIN_MEAN_DP, MIN_MEAN_GQ)

pprint(f"Samples remaining after QC: {mt.count_cols()} ")

'Samples remaining after QC: 185730 '


In [13]:
# Variant QC
mt = variant_QC_mt(mt, MIN_P_HWE, MIN_VAR_GQ)

interesting = mt.filter_rows(
    (hl.is_defined(mt.labels)) & (hl.agg.any(mt.GT.is_non_ref()))
)

interesting_count = interesting.count_rows()

pprint(
    f"{mt.count_rows()} variants remaining after QC of which {interesting_count} are annotated"
)

'219 variants remaining after QC of which 15 are annotated'


In [15]:
# Genotype GQ
mt = genotype_filter_mt(mt, MIN_DP, MIN_GQ, True)

missing = mt.aggregate_entries(hl.agg.sum(~hl.is_defined(mt.GT)))
pprint(f"{missing} missing or filtered entries after Call QC")

Filtering 0.00% entries out of downstream analysis.
'0 missing or filtered entries after Call QC'


In [16]:
# Checkpoint
stage = "QC2"
checkpoint_file = f"/tmp/{NAME}.{stage}.cp.mt"

mt = mt.checkpoint(checkpoint_file, overwrite=True)

2021-12-15 18:44:04 Hail: INFO: wrote matrix table with 219 rows and 185730 columns in 3 partitions to /tmp/MC4R.QC2.cp.mt


In [19]:
# BGEN
write_bgen(mt, "file:" + BGEN_FILE)

2021-12-15 18:48:41 Hail: INFO: while writing:
    file:/opt/notebooks/gogoGPCR/tmp/MC4R.bgen
  merge time: 73.340ms


In [20]:
# ANNOTATIONS

mt = add_varid(mt)

annotations = (
    mt.select_rows(
        varid=mt.varid,
        gene=mt.vep.transcript_consequences.gene_symbol[0],
        annotation=mt.labels,
    )
    .rows()
    .key_by("varid")
    .drop("locus")
    .drop("alleles")
)
annotations.export("file:" + ANNOTATIONS_FILE, header=False)

2021-12-15 18:48:41 Hail: INFO: Coerced sorted dataset
2021-12-15 18:48:41 Hail: INFO: merging 2 files totalling 5.8K...
2021-12-15 18:48:41 Hail: INFO: while writing:
    file:/opt/notebooks/gogoGPCR/tmp/MC4R.annotations
  merge time: 15.759ms


In [21]:
# SETLIST
position = mt.aggregate_rows(hl.agg.min(mt.locus.position))
names = mt.varid.collect()
names_str = ",".join(names)

line = f"{mt.vep.transcript_consequences.gene_symbol[0].collect()[0]}\t{mt.locus.contig.collect()[0]}\t{position}\t{names_str}"

with open(SETLIST_FILE, "w") as f:
    f.write(line)

In [22]:
bgen_file = BGEN_FILE + ".bgen"
sample_file = BGEN_FILE + ".sample"

subprocess.run(["dx", "upload", bgen_file, sample_file, ANNOTATIONS_FILE, SETLIST_FILE, "--path", "/Data/burden/"], check = True, shell = False)

CompletedProcess(args=['dx', 'upload', '/opt/notebooks/gogoGPCR/tmp/MC4R.bgen', '/opt/notebooks/gogoGPCR/tmp/MC4R.sample', '/opt/notebooks/gogoGPCR/tmp/MC4R.annotations', '/opt/notebooks/gogoGPCR/tmp/MC4R.setlist', '--path', '/Data/burden/'], returncode=0)

In [23]:
STAGE = "FINAL"
WRITE_PATH = "dnax://" + mt_database + f"/{NAME}.{STAGE}.mt"

mt.write(WRITE_PATH, overwrite = True)

2021-12-15 18:50:33 Hail: INFO: wrote matrix table with 219 rows and 185730 columns in 3 partitions to dnax://database-G6XB998J860kZy4z59fBqPBV/MC4R.FINAL.mt


In [17]:
stats, intr = get_stats(mt)
stats.show(-1)

2021-12-15 18:45:09 Hail: INFO: Ordering unsorted dataset with network shuffle


labels,n_carriers,n_variants
int32,int64,int64
1,5011,6
2,31,5
3,7312,1
4,347,3


In [18]:
intr.export(f"/tmp/{NAME}_QC2.tsv")

subprocess.run(
    ["hadoop", "fs", "-get", f"/tmp/{NAME}_QC2.tsv", f"../tmp/{NAME}_QC2.tsv"],
    check=True,
    shell=False,
)

2021-12-15 18:45:15 Hail: INFO: merging 2 files totalling 2.4K...
2021-12-15 18:45:15 Hail: INFO: while writing:
    /tmp/MC4R_QC2.tsv
  merge time: 33.965ms


CompletedProcess(args=['hadoop', 'fs', '-get', '/tmp/MC4R_QC2.tsv', '../tmp/MC4R_QC2.tsv'], returncode=0)