In [1]:
import subprocess
from datetime import datetime
from distutils.version import LooseVersion
from functools import partial
from pathlib import Path
from pprint import pprint

import dxdata
import dxpy
import hail as hl
import pandas as pd
import pyspark
import tomli
from matrixtables import *
from utils import get_stats

Path("../tmp").resolve().mkdir(parents=True, exist_ok=True)

In [2]:
with open("../config.toml", "rb") as f:
    conf = tomli.load(f)

# BASICS
IMPORT = conf["IMPORT"]
NAME = conf["NAME"]
VCF_VERSION = IMPORT["VCF_VERSION"]
REFERENCE_GENOME = conf["REFERENCE_GENOME"]
DATABASE = IMPORT["DATABASE"]


LOG_FILE = (
    Path(IMPORT["LOG_DIR"], f"{NAME}_{datetime.now().strftime('%H%M')}.log")
    .resolve()
    .__str__()
)

FILTER_FILE = (
    Path(conf["SAMPLE_QC"]["DATA_DIR"], conf["SAMPLE_QC"]["SAMPLE_FILTER_FILE"])
    .resolve()
    .__str__()
)

# GENES
GENE_FILE = Path(IMPORT["GENE_FILE"]).resolve().__str__()

with open(GENE_FILE, "r") as file:
    GENES = file.read().splitlines()
    
if NAME == "NONE":
    NAME = GENES[0]

# SAMPLE        
MIN_CALL_RATE = conf["SAMPLE_QC"]["MIN_CALL_RATE"]
MIN_MEAN_DP = conf["SAMPLE_QC"]["MIN_MEAN_DP"]
MIN_MEAN_GQ = conf["SAMPLE_QC"]["MIN_MEAN_GQ"]

# VARIANT
MIN_P_HWE = conf["VARIANT_QC"]["MIN_P_HWE"]
MIN_VAR_GQ = conf["VARIANT_QC"]["MIN_VAR_GQ"]

# GENOTYPE
MIN_DP = conf["ENTRY_QC"]["MIN_DP"]
MIN_GQ = conf["ENTRY_QC"]["MIN_GQ"]

# EXPORT
TMP_DIR = conf["EXPORT"]["TMP_DIR"]

BGEN_FILE = Path(TMP_DIR, f"{NAME}").resolve().__str__()
ANNOTATIONS_FILE = Path(TMP_DIR, f"{NAME}.annotations").resolve().__str__()
SETLIST_FILE = Path(TMP_DIR, f"{NAME}.setlist").resolve().__str__()


In [3]:
# Spark and Hail
sc = pyspark.SparkContext()
spark = pyspark.sql.SparkSession(sc)

mt_database = dxpy.find_one_data_object(name=DATABASE)["id"]
hl.init(sc=sc, default_reference=REFERENCE_GENOME, log=LOG_FILE)

pip-installed Hail requires additional configuration options in Spark referring
  to the path to the Hail Python module directory HAIL_DIR,
  e.g. /path/to/python/site-packages/hail:
    spark.jars=HAIL_DIR/hail-all-spark.jar
    spark.driver.extraClassPath=HAIL_DIR/hail-all-spark.jar
    spark.executor.extraClassPath=./hail-all-spark.jarRunning on Apache Spark version 2.4.4
SparkUI available at http://ip-10-60-130-154.eu-west-2.compute.internal:8081
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.61-3c86d3ba497a
LOGGING: writing to /opt/notebooks/gogoGPCR/hail_logs/NONE_1551.log


In [4]:
STAGE = "LABELLED"
READ_PATH = "dnax://" + mt_database + f"/{NAME}.{STAGE}.mt"

mt = hl.read_matrix_table(READ_PATH)

interesting = mt.filter_rows(
    (hl.is_defined(mt.labels)) & (hl.agg.any(mt.GT.is_non_ref()))
).count_rows()

pprint(f"{interesting} annotated variants found before QC")

'101 annotated variants found before QC'


In [5]:
# Withdrawn
mt = mt.filter_cols(~mt.s.startswith("W"))

pprint(f"Samples remaining after removing withdrawn participants: {mt.count_cols()} ")

'Samples remaining after removing withdrawn participants: 200611 '


In [6]:
# Filter samples
samples_to_remove = hl.import_table("file:" + FILTER_FILE, key="eid")
mt = mt.anti_join_cols(samples_to_remove)

pprint(f"Samples remaining after hard filtering samples: {mt.count_cols()} ")

2021-11-30 15:52:16 Hail: INFO: Reading table without type imputation
  Loading field 'eid' as type str (not specified)


'Samples remaining after hard filtering samples: 185834 '


In [7]:
# Sample QC
mt = sample_QC_mt(mt, MIN_CALL_RATE, MIN_MEAN_DP, MIN_MEAN_GQ)

pprint(f"Samples remaining after QC: {mt.count_cols()} ")

'Samples remaining after QC: 180623 '


In [8]:
# Variant QC
mt = variant_QC_mt(mt, MIN_P_HWE, MIN_VAR_GQ)

interesting = mt.filter_rows(
    (hl.is_defined(mt.labels)) & (hl.agg.any(mt.GT.is_non_ref()))
).count_rows()

pprint(
    f"{mt.count_rows()} variants remaining after QC of which {interesting} are annotated"
)

'287 variants remaining after QC of which 101 are annotated'


In [9]:
# Genotype GQ
mt = genotype_filter_mt(mt, MIN_DP, MIN_GQ, True)

missing = mt.aggregate_entries(hl.agg.sum(~hl.is_defined(mt.GT)))
pprint(f"{missing} missing or filtered entries after Call QC")

Filtering 0.00% entries out of downstream analysis.
'0 missing or filtered entries after Call QC'


In [10]:
# Checkpoint
stage = "QC2"
checkpoint_file = f"/tmp/{NAME}.{stage}.cp.mt"

mt = mt.checkpoint(checkpoint_file, overwrite=True)

2021-11-30 15:56:53 Hail: INFO: wrote matrix table with 287 rows and 180623 columns in 2 partitions to /tmp/DRD2.QC2.cp.mt
    Total size: 277.98 MiB
    * Rows/entries: 269.22 MiB
    * Columns: 8.76 MiB
    * Globals: 11.00 B
    * Smallest partition: 0 rows (20.00 B)
    * Largest partition:  287 rows (269.22 MiB)


In [11]:
# BGEN
write_bgen(mt, "file:" + BGEN_FILE)

2021-11-30 15:57:17 Hail: INFO: while writing:
    file:/opt/notebooks/gogoGPCR/tmp/DRD2.bgen
  merge time: 44.882ms


In [12]:
# ANNOTATIONS

mt = add_varid(mt)

annotations = (
    mt.select_rows(
        varid=mt.varid,
        gene=mt.vep.transcript_consequences.gene_symbol[0],
        annotation=mt.labels,
    )
    .rows()
    .key_by("varid")
    .drop("locus")
    .drop("alleles")
)
annotations.export("file:" + ANNOTATIONS_FILE, header=False)

2021-11-30 15:57:17 Hail: INFO: Coerced sorted dataset
2021-11-30 15:57:17 Hail: INFO: merging 1 files totalling 7.9K...
2021-11-30 15:57:17 Hail: INFO: while writing:
    file:/opt/notebooks/gogoGPCR/tmp/DRD2.annotations
  merge time: 12.507ms


In [13]:
# SETLIST
position = mt.aggregate_rows(hl.agg.min(mt.locus.position))
names = mt.varid.collect()
names_str = ",".join(names)

line = f"{mt.vep.transcript_consequences.gene_symbol[0].collect()[0]}\t{mt.locus.contig.collect()[0]}\t{position}\t{names_str}"

with open(SETLIST_FILE, "w") as f:
    f.write(line)

In [14]:
bgen_file = BGEN_FILE + ".bgen"
sample_file = BGEN_FILE + ".sample"

subprocess.run(["dx", "upload", bgen_file, sample_file, ANNOTATIONS_FILE, SETLIST_FILE, "--path", "/Data/burden/"], check = True, shell = False)

CompletedProcess(args=['dx', 'upload', '/opt/notebooks/gogoGPCR/tmp/DRD2.bgen', '/opt/notebooks/gogoGPCR/tmp/DRD2.sample', '/opt/notebooks/gogoGPCR/tmp/DRD2.annotations', '/opt/notebooks/gogoGPCR/tmp/DRD2.setlist', '--path', '/Data/burden/'], returncode=0)

In [15]:
STAGE = "FINAL"
WRITE_PATH = "dnax://" + mt_database + f"/{NAME}.{STAGE}.mt"

mt.write(WRITE_PATH, overwrite = True)

2021-11-30 15:59:25 Hail: INFO: wrote matrix table with 287 rows and 180623 columns in 2 partitions to dnax://database-G6XB998J860kZy4z59fBqPBV/DRD2.FINAL.mt
    Total size: 167.48 MiB
    * Rows/entries: 160.21 MiB
    * Columns: 7.27 MiB
    * Globals: 11.00 B
    * Smallest partition: 0 rows (20.00 B)
    * Largest partition:  287 rows (160.21 MiB)


In [16]:
stats, intr = get_stats(mt)
stats.show(-1)

2021-11-30 15:59:32 Hail: INFO: Ordering unsorted dataset with network shuffle


labels,n_carriers,n_variants
str,int64,int64
"""Gi1""",138,18
"""Gi1_GoA_Gz""",4,2
"""Gi1_Gz""",2,2
"""Gz""",6,1
"""WT""",9549,78


In [17]:
intr.export(f"/tmp/{NAME}_QC2.tsv")

subprocess.run(
    ["hadoop", "fs", "-get", f"/tmp/{NAME}_QC2.tsv", f"../tmp/{NAME}_QC2.tsv"],
    check=True,
    shell=False,
)

2021-11-30 15:59:39 Hail: INFO: merging 2 files totalling 15.4K...
2021-11-30 15:59:39 Hail: INFO: while writing:
    /tmp/DRD2_QC2.tsv
  merge time: 51.245ms


CompletedProcess(args=['hadoop', 'fs', '-get', '/tmp/DRD2_QC2.tsv', '../tmp/DRD2_QC2.tsv'], returncode=0)