# Create a MatrixTable and QC the hell out of it
## Import stuff and set your parameters
First, we import necessary libraries and configurations from config.toml. Then we initialise Spark and Hail.

In [2]:
import subprocess
from datetime import datetime
from distutils.version import LooseVersion
from functools import partial
from pathlib import Path
from pprint import pprint

import dxdata
import dxpy
import hail as hl
import pandas as pd
import pyspark
import tomli
from matrixtables import *
from utils import get_stats

Path("../tmp").resolve().mkdir(parents=True, exist_ok=True)

In [4]:
# Parameters
with open("../config.toml", "rb") as f:
    conf = tomli.load(f)

IMPORT = conf["IMPORT"]
NAME = conf["NAME"]
VCF_VERSION = IMPORT["VCF_VERSION"]
REFERENCE_GENOME = conf["REFERENCE_GENOME"]
DATABASE = IMPORT["DATABASE"]

LOG_FILE = (
    Path(IMPORT["LOG_DIR"], f"{NAME}_{datetime.now().strftime('%H%M')}.log")
    .resolve()
    .__str__()
)

MAP_FILE = Path(IMPORT["MAPPING_FILE"]).resolve().__str__()
INT_FILE = Path(IMPORT["INTERVAL_FILE"]).resolve().__str__()
GENE_FILE = Path(IMPORT["GENE_FILE"]).resolve().__str__()

with open(GENE_FILE, "r") as file:
    GENES = file.read().splitlines()

    if NAME == "NONE":
        NAME = GENES[0]


VCF_DIR = Path(IMPORT["VCF_DIR"]).resolve().__str__()

DOWNSAMPLE_P = IMPORT.get("DOWNSAMPLE_P", None)

SNV_ONLY = conf["ANNOTATE"]["SNV_ONLY"]
USE_VEP = conf["ANNOTATE"]["USE_VEP"]
MISSENSE_ONLY = conf["ANNOTATE"]["MISSENSE_ONLY"]

VEP_JSON = Path(conf["ANNOTATE"]["VEP_JSON"]).resolve().__str__()

TMP_DIR = conf["EXPORT"]["TMP_DIR"]

BGEN_FILE = Path(TMP_DIR, f"{NAME}").resolve().__str__()
ANNOTATIONS_FILE = Path(TMP_DIR, f"{NAME}.annotations").resolve().__str__()
SETLIST_FILE = Path(TMP_DIR, f"{NAME}.setlist").resolve().__str__()

In [5]:
# Spark and Hail

sc = pyspark.SparkContext()
spark = pyspark.sql.SparkSession(sc)

try:
    mt_database = dxpy.find_one_data_object(name=DATABASE)["id"]
except Exception as e:
    spark.sql(f"CREATE DATABASE {DATABASE} LOCATION  'dnax://'")
    mt_database = dxpy.find_one_data_object(name=DATABASE)["id"]

# this breaks export_bgen for now
# hl.init(sc=sc, default_reference=REFERENCE_GENOME, log=LOG_FILE, tmp_dir=f'dnax://{mt_database}/tmp/')

hl.init(sc=sc, default_reference=REFERENCE_GENOME, log=LOG_FILE)

pip-installed Hail requires additional configuration options in Spark referring
  to the path to the Hail Python module directory HAIL_DIR,
  e.g. /path/to/python/site-packages/hail:
    spark.jars=HAIL_DIR/hail-all-spark.jar
    spark.driver.extraClassPath=HAIL_DIR/hail-all-spark.jar
    spark.executor.extraClassPath=./hail-all-spark.jarRunning on Apache Spark version 2.4.4
SparkUI available at http://ip-10-60-25-134.eu-west-2.compute.internal:8081
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.61-3c86d3ba497a
LOGGING: writing to /opt/notebooks/gogoGPCR/hail_logs/NONE_1947.log


In [6]:
# Import
mapping = pd.read_csv(MAP_FILE, sep="\t").set_index("HGNC", drop=False)

mt = import_mt(GENES, mapping, vcf_dir=VCF_DIR, vcf_version=VCF_VERSION).key_rows_by(
    "locus", "alleles"
)  # .checkpoint(checkpoint_file)

v, s = mt.count()
pprint(f"{v} variants and {s} samples after import")

'260 variants and 200643 samples after import'


In [7]:
# Checkpoint
stage = "RAW"
checkpoint_file = f"/tmp/{NAME}.{stage}.cp.mt"

mt = mt.checkpoint(checkpoint_file, overwrite=True)

2021-12-05 19:49:40 Hail: INFO: Coerced sorted dataset
2021-12-05 19:50:30 Hail: INFO: wrote matrix table with 260 rows and 200643 columns in 1 partition to /tmp/MC4R.RAW.cp.mt
    Total size: 246.04 MiB
    * Rows/entries: 244.73 MiB
    * Columns: 1.31 MiB
    * Globals: 11.00 B
    * Smallest partition: 260 rows (244.73 MiB)
    * Largest partition:  260 rows (244.73 MiB)


In [8]:
# Downsample
if DOWNSAMPLE_P is not None:
    mt = downsample_mt(mt, DOWNSAMPLE_P)

    pprint(f"{mt.count_cols()} samples after downsampling")

In [9]:
# Interval QC
mt = interval_qc_mt(mt, "file:" + INT_FILE)

pprint(f"{mt.count_rows()} variants after interval filtering")

2021-12-05 19:52:00 Hail: INFO: Reading table without type imputation
  Loading field 'f0' as type str (user-supplied)
  Loading field 'f1' as type int32 (user-supplied)
  Loading field 'f2' as type int32 (user-supplied)
2021-12-05 19:52:03 Hail: INFO: Coerced sorted dataset


'204 variants after interval filtering'


In [10]:
# Split multi
mt = mt.filter_rows(mt.alleles.length() <= 6)
mt = smart_split_multi_mt(mt)

pprint(f"{mt.count_rows()} variants with not more than 6 alleles after splitting")

2021-12-05 19:52:08 Hail: INFO: Coerced sorted dataset
2021-12-05 19:52:10 Hail: INFO: Coerced sorted dataset
2021-12-05 19:52:13 Hail: INFO: Coerced sorted dataset


'223 variants with not more than 6 alleles after splitting'


In [11]:
if USE_VEP:
    mt = hl.vep(mt, "file:" + VEP_JSON)

    is_MANE = mt.aggregate_rows(
        hl.agg.all(hl.is_defined(mt.vep.transcript_consequences.mane_select))
    )
    assert is_MANE, "Selected transcript may not be MANE Select. Check manually."

    mt = mt.annotate_rows(
        protCons=mt.vep.transcript_consequences.amino_acids[0].split("/")[0]
        + hl.str(mt.vep.transcript_consequences.protein_end[0])
        + mt.vep.transcript_consequences.amino_acids[0].split("/")[-1]
    )

2021-12-05 19:52:17 Hail: INFO: Coerced sorted dataset
2021-12-05 19:52:19 Hail: INFO: Coerced sorted dataset
2021-12-05 19:52:20 Hail: INFO: Coerced sorted dataset
2021-12-05 19:52:22 Hail: INFO: Coerced sorted dataset
2021-12-05 19:52:23 Hail: INFO: Coerced sorted dataset
2021-12-05 19:52:25 Hail: INFO: Coerced sorted dataset
2021-12-05 19:52:26 Hail: INFO: Coerced sorted dataset
2021-12-05 19:52:27 Hail: INFO: Coerced sorted dataset


In [12]:
STAGE = "QC1"
WRITE_PATH = "dnax://" + mt_database + f"/{NAME}.{STAGE}.mt"

mt.write(WRITE_PATH, overwrite=True)

2021-12-05 19:54:09 Hail: INFO: Coerced sorted dataset
2021-12-05 19:54:11 Hail: INFO: Coerced sorted dataset
2021-12-05 19:54:19 Hail: INFO: Coerced sorted dataset
2021-12-05 19:54:20 Hail: INFO: Coerced sorted dataset
2021-12-05 19:56:03 Hail: INFO: wrote matrix table with 223 rows and 200643 columns in 3 partitions to dnax://database-G6XB998J860kZy4z59fBqPBV/MC4R.QC1.mt
    Total size: 113.68 MiB
    * Rows/entries: 112.55 MiB
    * Columns: 1.13 MiB
    * Globals: 11.00 B
    * Smallest partition: 0 rows (20.00 B)
    * Largest partition:  210 rows (102.36 MiB)
