# Create a MatrixTable and QC the hell out of it
## Import stuff and set your parameters
First, we import necessary libraries and configurations from config.toml. Then we initialise Spark and Hail. 

In [45]:
# Imports
import toml
from pathlib import Path
from datetime import datetime
from pprint import pprint
import sys
from distutils.version import LooseVersion
import subprocess

import pandas as pd
import pyspark
import dxpy
import dxdata
import hail as hl
    
module_path = Path('..').resolve().__str__()

if module_path not in sys.path:
    sys.path.append(module_path)
    
from src.utils import get_position, lookup_vcfs
from src.matrixtables import *
from src.resources import lauryns_variants


hl.plot.output_notebook()

In [36]:
# Parameters
with open("../config.toml") as f:
    conf = toml.load(f)

imps = conf["IMPORT"]
GENE = conf["GENE"]

now = datetime.now().strftime("%H%M")

map_file = Path(imps["MISC_DIR"],imps["MAPPING_FILE"]).resolve().__str__()
log_file = Path(imps["LOG_DIR"], f"{GENE}_{now}.log").resolve().__str__()
int_file = Path(imps["MISC_DIR"], imps["INTERVAL_FILE"]).resolve().__str__()
vcf_dir = Path(imps["VCF_DIR"]).resolve().__str__()


In [3]:
# Spark and Hail
sc = pyspark.SparkContext()
spark = pyspark.sql.SparkSession(sc)

hl.init(sc=sc, default_reference='GRCh38', log=log_file)

pip-installed Hail requires additional configuration options in Spark referring
  to the path to the Hail Python module directory HAIL_DIR,
  e.g. /path/to/python/site-packages/hail:
    spark.jars=HAIL_DIR/hail-all-spark.jar
    spark.driver.extraClassPath=HAIL_DIR/hail-all-spark.jar
    spark.executor.extraClassPath=./hail-all-spark.jarRunning on Apache Spark version 2.4.4
SparkUI available at http://ip-10-60-12-7.eu-west-2.compute.internal:8081
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.61-3c86d3ba497a
LOGGING: writing to /opt/notebooks/gogoGPCR/hail_logs/GCGR_1258.log


In [4]:
# VCFs
mapping = pd.read_csv(map_file, sep = "\t").set_index("HGNC", drop=False).loc[GENE,:].to_dict()
vcf_files = lookup_vcfs(mapping = mapping, vcfdir = vcf_dir, gene = GENE, version = imps["VCF_VERSION"])

In [6]:
# Import
mt = import_mt(vcf_files.get("vcfs"), mapping).key_rows_by("locus", "alleles")#.checkpoint(checkpoint_file)

v, s = mt.count()
pprint(f"{v} variants and {s} samples after import")

'855 variants and 200643 samples after import'


In [7]:
# Checkpoint
stage = "raw"
checkpoint_file = "file://" + Path(imps["TMP_DIR"], f"{GENE}.{stage}.cp.mt").resolve().__str__()

mt = mt.checkpoint(checkpoint_file, overwrite = True)

2021-09-14 13:04:11 Hail: INFO: Coerced sorted dataset
2021-09-14 13:06:52 Hail: INFO: wrote matrix table with 855 rows and 200643 columns in 1 partition to file:///opt/notebooks/gogoGPCR/tmp/GCGR.raw.cp.mt
    Total size: 1005.43 MiB
    * Rows/entries: 1004.12 MiB
    * Columns: 1.31 MiB
    * Globals: 11.00 B
    * Smallest partition: 855 rows (1004.12 MiB)
    * Largest partition:  855 rows (1004.12 MiB)


In [5]:
# Re-load
stage = "raw"
checkpoint_file = "file://" + Path(imps["TMP_DIR"], f"{GENE}.{stage}.cp.mt").resolve().__str__()

mt = hl.read_matrix_table(checkpoint_file)

In [11]:
# Downsample
mt = downsample_mt(mt, imps.get("DOWNSAMPLE_P", None))

pprint(f"{mt.count_cols()} samples after downsampling")

'200643 samples after downsampling'


In [12]:
# Interval QC
mt = interval_qc_mt(mt, mapping, "file://" + int_file)

pprint(f"{mt.count_rows()} variants after interval filtering")

2021-09-14 13:09:57 Hail: INFO: Reading table without type imputation
  Loading field 'f0' as type str (user-supplied)
  Loading field 'f1' as type int32 (user-supplied)
  Loading field 'f2' as type int32 (user-supplied)
2021-09-14 13:10:00 Hail: INFO: Coerced sorted dataset


'361 variants after interval filtering'


In [13]:
# Split multi
mt = mt.filter_rows(mt.alleles.length() <= 6)
mt = smart_split_multi_mt(mt)

pprint(f'{mt.count_rows()} variants not more than 6 alleles after split')

2021-09-14 13:10:18 Hail: INFO: Coerced sorted dataset
2021-09-14 13:10:20 Hail: INFO: Coerced sorted dataset
2021-09-14 13:10:22 Hail: INFO: Coerced sorted dataset


'396 variants not more than 6 alleles after split'


In [14]:
# Genotype GQ
mt = genotype_filter_mt(mt, conf["ENTRY_QC"]["MIN_DP"], conf["ENTRY_QC"]["MIN_GQ"], conf["ENTRY_QC"]["MIN_PL"], True)

v, s = mt.count()
pprint(f"{v} variants and {s} samples after genotype QC")

2021-09-14 13:10:34 Hail: INFO: Coerced sorted dataset
2021-09-14 13:10:35 Hail: INFO: Coerced sorted dataset
2021-09-14 13:10:37 Hail: INFO: Coerced sorted dataset


'381 variants and 200643 samples after genotype QC'


In [15]:
# Checkpoint
stage = "GT_QC"
checkpoint_file = "file://" + Path(imps["TMP_DIR"], f"{GENE}.{stage}.cp.mt").resolve().__str__()

mt = mt.checkpoint(checkpoint_file, overwrite = True)

2021-09-14 13:10:48 Hail: INFO: Coerced sorted dataset
2021-09-14 13:10:49 Hail: INFO: Coerced sorted dataset
2021-09-14 13:10:50 Hail: INFO: Coerced sorted dataset
2021-09-14 13:11:24 Hail: INFO: Coerced sorted dataset
2021-09-14 13:11:26 Hail: INFO: Coerced sorted dataset
2021-09-14 13:11:27 Hail: INFO: Coerced sorted dataset
2021-09-14 13:11:29 Hail: INFO: Coerced sorted dataset
2021-09-14 13:11:30 Hail: INFO: Coerced sorted dataset
2021-09-14 13:11:55 Hail: INFO: Coerced sorted dataset
2021-09-14 13:11:55 Hail: INFO: Coerced sorted dataset
2021-09-14 13:13:24 Hail: INFO: wrote matrix table with 396 rows and 200643 columns in 3 partitions to file:///opt/notebooks/gogoGPCR/tmp/GCGR.GT_QC.cp.mt
    Total size: 433.30 MiB
    * Rows/entries: 431.70 MiB
    * Columns: 1.60 MiB
    * Globals: 11.00 B
    * Smallest partition: 0 rows (20.00 B)
    * Largest partition:  377 rows (414.90 MiB)


In [16]:
# Re-load
stage = "GT_QC"
checkpoint_file = "file://" + Path(imps["TMP_DIR"], f"{GENE}.{stage}.cp.mt").resolve().__str__()

mt = hl.read_matrix_table(checkpoint_file)

In [20]:
# Variant QC
mt = variant_QC_mt(mt, conf["VARIANT_QC"]["MIN_P_HWE"], conf["VARIANT_QC"]["MIN_GQ"])

print(f"Variants remaining after QC: {mt.count_rows()} ")

Variants remaining after QC: 393 


In [21]:
# Withdrawn
mt = mt.filter_cols(~mt.s.startswith("W"))

print(f"Samples remaining after removing withdrawn participants: {mt.count_cols()} ")

In [23]:
# Sample QC
mt = sample_QC_mt(mt, conf["SAMPLE_QC"]["MIN_CALL_RATE"], conf["SAMPLE_QC"]["MIN_MEAN_DP"], conf["SAMPLE_QC"]["MIN_MEAN_GQ"])

print(f"Samples remaining after QC: {mt.count_cols()} ")

Samples remaining after QC: 200610 


In [33]:
# Related
rel = hl.import_table("file://" + conf["SAMPLE_QC"]["REL_FILE"], key = "eid")
mt = mt.anti_join_cols(rel)

print(f"Samples remaining after removing related samples: {mt.count_cols()} ")

2021-09-14 13:18:14 Hail: INFO: Reading table without type imputation
  Loading field 'eid' as type str (not specified)


Samples remaining after QC: 195808 


In [37]:
# Hard filters
samp = hl.import_table("file://" + conf["SAMPLE_QC"]["SAMP_FILE"], key = "eid")
mt = mt.semi_join_cols(samp)

print(f"Samples remaining after after hard filters: {mt.count_cols()} ")

2021-09-14 13:20:24 Hail: INFO: Reading table without type imputation
  Loading field 'eid' as type str (not specified)


Samples remaining after QC: 162483 


In [None]:
# mt2 = hl.variant_qc(mt)
# mt2 = mt2.filter_rows(mt2.is_lauryns)
# mt2.select_rows(mt2.variant_qc, mt2.annotations, mt2.entry_stats_row).rows().show(-1)

In [38]:
# Variant QC again
pprint(mt.count())

mt = variant_QC_mt(mt, conf["VARIANT_QC"]["MIN_P_HWE"], conf["VARIANT_QC"]["MIN_GQ"])

pprint(mt.count())

(393, 162483)
(297, 162483)


In [41]:
stage = "final"
write_file = "file://" + Path(imps["TMP_DIR"], f"{GENE}.{stage}.mt").resolve().__str__()

mt.write(write_file)

2021-09-14 13:28:56 Hail: INFO: wrote matrix table with 297 rows and 162483 columns in 3 partitions to file:///opt/notebooks/gogoGPCR/tmp/GCGR.final.cp.mt
    Total size: 183.36 MiB
    * Rows/entries: 176.79 MiB
    * Columns: 6.57 MiB
    * Globals: 11.00 B
    * Smallest partition: 0 rows (20.00 B)
    * Largest partition:  282 rows (171.45 MiB)


In [None]:
subprocess.run(["dx", "upload", Path(imps["TMP_DIR"], f"{GENE}.{stage}.mt"), "-r", "--path", "/data/matrix_tables/"], check = True, shell = False)