# Create a MatrixTable and QC the hell out of it
## Import stuff and set your parameters
First, we import necessary libraries and configurations from config.toml. Then we initialise Spark and Hail. 

In [None]:
!pip install toml
!pip install jupytext

In [335]:
# Imports
import toml
from pathlib import Path
from datetime import datetime
from pprint import pprint
import sys
from distutils.version import LooseVersion
import subprocess
import os

import pandas as pd
import pyspark
import dxpy
import dxdata
import hail as hl
    
module_path = Path('..').resolve().__str__()

if module_path not in sys.path:
    sys.path.append(module_path)

try:
    os.mkdir("../tmp")
except:
    pass
    
from src.utils import get_position, lookup_vcfs
from src.matrixtables import *
from src.resources import lauryns_variants


hl.plot.output_notebook()

In [359]:
# Parameters
with open("../config.toml") as f:
    conf = toml.load(f)

GENE = conf["GENE"]
VCF_VERSION = conf["IMPORT"]["VCF_VERSION"]
REFERENCE_GENOME = conf["REFERENCE_GENOME"]

LOG_FILE = Path(conf["IMPORT"]["LOG_DIR"], f"{GENE}_{datetime.now().strftime('%H%M')}.log").resolve().__str__()
MAP_FILE = Path(conf["IMPORT"]["MAPPING_FILE"]).resolve().__str__()
INT_FILE = Path(conf["IMPORT"]["INTERVAL_FILE"]).resolve().__str__()

VCF_DIR = Path(conf["IMPORT"]["VCF_DIR"]).resolve().__str__()

DOWNSAMPLE_P = conf["IMPORT"].get("DOWNSAMPLE_P", None)

SNV_ONLY = conf["ANNOTATE"]["SNV_ONLY"]
USE_VEP = conf["ANNOTATE"]["USE_VEP"]
MISSENSE_ONLY = conf["ANNOTATE"]["MISSENSE_ONLY"]

VEP_JSON = Path(conf["ANNOTATE"]["VEP_JSON"]).resolve().__str__()

ANNOTATION_DIR = conf["ANNOTATE"]["ANNOTATION_DIR"]
ANNOTATION_FILE = Path(ANNOTATION_DIR, f"{GENE}.tsv").resolve().__str__()

MIN_DP = conf["ENTRY_QC"]["MIN_DP"]
MIN_GQ = conf["ENTRY_QC"]["MIN_GQ"]
MIN_PL = conf["ENTRY_QC"]["MIN_PL"]

MIN_P_HWE = conf["VARIANT_QC"]["MIN_P_HWE"]
MIN_VAR_GQ = conf["VARIANT_QC"]["MIN_VAR_GQ"]

MIN_CALL_RATE = conf["SAMPLE_QC"]["MIN_CALL_RATE"]
MIN_MEAN_DP = conf["SAMPLE_QC"]["MIN_MEAN_DP"]
MIN_MEAN_GQ = conf["SAMPLE_QC"]["MIN_MEAN_GQ"]

REL_FILE = conf["SAMPLE_QC"]["REL_FILE"]
SAMP_FILE = conf["SAMPLE_QC"]["SAMP_FILE"]

TMP_DIR = conf["EXPORT"]["TMP_DIR"]

BGEN_FILE = Path(TMP_DIR, f"{GENE}").resolve().__str__()
ANNOTATIONS_FILE = Path(TMP_DIR, f"{GENE}.annotations").resolve().__str__()
SETLIST_FILE = Path(TMP_DIR, f"{GENE}.setlist").resolve().__str__()


In [5]:
# Spark and Hail
sc = pyspark.SparkContext()
spark = pyspark.sql.SparkSession(sc)

hl.init(sc=sc, default_reference=REFERENCE_GENOME, log=LOG_FILE)

pip-installed Hail requires additional configuration options in Spark referring
  to the path to the Hail Python module directory HAIL_DIR,
  e.g. /path/to/python/site-packages/hail:
    spark.jars=HAIL_DIR/hail-all-spark.jar
    spark.driver.extraClassPath=HAIL_DIR/hail-all-spark.jar
    spark.executor.extraClassPath=./hail-all-spark.jarRunning on Apache Spark version 2.4.4
SparkUI available at http://ip-10-60-107-137.eu-west-2.compute.internal:8081
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.61-3c86d3ba497a
LOGGING: writing to /opt/notebooks/gogoGPCR/hail_logs/GCGR_0914.log


In [6]:
# VCFs
mapping = pd.read_csv(MAP_FILE, sep = "\t").set_index("HGNC", drop=False).loc[GENE,:].to_dict()
vcf_files = lookup_vcfs(mapping = mapping, vcfdir = VCF_DIR, gene = GENE, version = VCF_VERSION)

In [7]:
# Import
mt = import_mt(vcf_files.get("vcfs"), mapping).key_rows_by("locus", "alleles")#.checkpoint(checkpoint_file)

v, s = mt.count()
pprint(f"{v} variants and {s} samples after import")

'855 variants and 200643 samples after import'


In [8]:
# Checkpoint
stage = "raw"
checkpoint_file = f"/tmp/{GENE}.{stage}.cp.mt"

mt = mt.checkpoint(checkpoint_file, overwrite = True)

2021-09-21 09:19:39 Hail: INFO: Coerced sorted dataset
2021-09-21 09:22:24 Hail: INFO: wrote matrix table with 855 rows and 200643 columns in 1 partition to /tmp/{GENE}.{stage}.cp.mt
    Total size: 1005.43 MiB
    * Rows/entries: 1004.12 MiB
    * Columns: 1.31 MiB
    * Globals: 11.00 B
    * Smallest partition: 855 rows (1004.12 MiB)
    * Largest partition:  855 rows (1004.12 MiB)


In [243]:
#mt = hl.read_matrix_table(checkpoint_file)

In [244]:
# Downsample
if DOWNSAMPLE_P is not None:
    mt = downsample_mt(mt, DOWNSAMPLE_P)

    pprint(f"{mt.count_cols()} samples after downsampling")

'200643 samples after downsampling'


In [245]:
# Interval QC
mt = interval_qc_mt(mt, mapping, "file:" + INT_FILE)

pprint(f"{mt.count_rows()} variants after interval filtering")

2021-09-21 13:40:36 Hail: INFO: Reading table without type imputation
  Loading field 'f0' as type str (user-supplied)
  Loading field 'f1' as type int32 (user-supplied)
  Loading field 'f2' as type int32 (user-supplied)
2021-09-21 13:40:36 Hail: INFO: Coerced sorted dataset


'361 variants after interval filtering'


In [246]:
# Split multi
mt = mt.filter_rows(mt.alleles.length() <= 6)
mt = smart_split_multi_mt(mt)

pprint(f'{mt.count_rows()} variants with not more than 6 alleles after splitting')

2021-09-21 13:40:39 Hail: INFO: Coerced sorted dataset
2021-09-21 13:40:40 Hail: INFO: Coerced sorted dataset
2021-09-21 13:40:42 Hail: INFO: Coerced sorted dataset


'396 variants not more than 6 alleles after split'


In [247]:
if SNV_ONLY:
    mt = mt.filter_rows(hl.len(mt.alleles[1]) == 1)
    
    pprint(f"{mt.count_rows()} SNVs""

2021-09-21 13:40:43 Hail: INFO: Coerced sorted dataset
2021-09-21 13:40:44 Hail: INFO: Coerced sorted dataset
2021-09-21 13:40:45 Hail: INFO: Coerced sorted dataset


390

In [248]:
if USE_VEP:
    mt = hl.vep(mt, "file:" + VEP_JSON)
    
    first_is_MANE = mt.aggregate_rows(hl.agg.all(hl.is_defined(mt.vep.transcript_consequences.mane_select[0])))
    
    assert first_is_MANE, "Careful! First transcript may not be MANE SELECT. Probably good idea to check manually"
    

2021-09-21 13:40:47 Hail: INFO: Coerced sorted dataset
2021-09-21 13:40:48 Hail: INFO: Coerced sorted dataset
2021-09-21 13:40:49 Hail: INFO: Coerced sorted dataset
2021-09-21 13:40:50 Hail: INFO: Coerced sorted dataset
2021-09-21 13:40:51 Hail: INFO: Coerced sorted dataset
2021-09-21 13:40:52 Hail: INFO: Coerced sorted dataset
2021-09-21 13:40:53 Hail: INFO: Coerced sorted dataset
2021-09-21 13:40:53 Hail: INFO: Coerced sorted dataset


In [254]:
if MISSENSE_ONLY:
    mt = mt.filter_rows(mt.vep.most_severe_consequence == "missense_variant")
    mt = mt.filter_rows(mt.vep.transcript_consequences.consequence_terms[0][0] == "missense_variant")
    mt = mt.annotate_rows(protCons = mt.vep.transcript_consequences.amino_acids[0].split("/")[0]  + hl.str(mt.vep.transcript_consequences.protein_end[0]) + mt.vep.transcript_consequences.amino_acids[0].split("/")[1])
    
    pprint(f"{mt.count_rows()} missense variant")

2021-09-21 13:42:16 Hail: INFO: Coerced sorted dataset
2021-09-21 13:42:17 Hail: INFO: Coerced sorted dataset
2021-09-21 13:42:17 Hail: INFO: Coerced sorted dataset
2021-09-21 13:42:18 Hail: INFO: Coerced sorted dataset


230

In [307]:
mt = annotate_mt(mt = mt, gene = GENE, annotations = "file:" + ANNOTATION_FILE)

interesting = mt.filter_rows((hl.is_defined(mt.annotations)) & (hl.agg.any(mt.GT.is_non_ref()))).count_rows()
pprint(f"{interesting} annotated variants found before QC")

AttributeError: module 'src.annotations' has no attribute 'annotate_GCGR'

In [308]:
# Genotype GQ
mt = genotype_filter_mt(mt, MIN_DP, MIN_GQ, MIN_PL, True)

missing = mt.aggregate_entries(hl.agg.sum(~hl.is_defined(mt.GT)))
pprint(f"{missing} missing or filtered entries after Call QC")

'21 missing or filtered entries after Call QC'


In [272]:
# Checkpoint
stage = "QC1"
checkpoint_file = f"/tmp/{GENE}.{stage}.cp.mt"

mt = mt.checkpoint(checkpoint_file, overwrite = True)

2021-09-21 14:49:42 Hail: INFO: Coerced sorted dataset
2021-09-21 14:49:43 Hail: INFO: Coerced sorted dataset
2021-09-21 14:50:08 Hail: INFO: Coerced sorted dataset
2021-09-21 14:50:09 Hail: INFO: Coerced sorted dataset
2021-09-21 14:51:34 Hail: INFO: Coerced sorted dataset
2021-09-21 14:51:35 Hail: INFO: Coerced sorted dataset
2021-09-21 14:51:36 Hail: INFO: Coerced sorted dataset
2021-09-21 14:51:36 Hail: INFO: Coerced sorted dataset
2021-09-21 14:51:39 Hail: INFO: Coerced sorted dataset
2021-09-21 14:51:40 Hail: INFO: Coerced sorted dataset
2021-09-21 14:52:05 Hail: INFO: Coerced sorted dataset
2021-09-21 14:52:05 Hail: INFO: Coerced sorted dataset
2021-09-21 14:52:06 Hail: INFO: Coerced sorted dataset
2021-09-21 14:52:07 Hail: INFO: Coerced sorted dataset
2021-09-21 14:52:08 Hail: INFO: Coerced sorted dataset
2021-09-21 14:52:09 Hail: INFO: Coerced sorted dataset
2021-09-21 14:52:10 Hail: INFO: Ordering unsorted dataset with network shuffle
2021-09-21 14:52:10 Hail: INFO: Ordering 

In [14]:
# Re-load
stage = "QC1"
checkpoint_file = f"/tmp/{GENE}.{stage}.cp.mt"

mt = hl.read_matrix_table(checkpoint_file)

In [309]:
# Variant QC
mt = variant_QC_mt(mt, MIN_P_HWE, MIN_VAR_GQ)

interesting = mt.filter_rows((hl.is_defined(mt.anno)) & (hl.agg.any(mt.GT.is_non_ref()))).count_rows()
print(f"{mt.count_rows()} variants remaining after QC of which {interesting} are annotated")

174 variants remaining after QC of which 23 are annotated


In [279]:
# Withdrawn
mt = mt.filter_cols(~mt.s.startswith("W"))

print(f"Samples remaining after removing withdrawn participants: {mt.count_cols()} ")

Samples remaining after removing withdrawn participants: 200611 


In [280]:
# Sample QC
mt = sample_QC_mt(mt, MIN_CALL_RATE, MIN_MEAN_DP, MIN_MEAN_GQ)

print(f"Samples remaining after QC: {mt.count_cols()} ")

Samples remaining after QC: 200597 


In [281]:
# Related
rel = hl.import_table("file:" + REL_FILE, key = "eid")
mt = mt.anti_join_cols(rel)

print(f"Samples remaining after removing related samples: {mt.count_cols()} ")

2021-09-21 14:57:49 Hail: INFO: Reading table without type imputation
  Loading field 'eid' as type str (not specified)


Samples remaining after removing related samples: 195796 


In [283]:
# Hard filters
samp = hl.import_table("file:" + SAMP_FILE, key = "eid")
mt = mt.semi_join_cols(samp)

print(f"Samples remaining after after hard filters: {mt.count_cols()} ")

2021-09-21 14:59:33 Hail: INFO: Reading table without type imputation
  Loading field 'eid' as type str (not specified)


Samples remaining after after hard filters: 162476 


In [297]:
# Variant QC again

mt = hl.variant_qc(mt)
mt = mt.filter_rows((mt.variant_qc.AF[0] > 0.0) & (mt.variant_qc.AF[0] < 1.0))

interesting = mt.filter_rows((hl.is_defined(mt.anno)) & (hl.agg.any(mt.GT.is_non_ref()))).count_rows()
print(f"{mt.count_rows()} variants remaining after QC of which {interesting} are annotated")

'174 variants left after all QC'


In [310]:
stage = "final"
write_file = f"/tmp/{GENE}.{stage}.mt"

mt.write(write_file)

2021-09-21 15:40:25 Hail: INFO: wrote matrix table with 174 rows and 162476 columns in 3 partitions to /tmp/GCGR.final.mt
    Total size: 110.02 MiB
    * Rows/entries: 104.11 MiB
    * Columns: 5.91 MiB
    * Globals: 11.00 B
    * Smallest partition: 0 rows (20.00 B)
    * Largest partition:  168 rows (101.97 MiB)


In [321]:
subprocess.run(["hadoop", "fs", "-get", write_file, Path(TMP_DIR).resolve()], check = True, shell = False)

CompletedProcess(args=['hadoop', 'fs', '-get', '/tmp/GCGR.final.mt', PosixPath('/opt/notebooks/gogoGPCR/tmp')], returncode=0)

In [322]:
subprocess.run(["dx", "upload", Path(TMP_DIR, f"{GENE}.{stage}.mt").resolve(), "-r", "--path", "/data/matrix_tables/"], check = True, shell = False)

CompletedProcess(args=['dx', 'upload', PosixPath('/opt/notebooks/gogoGPCR/tmp/GCGR.final.mt'), '-r', '--path', '/data/matrix_tables/'], returncode=0)

In [326]:
mt.select_rows(mt.variant_qc, mt.anno).rows().show(-1)

Unnamed: 0_level_0,Unnamed: 1_level_0,variant_qc,variant_qc,variant_qc,variant_qc,variant_qc,variant_qc,variant_qc,variant_qc,variant_qc,variant_qc,variant_qc,variant_qc,variant_qc,variant_qc,variant_qc,variant_qc,variant_qc,variant_qc,variant_qc,variant_qc,Unnamed: 22_level_0
Unnamed: 0_level_1,Unnamed: 1_level_1,dp_stats,dp_stats,dp_stats,dp_stats,gq_stats,gq_stats,gq_stats,gq_stats,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,anno
locus,alleles,mean,stdev,min,max,mean,stdev,min,max,AC,AF,AN,homozygote_count,call_rate,n_called,n_not_called,n_filtered,n_het,n_non_ref,het_freq_hwe,p_value_hwe,Annotation
locus<GRCh38>,array<str>,float64,float64,float64,float64,float64,float64,float64,float64,array<int32>,array<float64>,int32,array<int32>,float64,int64,int64,int64,int64,int64,float64,float64,str
chr17:81809032,"[""A"",""C""]",20.9,5.35,16.0,123.0,49.3,0.951,40.0,54.0,"[324951,1]","[1.00e+00,3.08e-06]",324952,"[162475,0]",1.0,162476,0,0,1,1,6.15e-06,0.5,
chr17:81809034,"[""C"",""A""]",20.9,5.35,16.0,85.0,49.3,0.952,34.0,54.0,"[324949,3]","[1.00e+00,9.23e-06]",324952,"[162473,0]",1.0,162476,0,0,3,3,1.85e-05,0.5,
chr17:81809041,"[""G"",""A""]",20.9,5.39,16.0,113.0,49.3,0.956,30.0,54.0,"[324932,20]","[1.00e+00,6.15e-05]",324952,"[162456,0]",1.0,162476,0,0,20,20,0.000123,0.5,"""WT"""
chr17:81809044,"[""C"",""T""]",20.9,5.35,16.0,105.0,49.3,0.951,37.0,54.0,"[324951,1]","[1.00e+00,3.08e-06]",324952,"[162475,0]",1.0,162476,0,0,1,1,6.15e-06,0.5,
chr17:81809068,"[""T"",""C""]",21.0,5.38,16.0,159.0,49.3,0.946,36.0,54.0,"[324945,7]","[1.00e+00,2.15e-05]",324952,"[162469,0]",1.0,162476,0,0,7,7,4.31e-05,0.5,
chr17:81809071,"[""C"",""A""]",21.0,5.37,16.0,159.0,49.3,0.946,40.0,54.0,"[324951,1]","[1.00e+00,3.08e-06]",324952,"[162475,0]",1.0,162476,0,0,1,1,6.15e-06,0.5,
chr17:81809075,"[""C"",""G""]",21.0,5.44,16.0,159.0,49.3,0.967,23.0,54.0,"[324920,32]","[1.00e+00,9.85e-05]",324952,"[162444,0]",1.0,162476,0,0,32,32,0.000197,0.501,
chr17:81809792,"[""C"",""A""]",17.5,2.67,13.0,69.0,60.9,13.3,0.0,99.0,"[324928,2]","[1.00e+00,6.16e-06]",324930,"[162463,0]",1.0,162465,1,10,2,2,1.23e-05,0.5,
chr17:81809792,"[""C"",""T""]",17.5,2.67,13.0,69.0,60.9,13.3,0.0,99.0,"[324929,1]","[1.00e+00,3.08e-06]",324930,"[162464,0]",1.0,162465,1,10,1,1,6.16e-06,0.5,
chr17:81809806,"[""A"",""G""]",17.5,2.65,14.0,45.0,48.7,0.947,21.0,50.0,"[324945,1]","[1.00e+00,3.08e-06]",324946,"[162472,0]",1.0,162473,0,3,1,1,6.15e-06,0.5,


In [None]:
stage = "final"
write_file = f"/tmp/{GENE}.{stage}.mt"

mt = hl.read_matrix_table(write_file)

In [336]:
# REMEMBER TO DELETE CELL

def recode_GT_to_GP(
    mt: hl.matrixtable.MatrixTable,
) -> hl.matrixtable.MatrixTable:

    GPs = hl.literal([[1.0, 0.0, 0.0], [0.0, 1.0, 0.0], [0.0, 0.0, 1.0]])

    mt = mt.annotate_entries(GP=GPs[mt.GT.n_alt_alleles()])

    return mt


def write_bgen(mt: hl.matrixtable.MatrixTable, output: str) -> None:

    mt = add_varid(mt)

    mt = recode_GT_to_GP(mt)

    hl.export_bgen(
        mt=mt, varid=mt.varid, rsid=mt.varid, gp=mt.GP, output=output
    )

In [340]:
# BGEN
write_bgen(mt, "file:" + BGEN_FILE)

2021-09-21 16:21:57 Hail: INFO: while writing:
    file:/opt/notebooks/gogoGPCR/tmp/GCGR.bgen
  merge time: 137.636ms


In [357]:
# ANNOTATIONS

mt = add_varid(mt)

annotations = (
    mt.select_rows(
        varid = mt.varid,
        gene = mt.vep.transcript_consequences.gene_symbol[0],
        annotation = mt.annotation
    )
    .rows()
    .key_by("varid")
    .drop("locus")
    .drop("alleles")
)
annotations.export("file:" + ANNOTATIONS_FILE, header=False)

2021-09-21 16:32:15 Hail: INFO: Coerced sorted dataset
2021-09-21 16:32:24 Hail: INFO: merging 2 files totalling 4.7K...
2021-09-21 16:32:24 Hail: INFO: while writing:
    file:/opt/notebooks/gogoGPCR/tmp/GCGR.annotations
  merge time: 16.836ms


In [360]:
# SETLIST
position = mt.aggregate_rows(hl.agg.min(mt.locus.position))
names = mt.varid.collect()
names_str = ",".join(names)

line = f"{mt.vep.transcript_consequences.gene_symbol[0].collect()[0]}\t{mt.locus.contig.collect()[0]}\t{position}\t{names_str}"

with open(SETLIST_FILE, "w") as f:
    f.write(line)

In [361]:
bgen_file = BGEN_FILE + ".bgen"
sample_file = BGEN_FILE + ".sample"

subprocess.run(["dx", "upload", bgen_file, sample_file, ANNOTATIONS_FILE, SETLIST_FILE, "--path", "/data/burden/"], check = True, shell = False)

CompletedProcess(args=['dx', 'upload', '/opt/notebooks/gogoGPCR/tmp/GCGR.bgen', '/opt/notebooks/gogoGPCR/tmp/GCGR.sample', '/opt/notebooks/gogoGPCR/tmp/GCGR.annotations', '/opt/notebooks/gogoGPCR/tmp/GCGR.setlist', '--path', '/data/burden/'], returncode=0)