# Create a MatrixTable and QC the hell out of it
## Import stuff and set your parameters
First, we import necessary libraries and configurations from config.toml. Then we initialise Spark and Hail. 

In [None]:
!pip install toml
!pip install jupytext

In [2]:
# Imports
import toml
from pathlib import Path
from datetime import datetime
from pprint import pprint
import sys
from distutils.version import LooseVersion
import subprocess
import os

import pandas as pd
import pyspark
import dxpy
import dxdata
import hail as hl
    
module_path = Path('..').resolve().__str__()

if module_path not in sys.path:
    sys.path.append(module_path)

try:
    os.mkdir("../tmp")
except:
    pass
    
from src.utils import get_position, lookup_vcfs
from src.matrixtables import *
from src.resources import lauryns_variants


hl.plot.output_notebook()

In [3]:
# Parameters
with open("../config.toml") as f:
    conf = toml.load(f)

IMPORT = conf["IMPORT"]
GENE = conf["GENE"]
VCF_VERSION = IMPORT["VCF_VERSION"]
REFERENCE_GENOME = conf["REFERENCE_GENOME"]
DATABASE = IMPORT["DATABASE"]

LOG_FILE = Path(IMPORT["LOG_DIR"],f"{GENE}_{datetime.now().strftime('%H%M')}.log").resolve().__str__()
MAP_FILE = Path(IMPORT["MAPPING_FILE"]).resolve().__str__()
INT_FILE = Path(IMPORT["INTERVAL_FILE"]).resolve().__str__()

VCF_DIR = Path(IMPORT["VCF_DIR"]).resolve().__str__()

DOWNSAMPLE_P = IMPORT.get("DOWNSAMPLE_P", None)

SNV_ONLY = conf["ANNOTATE"]["SNV_ONLY"]
USE_VEP = conf["ANNOTATE"]["USE_VEP"]
MISSENSE_ONLY = conf["ANNOTATE"]["MISSENSE_ONLY"]

VEP_JSON = Path(conf["ANNOTATE"]["VEP_JSON"]).resolve().__str__()

ANNOTATION_DIR = conf["ANNOTATE"]["ANNOTATION_DIR"]
ANNOTATION_FILE = Path(ANNOTATION_DIR, f"{GENE}.tsv").resolve().__str__()

MIN_DP = conf["ENTRY_QC"]["MIN_DP"]
MIN_GQ = conf["ENTRY_QC"]["MIN_GQ"]
MIN_PL = conf["ENTRY_QC"]["MIN_PL"]

MIN_P_HWE = conf["VARIANT_QC"]["MIN_P_HWE"]
MIN_VAR_GQ = conf["VARIANT_QC"]["MIN_VAR_GQ"]

MIN_CALL_RATE = conf["SAMPLE_QC"]["MIN_CALL_RATE"]
MIN_MEAN_DP = conf["SAMPLE_QC"]["MIN_MEAN_DP"]
MIN_MEAN_GQ = conf["SAMPLE_QC"]["MIN_MEAN_GQ"]

REL_PATH = Path(conf["SAMPLE_QC"]["DATA_DIR"], conf["SAMPLE_QC"]["REL_FILE"]).resolve().__str__()
SAMP_PATH = Path(conf["SAMPLE_QC"]["DATA_DIR"], conf["SAMPLE_QC"]["SAMP_FILE"]).resolve().__str__()

TMP_DIR = conf["EXPORT"]["TMP_DIR"]

BGEN_FILE = Path(TMP_DIR, f"{GENE}").resolve().__str__()
ANNOTATIONS_FILE = Path(TMP_DIR, f"{GENE}.annotations").resolve().__str__()
SETLIST_FILE = Path(TMP_DIR, f"{GENE}.setlist").resolve().__str__()


In [4]:
# Spark and Hail

sc = pyspark.SparkContext()
spark = pyspark.sql.SparkSession(sc)

try:
    mt_database = dxpy.find_one_data_object(name=DATABASE)["id"]
except DXSearchError:
    spark.sql(f"CREATE DATABASE {DATABASE} LOCATION  'dnax://'")
    mt_database = dxpy.find_one_data_object(name=DATABASE)["id"]

hl.init(sc=sc, default_reference=REFERENCE_GENOME, log=LOG_FILE, tmp_dir=f'dnax://{mt_database}/tmp/')

pip-installed Hail requires additional configuration options in Spark referring
  to the path to the Hail Python module directory HAIL_DIR,
  e.g. /path/to/python/site-packages/hail:
    spark.jars=HAIL_DIR/hail-all-spark.jar
    spark.driver.extraClassPath=HAIL_DIR/hail-all-spark.jar
    spark.executor.extraClassPath=./hail-all-spark.jarRunning on Apache Spark version 2.4.4
SparkUI available at http://ip-10-60-30-63.eu-west-2.compute.internal:8081
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.61-3c86d3ba497a
LOGGING: writing to /opt/notebooks/gogoGPCR/hail_logs/DRD2_1347.log


In [5]:
# VCFs
mapping = pd.read_csv(MAP_FILE, sep = "\t").set_index("HGNC", drop=False).loc[GENE,:].to_dict()
vcf_files = lookup_vcfs(mapping = mapping, vcfdir = VCF_DIR, gene = GENE, version = VCF_VERSION)

In [6]:
# Import
mt = import_mt(vcf_files.get("vcfs"), mapping).key_rows_by("locus", "alleles")#.checkpoint(checkpoint_file)

v, s = mt.count()
pprint(f"{v} variants and {s} samples after import")

'582 variants and 200643 samples after import'


In [7]:
# Checkpoint
stage = "raw"
checkpoint_file = f"/tmp/{GENE}.{stage}.cp.mt"

mt = mt.checkpoint(checkpoint_file, overwrite = True)

2021-10-26 13:52:09 Hail: INFO: Coerced sorted dataset
2021-10-26 13:54:08 Hail: INFO: wrote matrix table with 582 rows and 200643 columns in 1 partition to /tmp/DRD2.raw.cp.mt
    Total size: 602.72 MiB
    * Rows/entries: 601.41 MiB
    * Columns: 1.31 MiB
    * Globals: 11.00 B
    * Smallest partition: 582 rows (601.41 MiB)
    * Largest partition:  582 rows (601.41 MiB)


In [8]:
#mt = hl.read_matrix_table(checkpoint_file)

In [9]:
# Downsample
if DOWNSAMPLE_P is not None:
    mt = downsample_mt(mt, DOWNSAMPLE_P)

    pprint(f"{mt.count_cols()} samples after downsampling")

In [10]:
# Interval QC
mt = interval_qc_mt(mt, mapping, "file:" + INT_FILE)

pprint(f"{mt.count_rows()} variants after interval filtering")

2021-10-26 13:54:10 Hail: INFO: Reading table without type imputation
  Loading field 'f0' as type str (user-supplied)
  Loading field 'f1' as type int32 (user-supplied)
  Loading field 'f2' as type int32 (user-supplied)
2021-10-26 13:54:12 Hail: INFO: Coerced sorted dataset


'276 variants after interval filtering'


In [11]:
# Split multi
mt = mt.filter_rows(mt.alleles.length() <= 6)
mt = smart_split_multi_mt(mt)

pprint(f'{mt.count_rows()} variants with not more than 6 alleles after splitting')

2021-10-26 13:54:16 Hail: INFO: Coerced sorted dataset
2021-10-26 13:54:18 Hail: INFO: Coerced sorted dataset
2021-10-26 13:54:19 Hail: INFO: Coerced sorted dataset


'297 variants with not more than 6 alleles after splitting'


In [12]:
if USE_VEP:
    mt = hl.vep(mt, "file:" + VEP_JSON)
    
    is_MANE = mt.aggregate_rows(hl.agg.all(hl.is_defined(mt.vep.transcript_consequences.mane_select)))
    assert is_MANE, "Selected transcript may not be MANE Select. Check manually."
    
    mt = mt.annotate_rows(protCons = mt.vep.transcript_consequences.amino_acids[0].split("/")[0] +
                       hl.str(mt.vep.transcript_consequences.protein_end[0]) +
                       mt.vep.transcript_consequences.amino_acids[0].split("/")[-1])
    

2021-10-26 13:54:22 Hail: INFO: Coerced sorted dataset
2021-10-26 13:54:23 Hail: INFO: Coerced sorted dataset
2021-10-26 13:54:25 Hail: INFO: Coerced sorted dataset
2021-10-26 13:54:26 Hail: INFO: Coerced sorted dataset
2021-10-26 13:54:27 Hail: INFO: Coerced sorted dataset
2021-10-26 13:54:28 Hail: INFO: Coerced sorted dataset


In [16]:
ht = hl.import_table("/tmp/211004_drd2_variants_aggregation.tsv", impute = True).key_by("AA consequence")
ht.show()

mt = mt.annotate_rows(annotations = ht[mt.protCons])

mt = mt.annotate_rows(Gi1 = mt.annotations.number_of_impairments_Gi1 > 0,
                      GoA = mt.annotations.number_of_impairments_GoA > 0,
                      Gz = mt.annotations.number_of_impairments_Gz > 0)

mt = mt.annotate_rows(annotation = hl.case()
                      .when(~mt.Gi1 & ~mt.GoA & ~mt.Gz, "WT")
                      .when(mt.Gi1 & ~mt.GoA & ~mt.Gz, "Gi1")
                      .when(~mt.Gi1 & mt.GoA & ~mt.Gz, "GoA")
                      .when(~mt.Gi1 & ~mt.GoA & mt.Gz, "Gz")
                      .when(mt.Gi1 & mt.GoA & ~mt.Gz, "Gi1_GoA")
                      .when(mt.Gi1 & ~mt.GoA & mt.Gz, "Gi1_Gz")
                      .when(~mt.Gi1 & mt.GoA & mt.Gz, "GoA_Gz")
                      .when(mt.Gi1 & mt.GoA & mt.Gz, "Gi1_GoA_Gz")
                     .or_missing())

2021-10-26 13:57:39 Hail: INFO: Reading table to impute column types
2021-10-26 13:57:39 Hail: INFO: Loading 50 fields. Counts by type:
  str: 41
  int32: 9


Var,Gi1_activation rate_mean_100qm,Gi1_activation rate_mean_10qm,Gi1_activation rate_pval_100qm,Gi1_activation rate_pval_10qm,Gi1_amplitude_mean_100qm,Gi1_amplitude_mean_10qm,Gi1_amplitude_pval_100qm,Gi1_amplitude_pval_10qm,GoA_activation rate_mean_100qm,GoA_activation rate_mean_10qm,GoA_activation rate_pval_100qm,GoA_activation rate_pval_10qm,GoA_amplitude_mean_100qm,GoA_amplitude_mean_10qm,GoA_amplitude_pval_100qm,GoA_amplitude_pval_10qm,Gz_activation rate_mean_100qm,Gz_activation rate_mean_10qm,Gz_activation rate_pval_100qm,Gz_activation rate_pval_10qm,Gz_amplitude_mean_100qm,Gz_amplitude_mean_10qm,Gz_amplitude_pval_100qm,Gz_amplitude_pval_10qm,Labcode,Variant,SurfaceExpression,SurfaceExpression_sem,SurfaceExpression_n,SurfaceExpression_p,TotalExpression,TotalExpression_sem,TotalExpression_n,TotalExpression_p,AA consequence,Transcript Consequence,sequence number for sorting,SequenceNumber,GPCRdb,Segment,WTaa,NMaa,GPCRdb_short,number_of_impairments,number_of_impairments_Gi1,number_of_impairments_GoA,number_of_impairments_Gz,number_of_across_pathways,mean_amplitudes
str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,int32,str,str,str,int32,str,str,str,int32,int32,str,str,str,str,str,int32,int32,int32,int32,int32,str
"""A127V 3x45""","""0,597015370283226""","""0,643574119127666""","""0,1727""","""0,9911""","""69,5316220360144""","""63,1300660485471""","""0,0924""","""0,4849""","""1,58441401105866""","""1,80985162070555""","""0,5239""","""0,9911""","""96,103788315811""","""101,594723211329""","""0,9995""","""0,9999""","""0,0233778347710404""","""0,020530791018249""","""0,9521""","""852""","""90,7583896418041""","""88,0322811714467""","""0,9907""","""999""","""DRD2MT123""","""A127V""","""43,3395618160454""","""10,0578815197535""",5,"""0,1592""","""51,7147173783332""","""12,2691063843954""",5,"""0,3719""","""A127V""","""c.380C>T""",127,127,"""3.45x45""","""TM3""","""A""","""V""","""3x45""",0,0,0,0,0,"""84,8584784041587"""
"""A177T ""","""0,910903915387408""","""0,769881694191999""","""0,9985""","""0,9989""","""112,222738456505""","""98,3280001795086""","""0,9998""","""0,9996""","""1,908217378732""","""1,97519010909049""","""0,9645""","""0,9989""","""74,1389680451515""","""65,8438635251985""","""0,9917""","""0,9731""","""0,043365611478242""","""0,0363208119655175""","""0,9999""","""0,9997""","""89,6257054469003""","""89,7260401721463""","""0,9823""","""0,9991""","""DRD2MT26""","""A177T""","""99,7637678729382""","""8,3466360712182""",5,"""0,9995""","""90,2048452624591""","""8,40297787331943""",5,"""0,9999""","""A177T""","""c.529G>A""",177,177,"""""","""ECL2""","""A""","""T""","""""",0,0,0,0,0,"""88,314219304235"""
"""A273V ""","""1,18461745863355""","""1,03470264977696""","""0,9993""","""0,9996""","""83,0794661236873""","""79,6397858275958""","""0,7158""","""991""","""2,50006045958493""","""2,38950668556777""","""0,9994""","""0,9996""","""79,2166969786639""","""94,3691372591675""","""0,9983""","""0,9994""","""0,0485121486190895""","""0,0409948056232525""","""0,9994""","""0,9997""","""101,108286184116""","""91,6896599288242""","""0,9988""","""0,9993""","""DRD2MT59""","""A273V""","""71,4000533739933""","""5,02774873947585""",4,"""0,9982""","""64,6226063570699""","""12,5391044137044""",4,"""0,9784""","""A273V""","""c.818C>T""",273,273,"""""","""ICL3""","""A""","""V""","""""",0,0,0,0,0,"""88,1838387170091"""
"""A29V ""","""1,19648135362991""","""0,687120249512224""","""0,9992""","""0,9982""","""71,9837467989575""","""67,1176980248258""","""0,1488""","""0,6821""","""1,94605727233454""","""2,11776576694169""","""0,9917""","""0,9982""","""61,9164152821397""","""67,6666142497236""","""0,9556""","""978""","""0,0385379042494758""","""0,0356496203232574""","""0,9995""","""0,9997""","""100,074058345824""","""103,838894340631""","""0,9986""","""0,9999""","""DRD2MT11""","""A29V""","""75,2243139899891""","""8,67074687167099""",5,"""0,9985""","""88,2574043404188""","""3,29478826278041""",5,"""0,9997""","""A29V""","""c.86C>T""",29,29,"""""","""N-term""","""A""","""V""","""""",0,0,0,0,0,"""78,7662378403503"""
"""A323T ""","""0,984610451650778""","""0,906886826762193""","""0,9993""","""0,9998""","""70,2952045173225""","""68,7992656005877""","""0,1076""","""0,7661""","""2,34649853196804""","""2,55937454415182""","""0,9999""","""0,9998""","""62,2744980962065""","""59,6989659613797""","""0,9135""","""0,8261""","""0,066574117248078""","""0,0698398162224449""","""0,7705""","""0,0464""","""78,6924810050178""","""76,7326270564095""","""0,7661""","""0,9908""","""DRD2MT64""","""A323T""","""100,618766436387""","""13,4481101309862""",5,"""0,9994""","""90,7852838721204""","""12,2022024724627""",5,"""0,9999""","""A323T""","""c.967G>A""",323,323,"""""","""ICL3""","""A""","""T""","""""",1,0,0,1,1,"""69,4155070394873"""
"""A376T 6x38""","""0,765220828736293""","""0,749014721324006""","""0,8664""","""0,9987""","""54,7135420629383""","""56,8707660632556""","""2""","""0,2432""","""2,22288556787288""","""2,15192661667652""","""0,9982""","""0,9987""","""53,5958585968642""","""46,5928447840417""","""0,6338""","""0,3418""","""0,0456956408810725""","""0,0422461979756721""","""0,9997""","""0,9995""","""93,836389396777""","""82,084580849391""","""998""","""0,9983""","""DRD2MT67""","""A376T""","""77,717697185568""","""10,8371595227965""",5,"""0,9988""","""89,6118853886364""","""10,18054673525""",5,"""0,9999""","""A376T""","""c.1126G>A""",376,376,"""6.38x38""","""TM6""","""A""","""T""","""6x38""",1,1,0,0,1,"""64,6156636255446"""
"""A410T 7x36""","""0,854676162199521""","""0,776877616118393""","""0,9913""","""999""","""71,5785122617385""","""70,5140697127194""","""0,1379""","""0,8447""","""2,04286088540527""","""2,12729650138199""","""0,9921""","""999""","""73,5022000689716""","""59,7527823572074""","""0,9914""","""0,9106""","""0,0402846436161466""","""0,0378018331854902""","""0,9997""","""0,9999""","""108,37767592486""","""117,847298338709""","""0,9993""","""0,9989""","""DRD2MT04""","""A410T""","""101,800228775251""","""5,11256684684974""",4,"""0,9994""","""103,978495969563""","""10,4100399865288""",4,"""0,9991""","""A410T""","""c.1228G>A""",410,410,"""7.37x36""","""TM7""","""A""","""T""","""7x36""",0,0,0,0,0,"""83,595423110701"""
"""A46T 1x44""","""0,894643945182611""","""0,79949765268863""","""0,9983""","""0,9991""","""83,2597632584993""","""81,1730814847358""","""728""","""0,9979""","""2,00113477075791""","""2,19808427981123""","""0,9984""","""0,9991""","""156,394857395663""","""132,396439723518""","""0,8554""","""0,9981""","""0,0333523272211432""","""0,030560051568725""","""0,9988""","""0,9989""","""78,599432635157""","""84,2644377296527""","""0,7622""","""0,9986""","""DRD2MT112""","""A46T""","""49,3122488605849""","""6,57457849052052""",5,"""0,3421""","""55,3602713857521""","""5,29295772603863""",5,"""0,5503""","""A46T""","""c.136G>A""",46,46,"""1.44x44""","""TM1""","""A""","""T""","""1x44""",0,0,0,0,0,"""102,681335371204"""
"""A46V 1x44""","""1,02409206292101""","""0,889096084076659""","""0,9996""","""0,9997""","""93,4731988347069""","""91,2020182563606""","""0,9979""","""0,9991""","""2,42892718776729""","""2,46752412013574""","""0,9997""","""0,9997""","""82,2406162440736""","""97,9405007712161""","""0,9985""","""0,9997""","""0,0408298747239261""","""0,0372066672671375""","""0,9997""","""0,9999""","""98,3760607973331""","""89,9461424365237""","""0,9985""","""0,9991""","""DRD2MT81""","""A46V""","""76,5517175172997""","""12,9306261006175""",5,"""0,9986""","""76,2557966867227""","""6,28155056604264""",5,"""0,9986""","""A46V""","""c.137C>T""",46,46,"""1.44x44""","""TM1""","""A""","""V""","""1x44""",0,0,0,0,0,"""92,1964228900357"""
"""A64V 12x49""","""1,14324024807774""","""0,811946186581954""","""0,9996""","""0,9992""","""68,9230075538355""","""59,6840884859814""","""0,0814""","""0,3389""","""2,515097643418""","""2,21501583850161""","""0,9985""","""0,9992""","""54,7551429905173""","""66,0670646334938""","""0,7822""","""0,9793""","""0,0545743481884731""","""0,0396843067779764""","""0,9986""","""0,9999""","""83,2069883593959""","""73,849225645396""","""923""","""0,9794""","""DRD2MT06""","""A64V""","""114,532704461329""","""9,17954064981459""",5,"""998""","""136,445740677416""","""24,5181556416807""",5,"""0,2117""","""A64V""","""c.191C>T""",64,64,"""12.49x49""","""ICL1""","""A""","""V""","""12x49""",0,0,0,0,0,"""67,7475862781033"""


In [17]:
def show_stats(mt):
    intr = mt.filter_rows((hl.is_defined(mt.annotations)))
    intr = hl.variant_qc(intr)
    intr = intr.rows() #intr = intr.select_rows(intr.variant_qc, intr.protCons, intr.annotations, intr.annotation).rows()
    #intr = intr.annotate(**intr.variant_qc)
    #intr = intr.annotate(**intr.annotations)
    #intr = intr.drop("variant_qc", "gq_stats", "dp_stats", "annotations")
    stats = intr.group_by(intr.annotation).aggregate(n_carriers = hl.agg.sum(intr.variant_qc.n_het),
                                                    n_variants = hl.agg.count())
    
    pprint("Stats before QC:")
    stats.show(-1)



In [18]:
#mt = annotate_mt(mt = mt, gene = GENE, annotations = "file:" + ANNOTATION_FILE)
#
#interesting = mt.filter_rows((hl.is_defined(mt.annotations)) & (hl.agg.any(mt.GT.is_non_ref()))).count_rows()
#pprint(f"{interesting} annotated variants found before QC")

In [19]:
# Checkpoint
stage = "QC1"
checkpoint_file = f"/tmp/{GENE}.{stage}.cp.mt"

mt = mt.checkpoint(checkpoint_file, overwrite = True)
show_stats(mt)

2021-10-26 13:57:47 Hail: INFO: Coerced sorted dataset
2021-10-26 13:57:48 Hail: INFO: Coerced sorted dataset
2021-10-26 13:58:05 Hail: INFO: Coerced sorted dataset
2021-10-26 13:58:06 Hail: INFO: Coerced sorted dataset
2021-10-26 13:58:07 Hail: INFO: Coerced sorted dataset
2021-10-26 13:58:09 Hail: INFO: Coerced sorted dataset
2021-10-26 13:58:10 Hail: INFO: Ordering unsorted dataset with network shuffle
2021-10-26 13:58:10 Hail: INFO: Ordering unsorted dataset with network shuffle
2021-10-26 13:58:11 Hail: INFO: Ordering unsorted dataset with network shuffle
2021-10-26 13:58:56 Hail: INFO: wrote matrix table with 297 rows and 200643 columns in 2 partitions to /tmp/DRD2.QC1.cp.mt
    Total size: 306.38 MiB
    * Rows/entries: 305.07 MiB
    * Columns: 1.31 MiB
    * Globals: 11.00 B
    * Smallest partition: 0 rows (20.00 B)
    * Largest partition:  297 rows (305.07 MiB)


'Stats before QC:'


2021-10-26 13:59:03 Hail: INFO: Ordering unsorted dataset with network shuffle


annotation,n_carriers,n_variants
str,int64,int64
"""Gi1""",154,18
"""Gi1_GoA_Gz""",4,2
"""Gi1_Gz""",2,2
"""Gz""",6,1
"""WT""",10425,78


In [None]:
# Re-load
#stage = "QC1"
#checkpoint_file = f"/tmp/{GENE}.{stage}.cp.mt"

#mt = hl.read_matrix_table(checkpoint_file)

In [20]:
# Withdrawn
mt = mt.filter_cols(~mt.s.startswith("W"))

print(f"Samples remaining after removing withdrawn participants: {mt.count_cols()} ")

Samples remaining after removing withdrawn participants: 200611 


In [21]:
# Related
rel = hl.import_table("file:" + REL_PATH, key = "eid")
mt = mt.anti_join_cols(rel)

print(f"Samples remaining after removing related samples: {mt.count_cols()} ")

2021-10-26 13:59:30 Hail: INFO: Reading table without type imputation
  Loading field 'eid' as type str (not specified)


Samples remaining after removing related samples: 186490 


In [22]:
# Sample QC
mt = sample_QC_mt(mt, MIN_CALL_RATE, MIN_MEAN_DP, MIN_MEAN_GQ)

print(f"Samples remaining after QC: {mt.count_cols()} ")

Samples remaining after QC: 181265 


In [23]:
# Hard filters
samp = hl.import_table("file:" + SAMP_PATH, key = "eid")
mt = mt.semi_join_cols(samp)

print(f"Samples remaining after after hard filters: {mt.count_cols()} ")

2021-10-26 13:59:46 Hail: INFO: Reading table without type imputation
  Loading field 'eid' as type str (not specified)


Samples remaining after after hard filters: 150347 


In [24]:
# Variant QC
mt = variant_QC_mt(mt, MIN_P_HWE, MIN_VAR_GQ)

interesting = mt.filter_rows((hl.is_defined(mt.annotations)) & (hl.agg.any(mt.GT.is_non_ref()))).count_rows()
print(f"{mt.count_rows()} variants remaining after QC of which {interesting} are annotated")

211 variants remaining after QC of which 76 are annotated


In [25]:
# Genotype GQ
mt = genotype_filter_mt(mt, MIN_DP, MIN_GQ, True)

missing = mt.aggregate_entries(hl.agg.sum(~hl.is_defined(mt.GT)))
pprint(f"{missing} missing or filtered entries after Call QC")

Filtering 0.00% entries out of downstream analysis.
'0 missing or filtered entries after Call QC'


In [26]:
# Checkpoint
stage = "QC2"
checkpoint_file = f"/tmp/{GENE}.{stage}.cp.mt"

mt = mt.checkpoint(checkpoint_file, overwrite = True)
show_stats(mt)

2021-10-26 14:03:51 Hail: INFO: wrote matrix table with 211 rows and 150347 columns in 2 partitions to /tmp/DRD2.QC2.cp.mt
    Total size: 171.36 MiB
    * Rows/entries: 164.09 MiB
    * Columns: 7.28 MiB
    * Globals: 11.00 B
    * Smallest partition: 0 rows (20.00 B)
    * Largest partition:  211 rows (164.09 MiB)


'Stats before QC:'


2021-10-26 14:03:55 Hail: INFO: Ordering unsorted dataset with network shuffle


annotation,n_carriers,n_variants
str,int64,int64
"""Gi1""",110,14
"""Gi1_GoA_Gz""",2,1
"""Gi1_Gz""",2,2
"""Gz""",5,1
"""WT""",7496,58


In [40]:
# BGEN
write_bgen(mt, "file:" + BGEN_FILE)

FatalError: IllegalArgumentException: RangeEnd < RangeStart: rangeEnd = -1, rangeStart = 0, url http://10.0.3.1:8090/DB/R/database-G5B4q0jJ860jjqP2Jq6pfvF5/tmp/export-bgen-concatenated-pSOhNdh8WnCsgwyZRcOUsT/part-1-149-1-0-a1c15e85-4cc8-b5c7-98c1-49df2ce5806c

Java stack trace:
java.lang.IllegalArgumentException: RangeEnd < RangeStart: rangeEnd = -1, rangeStart = 0, url http://10.0.3.1:8090/DB/R/database-G5B4q0jJ860jjqP2Jq6pfvF5/tmp/export-bgen-concatenated-pSOhNdh8WnCsgwyZRcOUsT/part-1-149-1-0-a1c15e85-4cc8-b5c7-98c1-49df2ce5806c
	at com.google.common.base.Preconditions.checkArgument(Preconditions.java:122)
	at com.dnanexus.hadoop.fs.DNAxFileSystem.createConnection(DNAxFileSystem.java:751)
	at com.dnanexus.hadoop.fs.DNAxFileSystem.open(DNAxFileSystem.java:608)
	at org.apache.hadoop.fs.FileSystem.open(FileSystem.java:769)
	at is.hail.io.fs.HadoopFS.openNoCompression(HadoopFS.scala:83)
	at is.hail.io.fs.FS$class.open(FS.scala:139)
	at is.hail.io.fs.HadoopFS.open(HadoopFS.scala:70)
	at is.hail.io.fs.FS$class.open(FS.scala:148)
	at is.hail.io.fs.HadoopFS.open(HadoopFS.scala:70)
	at is.hail.io.gen.ExportBGEN$$anonfun$1$$anonfun$apply$mcV$sp$1$$anonfun$apply$4.apply(ExportBGEN.scala:377)
	at is.hail.io.gen.ExportBGEN$$anonfun$1$$anonfun$apply$mcV$sp$1$$anonfun$apply$4.apply(ExportBGEN.scala:376)
	at scala.collection.IndexedSeqOptimized$class.foreach(IndexedSeqOptimized.scala:33)
	at scala.collection.mutable.ArrayOps$ofRef.foreach(ArrayOps.scala:186)
	at is.hail.io.gen.ExportBGEN$$anonfun$1$$anonfun$apply$mcV$sp$1.apply(ExportBGEN.scala:376)
	at is.hail.io.gen.ExportBGEN$$anonfun$1$$anonfun$apply$mcV$sp$1.apply(ExportBGEN.scala:372)
	at is.hail.utils.package$.using(package.scala:618)
	at is.hail.io.gen.ExportBGEN$$anonfun$1.apply$mcV$sp(ExportBGEN.scala:372)
	at is.hail.io.gen.ExportBGEN$$anonfun$1.apply(ExportBGEN.scala:372)
	at is.hail.io.gen.ExportBGEN$$anonfun$1.apply(ExportBGEN.scala:372)
	at is.hail.utils.package$.time(package.scala:158)
	at is.hail.io.gen.ExportBGEN$.apply(ExportBGEN.scala:371)
	at is.hail.expr.ir.MatrixBGENWriter.apply(MatrixWriter.scala:335)
	at is.hail.expr.ir.WrappedMatrixWriter.apply(MatrixWriter.scala:40)
	at is.hail.expr.ir.Interpret$.run(Interpret.scala:825)
	at is.hail.expr.ir.Interpret$.alreadyLowered(Interpret.scala:53)
	at is.hail.expr.ir.InterpretNonCompilable$.interpretAndCoerce$1(InterpretNonCompilable.scala:16)
	at is.hail.expr.ir.InterpretNonCompilable$.is$hail$expr$ir$InterpretNonCompilable$$rewrite$1(InterpretNonCompilable.scala:53)
	at is.hail.expr.ir.InterpretNonCompilable$.apply(InterpretNonCompilable.scala:58)
	at is.hail.expr.ir.lowering.InterpretNonCompilablePass$.transform(LoweringPass.scala:67)
	at is.hail.expr.ir.lowering.LoweringPass$$anonfun$apply$3$$anonfun$1.apply(LoweringPass.scala:15)
	at is.hail.expr.ir.lowering.LoweringPass$$anonfun$apply$3$$anonfun$1.apply(LoweringPass.scala:15)
	at is.hail.utils.ExecutionTimer.time(ExecutionTimer.scala:81)
	at is.hail.expr.ir.lowering.LoweringPass$$anonfun$apply$3.apply(LoweringPass.scala:15)
	at is.hail.expr.ir.lowering.LoweringPass$$anonfun$apply$3.apply(LoweringPass.scala:13)
	at is.hail.utils.ExecutionTimer.time(ExecutionTimer.scala:81)
	at is.hail.expr.ir.lowering.LoweringPass$class.apply(LoweringPass.scala:13)
	at is.hail.expr.ir.lowering.InterpretNonCompilablePass$.apply(LoweringPass.scala:62)
	at is.hail.expr.ir.lowering.LoweringPipeline$$anonfun$apply$1.apply(LoweringPipeline.scala:14)
	at is.hail.expr.ir.lowering.LoweringPipeline$$anonfun$apply$1.apply(LoweringPipeline.scala:12)
	at scala.collection.IndexedSeqOptimized$class.foreach(IndexedSeqOptimized.scala:33)
	at scala.collection.mutable.WrappedArray.foreach(WrappedArray.scala:35)
	at is.hail.expr.ir.lowering.LoweringPipeline.apply(LoweringPipeline.scala:12)
	at is.hail.expr.ir.CompileAndEvaluate$._apply(CompileAndEvaluate.scala:28)
	at is.hail.backend.spark.SparkBackend.is$hail$backend$spark$SparkBackend$$_execute(SparkBackend.scala:354)
	at is.hail.backend.spark.SparkBackend$$anonfun$execute$1.apply(SparkBackend.scala:338)
	at is.hail.backend.spark.SparkBackend$$anonfun$execute$1.apply(SparkBackend.scala:335)
	at is.hail.expr.ir.ExecuteContext$$anonfun$scoped$1.apply(ExecuteContext.scala:25)
	at is.hail.expr.ir.ExecuteContext$$anonfun$scoped$1.apply(ExecuteContext.scala:23)
	at is.hail.utils.package$.using(package.scala:618)
	at is.hail.annotations.Region$.scoped(Region.scala:18)
	at is.hail.expr.ir.ExecuteContext$.scoped(ExecuteContext.scala:23)
	at is.hail.backend.spark.SparkBackend.withExecuteContext(SparkBackend.scala:247)
	at is.hail.backend.spark.SparkBackend.execute(SparkBackend.scala:335)
	at is.hail.backend.spark.SparkBackend$$anonfun$7.apply(SparkBackend.scala:379)
	at is.hail.backend.spark.SparkBackend$$anonfun$7.apply(SparkBackend.scala:377)
	at is.hail.utils.ExecutionTimer$.time(ExecutionTimer.scala:52)
	at is.hail.backend.spark.SparkBackend.executeJSON(SparkBackend.scala:377)
	at sun.reflect.GeneratedMethodAccessor52.invoke(Unknown Source)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)



Hail version: 0.2.61-3c86d3ba497a
Error summary: IllegalArgumentException: RangeEnd < RangeStart: rangeEnd = -1, rangeStart = 0, url http://10.0.3.1:8090/DB/R/database-G5B4q0jJ860jjqP2Jq6pfvF5/tmp/export-bgen-concatenated-pSOhNdh8WnCsgwyZRcOUsT/part-1-149-1-0-a1c15e85-4cc8-b5c7-98c1-49df2ce5806c

In [41]:
# ANNOTATIONS

mt = add_varid(mt)

annotations = (
    mt.select_rows(
        varid = mt.varid,
        gene = mt.vep.transcript_consequences.gene_symbol[0],
        annotation = mt.annotation
    )
    .rows()
    .key_by("varid")
    .drop("locus")
    .drop("alleles")
)
annotations.export("file:" + ANNOTATIONS_FILE, header=False)

2021-10-26 14:22:43 Hail: INFO: Coerced sorted dataset
2021-10-26 14:22:54 Hail: INFO: merging 1 files totalling 5.8K...
2021-10-26 14:22:56 Hail: INFO: while writing:
    file:/opt/notebooks/gogoGPCR/tmp/DRD2.annotations
  merge time: 1.036s


In [42]:
# SETLIST
position = mt.aggregate_rows(hl.agg.min(mt.locus.position))
names = mt.varid.collect()
names_str = ",".join(names)

line = f"{mt.vep.transcript_consequences.gene_symbol[0].collect()[0]}\t{mt.locus.contig.collect()[0]}\t{position}\t{names_str}"

with open(SETLIST_FILE, "w") as f:
    f.write(line)

In [None]:
bgen_file = BGEN_FILE + ".bgen"
sample_file = BGEN_FILE + ".sample"

subprocess.run(["dx", "upload", bgen_file, sample_file, ANNOTATIONS_FILE, SETLIST_FILE, "--path", "/data/burden/"], check = True, shell = False)

In [45]:
sample = mt.select_cols(ID_1 = mt.s, ID_2 = mt.s, missing = 0)

In [47]:
sample.cols().show()

2021-10-26 14:25:18 Hail: WARN: cols(): Resulting column table is sorted by 'col_key'.
    To preserve matrix table column order, first unkey columns with 'key_cols_by()'
2021-10-26 14:25:19 Hail: INFO: Coerced sorted dataset


s,ID_1,ID_2,missing
str,str,str,int32
"""1000030""","""1000030""","""1000030""",0
"""1000059""","""1000059""","""1000059""",0
"""1000062""","""1000062""","""1000062""",0
"""1000077""","""1000077""","""1000077""",0
"""1000086""","""1000086""","""1000086""",0
"""1000100""","""1000100""","""1000100""",0
"""1000229""","""1000229""","""1000229""",0
"""1000250""","""1000250""","""1000250""",0
"""1000264""","""1000264""","""1000264""",0
"""1000296""","""1000296""","""1000296""",0


In [None]:
#STAGE = "final"
#WRITE_PATH = "dnax://" + mt_database + f"/{GENE}.{STAGE}.mt"

#mt.write(WRITE_PATH, overwrite = True)
show_stats(mt)

#STAGE = "final"
#WRITE_PATH = "dnax://" + mt_database + f"/{GENE}.{STAGE}.mt"

#mt = hl.read_matrix_table(WRITE_PATH)