In [1]:
import hail as hl
from pathlib import Path
from datetime import datetime
import pyspark
import dxpy
import pandas as pd
import subprocess
from matrixtables import import_mt
import requests

In [2]:
# Constants
DATABASE = "matrix_tables"
REFERENCE_GENOME = "GRCh38"
PROJ_NAME = "DRD2"

# RAP
VCF_VERSION = "v1"
FIELD_ID = 23157

# Paths
BULK_DIR = Path("/mnt/project/Bulk")
VCF_DIR = Path("Exome sequences/Population level exome OQFE variants, pVCF format - final release")
MISC_DIR = Path("/mnt/project/gogoGPCR2/")

# Remotes
GENCODE_GTF = "https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_29/gencode.v29.annotation.gtf.gz"
PVCF_BLOCKS = "biobank.ndph.ox.ac.uk/ukb/ukb/auxdata/pvcf_blocks.txt"
INTERVAL_FILE = Path("Exome sequences/Exome OQFE CRAM files/helper_files/xgen_plus_spikein.GRCh38.bed")

# Genes
GENES = ["DRD2", "GCGR"]

In [None]:
import os
os.environ["DNAX_INSTANCE_COUNT"]

In [3]:
Path("/tmp").resolve().mkdir(parents=True, exist_ok=True)

LOG_FILE = (
    Path("../hail_logs", f"{PROJ_NAME}_{datetime.now().strftime('%H%M')}.log")
    .resolve()
    .__str__()
)

# Spark init
sc = pyspark.SparkContext()
spark = pyspark.sql.SparkSession(sc)

# Create database in DNAX
spark.sql(f"CREATE DATABASE IF NOT EXISTS {DATABASE} LOCATION 'dnax://'")
mt_database = dxpy.find_one_data_object(name=DATABASE)["id"]


# Hail init
hl.init(sc=sc, default_reference=REFERENCE_GENOME, log=LOG_FILE, tmp_dir=f'dnax://{mt_database}/tmp/')

SLF4J: Class path contains multiple SLF4J bindings.
SLF4J: Found binding in [jar:file:/cluster/dnax/jars/dnanexus-api-0.1.0-SNAPSHOT-jar-with-dependencies.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: Found binding in [jar:file:/cluster/spark/jars/log4j-slf4j-impl-2.17.2.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: See http://www.slf4j.org/codes.html#multiple_bindings for an explanation.
SLF4J: Actual binding is of type [org.slf4j.impl.Log4jLoggerFactory]
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


2023-09-06 09:54:34.264 WARN  NativeCodeLoader:60 - Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
2023-09-06 09:54:35.186 WARN  Utils:69 - Service 'org.apache.spark.network.netty.NettyBlockTransferService' could not bind on port 43000. Attempting port 43001.
2023-09-06 09:54:35.416 WARN  MetricsReporter:84 - No metrics configured for reporting
2023-09-06 09:54:35.418 WARN  LineProtoUsageReporter:48 - Telegraf configurations: url [metrics.push.telegraf.hostport], user [metrics.push.telegraf.user] or password [metrics.push.telegraf.password] missing.
2023-09-06 09:54:35.418 WARN  MetricsReporter:117 - metrics.scraping.httpserver.port


pip-installed Hail requires additional configuration options in Spark referring
  to the path to the Hail Python module directory HAIL_DIR,
  e.g. /path/to/python/site-packages/hail:
    spark.jars=HAIL_DIR/backend/hail-all-spark.jar
    spark.driver.extraClassPath=HAIL_DIR/backend/hail-all-spark.jar
    spark.executor.extraClassPath=./hail-all-spark.jar

log4j: Parsing for [root] with value=[INFO, logfile].
log4j: Level token is [INFO].
log4j: Category root set to INFO
log4j: Parsing appender named "logfile".
log4j: Parsing layout options for "logfile".
log4j: Setting property [conversionPattern] to [%d{yyyy-MM-dd HH:mm:ss.SSS} %c{1}: %p: %m%n].
log4j: End of parsing for "logfile".
log4j: Setting property [append] to [false].
log4j: Setting property [threshold] to [INFO].
log4j: Setting property [file] to [/opt/notebooks/gogoGPCR2/hail_logs/DRD2_0954.log].
log4j: setFile called: /opt/notebooks/gogoGPCR2/hail_logs/DRD2_0954.log, false
log4j: setFile ended
log4j: Parsed "logfile" options.
log4j: Parsing for [Hail] with value=[INFO, HailSocketAppender].
log4j: Level token is [INFO].
log4j: Category Hail set to INFO
log4j: Parsing appender named "HailSocketAppender".
log4j: Parsed "HailSocketAppender" options.
log4j: Handling log4j.additivity.Hail=[null]
log4j: Finished configuring.


Running on Apache Spark version 3.2.3
SparkUI available at http://ip-10-60-42-192.eu-west-2.compute.internal:8081
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.116-cd64e0876c94
LOGGING: writing to /opt/notebooks/gogoGPCR2/hail_logs/DRD2_0954.log


In [4]:
# if not Path("../data/gencode.v29.annotation.gtf.bgz").exists():
#     response = requests.get(GENCODE_GTF)
#     if response.status_code == 200:
#         with open(Path("../data/gencode.v29.annotation.gtf.bgz"), "wb") as file:
#             file.write(response.content)
            
# # Resource https://biobank.ndph.ox.ac.uk/ukb/refer.cgi?id=837
# if not Path("../data/pvcf_blocks.txt").exists():
#     response = requests.get(GENCODE_GTF)
#     if response.status_code == 200:
#         with open(Path("../data/pvcf_blocks.txt"), "wb") as file:
#             file.write(response.content)

In [5]:
# Read in metadata and region
# Map genes to blocks and regions
# The mapping file must contain the columns: HGNC, VCF_block, GRCh38_region, GRCh38_start, GRCh38_end. See GPCR_blocks.tsv for example
# MAPPING_FILE = Path("../data/misc/GPCR_blocks.tsv").resolve()
# mapping = pd.read_csv(MAPPING_FILE, sep="\t").set_index("HGNC", drop=False)
# mapping.loc[GENES, :]

In [6]:
# Get gene intervals
gene_interval = hl.experimental.get_gene_intervals(gene_symbols=GENES, reference_genome='GRCh38', gtf_file = f"file://{MISC_DIR / Path(GENCODE_GTF).with_suffix('.bgz').name}")
gene_interval

2023-09-06 09:54:52.807 Hail: INFO: Reading table without type imputation       
  Loading field 'f0' as type str (not specified)
  Loading field 'f1' as type str (not specified)
  Loading field 'f2' as type str (not specified)
  Loading field 'f3' as type int32 (user-supplied)
  Loading field 'f4' as type int32 (user-supplied)
  Loading field 'f5' as type float64 (user-supplied)
  Loading field 'f6' as type str (not specified)
  Loading field 'f7' as type int32 (user-supplied)
  Loading field 'f8' as type str (not specified)
2023-09-06 09:55:07.865 Hail: INFO: wrote table with 2742017 rows in 12 partitions to /tmp/P3ULsUNhqsAzH1kTQ6z9WB
2023-09-06 09:55:11.577 Hail: INFO: Ordering unsorted dataset with network shuffle
2023-09-06 09:55:15.234 Hail: INFO: get_gene_intervals found 2 entries:         
gene: DRD2 (ENSG00000149295)
gene: GCGR (ENSG00000215644)


[Interval(start=Locus(contig=chr11, position=113409615, reference_genome=GRCh38), end=Locus(contig=chr11, position=113475691, reference_genome=GRCh38), includes_start=True, includes_end=True),
 Interval(start=Locus(contig=chr17, position=81804132, reference_genome=GRCh38), end=Locus(contig=chr17, position=81814013, reference_genome=GRCh38), includes_start=True, includes_end=True)]

In [7]:
blocks = hl.import_table(f"file://{MISC_DIR / Path(PVCF_BLOCKS).name}" , no_header = True)
blocks = blocks.annotate(f1 = blocks.f1.replace("23", "X").replace("24", "Y"))
# blocks.show()
blocks = blocks.annotate(region = hl.str("").join([hl.str("chr"), blocks.f1]))
blocks = blocks.annotate(interval = hl.locus_interval(blocks.region, hl.int32(blocks.f3), hl.int32(blocks.f4), reference_genome='GRCh38')).key_by("interval")
blocks.show(5) 

2023-09-06 09:55:16.872 Hail: INFO: Reading table without type imputation       
  Loading field 'f0' as type str (not specified)
  Loading field 'f1' as type str (not specified)
  Loading field 'f2' as type str (not specified)
  Loading field 'f3' as type str (not specified)
  Loading field 'f4' as type str (not specified)


f0,f1,f2,f3,f4,region,interval
str,str,str,str,str,str,interval<locus<GRCh38>>
"""1""","""1""","""0""","""1""","""1218130""","""chr1""",[chr1:1-chr1:1218130)
"""2""","""1""","""1""","""1218131""","""1426969""","""chr1""",[chr1:1218131-chr1:1426969)
"""3""","""1""","""2""","""1426970""","""1758871""","""chr1""",[chr1:1426970-chr1:1758871)
"""4""","""1""","""3""","""1758872""","""2514221""","""chr1""",[chr1:1758872-chr1:2514221)
"""5""","""1""","""4""","""2514222""","""3782130""","""chr1""",[chr1:2514222-chr1:3782130)


In [8]:
gene_blocks = [blocks.filter(blocks.interval.overlaps(inter)).annotate(gene = gene) for inter, gene in zip(gene_interval, GENES)]

gb = hl.Table.union(*gene_blocks)

gb.show()

2023-09-06 09:55:19.158 Hail: INFO: Coerced sorted dataset
2023-09-06 09:55:19.669 Hail: INFO: Coerced sorted dataset


f0,f1,f2,f3,f4,region,interval,gene
str,str,str,str,str,str,interval<locus<GRCh38>>,str
"""566""","""11""","""47""","""112186478""","""116126491""","""chr11""",[chr11:112186478-chr11:116126491),"""DRD2"""
"""810""","""17""","""53""","""81251904""","""81915070""","""chr17""",[chr17:81251904-chr17:81915070),"""GCGR"""


In [9]:
vcf_files = [
    f"file://{BULK_DIR / VCF_DIR}/ukb{FIELD_ID}_c{chromosome.replace('chr', '')}_b{block}_{VCF_VERSION}.vcf.gz"
    for block, chromosome in zip(gb.f2.collect(), gb.region.collect())
]

mt = hl.import_vcf(
    vcf_files,
    drop_samples = True,
    reference_genome="GRCh38",
    array_elements_required=False,
    force_bgz = True
)

2023-09-06 09:55:21.944 Hail: INFO: Coerced sorted dataset
2023-09-06 09:55:22.462 Hail: INFO: Coerced sorted dataset
2023-09-06 09:55:23.794 Hail: INFO: Coerced sorted dataset
2023-09-06 09:55:24.312 Hail: INFO: Coerced sorted dataset
                                                                                

In [10]:
# Only genes of interest
mt = mt.filter_rows(hl.is_defined(gb[mt.locus]))

In [19]:
# # Only exome capture region

# interval_table = hl.import_bed(
#     f"file://{BULK_DIR / INTERVAL_FILE}",
#     reference_genome="GRCh38",
# )

# mt = mt.filter_rows(hl.is_defined(interval_table[mt.locus]))

2023-09-06 09:46:01.508 Hail: INFO: Reading table without type imputation       
  Loading field 'f0' as type str (user-supplied)
  Loading field 'f1' as type int32 (user-supplied)
  Loading field 'f2' as type int32 (user-supplied)


In [None]:
stage = "FIRST"
checkpoint_file = f"/tmp/{PROJ_NAME}.{stage}.cp.mt"

mt = mt.checkpoint(checkpoint_file, overwrite=True)

2023-09-06 09:55:56.153 Hail: INFO: Coerced sorted dataset
2023-09-06 09:55:56.646 Hail: INFO: Coerced sorted dataset
2023-09-06 09:55:56.813 Hail: INFO: scanning VCF for sortedness...
2023-09-06 10:00:05.879 Hail: INFO: Coerced sorted VCF - no additional import work to do
2023-09-06 10:00:06.842 Hail: INFO: Coerced sorted dataset                      
2023-09-06 10:00:07.783 Hail: INFO: wrote table with 380 rows in 2 partitions to /tmp/__iruid_12949-t6OREPqPiTLUOTjfKE68tb

In [10]:
# Not with 90% DP<10

PCT10DP_FILE = Path("Exome sequences/Population level exome OQFE variants, PLINK format - final release/helper_files/ukb23158_500k_OQFE.90pct10dp_qc_variants.txt")

pctdp = hl.import_table(f"file://{BULK_DIR / PCT10DP_FILE}", no_header = True)
pctdp = pctdp.annotate(**hl.parse_variant(hl.str("").join([hl.str("chr"), pctdp.f0]), reference_genome = "GRCh38")).key_by("locus", "alleles")
pctdp = pctdp.filter(hl.is_defined(gb[pctdp.locus]))
pctdp.count()

2023-09-06 09:24:06.906 Hail: INFO: Reading table without type imputation       
  Loading field 'f0' as type str (not specified)
2023-09-06 09:24:08.278 Hail: INFO: Reading table without type imputation       
  Loading field 'f0' as type str (not specified)
                                                                                

f0,locus,alleles
str,locus<GRCh38>,array<str>
"""1:69026:T:G""",chr1:69026,"[""T"",""G""]"
"""1:69081:G:C""",chr1:69081,"[""G"",""C""]"
"""1:69095:T:G""",chr1:69095,"[""T"",""G""]"
"""1:69134:A:G""",chr1:69134,"[""A"",""G""]"
"""1:69144:C:T""",chr1:69144,"[""C"",""T""]"
"""1:69149:T:A""",chr1:69149,"[""T"",""A""]"
"""1:69173:A:T""",chr1:69173,"[""A"",""T""]"
"""1:69202:A:G""",chr1:69202,"[""A"",""G""]"
"""1:69217:G:A""",chr1:69217,"[""G"",""A""]"
"""1:69224:A:T""",chr1:69224,"[""A"",""T""]"


In [11]:
# mt = mt.filter_rows(~hl.is_defined(ht2[mt.locus, mt.alleles]))
mt = mt.anti_join_rows(pctdp)

In [None]:
stage = "SECOND"
checkpoint_file = f"/tmp/{PROJ_NAME}.{stage}.cp.mt"

mt = mt.checkpoint(checkpoint_file, overwrite=True)

2023-09-06 09:24:23.577 Hail: INFO: Coerced sorted dataset
2023-09-06 09:24:24.048 Hail: INFO: Coerced sorted dataset
2023-09-06 09:24:29.897 Hail: INFO: Coerced sorted dataset                      
2023-09-06 09:25:27.826 Hail: INFO: Coerced sorted dataset                      
2023-09-06 09:25:28.005 Hail: INFO: scanning VCF for sortedness...
2023-09-06 09:29:19.019 Hail: INFO: Coerced sorted VCF - no additional import work to do
2023-09-06 09:29:19.613 Hail: INFO: Coerced sorted dataset
2023-09-06 09:29:23.618 Hail: INFO: wrote table with 380 rows in 2 partitions to /tmp/__iruid_18098-8LjuMVSKaUW2UAAYHug2ar
2023-09-06 09:29:31.319 Hail: INFO: Coerced sorted dataset                      
2023-09-06 09:29:37.788 Hail: INFO: wrote table with 570 rows in 1 partition to /tmp/__iruid_18513-wEN0Ky9YZYnpZhsRP2RwRo
2023-09-06 09:30:32.781 Hail: INFO: wrote table with 5798366 rows in 1 partition to /tmp/__iruid_18827-iIRi5XvsnqUOH6qiBmn1j4