In [1]:
import hail as hl
from pathlib import Path
from datetime import datetime
import pyspark
import dxpy
import pandas as pd
import subprocess
from matrixtables import import_mt
import requests

In [2]:
# Constants
DATABASE = "matrix_tables"
REFERENCE_GENOME = "GRCh38"
PROJ_NAME = "DRD2"

# RAP
VCF_VERSION = "v1"
FIELD_ID = 23157

# Paths
BULK_DIR = Path("/mnt/project/Bulk")
VCF_DIR = Path(
    "Exome sequences/Population level exome OQFE variants, pVCF format - final release/"
)

# Remotes
GENCODE_GTF = "https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_29/gencode.v29.annotation.gtf.gz"
PVCF_BLOCKS = "biobank.ndph.ox.ac.uk/ukb/ukb/auxdata/pvcf_blocks.txt"

# Genes
GENES = ["DRD2", "GCGR"]

In [3]:
Path("/tmp").resolve().mkdir(parents=True, exist_ok=True)

LOG_FILE = (
    Path("../hail_logs", f"{PROJ_NAME}_{datetime.now().strftime('%H%M')}.log")
    .resolve()
    .__str__()
)

# Hail init
sc = pyspark.SparkContext()
spark = pyspark.sql.SparkSession(sc)

hl.init(sc=sc, default_reference=REFERENCE_GENOME, log=LOG_FILE)

# Create database in DNAX
spark.sql(f"CREATE DATABASE IF NOT EXISTS {DATABASE} LOCATION 'dnax://'")
mt_database = dxpy.find_one_data_object(name=DATABASE)["id"]

SLF4J: Class path contains multiple SLF4J bindings.
SLF4J: Found binding in [jar:file:/cluster/dnax/jars/dnanexus-api-0.1.0-SNAPSHOT-jar-with-dependencies.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: Found binding in [jar:file:/cluster/spark/jars/log4j-slf4j-impl-2.17.2.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: See http://www.slf4j.org/codes.html#multiple_bindings for an explanation.
SLF4J: Actual binding is of type [org.slf4j.impl.Log4jLoggerFactory]
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


2023-09-05 12:30:11.217 WARN  NativeCodeLoader:60 - Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
2023-09-05 12:30:12.147 WARN  Utils:69 - Service 'org.apache.spark.network.netty.NettyBlockTransferService' could not bind on port 43000. Attempting port 43001.
2023-09-05 12:30:12.360 WARN  MetricsReporter:84 - No metrics configured for reporting
2023-09-05 12:30:12.361 WARN  LineProtoUsageReporter:48 - Telegraf configurations: url [metrics.push.telegraf.hostport], user [metrics.push.telegraf.user] or password [metrics.push.telegraf.password] missing.
2023-09-05 12:30:12.362 WARN  MetricsReporter:117 - metrics.scraping.httpserver.port


pip-installed Hail requires additional configuration options in Spark referring
  to the path to the Hail Python module directory HAIL_DIR,
  e.g. /path/to/python/site-packages/hail:
    spark.jars=HAIL_DIR/backend/hail-all-spark.jar
    spark.driver.extraClassPath=HAIL_DIR/backend/hail-all-spark.jar
    spark.executor.extraClassPath=./hail-all-spark.jar

log4j: Parsing for [root] with value=[INFO, logfile].
log4j: Level token is [INFO].
log4j: Category root set to INFO
log4j: Parsing appender named "logfile".
log4j: Parsing layout options for "logfile".
log4j: Setting property [conversionPattern] to [%d{yyyy-MM-dd HH:mm:ss.SSS} %c{1}: %p: %m%n].
log4j: End of parsing for "logfile".
log4j: Setting property [append] to [false].
log4j: Setting property [threshold] to [INFO].
log4j: Setting property [file] to [/opt/notebooks/gogoGPCR2/hail_logs/DRD2_1230.log].
log4j: setFile called: /opt/notebooks/gogoGPCR2/hail_logs/DRD2_1230.log, false
log4j: setFile ended
log4j: Parsed "logfile" options.
log4j: Parsing for [Hail] with value=[INFO, HailSocketAppender].
log4j: Level token is [INFO].
log4j: Category Hail set to INFO
log4j: Parsing appender named "HailSocketAppender".
log4j: Parsed "HailSocketAppender" options.
log4j: Handling log4j.additivity.Hail=[null]
log4j: Finished configuring.


Running on Apache Spark version 3.2.3
SparkUI available at http://ip-10-60-170-194.eu-west-2.compute.internal:8081
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.116-cd64e0876c94
LOGGING: writing to /opt/notebooks/gogoGPCR2/hail_logs/DRD2_1230.log


In [4]:
if not Path("../data/gencode.v29.annotation.gtf.bgz").exists():
    response = requests.get(GENCODE_GTF)
    if response.status_code == 200:
        with open(Path("../data/gencode.v29.annotation.gtf.bgz"), "wb") as file:
            file.write(response.content)

# Resource https://biobank.ndph.ox.ac.uk/ukb/refer.cgi?id=837
if not Path("../data/pvcf_blocks.txt").exists():
    response = requests.get(GENCODE_GTF)
    if response.status_code == 200:
        with open(Path("../data/pvcf_blocks.txt"), "wb") as file:
            file.write(response.content)

In [5]:
# Read in metadata and region
# Map genes to blocks and regions
# The mapping file must contain the columns: HGNC, VCF_block, GRCh38_region, GRCh38_start, GRCh38_end. See GPCR_blocks.tsv for example
# MAPPING_FILE = Path("../data/misc/GPCR_blocks.tsv").resolve()
# mapping = pd.read_csv(MAPPING_FILE, sep="\t").set_index("HGNC", drop=False)
# mapping.loc[GENES, :]

In [6]:
# Get gene intervals
gene_interval = hl.experimental.get_gene_intervals(
    gene_symbols=GENES,
    reference_genome="GRCh38",
    gtf_file="/tmp/gencode.v29.annotation.gtf.bgz",
)
gene_interval

2023-09-05 12:30:26.412 Hail: INFO: Reading table without type imputation       
  Loading field 'f0' as type str (not specified)
  Loading field 'f1' as type str (not specified)
  Loading field 'f2' as type str (not specified)
  Loading field 'f3' as type int32 (user-supplied)
  Loading field 'f4' as type int32 (user-supplied)
  Loading field 'f5' as type float64 (user-supplied)
  Loading field 'f6' as type str (not specified)
  Loading field 'f7' as type int32 (user-supplied)
  Loading field 'f8' as type str (not specified)
2023-09-05 12:30:36.429 Hail: INFO: wrote table with 2742017 rows in 12 partitions to /tmp/XsaTjz6V0sxQs09AJDKMtZ
2023-09-05 12:30:40.113 Hail: INFO: Ordering unsorted dataset with network shuffle
2023-09-05 12:30:44.556 Hail: INFO: get_gene_intervals found 2 entries:         
gene: DRD2 (ENSG00000149295)
gene: GCGR (ENSG00000215644)


[Interval(start=Locus(contig=chr11, position=113409615, reference_genome=GRCh38), end=Locus(contig=chr11, position=113475691, reference_genome=GRCh38), includes_start=True, includes_end=True),
 Interval(start=Locus(contig=chr17, position=81804132, reference_genome=GRCh38), end=Locus(contig=chr17, position=81814013, reference_genome=GRCh38), includes_start=True, includes_end=True)]

In [7]:
blocks = hl.import_table("/tmp/pvcf_blocks.txt", no_header=True)
blocks = blocks.annotate(f1=blocks.f1.replace("23", "X").replace("24", "Y"))
# blocks.show()
blocks = blocks.annotate(region=hl.str("").join([hl.str("chr"), blocks.f1]))
blocks = blocks.annotate(
    interval=hl.locus_interval(
        blocks.region,
        hl.int32(blocks.f3),
        hl.int32(blocks.f4),
        reference_genome="GRCh38",
    )
).key_by("interval")
blocks.show()

2023-09-05 12:30:45.321 Hail: INFO: Reading table without type imputation
  Loading field 'f0' as type str (not specified)
  Loading field 'f1' as type str (not specified)
  Loading field 'f2' as type str (not specified)
  Loading field 'f3' as type str (not specified)
  Loading field 'f4' as type str (not specified)


f0,f1,f2,f3,f4,region,interval
str,str,str,str,str,str,interval<locus<GRCh38>>
"""1""","""1""","""0""","""1""","""1218130""","""chr1""",[chr1:1-chr1:1218130)
"""2""","""1""","""1""","""1218131""","""1426969""","""chr1""",[chr1:1218131-chr1:1426969)
"""3""","""1""","""2""","""1426970""","""1758871""","""chr1""",[chr1:1426970-chr1:1758871)
"""4""","""1""","""3""","""1758872""","""2514221""","""chr1""",[chr1:1758872-chr1:2514221)
"""5""","""1""","""4""","""2514222""","""3782130""","""chr1""",[chr1:2514222-chr1:3782130)
"""6""","""1""","""5""","""3782131""","""6445172""","""chr1""",[chr1:3782131-chr1:6445172)
"""7""","""1""","""6""","""6445173""","""8357220""","""chr1""",[chr1:6445173-chr1:8357220)
"""8""","""1""","""7""","""8357221""","""10269882""","""chr1""",[chr1:8357221-chr1:10269882)
"""9""","""1""","""8""","""10269883""","""11232962""","""chr1""",[chr1:10269883-chr1:11232962)
"""10""","""1""","""9""","""11232963""","""12243417""","""chr1""",[chr1:11232963-chr1:12243417)


In [8]:
gene_blocks = [
    blocks.filter(blocks.interval.overlaps(inter)).annotate(gene=gene)
    for inter, gene in zip(gene_interval, GENES)
]
if len(gene_blocks) == 1:
    ht = gene_blocks[0]
else:
    ht = hl.Table.union(*gene_blocks)

ht.show()

2023-09-05 12:30:47.085 Hail: INFO: Coerced sorted dataset
2023-09-05 12:30:47.375 Hail: INFO: Coerced sorted dataset


f0,f1,f2,f3,f4,region,interval,gene
str,str,str,str,str,str,interval<locus<GRCh38>>,str
"""566""","""11""","""47""","""112186478""","""116126491""","""chr11""",[chr11:112186478-chr11:116126491),"""DRD2"""
"""810""","""17""","""53""","""81251904""","""81915070""","""chr17""",[chr17:81251904-chr17:81915070),"""GCGR"""


In [9]:
vcf_files = [
    f"file://{BULK_DIR / VCF_DIR}/ukb{FIELD_ID}_c{chromosome.replace('chr', '')}_b{block}_{VCF_VERSION}.vcf.gz"
    for block, chromosome in zip(ht.f2.collect(), ht.region.collect())
]

vcf_files

mts = hl.import_vcf(
    vcf_files, reference_genome="GRCh38", array_elements_required=True, force_bgz=True
)

if len(mts) == 1:
    mt = mts[0]
else:
    mt = hl.MatrixTable.union_rows(*mts)

2023-09-05 12:30:48.904 Hail: INFO: Coerced sorted dataset
2023-09-05 12:30:49.164 Hail: INFO: Coerced sorted dataset
2023-09-05 12:30:50.033 Hail: INFO: Coerced sorted dataset
2023-09-05 12:30:50.253 Hail: INFO: Coerced sorted dataset
                                                                                

In [10]:
# Only genes of interest
mt = mt.filter_rows(hl.is_defined(ht[mt.locus]))

NameError: name 'mt' is not defined

In [None]:
# Only exome capture region

INTERVAL_FILE = Path(
    "Exome sequences/Exome OQFE CRAM files/helper_files/xgen_plus_spikein.GRCh38.bed"
)
# subprocess.run(["hadoop", "fs", "-put", str(INTERVAL_FILE), "/tmp"])

interval_table = hl.import_bed(
    f"file://{BULK_DIR / INTERVAL_FILE}",
    reference_genome="GRCh38",
)

mt = mt.filter_rows(hl.is_defined(interval_table[mt.locus]))

In [None]:
# Not with 90% DP<10

PCT10DP_FILE = Path(
    "Exome sequences/Population level exome OQFE variants, PLINK format - final release/helper_files/ukb23158_500k_OQFE.90pct10dp_qc_variants.txt"
)

ht2 = hl.import_table(f"file://{BULK_DIR / PCT10DP_FILE}", no_header=True)
ht2 = hl.import_table(f"file://{BULK_DIR / PCT10DP_FILE}", no_header=True)
ht2 = ht2.annotate(
    **hl.parse_variant(
        hl.str("").join([hl.str("chr"), ht2.f0]), reference_genome="GRCh38"
    )
).key_by("locus", "alleles")
ht2.show()

In [None]:
mt = mt.filter_rows(~hl.is_defined(ht2[mt.locus, mt.alleles]))
mt.count()