In [1]:
import hail as hl
from pathlib import Path
from datetime import datetime
import pyspark
import dxpy
import pandas as pd
import subprocess
import requests
import os

In [2]:
# Constants
DATABASE = "matrix_tables"
REFERENCE_GENOME = "GRCh38"
PROJ_NAME = "DRD2"

# RAP
VCF_VERSION = "v1"
FIELD_ID = 23157

# Paths
BULK_DIR = Path("/mnt/project/Bulk")
VCF_DIR = Path("Exome sequences/Population level exome OQFE variants, pVCF format - final release")
MISC_DIR = Path("/mnt/project/gogoGPCR2/")

# Remotes
GENCODE_GTF = "https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_29/gencode.v29.annotation.gtf.gz"
PVCF_BLOCKS = "biobank.ndph.ox.ac.uk/ukb/ukb/auxdata/pvcf_blocks.txt"
INTERVAL_FILE = Path("Exome sequences/Exome OQFE CRAM files/helper_files/xgen_plus_spikein.GRCh38.bed")

# Genes
GENES = ["DRD2", ]

In [3]:
Path("/tmp").resolve().mkdir(parents=True, exist_ok=True)

LOG_FILE = (
    Path("../hail_logs", f"{PROJ_NAME}_{datetime.now().strftime('%H%M')}.log")
    .resolve()
    .__str__()
)

# Spark init
sc = pyspark.SparkContext()
spark = pyspark.sql.SparkSession(sc)

# Create database in DNAX
spark.sql(f"CREATE DATABASE IF NOT EXISTS {DATABASE} LOCATION 'dnax://'")
mt_database = dxpy.find_one_data_object(name=DATABASE)["id"]


# Hail init
hl.init(sc=sc, default_reference=REFERENCE_GENOME, log=LOG_FILE)

SLF4J: Class path contains multiple SLF4J bindings.
SLF4J: Found binding in [jar:file:/cluster/dnax/jars/dnanexus-api-0.1.0-SNAPSHOT-jar-with-dependencies.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: Found binding in [jar:file:/cluster/spark/jars/log4j-slf4j-impl-2.17.2.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: See http://www.slf4j.org/codes.html#multiple_bindings for an explanation.
SLF4J: Actual binding is of type [org.slf4j.impl.Log4jLoggerFactory]
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


2023-09-13 11:51:55.200 WARN  NativeCodeLoader:60 - Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
2023-09-13 11:51:56.211 WARN  Utils:69 - Service 'org.apache.spark.network.netty.NettyBlockTransferService' could not bind on port 43000. Attempting port 43001.
2023-09-13 11:51:56.450 WARN  MetricsReporter:84 - No metrics configured for reporting
2023-09-13 11:51:56.452 WARN  LineProtoUsageReporter:48 - Telegraf configurations: url [metrics.push.telegraf.hostport], user [metrics.push.telegraf.user] or password [metrics.push.telegraf.password] missing.
2023-09-13 11:51:56.452 WARN  MetricsReporter:117 - metrics.scraping.httpserver.port
2023-09-13 11:52:00.789 WARN  ShellBasedUnixGroupsMapping:210 - unable to return groups for user yXZGx11fq6yj9Kf8Qb4Qpfq3Qpb2GzpB0vVQ9BF0__project-G7qz02QJj12zJb35569kyYk7
PartialGroupNameException The user name 'yXZGx11fq6yj9Kf8Qb4Qpfq3Qpb2GzpB0vVQ9BF0__project-G7qz02QJj12zJb35569kyYk7' is not found. i

pip-installed Hail requires additional configuration options in Spark referring
  to the path to the Hail Python module directory HAIL_DIR,
  e.g. /path/to/python/site-packages/hail:
    spark.jars=HAIL_DIR/backend/hail-all-spark.jar
    spark.driver.extraClassPath=HAIL_DIR/backend/hail-all-spark.jar
    spark.executor.extraClassPath=./hail-all-spark.jar

log4j: Parsing for [root] with value=[INFO, logfile].
log4j: Level token is [INFO].
log4j: Category root set to INFO
log4j: Parsing appender named "logfile".
log4j: Parsing layout options for "logfile".
log4j: Setting property [conversionPattern] to [%d{yyyy-MM-dd HH:mm:ss.SSS} %c{1}: %p: %m%n].
log4j: End of parsing for "logfile".
log4j: Setting property [append] to [false].
log4j: Setting property [threshold] to [INFO].
log4j: Setting property [file] to [/opt/notebooks/gogoGPCR2/hail_logs/DRD2_1151.log].
log4j: setFile called: /opt/notebooks/gogoGPCR2/hail_logs/DRD2_1151.log, false
log4j: setFile ended
log4j: Parsed "logfile" options.
log4j: Parsing for [Hail] with value=[INFO, HailSocketAppender].
log4j: Level token is [INFO].
log4j: Category Hail set to INFO
log4j: Parsing appender named "HailSocketAppender".
log4j: Parsed "HailSocketAppender" options.
log4j: Handling log4j.additivity.Hail=[null]
log4j: Finished configuring.


Running on Apache Spark version 3.2.3
SparkUI available at http://ip-10-60-22-64.eu-west-2.compute.internal:8081
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.116-cd64e0876c94
LOGGING: writing to /opt/notebooks/gogoGPCR2/hail_logs/DRD2_1151.log


In [4]:
# if not Path("../data/gencode.v29.annotation.gtf.bgz").exists():
#     response = requests.get(GENCODE_GTF)
#     if response.status_code == 200:
#         with open(Path("../data/gencode.v29.annotation.gtf.bgz"), "wb") as file:
#             file.write(response.content)
            
# # Resource https://biobank.ndph.ox.ac.uk/ukb/refer.cgi?id=837
# if not Path("../data/pvcf_blocks.txt").exists():
#     response = requests.get(GENCODE_GTF)
#     if response.status_code == 200:
#         with open(Path("../data/pvcf_blocks.txt"), "wb") as file:
#             file.write(response.content)

In [5]:
# Get gene intervals
gene_interval = hl.experimental.get_gene_intervals(gene_symbols=GENES, reference_genome='GRCh38', gtf_file = f"file://{MISC_DIR / Path(GENCODE_GTF).with_suffix('.bgz').name}")
gene_interval

2023-09-13 11:52:11.799 Hail: INFO: Reading table without type imputation       
  Loading field 'f0' as type str (not specified)
  Loading field 'f1' as type str (not specified)
  Loading field 'f2' as type str (not specified)
  Loading field 'f3' as type int32 (user-supplied)
  Loading field 'f4' as type int32 (user-supplied)
  Loading field 'f5' as type float64 (user-supplied)
  Loading field 'f6' as type str (not specified)
  Loading field 'f7' as type int32 (user-supplied)
  Loading field 'f8' as type str (not specified)
2023-09-13 11:52:24.694 Hail: INFO: wrote table with 2742017 rows in 12 partitions to /tmp/PZ80Ps0Wt2vx7INxH6NKHI
2023-09-13 11:52:28.991 Hail: INFO: Ordering unsorted dataset with network shuffle
2023-09-13 11:52:33.078 Hail: INFO: get_gene_intervals found 1 entries:         
gene: DRD2 (ENSG00000149295)


[Interval(start=Locus(contig=chr11, position=113409615, reference_genome=GRCh38), end=Locus(contig=chr11, position=113475691, reference_genome=GRCh38), includes_start=True, includes_end=True)]

In [6]:
blocks = hl.import_table(f"file://{MISC_DIR / Path(PVCF_BLOCKS).name}" , no_header = True)
blocks = blocks.annotate(f1 = blocks.f1.replace("23", "X").replace("24", "Y"))
# blocks.show()
blocks = blocks.annotate(region = hl.str("").join([hl.str("chr"), blocks.f1]))
blocks = blocks.annotate(interval = hl.locus_interval(blocks.region, hl.int32(blocks.f3), hl.int32(blocks.f4), reference_genome='GRCh38')).key_by("interval")
blocks.show(5) 

2023-09-13 11:52:35.370 Hail: INFO: Reading table without type imputation       
  Loading field 'f0' as type str (not specified)
  Loading field 'f1' as type str (not specified)
  Loading field 'f2' as type str (not specified)
  Loading field 'f3' as type str (not specified)
  Loading field 'f4' as type str (not specified)
                                                                                

f0,f1,f2,f3,f4,region,interval
str,str,str,str,str,str,interval<locus<GRCh38>>
"""1""","""1""","""0""","""1""","""1218130""","""chr1""",[chr1:1-chr1:1218130)
"""2""","""1""","""1""","""1218131""","""1426969""","""chr1""",[chr1:1218131-chr1:1426969)
"""3""","""1""","""2""","""1426970""","""1758871""","""chr1""",[chr1:1426970-chr1:1758871)
"""4""","""1""","""3""","""1758872""","""2514221""","""chr1""",[chr1:1758872-chr1:2514221)
"""5""","""1""","""4""","""2514222""","""3782130""","""chr1""",[chr1:2514222-chr1:3782130)


In [7]:
gb = blocks.filter(hl.any(lambda inter: blocks.interval.overlaps(inter), gene_interval))

gb.show()

f0,f1,f2,f3,f4,region,interval
str,str,str,str,str,str,interval<locus<GRCh38>>
"""566""","""11""","""47""","""112186478""","""116126491""","""chr11""",[chr11:112186478-chr11:116126491)


In [8]:
vcf_files = [
    f"file://{BULK_DIR / VCF_DIR}/ukb{FIELD_ID}_c{chromosome.replace('chr', '')}_b{block}_{VCF_VERSION}.vcf.gz"
    for block, chromosome in zip(gb.f2.collect(), gb.region.collect())
]

# vcf_files = ['file:///mnt/project/Bulk/Exome sequences/Population level exome OQFE variants, pVCF format - final release/ukb23157_c11_b47_v1.vcf.gz',
#              'file:///mnt/project/Bulk/Exome sequences/Population level exome OQFE variants, pVCF format - final release/ukb23157_c17_b53_v1.vcf.gz']

mt = hl.import_vcf(
    vcf_files,
    drop_samples = False,
    reference_genome="GRCh38",
    array_elements_required=False,
    force_bgz = True,
    # n_partitions = 1280,
    # block_size = 1024,
)

2023-09-13 11:52:39.878 Hail: INFO: Coerced sorted dataset
2023-09-13 11:52:41.233 Hail: INFO: Coerced sorted dataset


In [9]:
# Only genes of interest
mt = hl.filter_intervals(mt, gene_interval)


In [10]:
# Only exome capture region

interval_table = hl.import_bed(
    f"file://{BULK_DIR / INTERVAL_FILE}",
    reference_genome="GRCh38",
)

mt = mt.filter_rows(hl.is_defined(interval_table[mt.locus]))

2023-09-13 11:53:19.204 Hail: INFO: Reading table without type imputation       
  Loading field 'f0' as type str (user-supplied)
  Loading field 'f1' as type int32 (user-supplied)
  Loading field 'f2' as type int32 (user-supplied)


In [11]:
# Not with 90% DP<10

PCT10DP_FILE = Path("Exome sequences/Population level exome OQFE variants, PLINK format - final release/helper_files/ukb23158_500k_OQFE.90pct10dp_qc_variants.txt")

pctdp = hl.import_table(f"file://{BULK_DIR / PCT10DP_FILE}", no_header = True)
pctdp = pctdp.annotate(**hl.parse_variant(hl.str("").join([hl.str("chr"), pctdp.f0]), reference_genome = "GRCh38")).key_by("locus", "alleles")
# pctdp = hl.filter_intervals(pctdp, gene_interval)

mt = mt.anti_join_rows(pctdp)

2023-09-13 11:53:27.936 Hail: INFO: Reading table without type imputation       
  Loading field 'f0' as type str (not specified)


In [116]:
stage = "FIRST"
checkpoint_file = f"/tmp/{PROJ_NAME}.{stage}.cp.mt"

mt = hl.read_matrix_table(checkpoint_file)
# mt = mt.checkpoint(checkpoint_file, overwrite=True)

In [117]:
def smart_split_multi_mt(
    mt: hl.matrixtable.MatrixTable, left_aligned=False
) -> hl.matrixtable.MatrixTable:
    """Split multiple alleles into bi-allelic in a clever way
    Parameters
    ----------
    mt : hl.matrixtable.MatrixTable
        MT with non-bi-allelic sites
    left_aligned : bool, optional
        Assume that alleles are left-aligned for faster splitting, by default False
    Returns
    -------
    hl.matrixtable.MatrixTable
        MT with only bi-allelic sites
    """

    mt = mt.key_rows_by("locus", "alleles")

    # Only split relevant alleles as suggested by Hail docs

    bi = mt.filter_rows(hl.len(mt.alleles) == 2)
    bi = bi.annotate_rows(a_index=1, was_split=False)
    multi = mt.filter_rows(hl.len(mt.alleles) > 2)
    split = hl.split_multi_hts(multi, left_aligned=left_aligned)
    mt = split.union_rows(bi)

    return mt

mt = smart_split_multi_mt(mt)


In [119]:
VEP_JSON = "/mnt/project/gogoGPCR2/vep-GRCh38.json"

mt = hl.vep(mt, "file:" + VEP_JSON)

mt = mt.annotate_rows(**mt.vep)
mt = mt.annotate_rows(**mt.transcript_consequences[0])
mt = mt.annotate_rows(
        protCons=mt.amino_acids.split("/")[0]
        + hl.str(mt.protein_end)
        + mt.amino_acids.split("/")[-1],
        varid = hl.variant_str(mt.locus, mt.alleles)
    )
mt = mt.drop("vep", "transcript_consequences", "vep_proc_id")

2023-09-13 13:06:16.475 Hail: INFO: wrote table with 421 rows in 16 partitions to /tmp/persist_TableQG6sf4SfZG


In [120]:
stage = "SECOND"
checkpoint_file = f"/tmp/{PROJ_NAME}.{stage}.cp.mt"

mt = mt.checkpoint(checkpoint_file, overwrite=True)
# mt = hl.read_matrix_table(checkpoint_file)

2023-09-13 13:06:54.419 Hail: INFO: wrote matrix table with 421 rows and 469835 columns in 16 partitions to /tmp/DRD2.SECOND.cp.mt


In [121]:
SAMPLES_TO_REMOVE_FILE = "/tmp/samples_to_remove.tsv"
samples_to_remove = hl.import_table(SAMPLES_TO_REMOVE_FILE).key_by("eid")
samples_to_remove.count()

2023-09-13 13:06:55.099 Hail: INFO: Reading table without type imputation
  Loading field 'eid' as type str (not specified)


59423

In [122]:
mt = mt.anti_join_cols(samples_to_remove)

In [123]:
mt = mt.annotate_entries(AB=(mt.AD[1] / hl.sum(mt.AD)))

filter_condition_ab = (
        (mt.GT.is_hom_ref() & (mt.AB <= 0.1))
        | (mt.GT.is_het() & (mt.AB >= 0.25) & (mt.AB <= 0.75))
        | (mt.GT.is_hom_var() & (mt.AB >= 0.9))
    )

mt = mt.filter_entries(filter_condition_ab)

In [124]:
mt = hl.variant_qc(mt)
mt = mt.filter_rows(mt.variant_qc.gq_stats.mean >= 20)
mt = mt.filter_rows(mt.variant_qc.dp_stats.mean >= 12)
mt = mt.filter_rows(mt.variant_qc.call_rate >= 0.95)
mt = mt.filter_rows(mt.variant_qc.n_non_ref > 0)

In [125]:
stage = "THIRD"
checkpoint_file = f"/tmp/{PROJ_NAME}.{stage}.cp.mt"

mt = mt.checkpoint(checkpoint_file, overwrite=True)
# mt = hl.read_matrix_table(checkpoint_file)

2023-09-13 13:07:15.835 Hail: INFO: wrote matrix table with 360 rows and 413290 columns in 16 partitions to /tmp/DRD2.THIRD.cp.mt


In [127]:
qt = mt.rows()

qt = qt.select(qt.varid, qt.protCons, qt.most_severe_consequence, qt.protein_end, qt.protein_start, qt.amino_acids, qt.gene_id, qt.transcript_id, **qt.variant_qc.flatten())
qt = qt.annotate(AC = qt.AC[1], AF = qt.AF[1], homozygote_count = qt.homozygote_count[1])
qt = qt.key_by().drop("locus", "alleles")

qt.show(5)
qt.export("/tmp/variant_qc.tsv")

varid,protCons,most_severe_consequence,protein_end,protein_start,amino_acids,gene_id,transcript_id,dp_stats.mean,dp_stats.stdev,dp_stats.min,dp_stats.max,gq_stats.mean,gq_stats.stdev,gq_stats.min,gq_stats.max,AC,AF,AN,homozygote_count,call_rate,n_called,n_not_called,n_filtered,n_het,n_non_ref,het_freq_hwe,p_value_hwe,p_value_excess_het
str,str,str,int32,int32,str,str,str,float64,float64,float64,float64,float64,float64,float64,float64,int32,float64,int32,int32,float64,int64,int64,int64,int64,int64,float64,float64,float64
"""chr11:113410731:C:T""","""C443Y""","""missense_variant""",443,443,"""C/Y""","""ENSG00000149295""","""ENST00000362072""",18.8,3.68,12.0,87.0,49.1,1.01,3.0,54.0,1,1.21e-06,826578,0,1.0,413289,0,1,1,1,2.42e-06,0.5,0.5
"""chr11:113410735:G:A""","""H442Y""","""missense_variant""",442,442,"""H/Y""","""ENSG00000149295""","""ENST00000362072""",18.8,3.69,12.0,87.0,49.1,1.01,18.0,54.0,7,8.47e-06,826576,0,1.0,413288,0,2,7,7,1.69e-05,0.5,0.5
"""chr11:113410736:G:A""","""L441L""","""synonymous_variant""",441,441,"""L""","""ENSG00000149295""","""ENST00000362072""",18.8,3.69,12.0,87.0,84.3,16.2,0.0,99.0,16,1.94e-05,826578,0,1.0,413289,0,1,16,16,3.87e-05,0.5,0.5
"""chr11:113410736:G:T""","""L441L""","""synonymous_variant""",441,441,"""L""","""ENSG00000149295""","""ENST00000362072""",18.8,3.69,12.0,87.0,84.3,16.2,0.0,99.0,1,1.21e-06,826578,0,1.0,413289,0,1,1,1,2.42e-06,0.5,0.5
"""chr11:113410754:C:T""","""K435K""","""synonymous_variant""",435,435,"""K""","""ENSG00000149295""","""ENST00000362072""",18.8,4.04,15.0,155.0,49.1,1.03,19.0,54.0,262,0.000317,826578,2,1.0,413289,0,1,258,260,0.000634,0.000416,1.0


2023-09-13 13:07:34.189 Hail: INFO: merging 17 files totalling 90.6K...
2023-09-13 13:07:34.214 Hail: INFO: while writing:
    /tmp/variant_qc.tsv
  merge time: 24.156ms


In [128]:
!hadoop fs -getmerge /tmp/variant_qc.tsv ,,/variant_qc.tsv

SLF4J: Class path contains multiple SLF4J bindings.
SLF4J: Found binding in [jar:file:/cluster/hadoop/share/hadoop/common/lib/slf4j-reload4j-1.7.36.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: Found binding in [jar:file:/cluster/dnax/jars/dnanexus-api-0.1.0-SNAPSHOT-jar-with-dependencies.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: See http://www.slf4j.org/codes.html#multiple_bindings for an explanation.
SLF4J: Actual binding is of type [org.slf4j.impl.Reload4jLoggerFactory]
2023-09-13 13:07:39,313 WARN metrics.MetricsReporter: Unable to initialize metrics scraping configurations from hive-site.xml. Message:InputStream cannot be null
2023-09-13 13:07:39,432 WARN service.DNAxApiSvc: Using default configurations. Unable to find dnanexus.conf.location=null
2023-09-13 13:07:39,432 INFO service.DNAxApiSvc: apiserver connection-pool config. MaxPoolSize=10, MaxPoolPerRoute=10,MaxWaitTimeout=60000
2023-09-13 13:07:39,432 INFO service.DNAxApiSvc: initializing http connection man

In [129]:
# BGEN
BGEN_FILE = "/tmp/DRD2"
GPs = hl.literal([[1.0, 0.0, 0.0], [0.0, 1.0, 0.0], [0.0, 0.0, 1.0]])

mt = mt.annotate_entries(GP=GPs[mt.GT.n_alt_alleles()])

hl.export_bgen(mt=mt, varid=mt.varid, rsid=mt.varid, gp=mt.GP, output="file:" + BGEN_FILE)


                                                                                

In [132]:
# .annotations file
ANNOTATIONS_FILE = "/tmp/DRD2.annotations"

annotations = (
    mt.select_rows(
        varid=mt.varid,
        gene=mt.gene_id,
        annotation=mt.protCons,
    )
    .rows()
    .key_by("varid")
    .drop("locus")
    .drop("alleles")
)

annotations.export("file:" + ANNOTATIONS_FILE, header=False)

2023-09-13 13:10:18.644 Hail: INFO: Coerced sorted dataset                      
2023-09-13 13:10:23.892 Hail: INFO: merging 8 files totalling 14.6K...          
2023-09-13 13:10:23.908 Hail: INFO: while writing:
    file:/tmp/DRD2.annotations
  merge time: 16.292ms


In [136]:
# .setlist file
SETLIST_FILE = "/tmp/DRD2.setlist"
position = mt.aggregate_rows(hl.agg.min(mt.locus.position))
names = mt.varid.collect()
names_str = ",".join(names)

line = f"{mt.gene_id.collect()[0]}\t{mt.locus.contig.collect()[0]}\t{position}\t{names_str}"

with open(SETLIST_FILE, "w") as f:
    f.write(line)


                                                                                

In [None]:
# Dx upload
bgen_file = BGEN_FILE + ".bgen"
sample_file = BGEN_FILE + ".sample"

subprocess.run(
    [
        "dx",
        "upload",
        bgen_file,
        sample_file,
        ANNOTATIONS_FILE,
        SETLIST_FILE,
        "--path",
        "/Data/burden/",
    ],
    check=True,
    shell=False,
)


In [139]:
mt.count()

(360, 413290)