In [1]:
import hail as hl
from pathlib import Path
from datetime import datetime
import pyspark
import dxpy
import pandas as pd
import subprocess

In [2]:
# Spark and Hail
# @Peter this can probably stay as is

DATABASE = "matrix_tables"
REFERENCE_GENOME = "GRCh38"
PROJ_NAME = "DRD2"

Path("/tmp").resolve().mkdir(parents=True, exist_ok=True)

LOG_FILE = (
    Path("../hail_logs", f"{PROJ_NAME}_{datetime.now().strftime('%H%M')}.log")
    .resolve()
    .__str__()
)

# Hail init
sc = pyspark.SparkContext()
spark = pyspark.sql.SparkSession(sc)

hl.init(sc=sc, default_reference=REFERENCE_GENOME, log=LOG_FILE)

# Create database in DNAX
spark.sql(f"CREATE DATABASE IF NOT EXISTS {DATABASE} LOCATION 'dnax://'")
mt_database = dxpy.find_one_data_object(name=DATABASE)["id"]

SLF4J: Class path contains multiple SLF4J bindings.
SLF4J: Found binding in [jar:file:/cluster/dnax/jars/dnanexus-api-0.1.0-SNAPSHOT-jar-with-dependencies.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: Found binding in [jar:file:/cluster/spark/jars/log4j-slf4j-impl-2.17.2.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: See http://www.slf4j.org/codes.html#multiple_bindings for an explanation.
SLF4J: Actual binding is of type [org.slf4j.impl.Log4jLoggerFactory]
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


2023-09-05 07:46:13.074 WARN  NativeCodeLoader:60 - Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
2023-09-05 07:46:14.362 WARN  Utils:69 - Service 'org.apache.spark.network.netty.NettyBlockTransferService' could not bind on port 43000. Attempting port 43001.
2023-09-05 07:46:14.731 WARN  MetricsReporter:84 - No metrics configured for reporting
2023-09-05 07:46:14.732 WARN  LineProtoUsageReporter:48 - Telegraf configurations: url [metrics.push.telegraf.hostport], user [metrics.push.telegraf.user] or password [metrics.push.telegraf.password] missing.
2023-09-05 07:46:14.732 WARN  MetricsReporter:117 - metrics.scraping.httpserver.port


pip-installed Hail requires additional configuration options in Spark referring
  to the path to the Hail Python module directory HAIL_DIR,
  e.g. /path/to/python/site-packages/hail:
    spark.jars=HAIL_DIR/backend/hail-all-spark.jar
    spark.driver.extraClassPath=HAIL_DIR/backend/hail-all-spark.jar
    spark.executor.extraClassPath=./hail-all-spark.jar

log4j: Parsing for [root] with value=[INFO, logfile].
log4j: Level token is [INFO].
log4j: Category root set to INFO
log4j: Parsing appender named "logfile".
log4j: Parsing layout options for "logfile".
log4j: Setting property [conversionPattern] to [%d{yyyy-MM-dd HH:mm:ss.SSS} %c{1}: %p: %m%n].
log4j: End of parsing for "logfile".
log4j: Setting property [append] to [false].
log4j: Setting property [threshold] to [INFO].
log4j: Setting property [file] to [/opt/notebooks/gogoGPCR2/hail_logs/DRD2_0746.log].
log4j: setFile called: /opt/notebooks/gogoGPCR2/hail_logs/DRD2_0746.log, false
log4j: setFile ended
log4j: Parsed "logfile" options.
log4j: Parsing for [Hail] with value=[INFO, HailSocketAppender].
log4j: Level token is [INFO].
log4j: Category Hail set to INFO
log4j: Parsing appender named "HailSocketAppender".
log4j: Parsed "HailSocketAppender" options.
log4j: Handling log4j.additivity.Hail=[null]
log4j: Finished configuring.


Running on Apache Spark version 3.2.3
SparkUI available at http://ip-10-60-170-194.eu-west-2.compute.internal:8081
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.116-cd64e0876c94
LOGGING: writing to /opt/notebooks/gogoGPCR2/hail_logs/DRD2_0746.log


In [20]:
# Read in metadata and region
# Map genes to blocks and regions
GENES = ["DRD2"]
MAPPING_FILE = Path("../data/misc/mappings_with_blocks.tsv").resolve()
mapping = pd.read_csv(MAPPING_FILE, sep="\t").set_index("HGNC", drop=False)
mapping.loc[GENES, :]

Unnamed: 0_level_0,HGNC,entry_name,name,accession,family,species,residue_numbering_scheme,sequence,genes,ENSG,...,type,GRCh37_start,GRCh37_end,GRCh37_strand,GRCh38_start,GRCh38_end,GRCh38_strand,GRCh38_region,Notes,VCF_block
HGNC,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
DRD2,DRD2,drd2_human,D<sub>2</sub> receptor,P14416,001_001_004_002,Homo sapiens,GPCRdb(A),MDPLNLSWYDDDLERQNWSRPFNGSDGKADRPHYNYYATLLTLLIA...,['DRD2'],ENSG00000149295,...,GPCR,113280318.0,113346413.0,-1.0,113409605,113475691,-1,11,,47


In [56]:
INTERVAL_FILE = Path("../data/misc/xgen_plus_spikein.b38.bed").resolve()
subprocess.run(["hadoop", "fs", "-put", str(INTERVAL_FILE), "/tmp"])

interval_table = hl.import_bed(
    f"/tmp/{INTERVAL_FILE.name}",
    reference_genome="GRCh38",
)

# mt = mt.filter_rows(hl.is_defined(interval_table[mt.locus]))

SLF4J: Class path contains multiple SLF4J bindings.
SLF4J: Found binding in [jar:file:/cluster/hadoop/share/hadoop/common/lib/slf4j-reload4j-1.7.36.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: Found binding in [jar:file:/cluster/dnax/jars/dnanexus-api-0.1.0-SNAPSHOT-jar-with-dependencies.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: See http://www.slf4j.org/codes.html#multiple_bindings for an explanation.
SLF4J: Actual binding is of type [org.slf4j.impl.Reload4jLoggerFactory]
2023-09-01 13:06:48,956 WARN metrics.MetricsReporter: Unable to initialize metrics scraping configurations from hive-site.xml. Message:InputStream cannot be null
2023-09-01 13:06:49,062 WARN service.DNAxApiSvc: Using default configurations. Unable to find dnanexus.conf.location=null
2023-09-01 13:06:49,062 INFO service.DNAxApiSvc: apiserver connection-pool config. MaxPoolSize=10, MaxPoolPerRoute=10,MaxWaitTimeout=60000
2023-09-01 13:06:49,062 INFO service.DNAxApiSvc: initializing http connection man

In [40]:
# vcffile = "file:///mnt/project/Bulk/Exome sequences/Population level exome OQFE
# variants, pVCF format - final release/ukb23157_c1_b43_v1.vcf.gz"
# mt = hl.import_vcf(vcffile, force_bgz=True)

In [57]:
# mt.count()

2023-09-01 13:07:00.570 Hail: INFO: Coerced sorted dataset                      
2023-09-01 13:07:01.957 Hail: INFO: Coerced sorted dataset                      
2023-09-01 13:07:03.961 Hail: INFO: wrote table with 15768 rows in 1 partition to /tmp/__iruid_9055-b6EcC8lCrQHrx4YEIEurzY
                                                                                

(937947, 469835)