In [1]:
from pyspark.sql import SparkSession
import hail as hl

builder = SparkSession.builder.enableHiveSupport()

spark = builder.getOrCreate()
hl.init(sc=spark.sparkContext, default_reference="GRCh38")

pip-installed Hail requires additional configuration options in Spark referring
  to the path to the Hail Python module directory HAIL_DIR,
  e.g. /path/to/python/site-packages/hail:
    spark.jars=HAIL_DIR/backend/hail-all-spark.jar
    spark.driver.extraClassPath=HAIL_DIR/backend/hail-all-spark.jar
    spark.executor.extraClassPath=./hail-all-spark.jarRunning on Apache Spark version 3.2.3
SparkUI available at http://ip-10-60-37-64.eu-west-2.compute.internal:8081
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.116-cd64e0876c94
LOGGING: writing to /opt/notebooks/hail-20240415-2003-0.2.116-cd64e0876c94.log


In [None]:
CHR = "17"
START = 4710596
END = 4721499
LENGTH = 1500000
REF = "GRCh38"
EXOME_DIR = "file:///mnt/project/Bulk/Exome sequences/"
START_VCF = (
    EXOME_DIR
    + f"Population level exome OQFE variants, pVCF format - final release/ukb23157_c{CHR}_b{START // LENGTH + 1}_v1.vcf.gz"
)
END_VCF = (
    EXOME_DIR
    + f"Population level exome OQFE variants, pVCF format - final release/ukb23157_c{CHR}_b{END // LENGTH + 1}_v1.vcf.gz"
)
INT_TABLE = (
    EXOME_DIR + "Exome OQFE CRAM files/helper_files/xgen_plus_spikein.GRCh38.bed"
)
DP10_FILE = (
    EXOME_DIR
    + "Population level exome OQFE variants, PLINK format - final release/helper_files/ukb23158_500k_OQFE.90pct10dp_qc_variants.txt"
)

# f"file:///mnt/project/Bulk/Exome sequences/Population level exome OQFE variants, pVCF format - final release/ukb23157_c3_b13_v1.vcf.gz",
# f"file:///mnt/project/Bulk/DRAGEN WGS/DRAGEN population level WGS variants, pVCF format 500k release/chr{CHR}/ukb24310_c{CHR}_b{START // LENGTH}_v1.vcf.gz",
# f"file:///mnt/project/Bulk/Exome sequences/Population level exome OQFE variants, pVCF format - final release/ukb23157_c11_b10_v1.vcf.gz",
# "file:///mnt/project/Bulk/Exome sequences/Population level exome OQFE variants, pVCF format - final release/ukb23157_c17_b0_v1.vcf.gz",
VCF_FILES = list(set([START_VCF, END_VCF]))

print(f"Number of VCFs {VCF_FILES}")


mt = hl.import_vcf(
    VCF_FILES,
    # min_partitions = 2,
    force_bgz=True,
    reference_genome="GRCh38",
    array_elements_required=False,
    drop_samples=False,
    # n_partitions=3,  # fit number of workers
)


# Filter Gene Intervals
gene_intervals = [
    hl.locus_interval(f"chr{CHR}", START, END, reference_genome=REF),
]

mt = hl.filter_intervals(mt, gene_intervals)


# Filter Spikein
interval_table = hl.import_bed(
    INT_TABLE,
    reference_genome="GRCh38",
)

mt = mt.filter_rows(hl.is_defined(interval_table[mt.locus]))

# Filter 90PCT_10DP


def load_90pct10dp(dp10: str, chrom: int = 17):
    ht = hl.import_table(dp10, no_header=True)
    ht = ht.filter(ht.f0.startswith(f"{chrom}:"))
    ht = ht.annotate(**hl.parse_variant("chr" + ht.f0))
    ht = ht.key_by(ht.locus, ht.alleles)
    ht = ht.drop("f0")

    return ht


dp10_table = load_90pct10dp(DP10_FILE)

mt = mt.filter_rows(~hl.is_defined(dp10_table[mt.locus, mt.alleles]))


# Split multi
def smart_split_multi_mt(
    mt: hl.matrixtable.MatrixTable, left_aligned=False
) -> hl.matrixtable.MatrixTable:
    """Split multiple alleles into bi-allelic in a clever way
    Parameters
    ----------
    mt : hl.matrixtable.MatrixTable
        MT with non-bi-allelic sites
    left_aligned : bool, optional
        Assume that alleles are left-aligned for faster splitting, by default False
    Returns
    -------
    hl.matrixtable.MatrixTable
        MT with only bi-allelic sites
    """

    # Only split relevant alleles as suggested by Hail docs

    bi = mt.filter_rows(hl.len(mt.alleles) == 2)
    bi = bi.annotate_rows(a_index=1, was_split=False)
    multi = mt.filter_rows(hl.len(mt.alleles) > 2)
    split = hl.split_multi_hts(multi, left_aligned=left_aligned)
    mt = split.union_rows(bi)

    return mt


mt = smart_split_multi_mt(mt)

mt.checkpoint("CP1.mt", overwrite=True)
mt.show()

In [2]:
mt = hl.read_matrix_table("CP1.mt")
mt = mt.add_col_index("idx")
mt = mt.key_cols_by("s", "idx")
# hl.export_vcf(mt.rows().naive_coalesce(1), "Filtered.vcf")

In [3]:
mt.describe()

----------------------------------------
Global fields:
    None
----------------------------------------
Column fields:
    's': str
    'idx': int64
----------------------------------------
Row fields:
    'locus': locus<GRCh38>
    'alleles': array<str>
    'rsid': str
    'qual': float64
    'filters': set<str>
    'info': struct {
        AF: array<float64>, 
        AQ: array<int32>, 
        AC: array<int32>, 
        AN: int32
    }
    'a_index': int32
    'was_split': bool
----------------------------------------
Entry fields:
    'GT': call
    'RNC': array<str>
    'DP': int32
    'AD': array<int32>
    'GQ': int32
    'PL': array<int32>
----------------------------------------
Column key: ['s', 'idx']
Row key: ['locus', 'alleles']
----------------------------------------


In [9]:
# mt.aggregate_entries(hl.agg.counter(mt.RNC))

{<FrozenList(frozen=True, ['-.'])>: 5,
 <FrozenList(frozen=True, ['..'])>: 185102255,
 <FrozenList(frozen=True, ['II'])>: 482555,
 <FrozenList(frozen=True, ['L.'])>: 10}

In [19]:
mt.filter_rows(hl.agg.sum(mt.GT.is_non_ref()) > 1).count()

(241, 469835)

In [36]:
mt.cols().naive_coalesce(1).export("/tmp/cols.tsv")

2024-04-15 20:00:32.913 Hail: WARN: cols(): Resulting column table is sorted by 'col_key'.
    To preserve matrix table column order, first unkey columns with 'key_cols_by()'
2024-04-15 20:00:34.471 Hail: INFO: Coerced sorted dataset
2024-04-15 20:00:35.137 Hail: INFO: merging 2 files totalling 6.6M...
2024-04-15 20:00:35.186 Hail: INFO: while writing:
    /tmp/cols.tsv
  merge time: 48.402ms


In [4]:
mt.select_rows().key_cols_by("idx").drop("s").entries().export("/tmp/Entries.tsv.bgz")

2024-04-15 20:03:20.825 Hail: WARN: entries(): Resulting entries table is sorted by '(row_key, col_key)'.
    To preserve row-major matrix table order, first unkey columns with 'key_cols_by()'
2024-04-15 20:06:09.345 Hail: INFO: merging 28 files totalling 1.1G...
2024-04-15 20:06:12.099 Hail: INFO: while writing:
    /tmp/Entries.tsv.bgz
  merge time: 2.753s


In [5]:
!hadoop fs -getmerge /tmp/Entries.tsv.bgz Entries.tsv.bgz

SLF4J: Class path contains multiple SLF4J bindings.
SLF4J: Found binding in [jar:file:/cluster/hadoop/share/hadoop/common/lib/slf4j-reload4j-1.7.36.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: Found binding in [jar:file:/cluster/dnax/jars/dnanexus-api-0.1.0-SNAPSHOT-jar-with-dependencies.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: See http://www.slf4j.org/codes.html#multiple_bindings for an explanation.
SLF4J: Actual binding is of type [org.slf4j.impl.Reload4jLoggerFactory]
2024-04-15 20:08:17,147 WARN metrics.MetricsReporter: Unable to initialize metrics scraping configurations from hive-site.xml. Message:InputStream cannot be null
2024-04-15 20:08:17,261 WARN service.DNAxApiSvc: Using default configurations. Unable to find dnanexus.conf.location=null
2024-04-15 20:08:17,261 INFO service.DNAxApiSvc: apiserver connection-pool config. MaxPoolSize=10, MaxPoolPerRoute=10,MaxWaitTimeout=60000
2024-04-15 20:08:17,261 INFO service.DNAxApiSvc: initializing http connection man

In [11]:
!cp '/mnt/project/Bulk/Exome sequences_Alternative exome processing/Exome variant call files (gnomAD) (VCFs)/helper_files/Broad_455k_exome_gnomAD_QC_summary.md' .

^C
