In [1]:
import pyspark
import dxpy
import dxdata
import toml
from pathlib import Path

import hail as hl

In [2]:
sc = pyspark.SparkContext()
spark = pyspark.sql.SparkSession(sc)

In [3]:
with open("../config.toml") as f:
    conf = toml.load(f)

log_file = Path(conf["IMPORT"]["LOG_DIR"], f"annotate.log").resolve().__str__()
GENE = "GCGR"

In [4]:
hl.init(sc=sc, default_reference='GRCh38', log=log_file)

pip-installed Hail requires additional configuration options in Spark referring
  to the path to the Hail Python module directory HAIL_DIR,
  e.g. /path/to/python/site-packages/hail:
    spark.jars=HAIL_DIR/hail-all-spark.jar
    spark.driver.extraClassPath=HAIL_DIR/hail-all-spark.jar
    spark.executor.extraClassPath=./hail-all-spark.jarRunning on Apache Spark version 2.4.4
SparkUI available at http://ip-10-60-85-84.eu-west-2.compute.internal:8081
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.61-3c86d3ba497a
LOGGING: writing to /opt/notebooks/gogoGPCR/hail_logs/annotate.log


In [5]:
stage = "final"
#write_file = "file:" + "/mnt/project/data/matrix_tables/GCGR.final.mt"
write_file = "file:/opt/notebooks/gogoGPCR/tmp/GCGR.final.mt"

mt = hl.read_matrix_table(write_file)
mt.count()


(297, 162483)

In [6]:
cadd_file = "file:" + "/mnt/project/data/GCGR_CADD.tsv.bgz"

ht = hl.import_table(
        cadd_file,
        impute=True,
        comment="#",
        find_replace=("^#", ""),
    )

ht.count()

2021-09-15 15:44:25 Hail: INFO: Reading table to impute column types
2021-09-15 15:44:30 Hail: INFO: Loading 134 fields. Counts by type:
  float64: 60
  int32: 52
  str: 22


41265

In [7]:
ht = ht.annotate(Chrom = "chr" + hl.str(ht.Chrom))

ht = ht.annotate(
        locus=hl.locus(ht.Chrom, ht.Pos, reference_genome="GRCh38"),
        alleles=hl.array([ht.Ref, ht.Alt]),
    )

ht = ht.key_by(ht.locus, ht.alleles)

ht = ht.filter((ht.ConsDetail == "missense"))

mt = mt.annotate_rows(cadd=ht[mt.locus, mt.alleles])

In [8]:
mt = mt.filter_rows(hl.is_defined(mt.cadd))

In [9]:
mt.count()

2021-09-15 15:44:33 Hail: INFO: Coerced sorted dataset


(170, 162483)

In [10]:
anno = hl.import_table("file:/mnt/project/data/annotations/GCGR_LoF.txt", key = "Variant")

2021-09-15 15:44:35 Hail: INFO: Reading table without type imputation
  Loading field 'Variant' as type str (not specified)
  Loading field 'LoF' as type str (not specified)


In [11]:
mt = mt.annotate_rows(protCons = mt.cadd.oAA + hl.str(mt.cadd.protPos) + mt.cadd.nAA)

In [12]:
mt = mt.annotate_rows(LoF = anno[mt.protCons].LoF)

In [13]:
mt = hl.variant_qc(mt)

In [32]:
mt2 = mt.filter_rows(mt.LoF == "LoF")
mt2.select_rows(mt2.variant_qc, mt2.protCons, mt2.LoF).rows().show(-1)

2021-09-15 15:59:33 Hail: INFO: Coerced sorted dataset
2021-09-15 15:59:34 Hail: INFO: Coerced sorted dataset
2021-09-15 15:59:36 Hail: INFO: Ordering unsorted dataset with network shuffle
2021-09-15 15:59:36 Hail: INFO: Ordering unsorted dataset with network shuffle
2021-09-15 15:59:38 Hail: INFO: Ordering unsorted dataset with network shuffle


Unnamed: 0_level_0,Unnamed: 1_level_0,variant_qc,variant_qc,variant_qc,variant_qc,variant_qc,variant_qc,variant_qc,variant_qc,variant_qc,variant_qc,variant_qc,variant_qc,variant_qc,variant_qc,variant_qc,variant_qc,variant_qc,variant_qc,variant_qc,variant_qc,Unnamed: 22_level_0,Unnamed: 23_level_0
Unnamed: 0_level_1,Unnamed: 1_level_1,dp_stats,dp_stats,dp_stats,dp_stats,gq_stats,gq_stats,gq_stats,gq_stats,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
locus,alleles,mean,stdev,min,max,mean,stdev,min,max,AC,AF,AN,homozygote_count,call_rate,n_called,n_not_called,n_filtered,n_het,n_non_ref,het_freq_hwe,p_value_hwe,protCons,LoF
locus<GRCh38>,array<str>,float64,float64,float64,float64,float64,float64,float64,float64,array<int32>,array<float64>,int32,array<int32>,float64,int64,int64,int64,int64,int64,float64,float64,str,str
chr17:81810848,"[""G"",""A""]",38.8,10.7,11.0,86.0,49.7,0.671,30.0,50.0,"[324964,2]","[1.00e+00,6.15e-06]",324966,"[162481,0]",1.0,162483,0,0,2,2,1.23e-05,0.5,"""D63N""","""LoF"""
chr17:81810923,"[""C"",""A""]",39.0,10.8,16.0,104.0,49.7,0.785,21.0,50.0,"[324922,30]","[1.00e+00,9.23e-05]",324952,"[162446,0]",1.0,162476,0,7,30,30,0.000185,0.501,"""H88N""","""LoF"""
chr17:81811111,"[""G"",""T""]",63.3,13.2,16.0,127.0,50.0,0.215,31.0,55.0,"[324965,1]","[1.00e+00,3.08e-06]",324966,"[162482,0]",1.0,162483,0,0,1,1,6.15e-06,0.5,"""G125C""","""LoF"""
chr17:81811122,"[""T"",""G""]",63.3,13.2,16.0,149.0,50.0,0.216,36.0,55.0,"[324963,3]","[1.00e+00,9.23e-06]",324966,"[162480,0]",1.0,162483,0,0,3,3,1.85e-05,0.5,"""I128M""","""LoF"""
chr17:81811667,"[""G"",""A""]",70.0,16.9,10.0,165.0,50.0,0.165,30.0,57.0,"[324964,2]","[1.00e+00,6.15e-06]",324966,"[162481,0]",1.0,162483,0,0,2,2,1.23e-05,0.5,"""R225H""","""LoF"""
chr17:81812226,"[""C"",""T""]",19.3,3.75,16.0,89.0,49.1,0.999,33.0,50.0,"[324964,2]","[1.00e+00,6.15e-06]",324966,"[162481,0]",1.0,162483,0,0,2,2,1.23e-05,0.5,"""R308W""","""LoF"""
chr17:81812853,"[""G"",""A""]",41.2,25.6,16.0,215.0,49.8,0.656,32.0,50.0,"[324948,18]","[1.00e+00,5.54e-05]",324966,"[162465,0]",1.0,162483,0,0,18,18,0.000111,0.5,"""E362K""","""LoF"""
chr17:81812871,"[""G"",""A""]",26.0,11.3,16.0,209.0,49.7,0.741,40.0,51.0,"[324917,49]","[1.00e+00,1.51e-04]",324966,"[162434,0]",1.0,162483,0,0,49,49,0.000302,0.502,"""V368M""","""LoF"""


In [15]:
def add_varid(mt: hl.matrixtable.MatrixTable) -> hl.matrixtable.MatrixTable:

    mt = mt.annotate_rows(
        varid=hl.delimit(
            [
                mt.locus.contig,
                hl.str(mt.locus.position),
                mt.alleles[0],
                mt.alleles[1],
            ],
            ":",
        )
    )

    return mt


def recode_GT_to_GP(
    mt: hl.matrixtable.MatrixTable,
) -> hl.matrixtable.MatrixTable:

    GPs = hl.literal([[1.0, 0.0, 0.0], [0.0, 1.0, 0.0], [0.0, 0.0, 1.0]])

    mt = mt.annotate_entries(GP=GPs[mt.GT.n_alt_alleles()])

    return mt


def write_bgen(mt: hl.matrixtable.MatrixTable, output: str) -> None:

    mt = add_varid(mt)

    mt = recode_GT_to_GP(mt)

    hl.export_bgen(
        mt=mt, varid=mt.varid, rsid=mt.varid, gp=mt.GP, output=output
    )

In [17]:
write_bgen(mt, "file:/opt/notebooks/gogoGPCR/tmp/GCGR")

2021-09-15 15:48:51 Hail: INFO: Coerced sorted dataset
2021-09-15 15:49:08 Hail: INFO: while writing:
    file:/opt/notebooks/gogoGPCR/tmp/GCGR.bgen
  merge time: 32.451ms


In [19]:
mt = mt.annotate_rows(LoF2 = hl.if_else(mt.LoF == "LoF", "LoF", "WT", missing_false = True))

In [25]:
mt = add_varid(mt)

annotations = (
    mt.select_rows(
        mt.varid,
        mt.cadd.GeneName,
        mt.LoF2
    )
    .rows()
    .key_by("varid")
    .drop("locus")
    .drop("alleles")
)
annotations.export("file:/opt/notebooks/gogoGPCR/tmp/GCGR.annotations", header=False)

2021-09-15 15:56:14 Hail: INFO: Coerced sorted dataset
2021-09-15 15:56:15 Hail: INFO: Coerced sorted dataset
2021-09-15 15:56:17 Hail: INFO: Ordering unsorted dataset with network shuffle
2021-09-15 15:56:17 Hail: INFO: Ordering unsorted dataset with network shuffle
2021-09-15 15:56:19 Hail: INFO: Ordering unsorted dataset with network shuffle
2021-09-15 15:56:20 Hail: INFO: Coerced sorted dataset
2021-09-15 15:56:22 Hail: INFO: merging 2 files totalling 4.5K...
2021-09-15 15:56:22 Hail: INFO: while writing:
    file:/opt/notebooks/gogoGPCR/tmp/GCGR.annotations
  merge time: 25.268ms


In [29]:
position = mt.aggregate_rows(hl.agg.min(mt.locus.position))
names = mt.varid.collect()
names_str = ",".join(names)

line = f"{mt.cadd.GeneName.collect()[0]}\t{mt.locus.contig.collect()[0]}\t{position}\t{names_str}"

with open(f"/opt/notebooks/gogoGPCR/tmp/GCG.setlist", "w") as f:
    f.write(line)

2021-09-15 15:58:10 Hail: INFO: Coerced sorted dataset
2021-09-15 15:58:14 Hail: INFO: Coerced sorted dataset
2021-09-15 15:58:17 Hail: INFO: Coerced sorted dataset
2021-09-15 15:58:20 Hail: INFO: Coerced sorted dataset
