In [61]:
import dxdata
import dxpy
import hail as hl

import pyspark
import tomli
import subprocess
from matrixtables import *
from utils import get_stats
from datetime import datetime
from pprint import pprint

Path("../tmp").resolve().mkdir(parents=True, exist_ok=True)

In [7]:
# Parameters
with open("../config.toml", "rb") as f:
    conf = tomli.load(f)

IMPORT = conf["IMPORT"]
NAME = conf["NAME"]
VCF_VERSION = IMPORT["VCF_VERSION"]
REFERENCE_GENOME = conf["REFERENCE_GENOME"]
DATABASE = IMPORT["DATABASE"]

LOG_FILE = (
    Path(IMPORT["LOG_DIR"], f"{NAME}_{datetime.now().strftime('%H%M')}.log")
    .resolve()
    .__str__()
)

GENE_FILE = Path(IMPORT["GENE_FILE"]).resolve().__str__()

with open(GENE_FILE, "r") as file:
    genes = file.read().splitlines()
    
if NAME == "NONE":
    NAME = genes[0]

In [6]:
# Spark and Hail

sc = pyspark.SparkContext()
spark = pyspark.sql.SparkSession(sc)

mt_database = dxpy.find_one_data_object(name=DATABASE)["id"]
hl.init(sc=sc, default_reference=REFERENCE_GENOME, log=LOG_FILE)

pip-installed Hail requires additional configuration options in Spark referring
  to the path to the Hail Python module directory HAIL_DIR,
  e.g. /path/to/python/site-packages/hail:
    spark.jars=HAIL_DIR/hail-all-spark.jar
    spark.driver.extraClassPath=HAIL_DIR/hail-all-spark.jar
    spark.executor.extraClassPath=./hail-all-spark.jarRunning on Apache Spark version 2.4.4
SparkUI available at http://ip-10-60-116-223.eu-west-2.compute.internal:8081
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.61-3c86d3ba497a
LOGGING: writing to /opt/notebooks/gogoGPCR/hail_logs/NONE_1018.log


In [13]:
STAGE = "QC1"
READ_PATH = "dnax://" + mt_database + f"/{NAME}.{STAGE}.mt"

mt = hl.read_matrix_table(READ_PATH)

v, s = mt.count()
pprint(f"{v} variants and {s} samples after reading matrixtable")

'297 variants and 200643 samples after reading matrixtable'


In [18]:
ht = hl.import_table("file:" + "/mnt/project/data/annotations/DRD2.tsv", impute=True).key_by("AA consequence")

2021-11-29 10:23:20 Hail: INFO: Reading table to impute column types
2021-11-29 10:23:23 Hail: INFO: Loading 50 fields. Counts by type:
  str: 41
  int32: 9


In [21]:
mt = mt.annotate_rows(annotations=ht[mt.protCons])

In [23]:
mt = mt.annotate_rows(
        Gi1=mt.annotations.number_of_impairments_Gi1 > 0,
        GoA=mt.annotations.number_of_impairments_GoA > 0,
        Gz=mt.annotations.number_of_impairments_Gz > 0,
    )

mt = mt.annotate_rows(
    labels=hl.case()
    .when(~mt.Gi1 & ~mt.GoA & ~mt.Gz, "WT")
    .when(mt.Gi1 & ~mt.GoA & ~mt.Gz, "Gi1")
    .when(~mt.Gi1 & mt.GoA & ~mt.Gz, "GoA")
    .when(~mt.Gi1 & ~mt.GoA & mt.Gz, "Gz")
    .when(mt.Gi1 & mt.GoA & ~mt.Gz, "Gi1_GoA")
    .when(mt.Gi1 & ~mt.GoA & mt.Gz, "Gi1_Gz")
    .when(~mt.Gi1 & mt.GoA & mt.Gz, "GoA_Gz")
    .when(mt.Gi1 & mt.GoA & mt.Gz, "Gi1_GoA_Gz")
    .or_missing()
)

In [56]:
stats, intr = get_stats(mt)
stats.show(-1)

2021-11-29 10:39:46 Hail: INFO: Ordering unsorted dataset with network shuffle
2021-11-29 10:39:47 Hail: INFO: Ordering unsorted dataset with network shuffle
2021-11-29 10:39:49 Hail: INFO: Ordering unsorted dataset with network shuffle
2021-11-29 10:40:01 Hail: INFO: Ordering unsorted dataset with network shuffle


labels,n_carriers,n_variants
str,int64,int64
"""Gi1""",154,18
"""Gi1_GoA_Gz""",4,2
"""Gi1_Gz""",2,2
"""Gz""",6,1
"""WT""",10425,78


In [63]:
intr.export("/tmp/DRD2_QC1.tsv")

subprocess.run(
    ["hadoop", "fs", "-get", "/tmp/DRD2_QC1.tsv", "../tmp/DRD2_QC1.tsv"],
    check=True,
    shell=False,
)

2021-11-29 10:47:18 Hail: INFO: Ordering unsorted dataset with network shuffle
2021-11-29 10:47:18 Hail: INFO: Ordering unsorted dataset with network shuffle
2021-11-29 10:47:19 Hail: INFO: Ordering unsorted dataset with network shuffle
2021-11-29 10:47:31 Hail: INFO: merging 2 files totalling 15.5K...
2021-11-29 10:47:31 Hail: INFO: while writing:
    /tmp/DRD2_QC1.tsv
  merge time: 52.124ms


CompletedProcess(args=['hadoop', 'fs', '-get', '/tmp/DRD2_QC1.tsv', '../tmp/DRD2_QC1.tsv'], returncode=0)

In [64]:
STAGE = "LABELLED"
WRITE_PATH = "dnax://" + mt_database + f"/{NAME}.{STAGE}.mt"

mt.write(WRITE_PATH, overwrite = True)

2021-11-29 10:52:01 Hail: INFO: Ordering unsorted dataset with network shuffle
2021-11-29 10:52:02 Hail: INFO: Ordering unsorted dataset with network shuffle
2021-11-29 10:52:05 Hail: INFO: Ordering unsorted dataset with network shuffle
2021-11-29 10:54:09 Hail: INFO: wrote matrix table with 297 rows and 200643 columns in 2 partitions to dnax://database-G6XB998J860kZy4z59fBqPBV/DRD2.LABELLED.mt
    Total size: 174.16 MiB
    * Rows/entries: 173.03 MiB
    * Columns: 1.13 MiB
    * Globals: 11.00 B
    * Smallest partition: 0 rows (20.00 B)
    * Largest partition:  297 rows (173.03 MiB)
