In [1]:
import dxdata
import dxpy
import hail as hl

import pyspark
import tomli
import subprocess
from matrixtables import *
from utils import get_stats
from datetime import datetime
from pprint import pprint

Path("../tmp").resolve().mkdir(parents=True, exist_ok=True)

In [2]:
# Parameters
with open("../config.toml", "rb") as f:
    conf = tomli.load(f)

IMPORT = conf["IMPORT"]
NAME = conf["NAME"]
VCF_VERSION = IMPORT["VCF_VERSION"]
REFERENCE_GENOME = conf["REFERENCE_GENOME"]
DATABASE = IMPORT["DATABASE"]

LOG_FILE = (
    Path(IMPORT["LOG_DIR"], f"{NAME}_{datetime.now().strftime('%H%M')}.log")
    .resolve()
    .__str__()
)

GENE_FILE = Path(IMPORT["GENE_FILE"]).resolve().__str__()

with open(GENE_FILE, "r") as file:
    genes = file.read().splitlines()
    
if NAME == "NONE":
    NAME = genes[0]

In [3]:
# Spark and Hail

sc = pyspark.SparkContext()
spark = pyspark.sql.SparkSession(sc)

mt_database = dxpy.find_one_data_object(name=DATABASE)["id"]
hl.init(sc=sc, default_reference=REFERENCE_GENOME, log=LOG_FILE)

pip-installed Hail requires additional configuration options in Spark referring
  to the path to the Hail Python module directory HAIL_DIR,
  e.g. /path/to/python/site-packages/hail:
    spark.jars=HAIL_DIR/hail-all-spark.jar
    spark.driver.extraClassPath=HAIL_DIR/hail-all-spark.jar
    spark.executor.extraClassPath=./hail-all-spark.jarRunning on Apache Spark version 2.4.4
SparkUI available at http://ip-10-60-79-22.eu-west-2.compute.internal:8081
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.78-b17627756568
LOGGING: writing to /opt/notebooks/gogoGPCR/hail_logs/NONE_1830.log


In [4]:
STAGE = "QC1"
READ_PATH = "dnax://" + mt_database + f"/{NAME}.{STAGE}.mt"

mt = hl.read_matrix_table(READ_PATH)

v, s = mt.count()
pprint(f"{v} variants and {s} samples after reading matrixtable")

'223 variants and 200643 samples after reading matrixtable'


In [5]:
ht = hl.import_table("file:" + "/mnt/project/Data/annotations/MC4R.tsv", impute=True, quote = '"')
ht = ht.annotate(Variants = ht.Variants.strip())
ht = ht.key_by("Variants")

2021-12-15 18:30:32 Hail: INFO: Reading table to impute column types
2021-12-15 18:30:37 Hail: INFO: Finished type imputation
  Loading field 'Variants' as type str (imputed)
  Loading field 'Category' as type int32 (imputed)


In [6]:
ht.show()

Variants,Category
str,int32
"""5'UTR 32bp vor A+G:A>C""",4
"""A135P""",2
"""A175T""",1
"""C271R""",2
"""D90N""",2
"""E61*""",2
"""G181D""",2
"""I170V""",1
"""I198I""",4
"""I251L""",1


In [7]:
mt = mt.annotate_rows(labels = ht[mt.protCons].Category)

In [8]:
stats, intr = get_stats(mt)
stats.show(-1)

2021-12-15 18:30:46 Hail: INFO: Ordering unsorted dataset with network shuffle
2021-12-15 18:30:46 Hail: INFO: Ordering unsorted dataset with network shuffle
2021-12-15 18:30:49 Hail: INFO: Ordering unsorted dataset with network shuffle
2021-12-15 18:31:01 Hail: INFO: Ordering unsorted dataset with network shuffle


labels,n_carriers,n_variants
int32,int64,int64
1,5505,6
2,35,5
3,7873,1
4,372,3


In [9]:
intr.export(f"/tmp/{NAME}_QC1.tsv")

subprocess.run(
    ["hadoop", "fs", "-get", f"/tmp/{NAME}_QC1.tsv", f"../tmp/{NAME}_QC1.tsv"],
    check=True,
    shell=False,
)

2021-12-15 18:31:20 Hail: INFO: Ordering unsorted dataset with network shuffle
2021-12-15 18:31:20 Hail: INFO: Ordering unsorted dataset with network shuffle
2021-12-15 18:31:22 Hail: INFO: Ordering unsorted dataset with network shuffle
2021-12-15 18:31:32 Hail: INFO: merging 2 files totalling 2.4K...
2021-12-15 18:31:32 Hail: INFO: while writing:
    /tmp/MC4R_QC1.tsv
  merge time: 39.455ms


CompletedProcess(args=['hadoop', 'fs', '-get', '/tmp/MC4R_QC1.tsv', '../tmp/MC4R_QC1.tsv'], returncode=0)

In [16]:
STAGE = "LABELLED"
WRITE_PATH = "dnax://" + mt_database + f"/{NAME}.{STAGE}.mt"

mt.write(WRITE_PATH, overwrite = True)

2021-12-06 15:34:21 Hail: INFO: Ordering unsorted dataset with network shuffle
2021-12-06 15:34:22 Hail: INFO: Ordering unsorted dataset with network shuffle
2021-12-06 15:34:23 Hail: INFO: Ordering unsorted dataset with network shuffle
2021-12-06 15:35:49 Hail: INFO: wrote matrix table with 223 rows and 200643 columns in 3 partitions to dnax://database-G6XB998J860kZy4z59fBqPBV/MC4R.LABELLED.mt
    Total size: 113.68 MiB
    * Rows/entries: 112.55 MiB
    * Columns: 1.13 MiB
    * Globals: 11.00 B
    * Smallest partition: 0 rows (20.00 B)
    * Largest partition:  210 rows (102.36 MiB)
