# Create a MatrixTable and QC the hell out of it
## 0 Import stuff and set your parameters

In [1]:
import toml
from pathlib import Path
from datetime import datetime
import sys

import pandas as pd
import pyspark
import dxpy
import dxdata
import hail as hl
    
module_path = Path('..').resolve().__str__()

if module_path not in sys.path:
    sys.path.append(module_path)
    
from src.utils import get_position, lookup_vcfs
from src.matrixtables import *

In [2]:
with open("../config.toml") as f:
    conf = toml.load(f)

imps = conf["IMPORT"]
GENE = conf["GENE"]

now = datetime.now().strftime("%H%M%S")

map_file = Path(imps["DATA_DIR"],imps["MAPPING_FILE"]).resolve().__str__()
log_file = Path(imps["LOG_DIR"], f"{GENE}_{now}.log").resolve().__str__()
int_file = Path(imps["DATA_DIR"], imps["INTERVAL_FILE"]).resolve().__str__()
vcf_dir = Path(imps["VCF_DIR"]).resolve().__str__()
checkpoint_file = Path(imps["TMP_DIR"], f"{GENE}.cp.mt").resolve().__str__()

In [4]:
sc = pyspark.SparkContext()
spark = pyspark.sql.SparkSession(sc)

hl.init(sc=sc, default_reference='GRCh38', log=log_file)

pip-installed Hail requires additional configuration options in Spark referring
  to the path to the Hail Python module directory HAIL_DIR,
  e.g. /path/to/python/site-packages/hail:
    spark.jars=HAIL_DIR/hail-all-spark.jar
    spark.driver.extraClassPath=HAIL_DIR/hail-all-spark.jar
    spark.executor.extraClassPath=./hail-all-spark.jarRunning on Apache Spark version 2.4.4
SparkUI available at http://ip-10-60-110-165.eu-west-2.compute.internal:8081
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.61-3c86d3ba497a
LOGGING: writing to /opt/notebooks/gogoGPCR/hail_logs/OPRM1_141824.log


In [5]:
mapping = pd.read_csv(map_file, sep = "\t").set_index("HGNC", drop=False).loc[GENE,:].to_dict()

In [6]:
vcf_files = lookup_vcfs(mapping = mapping, vcfdir = vcf_dir, gene = "OPRM1", version = imps["VCF_VERSION"])

In [None]:
mt = import_mt(vcf_files.get("vcfs"), mapping).checkpoint(checkpoint_file)

In [15]:
mt = downsample_mt(mt, DOWNSAMPLE_P)

In [None]:
mt = interval_qc_mt(mt, mapping, interval_path)

In [None]:
mt = add_varid(mt)