In [1]:
from pathlib import Path
import pandas as pd
import pyspark
import dxpy
import hail as hl
from datetime import datetime
from matrixtables import import_mt, interval_qc_mt, smart_split_multi_mt
from subprocess import run

from bokeh.io import show, output_notebook
from bokeh.layouts import gridplot
output_notebook()

Path("/tmp").resolve().mkdir(parents=True, exist_ok=True)


In [2]:
# Spark and Hail
VCF_DIR = Path("/mnt/project/Bulk/Exome sequences/Population level exome OQFE variants, pVCF format - final release/")
DATABASE = "matrix_tables"
REFERENCE_GENOME = 'GRCh38'

LOG_FILE = (
    Path("../hail_logs", f"GIPR_{datetime.now().strftime('%H%M')}.log")
    .resolve()
    .__str__()
)

sc = pyspark.SparkContext()
spark = pyspark.sql.SparkSession(sc)

try:
    mt_database = dxpy.find_one_data_object(name=DATABASE)["id"]
except Exception as e:
    spark.sql(f"CREATE DATABASE {DATABASE} LOCATION  'dnax://'")
    mt_database = dxpy.find_one_data_object(name=DATABASE)["id"]

hl.init(sc=sc, default_reference=REFERENCE_GENOME, log=LOG_FILE)

pip-installed Hail requires additional configuration options in Spark referring
  to the path to the Hail Python module directory HAIL_DIR,
  e.g. /path/to/python/site-packages/hail:
    spark.jars=HAIL_DIR/hail-all-spark.jar
    spark.driver.extraClassPath=HAIL_DIR/hail-all-spark.jar
    spark.executor.extraClassPath=./hail-all-spark.jarRunning on Apache Spark version 2.4.4
SparkUI available at http://ip-10-60-67-59.eu-west-2.compute.internal:8081
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.78-b17627756568
LOGGING: writing to /opt/notebooks/gogoGPCR/notebooks/hail_logs/GIPR_1037.log


In [3]:
# Read in metadata and region
MAPPING_FILE = Path("../../data/misc/mappings_with_blocks.tsv").resolve()
mapping = pd.read_csv(MAPPING_FILE, sep="\t").set_index("HGNC", drop=False)
mapping.loc["GIPR",:]

HGNC                                                                     GIPR
entry_name                                                         gipr_human
name                                                             GIP receptor
accession                                                              P48546
family                                                        002_001_003_002
species                                                          Homo sapiens
residue_numbering_scheme                                            GPCRdb(B)
sequence                    MTTSPILQLLLRLSLCGLLLQRAETGSKGQTAGELYQRWERYRREC...
genes                                                                ['GIPR']
ENSG                                                          ENSG00000010310
ENST                                                          ENST00000590918
type                                                                     GPCR
GRCh37_start                                                    

In [4]:
# Import VCF as matrix table and filter to only GIPR
mt = import_mt(["GIPR"], mapping, vcf_dir=VCF_DIR, vcf_version="v1").key_rows_by(
    "locus", "alleles"
)

In [5]:
# Checkpoint because Hail likes checkpointing
stage = "RAW"
checkpoint_file = f"/tmp/GIPR.{stage}.cp.mt"

mt = mt.checkpoint(checkpoint_file, overwrite=True)

v, s = mt.count()
print(f"{v} variants and {s} samples after import")

2023-01-26 10:46:17 Hail: INFO: Coerced sorted dataset
2023-01-26 10:54:48 Hail: INFO: wrote matrix table with 1214 rows and 469835 columns in 1 partition to /tmp/GIPR.RAW.cp.mt


1214 variants and 469835 samples after import


In [6]:
# Filter to only WES target regions 
INTERVAL_FILE=Path("../../data/misc/xgen_plus_spikein.b38.bed").resolve()
run(["hadoop", "fs", "-put", str(INTERVAL_FILE), "/tmp"])

interval_table = hl.import_bed(
        f"/tmp/{INTERVAL_FILE.name}",
        reference_genome="GRCh38",
    )

mt = mt.filter_rows(hl.is_defined(interval_table[mt.locus]))
print(f"{mt.count_rows()} variants after interval filtering")

2023-01-26 10:54:51 Hail: INFO: Reading table without type imputation
  Loading field 'f0' as type str (user-supplied)
  Loading field 'f1' as type int32 (user-supplied)
  Loading field 'f2' as type int32 (user-supplied)
2023-01-26 10:54:53 Hail: INFO: Coerced sorted dataset


489 variants after interval filtering


In [7]:
# Split multi alleles 
mt = mt.filter_rows(mt.alleles.length() <= 6)
mt = smart_split_multi_mt(mt)

print(f"{mt.count_rows()} variants with not more than 6 alleles after splitting")

2023-01-26 10:54:57 Hail: INFO: Coerced sorted dataset
2023-01-26 10:54:59 Hail: INFO: Coerced sorted dataset
2023-01-26 10:55:01 Hail: INFO: Coerced sorted dataset


555 variants with not more than 6 alleles after splitting


In [8]:
# Annotate with VEP and generate protein consequence
VEP_JSON = Path("../../data/misc/GRCh38_VEP.json").resolve()

mt = hl.vep(mt, f"file:{VEP_JSON}")

is_MANE = mt.aggregate_rows(
    hl.agg.all(hl.is_defined(mt.vep.transcript_consequences.mane_select))
)
assert is_MANE, "Selected transcript may not be MANE Select. Check manually."

mt = mt.annotate_rows(
    protCons=mt.vep.transcript_consequences.amino_acids[0].split("/")[0]
    + hl.str(mt.vep.transcript_consequences.protein_end[0])
    + mt.vep.transcript_consequences.amino_acids[0].split("/")[-1]
)

2023-01-26 10:55:03 Hail: INFO: Coerced sorted dataset
2023-01-26 10:55:05 Hail: INFO: Coerced sorted dataset
2023-01-26 10:55:06 Hail: INFO: Ordering unsorted dataset with network shuffle
2023-01-26 10:55:07 Hail: INFO: Coerced sorted dataset
2023-01-26 10:55:09 Hail: INFO: Coerced sorted dataset
2023-01-26 10:55:10 Hail: INFO: Coerced sorted dataset
2023-01-26 10:55:11 Hail: INFO: Ordering unsorted dataset with network shuffle
2023-01-26 10:55:12 Hail: INFO: Coerced sorted dataset


In [10]:
# Annotate variants of interest
interesting_variants = hl.literal(["missense_variant", "stop_gained", "frameshift_variant", "inframe_deletion", "start_lost"])

mt = mt.annotate_rows(is_interesting_var = interesting_variants.contains(mt.vep.most_severe_consequence))

In [23]:
# Load in vitro data
INVITRO_FILE="/opt/notebooks/gogoGPCR/data/GIPR/GIPR.invitro.csv"
run(["hadoop", "fs", "-put", str(INVITRO_FILE), "/tmp"])

ht = hl.import_table("/tmp/GIPR.invitro.csv", delimiter = ",", impute = True)
ht = ht.filter(ht.GRCh_38_location != "")
ht = ht.annotate(variant = "chr" + ht.GRCh_38_location.replace("-", ":"))
ht = ht.annotate(**hl.parse_variant(ht.variant))
ht = ht.key_by(ht.locus, ht.alleles)
ht.export("/tmp/GIPR/GIPR.invitro.tsv")
ht.write("/tmp/GIPR.invitro.ht")

run(["hadoop", "fs", "-get", "/tmp/GIPR/GIPR.invitro.tsv", "/opt/notebooks/gogoGPCR/data/GIPR/GIPR.invitro.tsv"])

CompletedProcess(args=['hadoop', 'fs', '-put', '/opt/notebooks/gogoGPCR/data/GIPR/GIPR.invitro.csv', '/tmp'], returncode=0)

In [66]:
# Annotate with in vitro data
mt = mt.annotate_rows(**ht[mt.locus, mt.alleles])

In [None]:
# SCORE_FILE=https://krishna.gs.washington.edu/download/CADD/v1.6/GRCh38/whole_genome_SNVs.tsv.gz
# INDEX=IndexFile
# curl -o $INDEX $SCORE_FILE.tbi
# tabix $SCORE_FILE $INDEX --print-header 19:45668221-45683722 | tail -n +2 > ../../data/GIPR/GIPR.CADD.tsv

In [74]:
# Import CADD scores
# run(["hadoop", "fs", "-put", "../../data/GIPR/GIPR.CADD.tsv", "/tmp/GIPR.CADD.tsv"])
cadd = hl.import_table("/tmp/GIPR.CADD.tsv", impute = True)
cadd = cadd.annotate(variant_string = "chr" + hl.str(cadd.Chrom) + ":" + hl.str(cadd.Pos) + ":" + cadd.Ref + ":" + cadd.Alt)
cadd = cadd.annotate(**hl.parse_variant(cadd.variant_string))
cadd = cadd.key_by(cadd.locus, cadd.alleles)

2023-01-26 12:20:28 Hail: INFO: Reading table to impute column types
2023-01-26 12:20:29 Hail: INFO: Finished type imputation
  Loading field 'Chrom' as type int32 (imputed)
  Loading field 'Pos' as type int32 (imputed)
  Loading field 'Ref' as type str (imputed)
  Loading field 'Alt' as type str (imputed)
  Loading field 'RawScore' as type float64 (imputed)
  Loading field 'PHRED' as type float64 (imputed)


In [76]:
# Annotate with CADD data
mt = mt.annotate_rows(**cadd[mt.locus, mt.alleles])

In [None]:
# Checkpoint because Hail likes checkpointing
stage = "ANNOTATED"
checkpoint_file = f"/tmp/GIPR.{stage}.cp.mt"

mt = mt.checkpoint(checkpoint_file, overwrite=True)

mt.filter_rows(~hl.is_missing(mt['Bmax (Binding)']))

2023-01-26 12:22:08 Hail: INFO: Coerced sorted dataset
2023-01-26 12:22:09 Hail: INFO: Coerced sorted dataset
2023-01-26 12:23:34 Hail: INFO: Ordering unsorted dataset with network shuffle
2023-01-26 12:23:35 Hail: INFO: Coerced sorted dataset
2023-01-26 12:23:36 Hail: INFO: Coerced sorted dataset
2023-01-26 12:23:36 Hail: INFO: Coerced sorted dataset
