In [1]:
from pathlib import Path
import pandas as pd
import pyspark
import dxpy
import hail as hl
from datetime import datetime
from matrixtables import import_mt, interval_qc_mt, smart_split_multi_mt
from subprocess import run

from bokeh.io import show, output_notebook
from bokeh.layouts import gridplot
output_notebook()

Path("../tmp").resolve().mkdir(parents=True, exist_ok=True)


In [2]:
# Spark and Hail
VCF_DIR = Path("/mnt/project/Bulk/Exome sequences/Population level exome OQFE variants, pVCF format - interim 450k release/")
DATABASE = "matrix_tables"
REFERENCE_GENOME = 'GRCh38'

LOG_FILE = (
    Path("../hail_logs", f"MC4R_{datetime.now().strftime('%H%M')}.log")
    .resolve()
    .__str__()
)

sc = pyspark.SparkContext()
spark = pyspark.sql.SparkSession(sc)

try:
    mt_database = dxpy.find_one_data_object(name=DATABASE)["id"]
except Exception as e:
    spark.sql(f"CREATE DATABASE {DATABASE} LOCATION  'dnax://'")
    mt_database = dxpy.find_one_data_object(name=DATABASE)["id"]

hl.init(sc=sc, default_reference=REFERENCE_GENOME, log=LOG_FILE)

pip-installed Hail requires additional configuration options in Spark referring
  to the path to the Hail Python module directory HAIL_DIR,
  e.g. /path/to/python/site-packages/hail:
    spark.jars=HAIL_DIR/hail-all-spark.jar
    spark.driver.extraClassPath=HAIL_DIR/hail-all-spark.jar
    spark.executor.extraClassPath=./hail-all-spark.jarRunning on Apache Spark version 2.4.4
SparkUI available at http://ip-10-60-72-227.eu-west-2.compute.internal:8081
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.78-b17627756568
LOGGING: writing to /opt/notebooks/gogoGPCR/hail_logs/MC4R_1902.log


In [3]:
MAPPING_FILE = Path("../data/misc/mappings_with_blocks.tsv").resolve()
mapping = pd.read_csv(MAPPING_FILE, sep="\t").set_index("HGNC", drop=False)
mapping.loc["GIPR",:]

HGNC                                                                     GIPR
entry_name                                                         gipr_human
name                                                             GIP receptor
accession                                                              P48546
family                                                        002_001_003_002
species                                                          Homo sapiens
residue_numbering_scheme                                            GPCRdb(B)
sequence                    MTTSPILQLLLRLSLCGLLLQRAETGSKGQTAGELYQRWERYRREC...
genes                                                                ['GIPR']
ENSG                                                          ENSG00000010310
ENST                                                          ENST00000590918
type                                                                     GPCR
GRCh37_start                                                    

In [4]:
mt = import_mt(["GIPR"], mapping, vcf_dir=VCF_DIR, vcf_version="v1").key_rows_by(
    "locus", "alleles"
)

v, s = mt.count()
print(f"{v} variants and {s} samples after import")

1200 variants and 454756 samples after import


In [5]:
stage = "RAW"
checkpoint_file = f"/tmp/GIPR.{stage}.cp.mt"

mt = mt.checkpoint(checkpoint_file, overwrite=True)

# mt = hl.read_matrix_table(f"/tmp/MC4R.{stage}.cp.mt")

2022-07-23 19:23:47 Hail: INFO: Coerced sorted dataset
2022-07-23 19:31:45 Hail: INFO: wrote matrix table with 1200 rows and 454756 columns in 1 partition to /tmp/GIPR.RAW.cp.mt


In [6]:
INTERVAL_FILE=Path("../data/misc/xgen_plus_spikein.b38.bed").resolve()
run(["hadoop", "fs", "-put", str(INTERVAL_FILE), "/tmp"])

interval_table = hl.import_bed(
        f"/tmp/{INTERVAL_FILE.name}",
        reference_genome="GRCh38",
    )

mt = mt.filter_rows(hl.is_defined(interval_table[mt.locus]))
print(f"{mt.count_rows()} variants after interval filtering")

2022-07-23 19:31:48 Hail: INFO: Reading table without type imputation
  Loading field 'f0' as type str (user-supplied)
  Loading field 'f1' as type int32 (user-supplied)
  Loading field 'f2' as type int32 (user-supplied)
2022-07-23 19:31:50 Hail: INFO: Coerced sorted dataset


487 variants after interval filtering


In [7]:
mt = mt.filter_rows(mt.alleles.length() <= 6)
mt = smart_split_multi_mt(mt)

print(f"{mt.count_rows()} variants with not more than 6 alleles after splitting")

2022-07-23 19:31:54 Hail: INFO: Coerced sorted dataset
2022-07-23 19:31:56 Hail: INFO: Coerced sorted dataset
2022-07-23 19:31:58 Hail: INFO: Coerced sorted dataset


552 variants with not more than 6 alleles after splitting


In [8]:
VEP_JSON = Path("../data/misc/GRCh38_VEP.json").resolve()

mt = hl.vep(mt, f"file:{VEP_JSON}")

is_MANE = mt.aggregate_rows(
    hl.agg.all(hl.is_defined(mt.vep.transcript_consequences.mane_select))
)
assert is_MANE, "Selected transcript may not be MANE Select. Check manually."

mt = mt.annotate_rows(
    protCons=mt.vep.transcript_consequences.amino_acids[0].split("/")[0]
    + hl.str(mt.vep.transcript_consequences.protein_end[0])
    + mt.vep.transcript_consequences.amino_acids[0].split("/")[-1]
)

2022-07-23 19:32:00 Hail: INFO: Coerced sorted dataset
2022-07-23 19:32:02 Hail: INFO: Coerced sorted dataset
2022-07-23 19:32:03 Hail: INFO: Ordering unsorted dataset with network shuffle
2022-07-23 19:32:04 Hail: INFO: Coerced sorted dataset
2022-07-23 19:32:05 Hail: INFO: Coerced sorted dataset
2022-07-23 19:32:06 Hail: INFO: Coerced sorted dataset
2022-07-23 19:32:07 Hail: INFO: Ordering unsorted dataset with network shuffle
2022-07-23 19:32:08 Hail: INFO: Coerced sorted dataset


In [9]:
STAGE = "QC1"
WRITE_PATH = f"/tmp/GIPR.{STAGE}.mt"

mt.write(WRITE_PATH, overwrite=True)

2022-07-23 19:32:27 Hail: INFO: Coerced sorted dataset
2022-07-23 19:32:28 Hail: INFO: Coerced sorted dataset
2022-07-23 19:33:43 Hail: INFO: Ordering unsorted dataset with network shuffle
2022-07-23 19:33:44 Hail: INFO: Coerced sorted dataset
2022-07-23 19:41:10 Hail: INFO: wrote matrix table with 552 rows and 454756 columns in 3 partitions to /tmp/GIPR.QC1.mt
    Total size: 699.32 MiB
    * Rows/entries: 696.75 MiB
    * Columns: 2.56 MiB
    * Globals: 11.00 B
    * Smallest partition: 0 rows (20.00 B)
    * Largest partition:  480 rows (606.49 MiB)


In [16]:
interesting_variants = hl.literal(["missense_variant", "stop_gained", "frameshift_variant", "inframe_deletion", "start_lost"])

mt = mt.annotate_rows(is_interesting_var = interesting_variants.contains(mt.vep.most_severe_consequence))
mt.is_interesting_var.show(10)

2022-06-25 20:15:11 Hail: INFO: Coerced sorted dataset
2022-06-25 20:15:13 Hail: INFO: Coerced sorted dataset
2022-06-25 20:15:14 Hail: INFO: Ordering unsorted dataset with network shuffle
2022-06-25 20:15:15 Hail: INFO: Coerced sorted dataset


locus,alleles,is_interesting_var
locus<GRCh38>,array<str>,bool
chr18:60371355,"[""T"",""C""]",True
chr18:60371356,"[""A"",""G""]",True
chr18:60371358,"[""C"",""T""]",True
chr18:60371359,"[""T"",""G""]",False
chr18:60371365,"[""A"",""C""]",True
chr18:60371366,"[""C"",""T""]",False
chr18:60371367,"[""A"",""T""]",True
chr18:60371367,"[""AAGTC"",""A""]",True
chr18:60371369,"[""G"",""C""]",True
chr18:60371374,"[""A"",""G""]",True


In [17]:
mt = mt.annotate_cols(is_interesting_sample = hl.agg.any(mt.is_interesting_var & mt.GT.is_non_ref()))
mt.aggregate_cols(hl.agg.sum(mt.is_interesting_sample))

2022-06-25 20:15:17 Hail: INFO: Coerced sorted dataset
2022-06-25 20:15:18 Hail: INFO: Coerced sorted dataset
2022-06-25 20:15:36 Hail: INFO: Ordering unsorted dataset with network shuffle
2022-06-25 20:15:37 Hail: INFO: Coerced sorted dataset


32611

In [135]:
mt.is_interesting_sample.export(str(Path("../data/misc/interesting_samples.tsv").resolve()))


2022-06-20 23:44:15 Hail: INFO: Coerced sorted dataset
2022-06-20 23:44:16 Hail: INFO: Coerced sorted dataset
2022-06-20 23:44:17 Hail: INFO: Coerced sorted dataset
2022-06-20 23:44:18 Hail: INFO: Coerced sorted dataset
2022-06-20 23:44:34 Hail: INFO: Ordering unsorted dataset with network shuffle
2022-06-20 23:44:35 Hail: INFO: Coerced sorted dataset
2022-06-20 23:44:36 Hail: INFO: Coerced sorted dataset
2022-06-20 23:45:26 Hail: INFO: merging 16 files totalling 6.0M...
2022-06-20 23:45:26 Hail: INFO: while writing:
    /opt/notebooks/gogoGPCR/data/misc/interesting_samples.tsv
  merge time: 192.768ms


In [19]:
STAGE = "MISC1"
WRITE_PATH = f"/tmp/MC4R.{STAGE}.mt"

mt.write(WRITE_PATH, overwrite=True)
# mt = hl.read_matrix_table(WRITE_PATH)

2022-06-25 20:16:54 Hail: INFO: Coerced sorted dataset
2022-06-25 20:16:55 Hail: INFO: Coerced sorted dataset
2022-06-25 20:17:19 Hail: INFO: Ordering unsorted dataset with network shuffle
2022-06-25 20:17:20 Hail: INFO: Coerced sorted dataset
2022-06-25 20:21:22 Hail: INFO: Coerced sorted dataset
2022-06-25 20:21:23 Hail: INFO: Coerced sorted dataset
2022-06-25 20:21:46 Hail: INFO: Ordering unsorted dataset with network shuffle
2022-06-25 20:21:47 Hail: INFO: Coerced sorted dataset
2022-06-25 20:21:49 Hail: INFO: Coerced sorted dataset
2022-06-25 20:21:50 Hail: INFO: Coerced sorted dataset
2022-06-25 20:21:50 Hail: INFO: Ordering unsorted dataset with network shuffle
2022-06-25 20:21:51 Hail: INFO: Coerced sorted dataset
2022-06-25 20:21:53 Hail: INFO: Ordering unsorted dataset with network shuffle
2022-06-25 20:21:53 Hail: INFO: Ordering unsorted dataset with network shuffle
2022-06-25 20:21:53 Hail: INFO: Ordering unsorted dataset with network shuffle
2022-06-25 20:25:09 Hail: INFO:

In [21]:
SAMPREMOVE_FILE=Path('../data/misc/samples_to_remove.tsv').resolve()
run(["hadoop", "fs", "-put", str(SAMPREMOVE_FILE), "/tmp"])
samples_to_remove = hl.import_table(f"/tmp/{SAMPREMOVE_FILE.name}", key="eid")
mt = mt.anti_join_cols(samples_to_remove)

print(f"Samples remaining after hard filtering samples: {mt.count_cols()} ")

2022-06-25 20:26:00 Hail: INFO: Reading table without type imputation
  Loading field 'eid' as type str (not specified)


Samples remaining after hard filtering samples: 427247 


In [22]:
mt = mt.filter_cols(~mt.s.startswith("W"))

print(f"Samples remaining after removing withdrawn participants: {mt.count_cols()} ")

Samples remaining after removing withdrawn participants: 427199 


In [23]:
mt = hl.variant_qc(mt)
mt = mt.filter_rows(mt.variant_qc.p_value_hwe <= 10e-6, keep = False)
mt = mt.filter_rows((mt.variant_qc.AF[0] > 0.0) & (mt.variant_qc.AF[0] < 1.0))
mt.count_rows()

2022-06-25 20:27:13 Hail: INFO: Coerced sorted dataset
2022-06-25 20:27:14 Hail: INFO: Coerced sorted dataset
2022-06-25 20:27:15 Hail: INFO: Coerced sorted dataset


324

In [26]:
mt = mt.annotate_rows(label = hl.if_else(mt.vep.most_severe_consequence != "missense_variant",
                                         mt.vep.most_severe_consequence,
                                         mt.EVE_class))

2022-06-25 20:32:30 Hail: INFO: Ordering unsorted dataset with network shuffle


In [27]:
STAGE = "FINAL"
WRITE_PATH = f"/tmp/MC4R.{STAGE}.mt"

mt.write(WRITE_PATH, overwrite=True)

2022-06-25 20:32:31 Hail: INFO: Coerced sorted dataset
2022-06-25 20:32:32 Hail: INFO: Coerced sorted dataset
2022-06-25 20:32:32 Hail: INFO: Coerced sorted dataset
2022-06-25 20:32:33 Hail: INFO: Coerced sorted dataset
2022-06-25 20:32:33 Hail: INFO: Coerced sorted dataset
2022-06-25 20:32:34 Hail: INFO: Ordering unsorted dataset with network shuffle
2022-06-25 20:32:35 Hail: INFO: Coerced sorted dataset
2022-06-25 20:32:36 Hail: INFO: Ordering unsorted dataset with network shuffle
2022-06-25 20:32:36 Hail: INFO: Ordering unsorted dataset with network shuffle
2022-06-25 20:32:36 Hail: INFO: Ordering unsorted dataset with network shuffle
2022-06-25 20:32:57 Hail: INFO: Ordering unsorted dataset with network shuffle
2022-06-25 20:32:58 Hail: INFO: Coerced sorted dataset
2022-06-25 20:35:55 Hail: INFO: wrote matrix table with 324 rows and 427199 columns in 3 partitions to /tmp/MC4R.FINAL.mt
    Total size: 354.84 MiB
    * Rows/entries: 352.37 MiB
    * Columns: 2.47 MiB
    * Globals: 1

In [28]:
intr = mt.rows()
intr = intr.select(intr.variant_qc, intr.protCons, intr.label)
intr = intr.annotate(**intr.variant_qc)
intr = intr.drop(
    "variant_qc",
    "gq_stats",
    "dp_stats",
)

In [45]:
intr.export("/tmp/MC4R_variants_full.tsv")

2022-06-21 02:15:45 Hail: INFO: merging 2 files totalling 51.7K...
2022-06-21 02:15:45 Hail: INFO: while writing:
    /tmp/MC4R_variants_full.tsv
  merge time: 48.332ms


In [34]:
from matrixtables import write_bgen, add_varid

In [40]:
BGEN_FILE = Path("/opt/notebooks/gogoGPCR/tmp/MC4R").resolve().__str__()
ANNOTATIONS_FILE = Path("/opt/notebooks/gogoGPCR/tmp/MC4R.annotations").resolve().__str__()
SETLIST_FILE = Path("/opt/notebooks/gogoGPCR/tmp/MC4R.setlist").resolve().__str__()

In [32]:
write_bgen(mt, "file:" + BGEN_FILE)

2022-06-25 20:42:24 Hail: INFO: Coerced sorted dataset
2022-06-25 20:42:26 Hail: INFO: Coerced sorted dataset
2022-06-25 20:42:42 Hail: INFO: Ordering unsorted dataset with network shuffle
2022-06-25 20:42:43 Hail: INFO: Coerced sorted dataset
2022-06-25 20:44:23 Hail: INFO: while writing:
    file:/opt/notebooks/gogoGPCR/tmp/MC4R.bgen
  merge time: 204.840ms


In [36]:
# %%
# ANNOTATIONS

mt = add_varid(mt)

annotations = (
    mt.select_rows(
        varid=mt.varid,
        gene=mt.vep.transcript_consequences.gene_symbol[0],
        annotation=mt.label,
    )
    .rows()
    .key_by("varid")
    .drop("locus")
    .drop("alleles")
)

In [39]:
annotations.export("file:" + ANNOTATIONS_FILE, header=False)

2022-06-25 20:46:15 Hail: INFO: Coerced sorted dataset
2022-06-25 20:46:16 Hail: INFO: Coerced sorted dataset
2022-06-25 20:46:34 Hail: INFO: Ordering unsorted dataset with network shuffle
2022-06-25 20:46:35 Hail: INFO: Coerced sorted dataset
2022-06-25 20:46:36 Hail: INFO: Coerced sorted dataset
2022-06-25 20:46:37 Hail: INFO: Coerced sorted dataset
2022-06-25 20:46:38 Hail: INFO: Ordering unsorted dataset with network shuffle
2022-06-25 20:46:38 Hail: INFO: Coerced sorted dataset
2022-06-25 20:46:40 Hail: INFO: Ordering unsorted dataset with network shuffle
2022-06-25 20:46:40 Hail: INFO: Ordering unsorted dataset with network shuffle
2022-06-25 20:46:40 Hail: INFO: Ordering unsorted dataset with network shuffle
2022-06-25 20:47:30 Hail: INFO: Coerced sorted dataset
2022-06-25 20:48:03 Hail: INFO: merging 2 files totalling 11.5K...
2022-06-25 20:48:03 Hail: INFO: while writing:
    file:/opt/notebooks/gogoGPCR/tmp/MC4R.annotations
  merge time: 55.485ms


In [41]:
position = mt.aggregate_rows(hl.agg.min(mt.locus.position))
names = mt.varid.collect()
names_str = ",".join(names)

line = f"{mt.vep.transcript_consequences.gene_symbol[0].collect()[0]}\t{mt.locus.contig.collect()[0]}\t{position}\t{names_str}"

with open(SETLIST_FILE, "w") as f:
    f.write(line)


2022-06-25 20:48:04 Hail: INFO: Coerced sorted dataset
2022-06-25 20:48:05 Hail: INFO: Coerced sorted dataset
2022-06-25 20:48:06 Hail: INFO: Coerced sorted dataset
2022-06-25 20:48:28 Hail: INFO: Coerced sorted dataset
2022-06-25 20:48:29 Hail: INFO: Coerced sorted dataset
2022-06-25 20:48:46 Hail: INFO: Ordering unsorted dataset with network shuffle
2022-06-25 20:48:47 Hail: INFO: Coerced sorted dataset
2022-06-25 20:49:37 Hail: INFO: Coerced sorted dataset
2022-06-25 20:49:39 Hail: INFO: Coerced sorted dataset
2022-06-25 20:49:55 Hail: INFO: Ordering unsorted dataset with network shuffle
2022-06-25 20:49:56 Hail: INFO: Coerced sorted dataset
2022-06-25 20:50:47 Hail: INFO: Coerced sorted dataset
2022-06-25 20:50:48 Hail: INFO: Coerced sorted dataset
2022-06-25 20:51:04 Hail: INFO: Ordering unsorted dataset with network shuffle
2022-06-25 20:51:05 Hail: INFO: Coerced sorted dataset


In [43]:
bgen_file = BGEN_FILE + ".bgen"
sample_file = BGEN_FILE + ".sample"

run(
    [
        "dx",
        "upload",
        bgen_file,
        sample_file,
        ANNOTATIONS_FILE,
        SETLIST_FILE,
        "--path",
        "/Data/burden/",
    ],
    check=True,
    shell=False,
)

CompletedProcess(args=['dx', 'upload', '/opt/notebooks/gogoGPCR/tmp/MC4R.bgen', '/opt/notebooks/gogoGPCR/tmp/MC4R.sample', '/opt/notebooks/gogoGPCR/tmp/MC4R.annotations', '/opt/notebooks/gogoGPCR/tmp/MC4R.setlist', '--path', '/Data/burden/'], returncode=0)

In [44]:
mt.show()

2022-06-25 22:09:31 Hail: INFO: Coerced sorted dataset
2022-06-25 22:09:32 Hail: INFO: Coerced sorted dataset
2022-06-25 22:09:48 Hail: INFO: Ordering unsorted dataset with network shuffle
2022-06-25 22:09:49 Hail: INFO: Coerced sorted dataset


locus,alleles
locus<GRCh38>,array<str>
chr18:60371355,"[""T"",""C""]"
chr18:60371356,"[""A"",""G""]"
chr18:60371358,"[""C"",""T""]"
chr18:60371359,"[""T"",""G""]"
chr18:60371365,"[""A"",""C""]"
chr18:60371366,"[""C"",""T""]"
chr18:60371367,"[""A"",""T""]"
chr18:60371367,"[""AAGTC"",""A""]"
chr18:60371369,"[""G"",""C""]"
chr18:60371374,"[""A"",""G""]"


In [47]:
mt.filter_rows(mt.protCons == "V103I").info.AF.show()

2022-06-25 22:11:31 Hail: INFO: Coerced sorted dataset
2022-06-25 22:11:32 Hail: INFO: Coerced sorted dataset
2022-06-25 22:11:49 Hail: INFO: Ordering unsorted dataset with network shuffle
2022-06-25 22:11:50 Hail: INFO: Coerced sorted dataset


locus,alleles,Unnamed: 2_level_0
locus<GRCh38>,array<str>,array<float64>
chr18:60372043,"[""C"",""T""]",[1.98e-02]


In [46]:
mt.describe()

----------------------------------------
Global fields:
    None
----------------------------------------
Column fields:
    's': str
    'is_interesting_sample': bool
----------------------------------------
Row fields:
    'locus': locus<GRCh38>
    'alleles': array<str>
    'rsid': str
    'qual': float64
    'filters': set<str>
    'info': struct {
        AF: array<float64>, 
        AQ: array<int32>, 
        AC: array<int32>, 
        AN: int32
    }
    'a_index': int32
    'was_split': bool
    'vep': struct {
        assembly_name: str, 
        allele_string: str, 
        ancestral: str, 
        colocated_variants: array<struct {
            aa_allele: str, 
            aa_maf: float64, 
            afr_allele: str, 
            afr_maf: float64, 
            allele_string: str, 
            amr_allele: str, 
            amr_maf: float64, 
            clin_sig: array<str>, 
            end: int32, 
            eas_allele: str, 
            eas_maf: float64, 
            ea