In [1]:
from pathlib import Path
import pandas as pd
import pyspark
import dxpy
import hail as hl
from datetime import datetime
from matrixtables import import_mt, interval_qc_mt, smart_split_multi_mt
from subprocess import run

from bokeh.io import show, output_notebook
from bokeh.layouts import gridplot
output_notebook()

Path("/tmp").resolve().mkdir(parents=True, exist_ok=True)


In [2]:
# Spark and Hail
VCF_DIR = Path("/mnt/project/Bulk/Exome sequences/Population level exome OQFE variants, pVCF format - final release/")
DATABASE = "matrix_tables"
REFERENCE_GENOME = 'GRCh38'

LOG_FILE = (
    Path("../hail_logs", f"GIPR_{datetime.now().strftime('%H%M')}.log")
    .resolve()
    .__str__()
)

sc = pyspark.SparkContext()
spark = pyspark.sql.SparkSession(sc)

try:
    mt_database = dxpy.find_one_data_object(name=DATABASE)["id"]
except Exception as e:
    spark.sql(f"CREATE DATABASE {DATABASE} LOCATION  'dnax://'")
    mt_database = dxpy.find_one_data_object(name=DATABASE)["id"]

hl.init(sc=sc, default_reference=REFERENCE_GENOME, log=LOG_FILE)

pip-installed Hail requires additional configuration options in Spark referring
  to the path to the Hail Python module directory HAIL_DIR,
  e.g. /path/to/python/site-packages/hail:
    spark.jars=HAIL_DIR/hail-all-spark.jar
    spark.driver.extraClassPath=HAIL_DIR/hail-all-spark.jar
    spark.executor.extraClassPath=./hail-all-spark.jarRunning on Apache Spark version 2.4.4
SparkUI available at http://ip-10-60-93-193.eu-west-2.compute.internal:8081
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.78-b17627756568
LOGGING: writing to /opt/notebooks/gogoGPCR/notebooks/hail_logs/GIPR_0851.log


In [3]:
# Read in metadata and region
MAPPING_FILE = Path("../../data/misc/mappings_with_blocks.tsv").resolve()
mapping = pd.read_csv(MAPPING_FILE, sep="\t").set_index("HGNC", drop=False)
mapping.loc["GIPR",:]

HGNC                                                                     GIPR
entry_name                                                         gipr_human
name                                                             GIP receptor
accession                                                              P48546
family                                                        002_001_003_002
species                                                          Homo sapiens
residue_numbering_scheme                                            GPCRdb(B)
sequence                    MTTSPILQLLLRLSLCGLLLQRAETGSKGQTAGELYQRWERYRREC...
genes                                                                ['GIPR']
ENSG                                                          ENSG00000010310
ENST                                                          ENST00000590918
type                                                                     GPCR
GRCh37_start                                                    

In [4]:
# Import VCF as matrix table and filter to only GIPR
mt = import_mt(["GIPR"], mapping, vcf_dir=VCF_DIR, vcf_version="v1").key_rows_by(
    "locus", "alleles"
)

In [5]:
# Checkpoint because Hail likes checkpointing
stage = "RAW"
checkpoint_file = f"/tmp/GIPR.{stage}.cp.mt"

mt = mt.checkpoint(checkpoint_file, overwrite=True)

v, s = mt.count()
print(f"{v} variants and {s} samples after import")

2023-01-30 08:59:47 Hail: INFO: Coerced sorted dataset
2023-01-30 09:08:03 Hail: INFO: wrote matrix table with 1214 rows and 469835 columns in 1 partition to /tmp/GIPR.RAW.cp.mt


1214 variants and 469835 samples after import


In [6]:
# Filter to only WES target regions 
INTERVAL_FILE=Path("../../data/misc/xgen_plus_spikein.b38.bed").resolve()
run(["hadoop", "fs", "-put", str(INTERVAL_FILE), "/tmp"])

interval_table = hl.import_bed(
        f"/tmp/{INTERVAL_FILE.name}",
        reference_genome="GRCh38",
    )

mt = mt.filter_rows(hl.is_defined(interval_table[mt.locus]))
print(f"{mt.count_rows()} variants after interval filtering")

2023-01-30 09:08:06 Hail: INFO: Reading table without type imputation
  Loading field 'f0' as type str (user-supplied)
  Loading field 'f1' as type int32 (user-supplied)
  Loading field 'f2' as type int32 (user-supplied)
2023-01-30 09:08:07 Hail: INFO: Coerced sorted dataset


489 variants after interval filtering


In [7]:
# Split multi alleles 
mt = mt.filter_rows(mt.alleles.length() <= 6)
mt = smart_split_multi_mt(mt)

print(f"{mt.count_rows()} variants with not more than 6 alleles after splitting")

2023-01-30 09:08:11 Hail: INFO: Coerced sorted dataset
2023-01-30 09:08:13 Hail: INFO: Coerced sorted dataset
2023-01-30 09:08:15 Hail: INFO: Coerced sorted dataset


555 variants with not more than 6 alleles after splitting


In [8]:
# Annotate with VEP and generate protein consequence
VEP_JSON = Path("../../data/misc/GRCh38_VEP.json").resolve()

mt = hl.vep(mt, f"file:{VEP_JSON}")

is_MANE = mt.aggregate_rows(
    hl.agg.all(hl.is_defined(mt.vep.transcript_consequences.mane_select))
)
assert is_MANE, "Selected transcript may not be MANE Select. Check manually."

mt = mt.annotate_rows(
    protCons=mt.vep.transcript_consequences.amino_acids[0].split("/")[0]
    + hl.str(mt.vep.transcript_consequences.protein_end[0])
    + mt.vep.transcript_consequences.amino_acids[0].split("/")[-1]
)

2023-01-30 09:08:17 Hail: INFO: Coerced sorted dataset
2023-01-30 09:08:18 Hail: INFO: Coerced sorted dataset
2023-01-30 09:08:19 Hail: INFO: Ordering unsorted dataset with network shuffle
2023-01-30 09:08:20 Hail: INFO: Coerced sorted dataset
2023-01-30 09:08:21 Hail: INFO: Coerced sorted dataset
2023-01-30 09:08:23 Hail: INFO: Coerced sorted dataset
2023-01-30 09:08:23 Hail: INFO: Ordering unsorted dataset with network shuffle
2023-01-30 09:08:24 Hail: INFO: Coerced sorted dataset


In [9]:
# Annotate variants of interest
interesting_variants = hl.literal(["missense_variant", "stop_gained", "frameshift_variant", "inframe_deletion", "start_lost"])

mt = mt.annotate_rows(is_interesting_var = interesting_variants.contains(mt.vep.most_severe_consequence))

In [10]:
# Load in vitro data
INVITRO_FILE="/opt/notebooks/gogoGPCR/data/GIPR/GIPR.invitro.csv"
run(["hadoop", "fs", "-put", str(INVITRO_FILE), "/tmp"])

ht = hl.import_table("/tmp/GIPR.invitro.csv", delimiter = ",", impute = True)
ht = ht.filter(ht.GRCh_38_location != "")
ht = ht.annotate(variant = "chr" + ht.GRCh_38_location.replace("-", ":"))
ht = ht.annotate(**hl.parse_variant(ht.variant))
ht = ht.key_by(ht.locus, ht.alleles)
ht.export("/tmp/GIPR/GIPR.invitro.tsv")
ht.write("/tmp/GIPR.invitro.ht")

run(["hadoop", "fs", "-get", "/tmp/GIPR/GIPR.invitro.tsv", "/opt/notebooks/gogoGPCR/data/GIPR/GIPR.invitro.tsv"])

2023-01-30 09:08:45 Hail: WARN: Found 1 duplicate column. Mangled columns follows:
  'Fmut(EC50)' -> 'Fmut(EC50)_1'
2023-01-30 09:08:45 Hail: INFO: Reading table to impute column types
2023-01-30 09:08:45 Hail: WARN: Found 1 duplicate column. Mangled columns follows:
  'Fmut(EC50)' -> 'Fmut(EC50)_1'
2023-01-30 09:08:45 Hail: INFO: Finished type imputation
  Loading field '\ufeffAposA' as type str (imputed)
  Loading field 'Location' as type str (imputed)
  Loading field 'Bmax (Binding)' as type str (imputed)
  Loading field 'sem (Bmax)' as type str (imputed)
  Loading field 'logIC50 (Binding)' as type str (imputed)
  Loading field 'sem (IC50)' as type str (imputed)
  Loading field 'n (binding)' as type str (imputed)
  Loading field 'Fmut (IC50)' as type str (imputed)
  Loading field 'Emax (cAMP)' as type str (imputed)
  Loading field 'sem (EC50)' as type str (imputed)
  Loading field 'logEC50 (cAMP)' as type str (imputed)
  Loading field 'sem (cAMP)' as type str (imputed)
  Loading fie

CompletedProcess(args=['hadoop', 'fs', '-get', '/tmp/GIPR/GIPR.invitro.tsv', '/opt/notebooks/gogoGPCR/data/GIPR/GIPR.invitro.tsv'], returncode=1)

In [11]:
# Annotate with in vitro data
mt = mt.annotate_rows(**ht[mt.locus, mt.alleles])

In [None]:
# SCORE_FILE=https://krishna.gs.washington.edu/download/CADD/v1.6/GRCh38/whole_genome_SNVs.tsv.gz
# INDEX=IndexFile
# curl -o $INDEX $SCORE_FILE.tbi
# tabix $SCORE_FILE $INDEX --print-header 19:45668221-45683722 | tail -n +2 > ../../data/GIPR/GIPR.CADD.tsv

In [13]:
# Import CADD scores
run(["hadoop", "fs", "-put", "../../data/GIPR/GIPR.CADD.tsv", "/tmp/GIPR.CADD.tsv"])
cadd = hl.import_table("/tmp/GIPR.CADD.tsv", impute = True)
cadd = cadd.annotate(variant_string = "chr" + hl.str(cadd.Chrom) + ":" + hl.str(cadd.Pos) + ":" + cadd.Ref + ":" + cadd.Alt)
cadd = cadd.annotate(**hl.parse_variant(cadd.variant_string))
cadd = cadd.key_by(cadd.locus, cadd.alleles)

2023-01-30 09:10:55 Hail: INFO: Reading table to impute column types
2023-01-30 09:10:55 Hail: INFO: Finished type imputation
  Loading field 'Chrom' as type int32 (imputed)
  Loading field 'Pos' as type int32 (imputed)
  Loading field 'Ref' as type str (imputed)
  Loading field 'Alt' as type str (imputed)
  Loading field 'RawScore' as type float64 (imputed)
  Loading field 'PHRED' as type float64 (imputed)


In [14]:
# Annotate with CADD data
mt = mt.annotate_rows(**cadd[mt.locus, mt.alleles])

In [16]:
# Checkpoint because Hail likes checkpointing
stage = "ANNOTATED"
checkpoint_file = f"/tmp/GIPR.{stage}.cp.mt"

mt = mt.checkpoint(checkpoint_file, overwrite=True)
# mt = hl.read_matrix_table(checkpoint_file)

mt.filter_rows(~hl.is_missing(mt['Bmax (Binding)'])).count_rows() # 32

2023-01-30 09:11:10 Hail: INFO: Coerced sorted dataset
2023-01-30 09:11:12 Hail: INFO: Coerced sorted dataset
2023-01-30 09:12:30 Hail: INFO: Ordering unsorted dataset with network shuffle
2023-01-30 09:12:31 Hail: INFO: Coerced sorted dataset
2023-01-30 09:12:32 Hail: INFO: Coerced sorted dataset
2023-01-30 09:12:33 Hail: INFO: Coerced sorted dataset
2023-01-30 09:17:00 Hail: INFO: wrote matrix table with 555 rows and 469835 columns in 3 partitions to /tmp/GIPR.ANNOTATED.cp.mt
    Total size: 1.32 GiB
    * Rows/entries: 1.32 GiB
    * Columns: 3.07 MiB
    * Globals: 11.00 B
    * Smallest partition: 0 rows (20.00 B)
    * Largest partition:  483 rows (1.14 GiB)


32

In [17]:
# Filter oligo lots
mt.count_rows()
mt = mt.annotate_rows(DP10 = hl.agg.mean(mt.DP < 10)) # UKB recommend
mt = mt.filter_rows(mt.DP10 < 0.90) # No variants
mt.count_rows()

555

In [18]:
# Entries QC
mt = mt.annotate_entries(AB=(mt.AD[1] / hl.sum(mt.AD)))

mt = mt.filter_entries(
    (mt.GQ >= 20) &
    (
        hl.is_indel(mt.alleles[0], mt.alleles[1]) & (mt.DP >= 10)) |
        (hl.is_snp(mt.alleles[0], mt.alleles[1]) & (mt.DP >= 7)
    ) &
    (
        (mt.GT.is_hom_ref() & (mt.AB <= 0.1)) |
        (mt.GT.is_het() & (mt.AB >= 0.2) & (mt.AB <= 0.8)) |
        (mt.GT.is_hom_var() & (mt.AB >= 0.9))
    )
) # Combine Backman and Pedersen

mt = mt.compute_entry_filter_stats()

555

In [19]:
# Time for variant and sample QC
mt = hl.variant_qc(mt)
mt = hl.sample_qc(mt)

In [20]:
# Checkpoint QC
stage = "QC0"
checkpoint_file = f"/tmp/GIPR.{stage}.cp.mt"

mt = mt.checkpoint(checkpoint_file, overwrite=True)
# mt = hl.read_matrix_table(checkpoint_file)

2023-01-30 09:46:58 Hail: INFO: wrote matrix table with 555 rows and 469835 columns in 3 partitions to /tmp/GIPR.QC0.cp.mt


In [21]:
# Variant QC
mt = mt.filter_rows(~(mt.variant_qc.p_value_hwe < 10**-15)) # Backman, 2 variants, chr19:45674703 ["C","T"] "N170N" and chr19:45677071 ["G","C"]"E252D"
mt = mt.filter_rows(mt.variant_qc.call_rate > 0.99) # Backman
mt = mt.filter_rows(~hl.is_missing(mt.variant_qc.AF)) # 10 variants, no missense
mt.count_rows()

542

In [22]:
# Sample QC
mt = mt.filter_cols(mt.sample_qc. call_rate > 0.9) # 1 sample
mt = mt.filter_cols(~mt.s.startswith("W"))

print(f"Samples remaining after removing withdrawn participants: {mt.count_cols()} ")

Samples remaining after removing withdrawn participants: 469817 


In [23]:
# Ancestry check
run(["hadoop", "fs", "-put", "ancestry.csv", "/tmp"]) # from GIPR_ancestry.ipynb
ht = hl.import_table("/tmp/ancestry.csv", delimiter = ",", quote='"', missing="foo").select("PC_UKBB.eid", "group").key_by("PC_UKBB.eid")

2023-01-30 09:51:43 Hail: INFO: Reading table without type imputation
  Loading field '' as type str (not specified)
  Loading field 'PC_UKBB.eid' as type str (not specified)
  Loading field 'group' as type str (not specified)


In [24]:
mt = mt.annotate_cols(**ht[mt.s])

In [33]:
mt2 = mt.filter_rows(~hl.is_missing(mt['Bmax (Binding)']))
mt2 = mt2.annotate_cols(carrier = hl.agg.any(mt2.GT.is_non_ref()))
grp = mt2.select_cols(mt2.group, mt2.carrier).cols()
grp = grp.group_by(grp.group).aggregate(num_carriers = hl.agg.sum(grp.carrier))
grp.show(-1)
grp.export('/tmp/n_ancestries.csv')
run(["hadoop", "fs", "-get", "/tmp/n_ancestries.csv"])

2023-01-30 09:58:04 Hail: INFO: Ordering unsorted dataset with network shuffle


group,num_carriers
str,int64
"""Ashkenazi""",22
"""Caribbean""",108
"""China""",34
"""India""",110
"""Iran""",18
"""Italy""",111
"""NA""",206
"""Nigeria""",155
"""Poland""",68
"""United Kingdom""",10638


2023-01-30 09:58:33 Hail: INFO: Ordering unsorted dataset with network shuffle
2023-01-30 09:58:33 Hail: INFO: merging 11 files totalling 139...
2023-01-30 09:58:33 Hail: INFO: while writing:
    /tmp/n_ancestries.csv
  merge time: 87.281ms


CompletedProcess(args=['hadoop', 'fs', '-get', '/tmp/n_ancestries.csv'], returncode=0)

In [34]:
# Filter only United Kingdom
mt = mt.filter_cols(mt.group == "United Kingdom")
mt.count_cols()

429597

In [35]:
# Checkpoint QC
stage = "QC1"
checkpoint_file = f"/tmp/GIPR.{stage}.cp.mt"

mt = mt.checkpoint(checkpoint_file, overwrite=True)
# mt = hl.read_matrix_table(checkpoint_file)

2023-01-30 10:01:41 Hail: INFO: wrote matrix table with 542 rows and 429597 columns in 3 partitions to /tmp/GIPR.QC1.cp.mt
