In [1]:
from pathlib import Path
import pandas as pd
import pyspark
import dxpy
import hail as hl
from datetime import datetime
from matrixtables import import_mt, interval_qc_mt, smart_split_multi_mt
from subprocess import run

from bokeh.io import show, output_notebook
from bokeh.layouts import gridplot
output_notebook()

Path("/tmp").resolve().mkdir(parents=True, exist_ok=True)

In [2]:
# Spark and Hail 

DATABASE = "matrix_tables"
REFERENCE_GENOME = 'GRCh38'
PROJ_NAME = "TASR"

LOG_FILE = (
    Path("../hail_logs", f"{PROJ_NAME}_{datetime.now().strftime('%H%M')}.log")
    .resolve()
    .__str__()
)

# Hail init
sc = pyspark.SparkContext()
spark = pyspark.sql.SparkSession(sc)

hl.init(sc=sc, default_reference=REFERENCE_GENOME, log=LOG_FILE)

# Create database in DNAX
spark.sql(f"CREATE DATABASE IF NOT EXISTS {DATABASE} LOCATION 'dnax://'")
mt_database = dxpy.find_one_data_object(name=DATABASE)["id"]

KeyboardInterrupt: 

In [6]:
# Read in metadata and region
# Map genes to blocks and regions
R2 = ["TAS1R2"]
R3 = ["TAS1R3"]
MAPPING_FILE = Path("../data/misc/mappings_with_blocks.tsv").resolve()
mapping = pd.read_csv(MAPPING_FILE, sep="\t").set_index("HGNC", drop=False)
mapping.loc[R2,:]

Unnamed: 0_level_0,HGNC,entry_name,name,accession,family,species,residue_numbering_scheme,sequence,genes,ENSG,...,type,GRCh37_start,GRCh37_end,GRCh37_strand,GRCh38_start,GRCh38_end,GRCh38_strand,GRCh38_region,Notes,VCF_block
HGNC,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TAS1R2,TAS1R2,ts1r2_human,<i>TAS1R2</i>,Q8TE23,004_003_001_002,Homo sapiens,GPCRdb(C),MGPRAKTISSLFFLLWVLAEPAENSDFYLPGDYLLGGLFSLHANMK...,"['TAS1R2', 'GPR71', 'T1R2', 'TR2']",ENSG00000179002,...,GPCR,19166093.0,19186176.0,-1.0,18839599,18859682,-1,1,,14


In [16]:
# Import gVCFs as Matrix Table and filter to only GENES

VCF_DIR = Path("/mnt/project/Bulk/Exome sequences/Population level exome OQFE variants, pVCF format - final release/")
FIELD_ID = 23157 # UKB field ID for latest exome data

mtR2 = import_mt(R2, mapping, vcf_dir=VCF_DIR, vcf_version="v1", field_id=FIELD_ID).key_rows_by(
    "locus", "alleles"
)

mtR3 = import_mt(R3, mapping, vcf_dir=VCF_DIR, vcf_version="v1", field_id=FIELD_ID).key_rows_by(
    "locus", "alleles"
)

mt = hl.MatrixTable.union_rows(mtR2, mtR3) # Workaround since this Hail is ancient and broken

In [17]:
# Filter to only WES target regions 
INTERVAL_FILE=Path("../data/misc/xgen_plus_spikein.b38.bed").resolve()
run(["hadoop", "fs", "-put", str(INTERVAL_FILE), "/tmp"])

interval_table = hl.import_bed(
        f"/tmp/{INTERVAL_FILE.name}",
        reference_genome="GRCh38",
    )

mt = mt.filter_rows(hl.is_defined(interval_table[mt.locus]))


2023-03-23 09:18:27 Hail: INFO: Reading table without type imputation
  Loading field 'f0' as type str (user-supplied)
  Loading field 'f1' as type int32 (user-supplied)
  Loading field 'f2' as type int32 (user-supplied)


In [18]:
# Initial checkpoint
stage = "INITIAL"
checkpoint_file = f"/tmp/{PROJ_NAME}.{stage}.cp.mt"

mt = mt.checkpoint(checkpoint_file, overwrite=False)

v, s = mt.count()
print(f"{v} variants and {s} samples after import and target region filter")

2023-03-23 09:25:32 Hail: INFO: Coerced sorted dataset
2023-03-23 09:35:59 Hail: INFO: Coerced sorted dataset
2023-03-23 09:36:02 Hail: INFO: Coerced sorted dataset
2023-03-23 09:47:30 Hail: INFO: wrote matrix table with 2032 rows and 469835 columns in 2 partitions to /tmp/TASR.INITIAL.cp.mt
    Total size: 5.75 GiB
    * Rows/entries: 5.75 GiB
    * Columns: 3.07 MiB
    * Globals: 11.00 B
    * Smallest partition: 861 rows (2.37 GiB)
    * Largest partition:  1171 rows (3.38 GiB)


2032 variants and 469835 samples after import and target region filter


In [19]:
# Split multi alleles 
mt = mt.filter_rows(mt.alleles.length() <= 6)
mt = smart_split_multi_mt(mt)

print(f"{mt.count_rows()} variants with not more than 6 alleles after splitting")

2448 variants with not more than 6 alleles after splitting


In [21]:
# Annotate with VEP and generate protein consequence
VEP_JSON = Path("../data/misc/GRCh38_VEP.json").resolve()

mt = hl.vep(mt, f"file:{VEP_JSON}")

is_MANE = mt.aggregate_rows(
    hl.agg.all(hl.is_defined(mt.vep.transcript_consequences.mane_select))
)
assert is_MANE, "Selected transcript may not be MANE Select. Check manually."

mt = mt.annotate_rows(
    protCons=mt.vep.transcript_consequences.amino_acids[0].split("/")[0]
    + hl.str(mt.vep.transcript_consequences.protein_end[0])
    + mt.vep.transcript_consequences.amino_acids[0].split("/")[-1],
    gene = mt.vep.transcript_consequences.gene_symbol[0]
)


2023-03-23 09:51:34 Hail: INFO: Ordering unsorted dataset with network shuffle
2023-03-23 09:52:28 Hail: INFO: Ordering unsorted dataset with network shuffle


In [49]:
# Initial checkpoint

mt = mt.annotate_rows(gene = mt.vep.transcript_consequences.gene_symbol[0])

stage = "ANNOTATED"
checkpoint_file = f"/tmp/{PROJ_NAME}.{stage}.cp.mt"


mt = mt.checkpoint(checkpoint_file, overwrite=True)

v, s = mt.count()
print(f"{v} variants and {s} samples after import and target region filter")

2023-03-23 11:08:47 Hail: INFO: wrote matrix table with 2448 rows and 469835 columns in 6 partitions to /tmp/TASR.ANNOTATED.cp.mt
2023-03-23 11:08:48 Hail: INFO: wrote matrix table with 2448 rows and 469835 columns in 6 partitions to /tmp/TASR.ANNOTATED.cp.mt


2448 variants and 469835 samples after import and target region filter


In [47]:
# Initial checkpoint
stage = "ANNOTATED"
checkpoint_file = f"/tmp/{PROJ_NAME}.{stage}.cp.mt"

mt = hl.read_matrix_table(checkpoint_file)

In [61]:
run(["hadoop", "fs", "-put", "ancestry.csv", "/tmp"]) # from Ancestry.ipynb
ht = hl.import_table("/tmp/ancestry.csv", delimiter = ",", quote='"', missing="foo").select("PC_UKBB.eid", "group").key_by("PC_UKBB.eid")
# mt = mt.annotate_cols(**ht[mt.s])


2023-03-23 11:28:37 Hail: INFO: Reading table without type imputation
  Loading field '' as type str (not specified)
  Loading field 'PC_UKBB.eid' as type str (not specified)
  Loading field 'group' as type str (not specified)


In [56]:
GENE = "TAS1R3"
ZYG = "HOM"
mtr2 = mt.filter_rows((mt.gene == GENE) & (mt.vep.most_severe_consequence == "missense_variant"))
mtr2 = mtr2.annotate_cols(
#                      tas1r2_het = hl.agg.filter(mtr2.GT.is_het_ref(), hl.agg.collect(mtr2.protCons)),
#                      tas1r2_hom = hl.agg.filter(mtr2.GT.is_hom_var(), hl.agg.collect(mtr2.protCons)),
#                      tas1r3_het = hl.agg.filter(mtr2.GT.is_het_ref(), hl.agg.collect(mtr2.protCons)),
                     tas1r3_hom = hl.agg.filter(mtr2.GT.is_hom_var(), hl.agg.collect(mtr2.protCons))
                     )

stage = "COLS"
checkpoint_file = f"/tmp/{PROJ_NAME}.{stage}.{GENE}.{ZYG}.cp.ht"
htr2 = mtr2.cols()
htr2 = htr2.checkpoint(checkpoint_file, overwrite=True)

2023-03-23 11:24:27 Hail: INFO: Coerced sorted dataset
2023-03-23 11:24:27 Hail: INFO: wrote table with 469835 rows in 16 partitions to /tmp/TASR.COLS.TAS1R3.HOM.cp.ht


In [67]:
stage = "COLS"

GENE = "TAS1R2"
ZYG = "HET"
f"/tmp/{PROJ_NAME}.{stage}.{GENE}.{ZYG}.cp.ht"

ht1 = hl.read_table(f"/tmp/{PROJ_NAME}.{stage}.{GENE}.{ZYG}.cp.ht").key_by("s")

GENE = "TAS1R2"
ZYG = "HOM"

ht2 = hl.read_table(f"/tmp/{PROJ_NAME}.{stage}.{GENE}.{ZYG}.cp.ht").key_by("s")

GENE = "TAS1R3"
ZYG = "HET"

ht3 = hl.read_table(f"/tmp/{PROJ_NAME}.{stage}.{GENE}.{ZYG}.cp.ht").key_by("s")

GENE = "TAS1R3"
ZYG = "HOM"

ht4 = hl.read_table(f"/tmp/{PROJ_NAME}.{stage}.{GENE}.{ZYG}.cp.ht").key_by("s")

ht = ht.join(ht1).join(ht2).join(ht3).join(ht4)

In [71]:
ht = ht.add_index()
ht = ht.key_by().drop('PC_UKBB.eid')
ht.export("TASR_Haplotypes.tsv")

2023-03-23 11:33:54 Hail: INFO: Ordering unsorted dataset with network shuffle
2023-03-23 11:34:13 Hail: INFO: merging 16 files totalling 26.3M...
2023-03-23 11:34:13 Hail: INFO: while writing:
    TASR_Haplotypes.tsv
  merge time: 282.482ms


In [72]:
run(["hadoop", "fs", "-get", "TASR_Haplotypes.tsv"]) 

CompletedProcess(args=['hadoop', 'fs', '-get', 'TASR_Haplotypes.tsv'], returncode=0)

In [74]:
# Get summary of variants
mt = hl.variant_qc(mt)
intr = mt.rows()
intr = intr.select(intr.gene, intr.variant_qc, intr.protCons, intr.vep.most_severe_consequence)
intr = intr.annotate(**intr.variant_qc)
intr = intr.drop(
    "variant_qc",
    "gq_stats",
    "dp_stats",
)
intr.export('/tmp/TARS_Variants.tsv')
run(["hadoop", "fs", "-get", '/tmp/TARS_Variants.tsv'])

2023-03-23 11:40:31 Hail: INFO: merging 4 files totalling 415.6K...
2023-03-23 11:40:31 Hail: INFO: while writing:
    /tmp/TARS_Variants.tsv
  merge time: 41.434ms


CompletedProcess(args=['hadoop', 'fs', '-get', '/tmp/TARS_Variants.tsv'], returncode=0)