In [1]:
import dxdata
import dxpy
import hail as hl

import pyspark
import tomli
import subprocess
from matrixtables import *
from utils import get_stats
from datetime import datetime
from pprint import pprint

Path("../tmp").resolve().mkdir(parents=True, exist_ok=True)

In [2]:
# Parameters
with open("../config.toml", "rb") as f:
    conf = tomli.load(f)

IMPORT = conf["IMPORT"]
NAME = conf["NAME"]
VCF_VERSION = IMPORT["VCF_VERSION"]
REFERENCE_GENOME = conf["REFERENCE_GENOME"]
DATABASE = IMPORT["DATABASE"]

LOG_FILE = (
    Path(IMPORT["LOG_DIR"], f"{NAME}_{datetime.now().strftime('%H%M')}.log")
    .resolve()
    .__str__()
)

GENE_FILE = Path(IMPORT["GENE_FILE"]).resolve().__str__()

with open(GENE_FILE, "r") as file:
    genes = file.read().splitlines()
    
if NAME == "NONE":
    NAME = genes[0]

In [3]:
# Spark and Hail

sc = pyspark.SparkContext()
spark = pyspark.sql.SparkSession(sc)

mt_database = dxpy.find_one_data_object(name=DATABASE)["id"]
hl.init(sc=sc, default_reference=REFERENCE_GENOME, log=LOG_FILE)

pip-installed Hail requires additional configuration options in Spark referring
  to the path to the Hail Python module directory HAIL_DIR,
  e.g. /path/to/python/site-packages/hail:
    spark.jars=HAIL_DIR/hail-all-spark.jar
    spark.driver.extraClassPath=HAIL_DIR/hail-all-spark.jar
    spark.executor.extraClassPath=./hail-all-spark.jarRunning on Apache Spark version 2.4.4
SparkUI available at http://ip-10-60-12-76.eu-west-2.compute.internal:8081
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.61-3c86d3ba497a
LOGGING: writing to /opt/notebooks/gogoGPCR/hail_logs/NONE_1433.log


In [4]:
STAGE = "QC1"
READ_PATH = "dnax://" + mt_database + f"/{NAME}.{STAGE}.mt"

mt = hl.read_matrix_table(READ_PATH)

v, s = mt.count()
pprint(f"{v} variants and {s} samples after reading matrixtable")

'223 variants and 200643 samples after reading matrixtable'


In [97]:
ht = hl.import_table("file:" + "/mnt/project/Data/annotations/MC4R_annotations.tsv", impute=True, quote = '"')
ht = ht.annotate(Variants = ht.Variants.strip())
ht = ht.key_by("Variants")

2021-12-06 15:15:07 Hail: INFO: Reading table to impute column types
2021-12-06 15:15:08 Hail: INFO: Finished type imputation
  Loading field 'Variants' as type str (imputed)
  Loading field 'Category' as type int32 (imputed)


In [96]:
ht.Variants.collect()

["5'UTR 32bp vor A+G:A>C",
 'A135P',
 'A175T',
 'C271R',
 'D90N',
 'E61*',
 'G181D',
 'I170V',
 'I198I',
 'I251L',
 'N274S',
 'P272L',
 'P275S',
 'P48S',
 'S127L',
 'S77L',
 'S77L',
 'S85G+Y268H',
 'T112K',
 'T112M',
 'T11l',
 'T122T',
 'T178M',
 'V103I',
 'V103l+S127L',
 'V166I',
 'V166L',
 'Y35X+D37V',
 'Y80C',
 'c.1000+16 G>A',
 'c.811insT, p.C271fsX285']

In [82]:
mt = mt.annotate_rows(labels = ht[mt.protCons].Category)

In [83]:
mt.aggregate_rows(hl.agg.sum(hl.is_defined(mt.labels)))

2021-12-06 15:08:08 Hail: INFO: Ordering unsorted dataset with network shuffle
2021-12-06 15:08:09 Hail: INFO: Ordering unsorted dataset with network shuffle
2021-12-06 15:08:09 Hail: INFO: Ordering unsorted dataset with network shuffle


14

In [85]:
found = mt.filter_rows(hl.is_defined(mt.labels)).rows()
found.show(-1)

2021-12-06 15:09:27 Hail: INFO: Ordering unsorted dataset with network shuffle
2021-12-06 15:09:27 Hail: INFO: Ordering unsorted dataset with network shuffle
2021-12-06 15:09:28 Hail: INFO: Ordering unsorted dataset with network shuffle
2021-12-06 15:09:28 Hail: INFO: Ordering unsorted dataset with network shuffle
2021-12-06 15:09:29 Hail: INFO: Ordering unsorted dataset with network shuffle
2021-12-06 15:09:29 Hail: INFO: Ordering unsorted dataset with network shuffle
2021-12-06 15:09:31 Hail: INFO: Ordering unsorted dataset with network shuffle
2021-12-06 15:09:31 Hail: INFO: Ordering unsorted dataset with network shuffle
2021-12-06 15:09:32 Hail: INFO: Ordering unsorted dataset with network shuffle


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,info,info,info,info,Unnamed: 9_level_0,Unnamed: 10_level_0,vep,vep,vep,vep,vep,vep,vep,vep,vep,vep,vep,vep,vep,vep,vep,vep,vep,Unnamed: 28_level_0,Unnamed: 29_level_0
locus,alleles,rsid,qual,filters,AF,AQ,AC,AN,a_index,was_split,assembly_name,allele_string,ancestral,colocated_variants,context,end,id,input,intergenic_consequences,most_severe_consequence,motif_feature_consequences,regulatory_feature_consequences,seq_region_name,start,strand,transcript_consequences,variant_class,protCons,labels
locus<GRCh38>,array<str>,str,float64,set<str>,array<float64>,array<int32>,array<int32>,int32,int32,bool,str,str,str,"array<struct{aa_allele: str, aa_maf: float64, afr_allele: str, afr_maf: float64, allele_string: str, amr_allele: str, amr_maf: float64, clin_sig: array<str>, end: int32, eas_allele: str, eas_maf: float64, ea_allele: str, ea_maf: float64, eur_allele: str, eur_maf: float64, exac_adj_allele: str, exac_adj_maf: float64, exac_allele: str, exac_afr_allele: str, exac_afr_maf: float64, exac_amr_allele: str, exac_amr_maf: float64, exac_eas_allele: str, exac_eas_maf: float64, exac_fin_allele: str, exac_fin_maf: float64, exac_maf: float64, exac_nfe_allele: str, exac_nfe_maf: float64, exac_oth_allele: str, exac_oth_maf: float64, exac_sas_allele: str, exac_sas_maf: float64, id: str, minor_allele: str, minor_allele_freq: float64, phenotype_or_disease: int32, pubmed: array<int32>, sas_allele: str, sas_maf: float64, somatic: int32, start: int32, strand: int32}>",str,int32,str,str,"array<struct{allele_num: int32, consequence_terms: array<str>, impact: str, minimised: int32, variant_allele: str}>",str,"array<struct{allele_num: int32, consequence_terms: array<str>, high_inf_pos: str, impact: str, minimised: int32, motif_feature_id: str, motif_name: str, motif_pos: int32, motif_score_change: float64, strand: int32, variant_allele: str}>","array<struct{allele_num: int32, biotype: str, consequence_terms: array<str>, impact: str, minimised: int32, regulatory_feature_id: str, variant_allele: str}>",str,int32,int32,"array<struct{allele_num: int32, amino_acids: str, appris: str, biotype: str, canonical: int32, ccds: str, cdna_start: int32, cdna_end: int32, cds_end: int32, cds_start: int32, codons: str, consequence_terms: array<str>, distance: int32, domains: array<struct{db: str, name: str}>, exon: str, gene_id: str, gene_pheno: int32, gene_symbol: str, gene_symbol_source: str, hgnc_id: str, hgvsc: str, hgvsp: str, hgvs_offset: int32, impact: str, intron: str, lof: str, lof_flags: str, lof_filter: str, lof_info: str, minimised: int32, polyphen_prediction: str, polyphen_score: float64, protein_end: int32, protein_start: int32, protein_id: str, sift_prediction: str, sift_score: float64, strand: int32, swissprot: str, transcript_id: str, trembl: str, tsl: int32, uniparc: str, mane_select: str, variant_allele: str}>",str,str,int32
chr18:60371527,"[""G"",""A""]","""chr18_60371527_G_A""",41.0,,[5.00e-06],[41],,,1,False,"""GRCh38""","""G/A""",,"[(NA,NA,NA,NA,""HGMD_MUTATION"",NA,NA,NA,60371527,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,""CM061846"",NA,NA,1,NA,NA,NA,NA,60371527,1),(NA,NA,NA,NA,""G/A"",NA,NA,NA,60371527,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,""rs201813179"",NA,NA,NA,NA,NA,NA,NA,60371527,1)]",,60371527,""".""","""chr18	60371527	.	G	A	.	.	GT""",,"""missense_variant""",,,"""chr18""",60371527,1,"[(1,""P/S"",""P1"",""protein_coding"",1,""CCDS11976.1"",1249,1249,823,823,""Cca/Tca"",[""missense_variant""],NA,[(""Gene3D"",""1""),(""ENSP_mappings"",""6w25""),(""Pfam"",""PF00001""),(""Prints"",""PR00534""),(""PROSITE_profiles"",""PS50262""),(""PANTHER"",""PTHR22750""),(""PANTHER"",""PTHR22750""),(""SMART"",""SM01381""),(""Superfamily"",""SSF81321""),(""CDD"",""cd15353"")],""1/1"",""ENSG00000166603"",1,""MC4R"",""HGNC"",""HGNC:6932"",""ENST00000299766.5:c.823C>T"",""ENSP00000299766.3:p.Pro275Ser"",NA,""MODERATE"",NA,NA,NA,NA,NA,NA,""benign"",3.00e-01,275,275,""ENSP00000299766"",""deleterious"",2.00e-02,-1,NA,""ENST00000299766"",NA,NA,NA,""NM_005912.3"",""A"")]","""SNV""","""P275S""",4
chr18:60371529,"[""T"",""C""]","""chr18_60371529_T_C""",45.0,,[2.00e-06],[45],,,1,False,"""GRCh38""","""T/C""",,"[(NA,NA,NA,NA,""HGMD_MUTATION"",NA,NA,NA,60371529,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,""CM012148"",NA,NA,1,NA,NA,NA,NA,60371529,1),(NA,NA,NA,NA,""T/C"",NA,NA,[""uncertain_significance"",""pathogenic""],60371529,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,""rs121913561"",NA,NA,1,[11443223],NA,NA,NA,60371529,1)]",,60371529,""".""","""chr18	60371529	.	T	C	.	.	GT""",,"""missense_variant""",,,"""chr18""",60371529,1,"[(1,""N/S"",""P1"",""protein_coding"",1,""CCDS11976.1"",1247,1247,821,821,""aAt/aGt"",[""missense_variant""],NA,[(""Gene3D"",""1""),(""ENSP_mappings"",""6w25""),(""Pfam"",""PF00001""),(""Prints"",""PR00534""),(""PROSITE_profiles"",""PS50262""),(""PANTHER"",""PTHR22750""),(""PANTHER"",""PTHR22750""),(""SMART"",""SM01381""),(""Superfamily"",""SSF81321""),(""CDD"",""cd15353"")],""1/1"",""ENSG00000166603"",1,""MC4R"",""HGNC"",""HGNC:6932"",""ENST00000299766.5:c.821A>G"",""ENSP00000299766.3:p.Asn274Ser"",NA,""MODERATE"",NA,NA,NA,NA,NA,NA,""possibly_damaging"",8.59e-01,274,274,""ENSP00000299766"",""tolerated"",9.00e-02,-1,NA,""ENST00000299766"",NA,NA,NA,""NM_005912.3"",""C"")]","""SNV""","""N274S""",1
chr18:60371539,"[""A"",""G""]","""chr18_60371539_A_G""",46.0,,[2.00e-06],[46],,,1,False,"""GRCh38""","""A/G""",,"[(NA,NA,NA,NA,""HGMD_MUTATION"",NA,NA,NA,60371539,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,""CM035931"",NA,NA,1,NA,NA,NA,NA,60371539,1),(NA,NA,NA,NA,""A/G"",NA,NA,[""pathogenic""],60371539,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,""rs1057517991"",NA,NA,1,NA,NA,NA,NA,60371539,1)]",,60371539,""".""","""chr18	60371539	.	A	G	.	.	GT""",,"""missense_variant""",,,"""chr18""",60371539,1,"[(1,""C/R"",""P1"",""protein_coding"",1,""CCDS11976.1"",1237,1237,811,811,""Tgt/Cgt"",[""missense_variant""],NA,[(""Gene3D"",""1""),(""ENSP_mappings"",""6w25""),(""Pfam"",""PF00001""),(""Prints"",""PR00534""),(""Prints"",""PR00535""),(""PROSITE_profiles"",""PS50262""),(""PANTHER"",""PTHR22750""),(""PANTHER"",""PTHR22750""),(""SMART"",""SM01381""),(""Superfamily"",""SSF81321""),(""CDD"",""cd15353"")],""1/1"",""ENSG00000166603"",1,""MC4R"",""HGNC"",""HGNC:6932"",""ENST00000299766.5:c.811T>C"",""ENSP00000299766.3:p.Cys271Arg"",NA,""MODERATE"",NA,NA,NA,NA,NA,NA,""probably_damaging"",1.00e+00,271,271,""ENSP00000299766"",""deleterious"",0.00e+00,-1,NA,""ENST00000299766"",NA,NA,NA,""NM_005912.3"",""G"")]","""SNV""","""C271R""",2
chr18:60371599,"[""T"",""G""]","""chr18_60371599_T_G;chr18_60371598_ATC_A""",53.0,,"[1.30e-02,3.50e-05]","[53,51]",,,1,True,"""GRCh38""","""T/G""",,"[(NA,NA,NA,NA,""HGMD_MUTATION"",NA,NA,NA,60371599,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,""CD032475"",NA,NA,1,NA,NA,NA,NA,60371599,1),(NA,NA,NA,NA,""HGMD_MUTATION"",NA,NA,NA,60371599,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,""CM030483"",NA,NA,1,NA,NA,NA,NA,60371599,1),(NA,NA,NA,NA,""COSMIC_MUTATION"",NA,NA,NA,60371599,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,""COSV99079230"",NA,NA,1,NA,NA,NA,1,60371599,1),(NA,NA,NA,NA,""T/G"",NA,NA,[""benign"",""likely_benign""],60371599,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,""rs52820871"",""G"",2.60e-03,1,[25741868,20127379,26474449,23049848,24820477,12646666,26788538,21085626,10199800,15037865,23251400,30134862,30121879,31035493],NA,NA,NA,60371599,1)]",,60371599,""".""","""chr18	60371599	.	T	G	.	.	GT""",,"""missense_variant""",,,"""chr18""",60371599,1,"[(1,""I/L"",""P1"",""protein_coding"",1,""CCDS11976.1"",1177,1177,751,751,""Att/Ctt"",[""missense_variant""],NA,[(""Gene3D"",""1""),(""ENSP_mappings"",""6w25""),(""Pfam"",""PF00001""),(""Prints"",""PR00237""),(""Prints"",""PR00534""),(""PROSITE_profiles"",""PS50262""),(""PANTHER"",""PTHR22750""),(""PANTHER"",""PTHR22750""),(""SMART"",""SM01381""),(""Superfamily"",""SSF81321""),(""Transmembrane_helices"",""TMhelix""),(""CDD"",""cd15353"")],""1/1"",""ENSG00000166603"",1,""MC4R"",""HGNC"",""HGNC:6932"",""ENST00000299766.5:c.751A>C"",""ENSP00000299766.3:p.Ile251Leu"",NA,""MODERATE"",NA,NA,NA,NA,NA,NA,""benign"",5.00e-03,251,251,""ENSP00000299766"",""tolerated"",1.00e+00,-1,NA,""ENST00000299766"",NA,NA,NA,""NM_005912.3"",""G"")]","""SNV""","""I251L""",1
chr18:60371756,"[""G"",""A""]","""chr18_60371756_G_A""",55.0,,[9.57e-04],[55],,,1,False,"""GRCh38""","""G/A""",,"[(NA,NA,NA,NA,""G/A"",NA,NA,[""benign""],60371756,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,""rs61741819"",""A"",1.28e-02,1,[25741868,26788538],NA,NA,NA,60371756,1)]",,60371756,""".""","""chr18	60371756	.	G	A	.	.	GT""",,"""synonymous_variant""",,,"""chr18""",60371756,1,"[(1,""I"",""P1"",""protein_coding"",1,""CCDS11976.1"",1020,1020,594,594,""atC/atT"",[""synonymous_variant""],NA,[(""Gene3D"",""1""),(""ENSP_mappings"",""6w25""),(""Pfam"",""PF00001""),(""Prints"",""PR00237""),(""Prints"",""PR00535""),(""PROSITE_profiles"",""PS50262""),(""PANTHER"",""PTHR22750""),(""PANTHER"",""PTHR22750""),(""SMART"",""SM01381""),(""Superfamily"",""SSF81321""),(""Transmembrane_helices"",""TMhelix""),(""CDD"",""cd15353"")],""1/1"",""ENSG00000166603"",1,""MC4R"",""HGNC"",""HGNC:6932"",""ENST00000299766.5:c.594C>T"",""ENSP00000299766.3:p.Ile198="",NA,""LOW"",NA,NA,NA,NA,NA,NA,NA,NA,198,198,""ENSP00000299766"",NA,NA,-1,NA,""ENST00000299766"",NA,NA,NA,""NM_005912.3"",""A"")]","""SNV""","""I198I""",4
chr18:60371808,"[""C"",""T""]","""chr18_60371808_C_T""",47.0,,[2.00e-06],[47],,,1,False,"""GRCh38""","""C/T""",,"[(NA,NA,NA,NA,""HGMD_MUTATION"",NA,NA,NA,60371808,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,""CM032287"",NA,NA,1,NA,NA,NA,NA,60371808,1),(NA,NA,NA,NA,""C/T"",NA,NA,[""pathogenic""],60371808,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,""rs13447333"",NA,NA,1,NA,NA,NA,NA,60371808,1)]",,60371808,""".""","""chr18	60371808	.	C	T	.	.	GT""",,"""missense_variant""",,,"""chr18""",60371808,1,"[(1,""G/D"",""P1"",""protein_coding"",1,""CCDS11976.1"",968,968,542,542,""gGc/gAc"",[""missense_variant""],NA,[(""Gene3D"",""1""),(""ENSP_mappings"",""6w25""),(""Pfam"",""PF00001""),(""PROSITE_profiles"",""PS50262""),(""PANTHER"",""PTHR22750""),(""PANTHER"",""PTHR22750""),(""SMART"",""SM01381""),(""Superfamily"",""SSF81321""),(""Transmembrane_helices"",""TMhelix""),(""CDD"",""cd15353"")],""1/1"",""ENSG00000166603"",1,""MC4R"",""HGNC"",""HGNC:6932"",""ENST00000299766.5:c.542G>A"",""ENSP00000299766.3:p.Gly181Asp"",NA,""MODERATE"",NA,NA,NA,NA,NA,NA,""probably_damaging"",1.00e+00,181,181,""ENSP00000299766"",""deleterious"",0.00e+00,-1,NA,""ENST00000299766"",NA,NA,NA,""NM_005912.3"",""T"")]","""SNV""","""G181D""",2
chr18:60371817,"[""G"",""A""]","""chr18_60371817_G_A""",50.0,,[7.00e-06],[50],,,1,False,"""GRCh38""","""G/A""",,"[(NA,NA,NA,NA,""COSMIC_MUTATION"",NA,NA,NA,60371817,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,""COSV55352328"",NA,NA,1,NA,NA,NA,1,60371817,1),(NA,NA,NA,NA,""G/A"",NA,NA,NA,60371817,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,""rs140040360"",NA,NA,NA,NA,NA,NA,NA,60371817,1)]",,60371817,""".""","""chr18	60371817	.	G	A	.	.	GT""",,"""missense_variant""",,,"""chr18""",60371817,1,"[(1,""T/M"",""P1"",""protein_coding"",1,""CCDS11976.1"",959,959,533,533,""aCg/aTg"",[""missense_variant""],NA,[(""Gene3D"",""1""),(""ENSP_mappings"",""6w25""),(""Pfam"",""PF00001""),(""PROSITE_profiles"",""PS50262""),(""PANTHER"",""PTHR22750""),(""PANTHER"",""PTHR22750""),(""SMART"",""SM01381""),(""Superfamily"",""SSF81321""),(""Transmembrane_helices"",""TMhelix""),(""CDD"",""cd15353"")],""1/1"",""ENSG00000166603"",1,""MC4R"",""HGNC"",""HGNC:6932"",""ENST00000299766.5:c.533C>T"",""ENSP00000299766.3:p.Thr178Met"",NA,""MODERATE"",NA,NA,NA,NA,NA,NA,""possibly_damaging"",9.08e-01,178,178,""ENSP00000299766"",""tolerated"",8.00e-02,-1,NA,""ENST00000299766"",NA,NA,NA,""NM_005912.3"",""A"")]","""SNV""","""T178M""",1
chr18:60371827,"[""C"",""T""]","""chr18_60371827_C_T""",48.0,,[1.37e-04],[48],,,1,False,"""GRCh38""","""C/T""",,"[(NA,NA,NA,NA,""HGMD_MUTATION"",NA,NA,NA,60371827,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,""CM030690"",NA,NA,1,NA,NA,NA,NA,60371827,1),(NA,NA,NA,NA,""C/T"",NA,NA,[""uncertain_significance"",""pathogenic""],60371827,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,""rs121913563"",NA,NA,1,[25333069,12646665],NA,NA,NA,60371827,1)]",,60371827,""".""","""chr18	60371827	.	C	T	.	.	GT""",,"""missense_variant""",,,"""chr18""",60371827,1,"[(1,""A/T"",""P1"",""protein_coding"",1,""CCDS11976.1"",949,949,523,523,""Gca/Aca"",[""missense_variant""],NA,[(""Gene3D"",""1""),(""ENSP_mappings"",""6w25""),(""Pfam"",""PF00001""),(""PROSITE_profiles"",""PS50262""),(""PANTHER"",""PTHR22750""),(""PANTHER"",""PTHR22750""),(""SMART"",""SM01381""),(""Superfamily"",""SSF81321""),(""Transmembrane_helices"",""TMhelix""),(""CDD"",""cd15353"")],""1/1"",""ENSG00000166603"",1,""MC4R"",""HGNC"",""HGNC:6932"",""ENST00000299766.5:c.523G>A"",""ENSP00000299766.3:p.Ala175Thr"",NA,""MODERATE"",NA,NA,NA,NA,NA,NA,""benign"",2.30e-02,175,175,""ENSP00000299766"",""tolerated"",1.00e+00,-1,NA,""ENST00000299766"",NA,NA,NA,""NM_005912.3"",""T"")]","""SNV""","""A175T""",1
chr18:60371842,"[""T"",""C""]","""chr18_60371842_T_C""",54.0,,[1.10e-04],[54],,,1,False,"""GRCh38""","""T/C""",,"[(NA,NA,NA,NA,""HGMD_MUTATION"",NA,NA,NA,60371842,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,""CM003761"",NA,NA,1,NA,NA,NA,NA,60371842,1),(NA,NA,NA,NA,""T/C"",NA,NA,[""pathogenic""],60371842,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,""rs121913560"",""C"",8.00e-04,1,[11487744],NA,NA,NA,60371842,1)]",,60371842,""".""","""chr18	60371842	.	T	C	.	.	GT""",,"""missense_variant""",,,"""chr18""",60371842,1,"[(1,""I/V"",""P1"",""protein_coding"",1,""CCDS11976.1"",934,934,508,508,""Ata/Gta"",[""missense_variant""],NA,[(""Gene3D"",""1""),(""ENSP_mappings"",""6w25""),(""Pfam"",""PF00001""),(""PROSITE_profiles"",""PS50262""),(""PANTHER"",""PTHR22750""),(""PANTHER"",""PTHR22750""),(""SMART"",""SM01381""),(""Superfamily"",""SSF81321""),(""Transmembrane_helices"",""TMhelix""),(""CDD"",""cd15353"")],""1/1"",""ENSG00000166603"",1,""MC4R"",""HGNC"",""HGNC:6932"",""ENST00000299766.5:c.508A>G"",""ENSP00000299766.3:p.Ile170Val"",NA,""MODERATE"",NA,NA,NA,NA,NA,NA,""possibly_damaging"",6.39e-01,170,170,""ENSP00000299766"",""tolerated"",6.00e-02,-1,NA,""ENST00000299766"",NA,NA,NA,""NM_005912.3"",""C"")]","""SNV""","""I170V""",1
chr18:60371854,"[""C"",""T""]","""chr18_60371854_C_T""",45.0,,[2.00e-06],[45],,,1,False,"""GRCh38""","""C/T""",,"[(NA,NA,NA,NA,""HGMD_MUTATION"",NA,NA,NA,60371854,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,""CM061845"",NA,NA,1,NA,NA,NA,NA,60371854,1),(NA,NA,NA,NA,""C/T"",NA,NA,[""pathogenic""],60371854,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,""rs942758928"",NA,NA,1,NA,NA,NA,NA,60371854,1)]",,60371854,""".""","""chr18	60371854	.	C	T	.	.	GT""",,"""missense_variant""",,,"""chr18""",60371854,1,"[(1,""V/I"",""P1"",""protein_coding"",1,""CCDS11976.1"",922,922,496,496,""Gtt/Att"",[""missense_variant""],NA,[(""Gene3D"",""1""),(""ENSP_mappings"",""6w25""),(""Pfam"",""PF00001""),(""PROSITE_profiles"",""PS50262""),(""PANTHER"",""PTHR22750""),(""PANTHER"",""PTHR22750""),(""SMART"",""SM01381""),(""Superfamily"",""SSF81321""),(""Transmembrane_helices"",""TMhelix""),(""CDD"",""cd15353"")],""1/1"",""ENSG00000166603"",1,""MC4R"",""HGNC"",""HGNC:6932"",""ENST00000299766.5:c.496G>A"",""ENSP00000299766.3:p.Val166Ile"",NA,""MODERATE"",NA,NA,NA,NA,NA,NA,""benign"",2.81e-01,166,166,""ENSP00000299766"",""deleterious"",0.00e+00,-1,NA,""ENST00000299766"",NA,NA,NA,""NM_005912.3"",""T"")]","""SNV""","""V166I""",2


In [87]:
not_found = ht.anti_join(found.key_by("protCons"))

In [88]:
not_found.show(-1)

2021-12-06 15:10:05 Hail: INFO: Ordering unsorted dataset with network shuffle
2021-12-06 15:10:07 Hail: INFO: Ordering unsorted dataset with network shuffle
2021-12-06 15:10:08 Hail: INFO: Ordering unsorted dataset with network shuffle
2021-12-06 15:10:11 Hail: INFO: Ordering unsorted dataset with network shuffle
2021-12-06 15:10:13 Hail: INFO: Ordering unsorted dataset with network shuffle
2021-12-06 15:10:14 Hail: INFO: Ordering unsorted dataset with network shuffle
2021-12-06 15:10:15 Hail: INFO: Ordering unsorted dataset with network shuffle
2021-12-06 15:10:17 Hail: INFO: Ordering unsorted dataset with network shuffle
2021-12-06 15:10:18 Hail: INFO: Ordering unsorted dataset with network shuffle
2021-12-06 15:10:19 Hail: INFO: Ordering unsorted dataset with network shuffle
2021-12-06 15:10:22 Hail: INFO: Ordering unsorted dataset with network shuffle


Variants,Category
str,int32
"""5'UTR 32bp vor A+G:A>C""",4
"""A135P""",2
"""D90N""",2
"""E61*""",2
"""P272L""",2
"""P48S""",4
"""S77L""",1
"""S77L""",1
"""S85G+Y268H""",2
"""T112K""",4


In [89]:
no_label = mt.filter_rows(~hl.is_defined(mt.labels)).rows()
no_label = no_label.annotate(**no_label.vep.transcript_consequences[0])
no_label = no_label.drop("vep")
no_label.show(-1)

2021-12-06 15:10:25 Hail: INFO: Ordering unsorted dataset with network shuffle
2021-12-06 15:10:26 Hail: INFO: Ordering unsorted dataset with network shuffle
2021-12-06 15:10:28 Hail: INFO: Ordering unsorted dataset with network shuffle
2021-12-06 15:10:29 Hail: INFO: Ordering unsorted dataset with network shuffle
2021-12-06 15:10:29 Hail: INFO: Ordering unsorted dataset with network shuffle
2021-12-06 15:10:30 Hail: INFO: Ordering unsorted dataset with network shuffle
2021-12-06 15:10:31 Hail: INFO: Ordering unsorted dataset with network shuffle
2021-12-06 15:10:32 Hail: INFO: Ordering unsorted dataset with network shuffle
2021-12-06 15:10:32 Hail: INFO: Ordering unsorted dataset with network shuffle


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,info,info,info,info,Unnamed: 9_level_0,Unnamed: 10_level_0,Unnamed: 11_level_0,Unnamed: 12_level_0,Unnamed: 13_level_0,Unnamed: 14_level_0,Unnamed: 15_level_0,Unnamed: 16_level_0,Unnamed: 17_level_0,Unnamed: 18_level_0,Unnamed: 19_level_0,Unnamed: 20_level_0,Unnamed: 21_level_0,Unnamed: 22_level_0,Unnamed: 23_level_0,Unnamed: 24_level_0,Unnamed: 25_level_0,Unnamed: 26_level_0,Unnamed: 27_level_0,Unnamed: 28_level_0,Unnamed: 29_level_0,Unnamed: 30_level_0,Unnamed: 31_level_0,Unnamed: 32_level_0,Unnamed: 33_level_0,Unnamed: 34_level_0,Unnamed: 35_level_0,Unnamed: 36_level_0,Unnamed: 37_level_0,Unnamed: 38_level_0,Unnamed: 39_level_0,Unnamed: 40_level_0,Unnamed: 41_level_0,Unnamed: 42_level_0,Unnamed: 43_level_0,Unnamed: 44_level_0,Unnamed: 45_level_0,Unnamed: 46_level_0,Unnamed: 47_level_0,Unnamed: 48_level_0,Unnamed: 49_level_0,Unnamed: 50_level_0,Unnamed: 51_level_0,Unnamed: 52_level_0,Unnamed: 53_level_0,Unnamed: 54_level_0,Unnamed: 55_level_0,Unnamed: 56_level_0,Unnamed: 57_level_0
locus,alleles,rsid,qual,filters,AF,AQ,AC,AN,a_index,was_split,protCons,labels,allele_num,amino_acids,appris,biotype,canonical,ccds,cdna_start,cdna_end,cds_end,cds_start,codons,consequence_terms,distance,domains,exon,gene_id,gene_pheno,gene_symbol,gene_symbol_source,hgnc_id,hgvsc,hgvsp,hgvs_offset,impact,intron,lof,lof_flags,lof_filter,lof_info,minimised,polyphen_prediction,polyphen_score,protein_end,protein_start,protein_id,sift_prediction,sift_score,strand,swissprot,transcript_id,trembl,tsl,uniparc,mane_select,variant_allele
locus<GRCh38>,array<str>,str,float64,set<str>,array<float64>,array<int32>,array<int32>,int32,int32,bool,str,int32,int32,str,str,str,int32,str,int32,int32,int32,int32,str,array<str>,int32,"array<struct{db: str, name: str}>",str,str,int32,str,str,str,str,str,int32,str,str,str,str,str,str,int32,str,float64,int32,int32,str,str,float64,int32,str,str,str,int32,str,str,str
chr18:60371355,"[""T"",""C""]","""chr18_60371355_T_C""",49.0,,[1.70e-05],[49],,,1,False,"""Y332C""",,1,"""Y/C""","""P1""","""protein_coding""",1,"""CCDS11976.1""",1421,1421,995,995,"""tAt/tGt""","[""missense_variant""]",,,"""1/1""","""ENSG00000166603""",1,"""MC4R""","""HGNC""","""HGNC:6932""","""ENST00000299766.5:c.995A>G""","""ENSP00000299766.3:p.Tyr332Cys""",,"""MODERATE""",,,,,,,"""probably_damaging""",0.976,332,332,"""ENSP00000299766""","""deleterious_low_confidence""",0.0,-1,,"""ENST00000299766""",,,,"""NM_005912.3""","""C"""
chr18:60371356,"[""A"",""G""]","""chr18_60371356_A_G""",43.0,,[7.00e-06],[43],,,1,False,"""Y332H""",,1,"""Y/H""","""P1""","""protein_coding""",1,"""CCDS11976.1""",1420,1420,994,994,"""Tat/Cat""","[""missense_variant""]",,,"""1/1""","""ENSG00000166603""",1,"""MC4R""","""HGNC""","""HGNC:6932""","""ENST00000299766.5:c.994T>C""","""ENSP00000299766.3:p.Tyr332His""",,"""MODERATE""",,,,,,,"""probably_damaging""",0.968,332,332,"""ENSP00000299766""","""deleterious_low_confidence""",0.0,-1,,"""ENST00000299766""",,,,"""NM_005912.3""","""G"""
chr18:60371358,"[""C"",""T""]","""chr18_60371358_C_T""",46.0,,[2.00e-06],[46],,,1,False,"""R331K""",,1,"""R/K""","""P1""","""protein_coding""",1,"""CCDS11976.1""",1418,1418,992,992,"""aGa/aAa""","[""missense_variant""]",,"[(""Prints"",""PR01062""),(""PANTHER"",""PTHR22750""),(""PANTHER"",""PTHR22750"")]","""1/1""","""ENSG00000166603""",1,"""MC4R""","""HGNC""","""HGNC:6932""","""ENST00000299766.5:c.992G>A""","""ENSP00000299766.3:p.Arg331Lys""",,"""MODERATE""",,,,,,,"""benign""",0.0,331,331,"""ENSP00000299766""","""tolerated_low_confidence""",0.84,-1,,"""ENST00000299766""",,,,"""NM_005912.3""","""T"""
chr18:60371359,"[""T"",""G""]","""chr18_60371359_T_G""",44.0,,[2.00e-06],[44],,,1,False,"""R331R""",,1,"""R""","""P1""","""protein_coding""",1,"""CCDS11976.1""",1417,1417,991,991,"""Aga/Cga""","[""synonymous_variant""]",,"[(""Prints"",""PR01062""),(""PANTHER"",""PTHR22750""),(""PANTHER"",""PTHR22750"")]","""1/1""","""ENSG00000166603""",1,"""MC4R""","""HGNC""","""HGNC:6932""","""ENST00000299766.5:c.991A>C""","""ENSP00000299766.3:p.Arg331=""",,"""LOW""",,,,,,,,,331,331,"""ENSP00000299766""",,,-1,,"""ENST00000299766""",,,,"""NM_005912.3""","""G"""
chr18:60371365,"[""A"",""C""]","""chr18_60371365_A_C""",40.0,,[2.00e-06],[40],,,1,False,"""S329A""",,1,"""S/A""","""P1""","""protein_coding""",1,"""CCDS11976.1""",1411,1411,985,985,"""Tct/Gct""","[""missense_variant""]",,"[(""Prints"",""PR01062""),(""PANTHER"",""PTHR22750""),(""PANTHER"",""PTHR22750"")]","""1/1""","""ENSG00000166603""",1,"""MC4R""","""HGNC""","""HGNC:6932""","""ENST00000299766.5:c.985T>G""","""ENSP00000299766.3:p.Ser329Ala""",,"""MODERATE""",,,,,,,"""benign""",0.0,329,329,"""ENSP00000299766""","""tolerated_low_confidence""",0.16,-1,,"""ENST00000299766""",,,,"""NM_005912.3""","""C"""
chr18:60371374,"[""A"",""G""]","""chr18_60371374_A_G""",41.0,,[5.00e-06],[41],,,1,False,"""C326R""",,1,"""C/R""","""P1""","""protein_coding""",1,"""CCDS11976.1""",1402,1402,976,976,"""Tgt/Cgt""","[""missense_variant""]",,"[(""Prints"",""PR01062""),(""PANTHER"",""PTHR22750""),(""PANTHER"",""PTHR22750"")]","""1/1""","""ENSG00000166603""",1,"""MC4R""","""HGNC""","""HGNC:6932""","""ENST00000299766.5:c.976T>C""","""ENSP00000299766.3:p.Cys326Arg""",,"""MODERATE""",,,,,,,"""benign""",0.062,326,326,"""ENSP00000299766""","""deleterious_low_confidence""",0.02,-1,,"""ENST00000299766""",,,,"""NM_005912.3""","""G"""
chr18:60371377,"[""G"",""A""]","""chr18_60371377_G_A""",35.0,,[2.00e-06],[35],,,1,False,"""L325F""",,1,"""L/F""","""P1""","""protein_coding""",1,"""CCDS11976.1""",1399,1399,973,973,"""Ctt/Ttt""","[""missense_variant""]",,"[(""Prints"",""PR01062""),(""PANTHER"",""PTHR22750""),(""PANTHER"",""PTHR22750"")]","""1/1""","""ENSG00000166603""",1,"""MC4R""","""HGNC""","""HGNC:6932""","""ENST00000299766.5:c.973C>T""","""ENSP00000299766.3:p.Leu325Phe""",,"""MODERATE""",,,,,,,"""benign""",0.0,325,325,"""ENSP00000299766""","""tolerated_low_confidence""",0.71,-1,,"""ENST00000299766""",,,,"""NM_005912.3""","""A"""
chr18:60371378,"[""G"",""A""]","""chr18_60371378_G_A""",49.0,,[1.47e-04],[49],,,1,False,"""G324G""",,1,"""G""","""P1""","""protein_coding""",1,"""CCDS11976.1""",1398,1398,972,972,"""ggC/ggT""","[""synonymous_variant""]",,"[(""Prints"",""PR01062""),(""PANTHER"",""PTHR22750""),(""PANTHER"",""PTHR22750"")]","""1/1""","""ENSG00000166603""",1,"""MC4R""","""HGNC""","""HGNC:6932""","""ENST00000299766.5:c.972C>T""","""ENSP00000299766.3:p.Gly324=""",,"""LOW""",,,,,,,,,324,324,"""ENSP00000299766""",,,-1,,"""ENST00000299766""",,,,"""NM_005912.3""","""A"""
chr18:60371382,"[""C"",""A""]","""chr18_60371382_C_A""",38.0,,[2.00e-06],[38],,,1,False,"""G323V""",,1,"""G/V""","""P1""","""protein_coding""",1,"""CCDS11976.1""",1394,1394,968,968,"""gGa/gTa""","[""missense_variant""]",,"[(""Prints"",""PR01062""),(""PANTHER"",""PTHR22750""),(""PANTHER"",""PTHR22750"")]","""1/1""","""ENSG00000166603""",1,"""MC4R""","""HGNC""","""HGNC:6932""","""ENST00000299766.5:c.968G>T""","""ENSP00000299766.3:p.Gly323Val""",,"""MODERATE""",,,,,,,"""benign""",0.0,323,323,"""ENSP00000299766""","""tolerated_low_confidence""",0.35,-1,,"""ENST00000299766""",,,,"""NM_005912.3""","""A"""
chr18:60371386,"[""G"",""C""]","""chr18_60371386_G_C""",42.0,,[5.00e-06],[42],,,1,False,"""L322V""",,1,"""L/V""","""P1""","""protein_coding""",1,"""CCDS11976.1""",1390,1390,964,964,"""Ctg/Gtg""","[""missense_variant""]",,"[(""Prints"",""PR01062""),(""PANTHER"",""PTHR22750""),(""PANTHER"",""PTHR22750"")]","""1/1""","""ENSG00000166603""",1,"""MC4R""","""HGNC""","""HGNC:6932""","""ENST00000299766.5:c.964C>G""","""ENSP00000299766.3:p.Leu322Val""",,"""MODERATE""",,,,,,,"""benign""",0.007,322,322,"""ENSP00000299766""","""tolerated_low_confidence""",0.18,-1,,"""ENST00000299766""",,,,"""NM_005912.3""","""C"""
