In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
from scipy.stats import binom_test

pd.set_option('display.max_columns', None)

# GT:AD:DP:GQ:JL:JP:PGT:PID:PL:PP:PS
# Allele|Consequence|IMPACT|SYMBOL|Gene|Feature_type|Feature|
# BIOTYPE|EXON|INTRON|HGVSc|HGVSp|cDNA_position|CDS_position|Protein_position|Ami
# no_acids|Codons|Existing_variation|DISTANCE|STRAND|FLAGS|VARIANT_CLASS|SYMBOL_S
# OURCE|HGNC_ID|CANONICAL|MANE_SELECT|MANE_PLUS_CLINICAL|TSL|APPRIS|CCDS|ENSP|SWI
# SSPROT|TREMBL|UNIPARC|UNIPROT_ISOFORM|GENE_PHENO|SIFT|PolyPhen|DOMAINS|miRNA|HG
# VS_OFFSET|AF|AFR_AF|AMR_AF|EAS_AF|EUR_AF|SAS_AF|AA_AF|EA_AF|gnomAD_AF|gnomAD_AF
# R_AF|gnomAD_AMR_AF|gnomAD_ASJ_AF|gnomAD_EAS_AF|gnomAD_FIN_AF|gnomAD_NFE_AF|gnom
# AD_OTH_AF|gnomAD_SAS_AF|MAX_AF|MAX_AF_POPS|CLIN_SIG|SOMATIC|PHENO|PUBMED|MOTIF_
# NAME|MOTIF_POS|HIGH_INF_POS|MOTIF_SCORE_CHANGE|TRANSCRIPTION_FACTORS|CADD_phred
# |MPC_rankscore|MPC_score|MetaLR_pred|MetaLR_rankscore|MetaLR_score|MetaSVM_pred
# |MetaSVM_rankscore|Polyphen2_HDIV_score|REVEL_rankscore|REVEL_score|SIFT_pred|g
# nomAD_exomes_AC|gnomAD_exomes_AF|gnomAD_exomes_AN



In [None]:
annotated_vcf = '../output/merged_vcfs/annovar/joint.multisplit.VQSR.CGP.ann.vep.DNM.spliceAI.annotated.hg38_multianno.txt.reorder.vcf'

# Loading in DeNovos

In [2]:
families = {}
sample_lines = {}
de_novo_info = []
vep_header = []
spliceAI_header = []
greenvaran_header = [
    'greendb_id',
    'greendb_stdtype',
    'greendb_dbsource',
    'greendb_genes',
    'greendb_constraint',
    'greendb_level',
    'greendb_more_support'
]

with open('../1.Processing/input/trios.ped') as ped:
    for line in ped:
        info = line[:-1].split('\t')
        if info[-1] == '2':
            proband = info[1]
            father = info[2]
            mother = info[3]
            families[proband] = [father,mother,info[0]]


with open(annotated_vcf) as vcf:
    for line in vcf:
        if line[0:2] != '##' or line[0:14] == '##INFO=<ID=CSQ' or line.startswith('##INFO=<ID=SpliceAI'):
            if line[0:14] == '##INFO=<ID=CSQ':
                vep_header=line.split(': ')[1][:-3].split('|')
            elif line.startswith('##INFO=<ID=SpliceAI'):
                spliceAI_header=line.split(': ')[-1][:-3].split('|')[2:]
            if line[0] == '#':
                head = line[:-1].split('\t')
                for i in np.arange(9,len(head)):
                    sample_lines[head[i]] = i
            else:
                record = line[:-1].split('\t')
                locus = record[0] + ':' + record[1]
                ref = record[3]
                alt = record[4]
                PASS = record[6]
                variant_splice_AI = ['','','','','','','','']
                green_db = []
                for i in record[7].split(';'):
                    if i.split('=')[0] == "MQ":
                        variant_MQ = i.split('=')[1]
                    elif i.split('=')[0] == 'CSQ':
                        variant_vep = i.split('=')[1].split(',')[0].split('|')
                    elif i.split('=')[0] == 'SpliceAI':
                        variant_splice_AI = i.split('=')[1].split(',')[0].split('|')[2:]
                    elif i.split('=')[0].startswith('greendb_'):
                        green_db.append(i.split('=')[1])
                if green_db == []:
                    green_db = ['','','','','','','']
                for i in record[7].split(';'):
                    if "ConfDeNovo" in i:
                        probands = i.split('=')[1].split(',')
                        conf = i.split('=')[0]
                        for proband in probands:
                            proband_info = record[sample_lines[proband]]
                            family = families[proband][2]
                            proband_DP = proband_info.split(':')[2]
                            proband_GQ = proband_info.split(':')[3]
                            father_info = record[sample_lines[families[proband][0]]]
                            mother_info = record[sample_lines[families[proband][1]]]
                            father_DP = father_info.split(':')[2]
                            mother_DP = mother_info.split(':')[2]
                            de_novo_info.append([locus,ref,alt,PASS,proband,family,conf,proband_info,father_info,mother_info,proband_GQ,proband_DP,father_DP,mother_DP,variant_MQ] + variant_vep + variant_splice_AI + green_db + [record[-1]])

In [3]:
df = pd.DataFrame(de_novo_info,columns=['locus','ref','alt','filter','proband','family','conf','proband_info','father_info','mother_info','proband_GQ','proband_DP','father_DP','mother_DP','variant_MQ'] + vep_header + spliceAI_header + greenvaran_header + ['gnomad_genomes_312_AF'])

In [4]:
convert_dict = {
    'locus': str,
    'ref': str,
    'alt': str,
    'filter': str,
    'proband': str,
    'family': str,
    'conf': str,
    'proband_info': str,
    'father_info': str,
    'mother_info': str,
    'proband_GQ': int,
    'proband_DP': int,
    'father_DP': int,
    'mother_DP': int,
    'variant_MQ': float,
    'gnomad_genomes_312_AF': float
}

for i in vep_header:
    convert_dict[i] = str

for i in spliceAI_header:
    convert_dict[i] = str
    
for i in greenvaran_header:
    convert_dict[i] = str

df['gnomad_genomes_312_AF'] = df['gnomad_genomes_312_AF'].replace('.','0')
df = df[df['filter'] == 'PASS']
df = df[df['family'] != '8463'] # remove because of problems with this family
df = df[df['father_DP'] != '.']
df = df[df['mother_DP'] != '.']
df = df[df['proband_DP'] != 0]
df = df[df['mother_DP'] != 0]
df = df[df['father_DP'] != 0]
df = df.astype(convert_dict)
# df = df[df['gnomad_genomes_312_AF'] <= 0.001]
df = df[df['conf'] == 'hiConfDeNovo']

In [5]:
df['DS_AG'] = df['DS_AG'].replace('',0).replace('.',0)
df['DS_AL'] = df['DS_AL'].replace('',0).replace('.',0)
df['DS_DG'] = df['DS_DG'].replace('',0).replace('.',0)
df['DS_DL'] = df['DS_DL'].replace('',0).replace('.',0)
df = df.astype({'DS_AG': float, 'DS_AL': float, 'DS_DG': float, 'DS_DL': float})
df['SpliceAI_max'] = df[['DS_AG','DS_AL','DS_DG','DS_DL']].max(axis=1).astype(float)

In [6]:
# binomial test?
# Binomial test was conducted for each de novo variant by utilizing read counts of total and alternative alleles to remove calls that deviate from theoretical heterozygous state and both of the parents having reference homozygous genotype (GT=0/0) from HaplotypeCaller. 
def binom_het_test(s):
    ref_count = int(s.split(':')[1].split(',')[0])
    alt_count = int(s.split(':')[1].split(',')[1])
    return binom_test(alt_count, alt_count + ref_count, p=0.5)

df['binom_p_val'] = df['proband_info'].apply(binom_het_test)

In [7]:
# df = df[df['binom_p_val'] > 0.01]

In [8]:
len(de_novo_info)
#120685

751845

In [9]:
def sort_Consequences(s):
    return '&'.join(sorted(s.split('&')))

df['Consequence'] = df['Consequence'].apply(sort_Consequences)

In [10]:
# df['MAX_AF'] = df['MAX_AF'].replace('',0)
# df = df.astype({'MAX_AF': float})
# df = df[df['MAX_AF'] <= 0.01]

In [11]:
df

Unnamed: 0,locus,ref,alt,filter,proband,family,conf,proband_info,father_info,mother_info,proband_GQ,proband_DP,father_DP,mother_DP,variant_MQ,Allele,Consequence,IMPACT,SYMBOL,Gene,Feature_type,Feature,BIOTYPE,EXON,INTRON,HGVSc,HGVSp,cDNA_position,CDS_position,Protein_position,Amino_acids,Codons,Existing_variation,DISTANCE,STRAND,FLAGS,VARIANT_CLASS,SYMBOL_SOURCE,HGNC_ID,CANONICAL,MANE_SELECT,MANE_PLUS_CLINICAL,TSL,APPRIS,CCDS,ENSP,SWISSPROT,TREMBL,UNIPARC,UNIPROT_ISOFORM,GENE_PHENO,SIFT,PolyPhen,DOMAINS,miRNA,HGVS_OFFSET,AF,AFR_AF,AMR_AF,EAS_AF,EUR_AF,SAS_AF,AA_AF,EA_AF,gnomAD_AF,gnomAD_AFR_AF,gnomAD_AMR_AF,gnomAD_ASJ_AF,gnomAD_EAS_AF,gnomAD_FIN_AF,gnomAD_NFE_AF,gnomAD_OTH_AF,gnomAD_SAS_AF,MAX_AF,MAX_AF_POPS,CLIN_SIG,SOMATIC,PHENO,PUBMED,MOTIF_NAME,MOTIF_POS,HIGH_INF_POS,MOTIF_SCORE_CHANGE,TRANSCRIPTION_FACTORS,CADD_phred,DisGeNET,MPC,MTR,Mastermind,MetaSVM_pred,MetaSVM_rankscore,Phenotypes,Polyphen2_HDIV_score,SIFT_pred,SplicAI,SpliceRegion,gnomAD_exomes_AC,gnomAD_exomes_AF,gnomAD_exomes_AN,pLI,pLI_values,DS_AG,DS_AL,DS_DG,DS_DL,DP_AG,DP_AL,DP_DG,DP_DL,greendb_id,greendb_stdtype,greendb_dbsource,greendb_genes,greendb_constraint,greendb_level,greendb_more_support,gnomad_genomes_312_AF,SpliceAI_max,binom_p_val
48,chr1:28694,A,AGAAT,PASS,GLE_6560019403,8063,hiConfDeNovo,"0/1:16,4:20:99:-1:-1:120,0,855:100,0,894","0/0:1,0:1:20:.:.:0,0,0:0,20,44","0/0:5,0:5:33:-1:-1:0,15,174:0,33,218",99,20,1,5,27.00,GAAT,upstream_gene_variant,MODIFIER,MIR1302-2HG,ENSG00000243485,Transcript,ENST00000469289,lncRNA,,,,,,,,,,rs1157902025,1572,1,,insertion,HGNC,HGNC:52482,,,,5,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,,,,,32562_pro,promoter,EnsemblRegBuild,"MIR1302-2HG,OR4F5,WASH7P",1,0,0,,0.0,1.181793e-02
67,chr1:50299,CA,C,PASS,GLE_2932131647,8345,hiConfDeNovo,"0/1:1,2:3:51:-1:-1:72,0,31:51,0,68","0/0:6,0:6:31:-1:-1:0,12,180:0,31,226","0/0:0,0:0:21:.:.:0,0,0:0,21,46",51,3,6,0,29.84,-,upstream_gene_variant,MODIFIER,OR4G4P,ENSG00000268020,Transcript,ENST00000606857,unprocessed_pseudogene,,,,,,,,,,rs769584624,2173,1,,deletion,HGNC,HGNC:14822,YES,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,,,,,,,,,,,,0.0014,0.0,1.000000e+00
79,chr1:66244,ATTATATAATATATAATATAAATATAATATAAATTATAT,A,PASS,GLE_6577308379,8413,hiConfDeNovo,"0/1:2,4:6:99:-1:-1:.:.:162,0,72:143,0,102","0/0:1,0:1:20:.:.:.:.:0,0,0:0,20,45","0/0:1,0:1:21:-1:-1:.:.:0,3,14:0,21,59",99,6,1,1,56.85,-,downstream_gene_variant,MODIFIER,OR4G11P,ENSG00000240361,Transcript,ENST00000492842,transcribed_unprocessed_pseudogene,,,,,,,,,,rs1273585901,2358,1,,deletion,HGNC,HGNC:31276,YES,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,,,,,,,,,,,,0.0021,0.0,6.875000e-01
103,chr1:83886,A,G,PASS,GLE_6101591489,8088,hiConfDeNovo,"0/1:31,5:36:35:.:.:0|1:83886_A_G:59,0,1276:35,...","0/0:44,0:44:99:.:.:.:.:0,101,1196:0,125,1247","0/0:41,0:41:99:.:.:.:.:0,111,1653:0,135,1704",35,36,44,41,57.39,G,intergenic_variant,MODIFIER,,,,,,,,,,,,,,,rs1315695765,,,,SNV,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,,,,,,,,,,,,0.0006,0.0,1.291349e-05
104,chr1:83919,G,A,PASS,GLE_6101591489,8088,hiConfDeNovo,"0/1:36,10:46:99:.:.:.:.:306,0,1566:287,0,1590","0/0:44,0:44:99:.:.:.:.:0,101,1196:0,120,1239","0/0:50,0:50:99:.:.:.:.:0,117,1755:0,136,1798",99,46,44,50,57.17,A,intergenic_variant,MODIFIER,,,,,,,,,,,,,,,rs992179951,,,,SNV,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,,,,,,,,,,,,0.0056,0.0,1.564172e-04
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
751836,chrY:56884359,A,*,PASS,GLE_7679059484,8341,hiConfDeNovo,"0/1:26,17:45:99:59:3:.:.:442,0,1502:360,0,1590","0/0:20,0:23:25:59:3:0|1:56884359_A_C:66,126,96...","0/0:28,0:28:43:59:3:.:.:0,75,1125:0,43,1120",99,45,23,28,51.98,-,intergenic_variant,MODIFIER,,,,,,,,,,,,,,,,,,,deletion,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,,,,,1462664_enh,enhancer,HACER,"SPRY3,CTBP2P1",1,0,.,,0.0,2.220528e-01
751837,chrY:56885233,C,T,PASS,GLE_5568105601,6675,hiConfDeNovo,"0/1:87,14:101:54:116:56:134,0,2842:54,0,2927","0/0:190,0:190:80:116:56:0,120,1800:0,80,1785","0/0:103,0:103:80:116:56:0,120,1800:0,80,1785",54,101,190,103,50.00,T,intergenic_variant,MODIFIER,,,,,,,,,,,,,,,rs3866410,,,,SNV,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,,,,,"1462671_enh,1462664_enh",enhancer,HACER,"SPRY3,CTBP2P1",1,0,.,,0.0,4.798717e-14
751840,chrY:56886110,G,*,PASS,GLE_8438071884,6411,hiConfDeNovo,"0/1:9,3:12:97:-1:-1:0|1:56886087_C_T:121,0,366...","0/0:21,0:21:82:-1:-1:.:.:0,60,649:0,82,700","0/0:0,0:0:24:.:.:.:.:0,0,0:0,24,51",97,12,21,0,54.88,T,intergenic_variant,MODIFIER,,,,,,,,,,,,,,,rs3866410,,,,SNV,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,,,,,"1462671_enh,1462664_enh",enhancer,HACER,"SPRY3,CTBP2P1",1,0,.,,0.0,1.459961e-01
751841,chrY:56886112,T,*,PASS,GLE_8438071884,6411,hiConfDeNovo,"0/1:9,3:12:97:-1:-1:0|1:56886087_C_T:121,0,366...","0/0:21,0:21:82:-1:-1:.:.:0,60,649:0,82,700","0/0:0,0:0:24:.:.:.:.:0,0,0:0,24,51",97,12,21,0,55.02,T,intergenic_variant,MODIFIER,,,,,,,,,,,,,,,rs3866410,,,,SNV,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,,,,,"1462671_enh,1462664_enh",enhancer,HACER,"SPRY3,CTBP2P1",1,0,.,,0.0,1.459961e-01


# Comparing with Jiny's data

In [12]:
exonic = df[df['IMPACT'] != 'MODIFIER']

In [13]:
loci_list = []
with open('old_WES_damaing_locus.txt') as loci:
    for line in loci:
        loci_list.append(line[:-1])
        
exonic[exonic['locus'].isin(loci_list)]

Unnamed: 0,locus,ref,alt,filter,proband,family,conf,proband_info,father_info,mother_info,proband_GQ,proband_DP,father_DP,mother_DP,variant_MQ,Allele,Consequence,IMPACT,SYMBOL,Gene,Feature_type,Feature,BIOTYPE,EXON,INTRON,HGVSc,HGVSp,cDNA_position,CDS_position,Protein_position,Amino_acids,Codons,Existing_variation,DISTANCE,STRAND,FLAGS,VARIANT_CLASS,SYMBOL_SOURCE,HGNC_ID,CANONICAL,MANE_SELECT,MANE_PLUS_CLINICAL,TSL,APPRIS,CCDS,ENSP,SWISSPROT,TREMBL,UNIPARC,UNIPROT_ISOFORM,GENE_PHENO,SIFT,PolyPhen,DOMAINS,miRNA,HGVS_OFFSET,AF,AFR_AF,AMR_AF,EAS_AF,EUR_AF,SAS_AF,AA_AF,EA_AF,gnomAD_AF,gnomAD_AFR_AF,gnomAD_AMR_AF,gnomAD_ASJ_AF,gnomAD_EAS_AF,gnomAD_FIN_AF,gnomAD_NFE_AF,gnomAD_OTH_AF,gnomAD_SAS_AF,MAX_AF,MAX_AF_POPS,CLIN_SIG,SOMATIC,PHENO,PUBMED,MOTIF_NAME,MOTIF_POS,HIGH_INF_POS,MOTIF_SCORE_CHANGE,TRANSCRIPTION_FACTORS,CADD_phred,DisGeNET,MPC,MTR,Mastermind,MetaSVM_pred,MetaSVM_rankscore,Phenotypes,Polyphen2_HDIV_score,SIFT_pred,SplicAI,SpliceRegion,gnomAD_exomes_AC,gnomAD_exomes_AF,gnomAD_exomes_AN,pLI,pLI_values,DS_AG,DS_AL,DS_DG,DS_DL,DP_AG,DP_AL,DP_DG,DP_DL,greendb_id,greendb_stdtype,greendb_dbsource,greendb_genes,greendb_constraint,greendb_level,greendb_more_support,gnomad_genomes_312_AF,SpliceAI_max,binom_p_val


In [14]:
WES_gene_list = []
with open('/projects/ps-gleesonlab8/User/hiyoothere/NTD/15.Final/221020_DNM_v9/DNM.final.PoiFil.den_input.NTD_gene_list') as WES_gene:
    for line in WES_gene:
        WES_gene_list.append(line[:-1])
        
gene_matched = exonic[exonic['SYMBOL'].isin(WES_gene_list)]
# gene_matched.to_csv('varinats_in_same_genes_as_WES_MM.tsv',sep='\t',index=False)

In [15]:
gene_matched

Unnamed: 0,locus,ref,alt,filter,proband,family,conf,proband_info,father_info,mother_info,proband_GQ,proband_DP,father_DP,mother_DP,variant_MQ,Allele,Consequence,IMPACT,SYMBOL,Gene,Feature_type,Feature,BIOTYPE,EXON,INTRON,HGVSc,HGVSp,cDNA_position,CDS_position,Protein_position,Amino_acids,Codons,Existing_variation,DISTANCE,STRAND,FLAGS,VARIANT_CLASS,SYMBOL_SOURCE,HGNC_ID,CANONICAL,MANE_SELECT,MANE_PLUS_CLINICAL,TSL,APPRIS,CCDS,ENSP,SWISSPROT,TREMBL,UNIPARC,UNIPROT_ISOFORM,GENE_PHENO,SIFT,PolyPhen,DOMAINS,miRNA,HGVS_OFFSET,AF,AFR_AF,AMR_AF,EAS_AF,EUR_AF,SAS_AF,AA_AF,EA_AF,gnomAD_AF,gnomAD_AFR_AF,gnomAD_AMR_AF,gnomAD_ASJ_AF,gnomAD_EAS_AF,gnomAD_FIN_AF,gnomAD_NFE_AF,gnomAD_OTH_AF,gnomAD_SAS_AF,MAX_AF,MAX_AF_POPS,CLIN_SIG,SOMATIC,PHENO,PUBMED,MOTIF_NAME,MOTIF_POS,HIGH_INF_POS,MOTIF_SCORE_CHANGE,TRANSCRIPTION_FACTORS,CADD_phred,DisGeNET,MPC,MTR,Mastermind,MetaSVM_pred,MetaSVM_rankscore,Phenotypes,Polyphen2_HDIV_score,SIFT_pred,SplicAI,SpliceRegion,gnomAD_exomes_AC,gnomAD_exomes_AF,gnomAD_exomes_AN,pLI,pLI_values,DS_AG,DS_AL,DS_DG,DS_DL,DP_AG,DP_AL,DP_DG,DP_DL,greendb_id,greendb_stdtype,greendb_dbsource,greendb_genes,greendb_constraint,greendb_level,greendb_more_support,gnomad_genomes_312_AF,SpliceAI_max,binom_p_val
7386,chr1:23030585,T,A,PASS,GLE_2932131647,8345,hiConfDeNovo,"0/1:22,10:32:35:0:3:80,0,553:35,0,607","0/0:38,0:38:21:0:3:0,21,934:0,21,961","0/0:39,0:39:29:0:3:0,25,970:0,29,1001",35,32,38,39,60.0,A,synonymous_variant,LOW,KDM1A,ENSG00000004487,Transcript,ENST00000356634,protein_coding,2/19,,ENST00000356634.7:c.468T>A,ENSP00000349049.3:p.Pro156%3D,617,468,156,P,ccT/ccA,rs1569640363,,1,,SNV,HGNC,HGNC:29079,,,,1.0,,CCDS30627.1,ENSP00000349049,O60341.213,,UPI000020466D,O60341-1,1.0,,,PDB-ENSP_mappings:2v1d.A&PDB-ENSP_mappings:2x0...,,,,,,,,,,,,,,,,,,,,,,likely_benign,,1.0,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,2,5,49,28,"27972_pro,71793_enh","enhancer,promoter","JungEtAl2019,ENCODE-HMM","KDM1A,MIR3115",0.467773,3.0,0.0,0.0,0.0,0.050102
7396,chr1:23077336,C,T,PASS,GLE_1567252581,8078,hiConfDeNovo,"0/1:23,11:34:99:81:21:280,0,697:195,0,785","0/0:37,0:37:65:81:21:0,100,1252:0,65,1244","0/0:32,0:32:47:81:21:0,82,1013:0,47,1005",99,34,37,32,60.0,T,stop_gained,HIGH,KDM1A,ENSG00000004487,Transcript,ENST00000356634,protein_coding,14/19,,ENST00000356634.7:c.1771C>T,ENSP00000349049.3:p.Arg591Ter,1920,1771,591,R/*,Cga/Tga,COSV63088329,,1,,SNV,HGNC,HGNC:29079,,,,1.0,,CCDS30627.1,ENSP00000349049,O60341.213,,UPI000020466D,O60341-1,1.0,,,PDB-ENSP_mappings:2dw4.A&PDB-ENSP_mappings:2ej...,,,,,,,,,,,,,,,,,,,,,,,1.0,1.0,,,,,,,38.0,invalid_field,invalid_field,invalid_field,invalid_field,,,invalid_field,.&.&.&.,.&.&.&.,invalid_field,invalid_field,,,,invalid_field,invalid_field,0.0,0.0,0.08,0.01,25,-5,46,24,72086_enh,enhancer,JungEtAl2019,"KDM1A,AL031428.1",0.72106,4.0,0.0,0.0,0.08,0.057613
255889,chr6:33670584,G,A,PASS,GLE_7507632649,8336,hiConfDeNovo,"0/1:16,8:24:99:77:18:196,0,454:111,0,542","0/0:29,0:29:43:77:18:0,78,1170:0,43,1162","0/0:40,0:40:79:77:18:0,114,1710:0,79,1702",99,24,29,40,60.0,A,intron_variant&splice_region_variant,LOW,ITPR3,ENSG00000096433,Transcript,ENST00000374316,protein_coding,,20/58,ENST00000374316.9:c.2441+8G>A,,,,,,,,,1,,SNV,HGNC,HGNC:6182,,,,5.0,P1,CCDS4783.1,ENSP00000363435,Q14573.204,,UPI000013CB74,,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.9,0.0,1,26,1,-11,1174714_enh,enhancer,BENGI,"MIR3934,ITPR3",0.913404,2.0,0.0,0.0,0.9,0.15159
542191,chr14:24099236,G,GATCCTGGGCGTCACGTCGCCCAAGGGCCGCAAGTACCAC,PASS,GLE_8240152843,8252,hiConfDeNovo,"0/1:47,15:62:99:110:50:0|1:24099224_A_C:527,0,...","0/0:82,0:82:82:110:50:.:.:0,120,1800:0,82,1787","0/0:55,0:55:73:110:50:.:.:0,111,1665:0,73,1652",99,62,82,55,59.45,ATCCTGGGCGTCACGTCGCCCAAGGGCCGCAAGTACCAC,inframe_insertion&splice_region_variant,MODERATE,PCK2,ENSG00000100889,Transcript,ENST00000216780,protein_coding,,,ENST00000216780.9:c.852_852+1insATCCTGGGCGTCAC...,ENSP00000216780.4:p.Gly287_Ile288insValThrSerP...,947-948,852-853,284-285,-/ILGVTSPKGRKYH,-/ATCCTGGGCGTCACGTCGCCCAAGGGCCGCAAGTACCAC,,,1,,insertion,HGNC,HGNC:8725,YES,NM_004563.4,,1.0,P1,CCDS9609.1,ENSP00000216780,Q16822.207,A0A384MTT2.16,UPI0000169DF3,Q16822-1,1.0,,,AFDB-ENSP_mappings:AF-Q16822-F1.A,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.06,0.53,-23,11,11,0,"394051_enh,394052_enh",enhancer,"JungEtAl2019,BENGI",NRL,0.843415,4.0,0.0,0.0,0.53,5.8e-05
545505,chr14:59464460,G,A,PASS,GLE_7269173106,7662,hiConfDeNovo,"0/1:27,11:38:99:83:23:248,0,804:163,0,892","0/0:40,0:40:64:83:23:0,99,1296:0,64,1288","0/0:29,0:29:49:83:23:0,84,1260:0,49,1252",99,38,40,29,60.0,A,missense_variant,MODERATE,GPR135,ENSG00000181619,Transcript,ENST00000395116,protein_coding,1/1,,ENST00000395116.2:c.767C>T,ENSP00000378548.1:p.Ala256Val,921,767,256,A/V,gCg/gTg,,,-1,,SNV,HGNC,HGNC:19991,YES,NM_022571.6,,,P1,CCDS9738.1,ENSP00000378548,Q8IZ08.138,,UPI0000046D5B,,,tolerated(0.22),benign(0.079),AFDB-ENSP_mappings:AF-Q8IZ08-F1.A,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,15.35,invalid_field,invalid_field,invalid_field,invalid_field,T,0.37324,invalid_field,0.727,T,invalid_field,invalid_field,,,,invalid_field,invalid_field,0.0,0.0,0.0,0.0,-5,0,2,-50,"409182_enh,409181_enh,583790_pro","enhancer,promoter","DECRES,FOCS,SegWey,ENCODE-HMM,BENGI,EnsemblReg...",GPR135,0.536923,3.0,0.0,0.0,0.0,0.013853
592100,chr17:4885900,C,T,PASS,GLE_2223281625,8546,hiConfDeNovo,"0/1:20,20:40:99:96:36:535,0,553:450,0,641","0/0:35,0:35:65:96:36:0,100,1177:0,65,1169","0/0:36,0:36:64:96:36:0,99,1131:0,64,1123",99,40,35,36,60.0,T,intron_variant&splice_polypyrimidine_tract_var...,LOW,MINK1,ENSG00000141503,Transcript,ENST00000347992,protein_coding,,7/31,ENST00000347992.11:c.640-11C>T,,,,,,,,,1,,SNV,HGNC,HGNC:17565,,,,1.0,A1,CCDS45589.1,ENSP00000269296,Q8N4C8.188,,UPI00000411AB,Q8N4C8-3,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.12,0.0,0.0,13,30,21,30,,,,,,,,0.0,0.12,1.0


In [16]:
gene_matched[['locus','ref','alt','proband','family','Consequence','IMPACT','SYMBOL']]

Unnamed: 0,locus,ref,alt,proband,family,Consequence,IMPACT,SYMBOL
7386,chr1:23030585,T,A,GLE_2932131647,8345,synonymous_variant,LOW,KDM1A
7396,chr1:23077336,C,T,GLE_1567252581,8078,stop_gained,HIGH,KDM1A
255889,chr6:33670584,G,A,GLE_7507632649,8336,intron_variant&splice_region_variant,LOW,ITPR3
542191,chr14:24099236,G,GATCCTGGGCGTCACGTCGCCCAAGGGCCGCAAGTACCAC,GLE_8240152843,8252,inframe_insertion&splice_region_variant,MODERATE,PCK2
545505,chr14:59464460,G,A,GLE_7269173106,7662,missense_variant,MODERATE,GPR135
592100,chr17:4885900,C,T,GLE_2223281625,8546,intron_variant&splice_polypyrimidine_tract_var...,LOW,MINK1


# IMPACT HIGH

In [17]:
impact_high = df[df['IMPACT'] == 'HIGH']

In [18]:
impact_high.head()

Unnamed: 0,locus,ref,alt,filter,proband,family,conf,proband_info,father_info,mother_info,proband_GQ,proband_DP,father_DP,mother_DP,variant_MQ,Allele,Consequence,IMPACT,SYMBOL,Gene,Feature_type,Feature,BIOTYPE,EXON,INTRON,HGVSc,HGVSp,cDNA_position,CDS_position,Protein_position,Amino_acids,Codons,Existing_variation,DISTANCE,STRAND,FLAGS,VARIANT_CLASS,SYMBOL_SOURCE,HGNC_ID,CANONICAL,MANE_SELECT,MANE_PLUS_CLINICAL,TSL,APPRIS,CCDS,ENSP,SWISSPROT,TREMBL,UNIPARC,UNIPROT_ISOFORM,GENE_PHENO,SIFT,PolyPhen,DOMAINS,miRNA,HGVS_OFFSET,AF,AFR_AF,AMR_AF,EAS_AF,EUR_AF,SAS_AF,AA_AF,EA_AF,gnomAD_AF,gnomAD_AFR_AF,gnomAD_AMR_AF,gnomAD_ASJ_AF,gnomAD_EAS_AF,gnomAD_FIN_AF,gnomAD_NFE_AF,gnomAD_OTH_AF,gnomAD_SAS_AF,MAX_AF,MAX_AF_POPS,CLIN_SIG,SOMATIC,PHENO,PUBMED,MOTIF_NAME,MOTIF_POS,HIGH_INF_POS,MOTIF_SCORE_CHANGE,TRANSCRIPTION_FACTORS,CADD_phred,DisGeNET,MPC,MTR,Mastermind,MetaSVM_pred,MetaSVM_rankscore,Phenotypes,Polyphen2_HDIV_score,SIFT_pred,SplicAI,SpliceRegion,gnomAD_exomes_AC,gnomAD_exomes_AF,gnomAD_exomes_AN,pLI,pLI_values,DS_AG,DS_AL,DS_DG,DS_DL,DP_AG,DP_AL,DP_DG,DP_DL,greendb_id,greendb_stdtype,greendb_dbsource,greendb_genes,greendb_constraint,greendb_level,greendb_more_support,gnomad_genomes_312_AF,SpliceAI_max,binom_p_val
7396,chr1:23077336,C,T,PASS,GLE_1567252581,8078,hiConfDeNovo,"0/1:23,11:34:99:81:21:280,0,697:195,0,785","0/0:37,0:37:65:81:21:0,100,1252:0,65,1244","0/0:32,0:32:47:81:21:0,82,1013:0,47,1005",99,34,37,32,60.0,T,stop_gained,HIGH,KDM1A,ENSG00000004487,Transcript,ENST00000356634,protein_coding,14/19,,ENST00000356634.7:c.1771C>T,ENSP00000349049.3:p.Arg591Ter,1920,1771,591,R/*,Cga/Tga,COSV63088329,,1,,SNV,HGNC,HGNC:29079,,,,1,,CCDS30627.1,ENSP00000349049,O60341.213,,UPI000020466D,O60341-1,1,,,PDB-ENSP_mappings:2dw4.A&PDB-ENSP_mappings:2ej...,,,,,,,,,,,,,,,,,,,,,,,1.0,1.0,,,,,,,38.0,invalid_field,invalid_field,invalid_field,invalid_field,,,invalid_field,.&.&.&.,.&.&.&.,invalid_field,invalid_field,,,,invalid_field,invalid_field,0.0,0.0,0.08,0.01,25,-5,46,24,72086_enh,enhancer,JungEtAl2019,"KDM1A,AL031428.1",0.72106,4.0,0.0,0.0,0.08,0.05761267
91255,chr1:201899847,ATG,A,PASS,GLE_2069350888,8334,hiConfDeNovo,"0/1:33,6:39:99:102:42:0|1:201899839_G_T:192,0,...","0/0:42,0:42:72:102:42:.:.:0,107,1513:0,72,1505","0/0:38,0:38:70:102:42:.:.:0,105,1351:0,70,1343",99,39,42,38,60.0,-,frameshift_variant,HIGH,LMOD1,ENSG00000163431,Transcript,ENST00000367288,protein_coding,2/3,,ENST00000367288.5:c.1164_1165del,ENSP00000356257.4:p.Ile389HisfsTer66,1372-1373,1164-1165,388-389,TI/TX,acCAtc/actc,,,-1,,deletion,HGNC,HGNC:6647,YES,NM_012134.3,,1,P1,CCDS53457.1,ENSP00000356257,P29536.168,,UPI00003665F4,P29536-1,1,,,PDB-ENSP_mappings:4z79.A&PDB-ENSP_mappings:4z8...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,13,-36,-36,-41,49275_enh,enhancer,JungEtAl2019,"AL513217.1,LMOD1",0.0929413,3.0,0.0,0.0,0.0,1.429926e-05
91256,chr1:201899850,G,GCA,PASS,GLE_2069350888,8334,hiConfDeNovo,"0/1:32,6:38:99:102:42:0|1:201899839_G_T:192,0,...","0/0:42,0:42:72:102:42:.:.:0,107,1513:0,72,1505","0/0:38,0:38:70:102:42:.:.:0,105,1351:0,70,1343",99,38,42,38,60.0,CA,frameshift_variant,HIGH,LMOD1,ENSG00000163431,Transcript,ENST00000367288,protein_coding,2/3,,ENST00000367288.5:c.1162_1163insTG,ENSP00000356257.4:p.Thr388MetfsTer31,1370-1371,1162-1163,388,T/MX,acc/aTGcc,,,-1,,insertion,HGNC,HGNC:6647,YES,NM_012134.3,,1,P1,CCDS53457.1,ENSP00000356257,P29536.168,,UPI00003665F4,P29536-1,1,,,PDB-ENSP_mappings:4z79.A&PDB-ENSP_mappings:4z8...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,10,-39,-38,-44,49275_enh,enhancer,JungEtAl2019,"AL513217.1,LMOD1",0.0929413,3.0,0.0,0.0,0.0,2.434256e-05
150240,chr3:10342946,A,ACCGTGAGCGGCGAATCTGTGCCCATGAAGGCACCGGCCAACACAA...,PASS,GLE_4949935840,8299,hiConfDeNovo,"0/1:39,5:44:99:61:3:0|1:10342936_G_A:1423,0,63...","0/0:55,0:55:26:61:3:.:.:0,61,1643:0,26,1635","0/0:41,0:59:99:61:3:0|1:10342944_G_GCGT:598,75...",99,44,55,59,59.99,CCGTGAGCGGCGAATCTGTGCCCATGAAGGCACCGGCCAACACAAT...,frameshift_variant&stop_gained,HIGH,ATP2B2,ENSG00000157087,Transcript,ENST00000352432,protein_coding,17/23,,ENST00000352432.9:c.2689_2690insCTCGCTCTACCAAA...,ENSP00000324172.6:p.Val897AlafsTer15,2689-2690,2689-2690,897,V/ARSTKTSSVSSSSK*R*T*PLASLCWPVPSWAQIRRSRX,gtg/gCTCGCTCTACCAAAACATCCAGCGTTTCATCCTCTTCCAAA...,,,-1,,insertion,HGNC,HGNC:815,,,,1,,,ENSP00000324172,,A0A2U3TZI3.19,UPI000D1955CC,,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.29,0.15,0.0,0.0,0,19,0,4,854554_enh,enhancer,JungEtAl2019,"ATP2B2,MIR378B",0.726492,4.0,0.0,0.0,0.29,1.405162e-07
154089,chr3:53747333,GT,G,PASS,GLE_4931115797,8077,hiConfDeNovo,"0/1:20,26:46:99:98:38:0|1:53747333_GT_G:1016,0...","0/0:47,0:47:85:98:38:.:.:0,120,1800:0,85,1792","0/0:34,0:34:64:98:38:.:.:0,99,1201:0,64,1193",99,46,47,34,60.0,-,frameshift_variant,HIGH,CACNA1D,ENSG00000157388,Transcript,ENST00000288139,protein_coding,27/49,,ENST00000288139.11:c.3261del,ENSP00000288139.3:p.Asp1088ThrfsTer24,3816,3260,1087,V/X,gTt/gt,,,1,,deletion,HGNC,HGNC:1391,,,NM_000720.4,1,P2,CCDS2872.1,ENSP00000288139,Q01668.209,,UPI000005031A,Q01668-2,1,,,,,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,-31,-9,14,39,,,,,,,,0.0,0.0,0.4613912


In [19]:
len(impact_high)
#34

81

In [20]:
# impact_high.to_csv('impact_high_all_genes.tsv',sep='\t',index=False)

# metaSVM Deleterious missense mutations

In [21]:
MetaSVM_pred_D = df[df['MetaSVM_pred'] == 'D']

In [22]:
MetaSVM_pred_D.head()

Unnamed: 0,locus,ref,alt,filter,proband,family,conf,proband_info,father_info,mother_info,proband_GQ,proband_DP,father_DP,mother_DP,variant_MQ,Allele,Consequence,IMPACT,SYMBOL,Gene,Feature_type,Feature,BIOTYPE,EXON,INTRON,HGVSc,HGVSp,cDNA_position,CDS_position,Protein_position,Amino_acids,Codons,Existing_variation,DISTANCE,STRAND,FLAGS,VARIANT_CLASS,SYMBOL_SOURCE,HGNC_ID,CANONICAL,MANE_SELECT,MANE_PLUS_CLINICAL,TSL,APPRIS,CCDS,ENSP,SWISSPROT,TREMBL,UNIPARC,UNIPROT_ISOFORM,GENE_PHENO,SIFT,PolyPhen,DOMAINS,miRNA,HGVS_OFFSET,AF,AFR_AF,AMR_AF,EAS_AF,EUR_AF,SAS_AF,AA_AF,EA_AF,gnomAD_AF,gnomAD_AFR_AF,gnomAD_AMR_AF,gnomAD_ASJ_AF,gnomAD_EAS_AF,gnomAD_FIN_AF,gnomAD_NFE_AF,gnomAD_OTH_AF,gnomAD_SAS_AF,MAX_AF,MAX_AF_POPS,CLIN_SIG,SOMATIC,PHENO,PUBMED,MOTIF_NAME,MOTIF_POS,HIGH_INF_POS,MOTIF_SCORE_CHANGE,TRANSCRIPTION_FACTORS,CADD_phred,DisGeNET,MPC,MTR,Mastermind,MetaSVM_pred,MetaSVM_rankscore,Phenotypes,Polyphen2_HDIV_score,SIFT_pred,SplicAI,SpliceRegion,gnomAD_exomes_AC,gnomAD_exomes_AF,gnomAD_exomes_AN,pLI,pLI_values,DS_AG,DS_AL,DS_DG,DS_DL,DP_AG,DP_AL,DP_DG,DP_DL,greendb_id,greendb_stdtype,greendb_dbsource,greendb_genes,greendb_constraint,greendb_level,greendb_more_support,gnomad_genomes_312_AF,SpliceAI_max,binom_p_val
10205,chr1:47416556,G,T,PASS,GLE_2816585663,5651,hiConfDeNovo,"0/1:16,19:35:99:72:13:525,0,451:440,0,539","0/0:36,0:36:38:72:13:0,73,1088:0,38,1080","0/0:41,0:41:74:72:13:0,109,1316:0,74,1308",99,35,36,41,60.0,T,missense_variant,MODERATE,FOXE3,ENSG00000186790,Transcript,ENST00000335071,protein_coding,1/1,,ENST00000335071.4:c.241G>T,ENSP00000334472.2:p.Ala81Ser,272,241,81,A/S,Gcc/Tcc,,,1,,SNV,HGNC,HGNC:3808,YES,NM_012186.3,,,P1,CCDS550.1,ENSP00000334472,Q13461.175,A0A0A1EII5.48,UPI000012ADD3,,1.0,tolerated(0.16),possibly_damaging(0.703),AFDB-ENSP_mappings:AF-Q13461-F1.A,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,15.1,invalid_field,invalid_field,invalid_field,invalid_field,D,0.87335,invalid_field,0.885,T,invalid_field,invalid_field,,,,invalid_field,invalid_field,0.0,0.0,0.0,0.0,7.0,-36.0,-17.0,0.0,"562852_pro,38851_pro,660_biv","bivalent,promoter","DECRES,SegWey,ENCODE-HMM,BENGI,EnsemblRegBuild",FOXE3,0.777246,4,0,0.0,0.0,0.735879
86341,chr1:151752930,TTTTTCTTTTC,*,PASS,GLE_3601683474,8067,hiConfDeNovo,"0/1:13,13:26:99:-1:-1:.:.:482,0,491:462,0,576","0/0:42,0:42:99:-1:-1:.:.:0,105,1287:0,123,1332","0/0:46,0:46:20:.:.:.:.:0,0,0:0,20,45",99,26,42,46,60.0,A,missense_variant,MODERATE,SNX27,ENSG00000143376,Transcript,ENST00000368838,protein_coding,8/10,,ENST00000368838.2:c.842T>A,ENSP00000357831.2:p.Leu281His,841,842,281,L/H,cTc/cAc,,,1,cds_start_NF,SNV,HGNC,HGNC:20073,,,,1.0,,,ENSP00000357831,,A0A5H1ZRP6.8,UPI000D18ED09,,,deleterious(0),probably_damaging(0.97),,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,28.1,invalid_field,invalid_field,invalid_field,invalid_field,D,0.96707,invalid_field,.&1.0&0.999,.&D&D,invalid_field,invalid_field,,,,invalid_field,invalid_field,0.0,0.0,0.0,0.0,,,,,18188_enh,enhancer,JungEtAl2019,"MRPL9,RNU6-662P",0.718017,1,0,0.0,0.0,1.0
91257,chr1:201899851,T,C,PASS,GLE_2069350888,8334,hiConfDeNovo,"0/1:33,7:40:99:102:42:0|1:201899839_G_T:195,0,...","0/0:42,0:42:72:102:42:.:.:0,107,1513:0,72,1505","0/0:38,0:38:70:102:42:.:.:0,105,1351:0,70,1343",99,40,42,38,60.0,C,missense_variant,MODERATE,LMOD1,ENSG00000163431,Transcript,ENST00000367288,protein_coding,2/3,,ENST00000367288.5:c.1162A>G,ENSP00000356257.4:p.Thr388Ala,1370,1162,388,T/A,Acc/Gcc,,,-1,,SNV,HGNC,HGNC:6647,YES,NM_012134.3,,1.0,P1,CCDS53457.1,ENSP00000356257,P29536.168,,UPI00003665F4,P29536-1,1.0,tolerated(0.06),benign(0.358),PDB-ENSP_mappings:4z79.A&PDB-ENSP_mappings:4z8...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,22.9,invalid_field,invalid_field,invalid_field,invalid_field,D,0.89357,invalid_field,0.836,T,invalid_field,invalid_field,,,,invalid_field,invalid_field,0.0,0.0,0.0,0.0,-40.0,9.0,1.0,-39.0,49275_enh,enhancer,JungEtAl2019,"AL513217.1,LMOD1",0.0929413,3,0,0.0,0.0,4.2e-05
94023,chr1:230779122,G,T,PASS,GLE_4487097024,8346,hiConfDeNovo,"0/1:12,16:28:99:89:29:452,0,268:367,0,356","0/0:36,0:36:65:89:29:0,100,1170:0,65,1162","0/0:33,0:33:55:89:29:0,90,1107:0,55,1099",99,28,36,33,60.0,T,missense_variant,MODERATE,CAPN9,ENSG00000135773,Transcript,ENST00000271971,protein_coding,9/20,,ENST00000271971.7:c.1103G>T,ENSP00000271971.2:p.Arg368Leu,1212,1103,368,R/L,cGc/cTc,rs144137595&COSV99680503,,1,,SNV,HGNC,HGNC:1486,YES,NM_006615.3,,1.0,P1,CCDS1586.1,ENSP00000271971,O14815.179,,UPI000006E882,O14815-1,,deleterious(0.02),probably_damaging(0.999),AFDB-ENSP_mappings:AF-O14815-F1.A,,,,,,,,,0.0,0.0004651,0.0001956,0.0001853,0.0002315,0.0,0.0,4.631e-05,0.0003274,0.0,0.0,0.0004651,EA,,0&1,0&1,,,,,,,32.0,invalid_field,invalid_field,invalid_field,invalid_field,D,0.93601,invalid_field,0.999&0.999&1.0,D&D&D,invalid_field,invalid_field,49.0,0.0001956447,250454.0,invalid_field,invalid_field,0.0,0.0,0.0,0.04,11.0,-37.0,-29.0,15.0,69856_enh,enhancer,BENGI,"CAPN9,AL512328.1",0.691448,1,0,0.0002,0.04,0.571588
146751,chr2:232791057,C,T,PASS,GLE_8240152843,8252,hiConfDeNovo,"0/1:26,19:45:99:86:26:486,0,702:401,0,790","0/0:35,0:35:64:86:26:0,99,1147:0,64,1139","0/0:33,0:33:52:86:26:0,87,1305:0,52,1297",99,45,35,33,60.0,T,missense_variant,MODERATE,GIGYF2,ENSG00000204120,Transcript,ENST00000373563,protein_coding,11/29,,ENST00000373563.9:c.980C>T,ENSP00000362664.5:p.Pro327Leu,1151,980,327,P/L,cCt/cTt,,,1,,SNV,HGNC,HGNC:11960,YES,NM_001103146.3,,1.0,P4,CCDS33401.1,ENSP00000362664,Q6Y7W6.155,,UPI00001BD8AE,Q6Y7W6-1,1.0,deleterious(0.04),possibly_damaging(0.725),AFDB-ENSP_mappings:AF-Q6Y7W6-F1.A,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,25.7,invalid_field,invalid_field,invalid_field,invalid_field,D,0.8602,invalid_field,.&0.983&.&0.983&.&.&.&.&.&.,.&D&D&D&D&D&D&D&D&D,invalid_field,invalid_field,,,,invalid_field,invalid_field,0.0,0.0,0.0,0.0,-45.0,-40.0,35.0,31.0,720718_enh,enhancer,JungEtAl2019,"RNU6-107P,GIGYF2",0.533037,1,0,0.0,0.0,0.371298


In [23]:
len(MetaSVM_pred_D)
#29

29

# PolyPhen probably damaging missense mutations

In [24]:
PolyPhen_damaging = df[df['PolyPhen'].str.contains("probably_damaging")]

In [25]:
PolyPhen_damaging.head()

Unnamed: 0,locus,ref,alt,filter,proband,family,conf,proband_info,father_info,mother_info,proband_GQ,proband_DP,father_DP,mother_DP,variant_MQ,Allele,Consequence,IMPACT,SYMBOL,Gene,Feature_type,Feature,BIOTYPE,EXON,INTRON,HGVSc,HGVSp,cDNA_position,CDS_position,Protein_position,Amino_acids,Codons,Existing_variation,DISTANCE,STRAND,FLAGS,VARIANT_CLASS,SYMBOL_SOURCE,HGNC_ID,CANONICAL,MANE_SELECT,MANE_PLUS_CLINICAL,TSL,APPRIS,CCDS,ENSP,SWISSPROT,TREMBL,UNIPARC,UNIPROT_ISOFORM,GENE_PHENO,SIFT,PolyPhen,DOMAINS,miRNA,HGVS_OFFSET,AF,AFR_AF,AMR_AF,EAS_AF,EUR_AF,SAS_AF,AA_AF,EA_AF,gnomAD_AF,gnomAD_AFR_AF,gnomAD_AMR_AF,gnomAD_ASJ_AF,gnomAD_EAS_AF,gnomAD_FIN_AF,gnomAD_NFE_AF,gnomAD_OTH_AF,gnomAD_SAS_AF,MAX_AF,MAX_AF_POPS,CLIN_SIG,SOMATIC,PHENO,PUBMED,MOTIF_NAME,MOTIF_POS,HIGH_INF_POS,MOTIF_SCORE_CHANGE,TRANSCRIPTION_FACTORS,CADD_phred,DisGeNET,MPC,MTR,Mastermind,MetaSVM_pred,MetaSVM_rankscore,Phenotypes,Polyphen2_HDIV_score,SIFT_pred,SplicAI,SpliceRegion,gnomAD_exomes_AC,gnomAD_exomes_AF,gnomAD_exomes_AN,pLI,pLI_values,DS_AG,DS_AL,DS_DG,DS_DL,DP_AG,DP_AL,DP_DG,DP_DL,greendb_id,greendb_stdtype,greendb_dbsource,greendb_genes,greendb_constraint,greendb_level,greendb_more_support,gnomad_genomes_312_AF,SpliceAI_max,binom_p_val
6641,chr1:16948842,C,T,PASS,GLE_6729788170,8366,hiConfDeNovo,"0/1:52,13:65:78:59:3:157,0,1397:78,0,1483","0/0:71,0:71:22:59:3:0,60,2020:0,22,2008","0/0:36,0:36:65:59:3:0,100,1157:0,65,1148",78,65,71,36,56.86,T,missense_variant,MODERATE,CROCC,ENSG00000058453,Transcript,ENST00000375541,protein_coding,19/37,,ENST00000375541.10:c.2752C>T,ENSP00000364691.4:p.Arg918Trp,2840,2752,918,R/W,Cgg/Tgg,rs143866013&COSV65010102,,1,,SNV,HGNC,HGNC:21299,YES,NM_014675.5,,5,P1,CCDS30616.1,ENSP00000364691,Q5TZA2.144,,UPI000042B0BB,,,deleterious(0),probably_damaging(0.915),AFDB-ENSP_mappings:AF-Q5TZA2-F1.A,,,,,,,,,,,0.03498,0.00477,0.05406,0.014,0.0008806,0.05556,0.04305,0.02226,0.02572,0.05556,gnomAD_FIN,,0&1,0&1,,,,,,,26.6,invalid_field,invalid_field,invalid_field,invalid_field,T,0.52106,invalid_field,,D,invalid_field,invalid_field,,,,invalid_field,invalid_field,0.0,0.04,0.0,0.0,15.0,-43.0,-7.0,20.0,"15187_pro,1941947_enh","enhancer,promoter","DECRES,SegWey,ENCODE-HMM,BENGI,FulcoEtAl2019",CROCC,3.0,0,0.0007,,0.04,1e-06
86341,chr1:151752930,TTTTTCTTTTC,*,PASS,GLE_3601683474,8067,hiConfDeNovo,"0/1:13,13:26:99:-1:-1:.:.:482,0,491:462,0,576","0/0:42,0:42:99:-1:-1:.:.:0,105,1287:0,123,1332","0/0:46,0:46:20:.:.:.:.:0,0,0:0,20,45",99,26,42,46,60.0,A,missense_variant,MODERATE,SNX27,ENSG00000143376,Transcript,ENST00000368838,protein_coding,8/10,,ENST00000368838.2:c.842T>A,ENSP00000357831.2:p.Leu281His,841,842,281,L/H,cTc/cAc,,,1,cds_start_NF,SNV,HGNC,HGNC:20073,,,,1,,,ENSP00000357831,,A0A5H1ZRP6.8,UPI000D18ED09,,,deleterious(0),probably_damaging(0.97),,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,28.1,invalid_field,invalid_field,invalid_field,invalid_field,D,0.96707,invalid_field,.&1.0&0.999,.&D&D,invalid_field,invalid_field,,,,invalid_field,invalid_field,0.0,0.0,0.0,0.0,,,,,18188_enh,enhancer,JungEtAl2019,"MRPL9,RNU6-662P",0.718017,1,0.0,0.0,0.0,1.0
91254,chr1:201899846,G,C,PASS,GLE_2069350888,8334,hiConfDeNovo,"0/1:32,6:38:68:102:42:0|1:201899839_G_T:153,0,...","0/0:42,0:42:72:102:42:.:.:0,107,1513:0,72,1505","0/0:38,0:38:70:102:42:.:.:0,105,1351:0,70,1343",68,38,42,38,60.0,C,missense_variant,MODERATE,LMOD1,ENSG00000163431,Transcript,ENST00000367288,protein_coding,2/3,,ENST00000367288.5:c.1167C>G,ENSP00000356257.4:p.Ile389Met,1375,1167,389,I/M,atC/atG,,,-1,,SNV,HGNC,HGNC:6647,YES,NM_012134.3,,1,P1,CCDS53457.1,ENSP00000356257,P29536.168,,UPI00003665F4,P29536-1,1.0,deleterious(0.01),probably_damaging(0.94),PDB-ENSP_mappings:4z79.A&PDB-ENSP_mappings:4z8...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,24.8,invalid_field,invalid_field,invalid_field,invalid_field,T,0.05582,invalid_field,0.998,D,invalid_field,invalid_field,,,,invalid_field,invalid_field,0.0,0.0,0.0,0.0,-41.0,-35.0,-40.0,-34.0,49275_enh,enhancer,JungEtAl2019,"AL513217.1,LMOD1",0.0929413,3,0.0,0.0,0.0,2.4e-05
94023,chr1:230779122,G,T,PASS,GLE_4487097024,8346,hiConfDeNovo,"0/1:12,16:28:99:89:29:452,0,268:367,0,356","0/0:36,0:36:65:89:29:0,100,1170:0,65,1162","0/0:33,0:33:55:89:29:0,90,1107:0,55,1099",99,28,36,33,60.0,T,missense_variant,MODERATE,CAPN9,ENSG00000135773,Transcript,ENST00000271971,protein_coding,9/20,,ENST00000271971.7:c.1103G>T,ENSP00000271971.2:p.Arg368Leu,1212,1103,368,R/L,cGc/cTc,rs144137595&COSV99680503,,1,,SNV,HGNC,HGNC:1486,YES,NM_006615.3,,1,P1,CCDS1586.1,ENSP00000271971,O14815.179,,UPI000006E882,O14815-1,,deleterious(0.02),probably_damaging(0.999),AFDB-ENSP_mappings:AF-O14815-F1.A,,,,,,,,,0.0,0.0004651,0.0001956,0.0001853,0.0002315,0.0,0.0,4.631e-05,0.0003274,0.0,0.0,0.0004651,EA,,0&1,0&1,,,,,,,32.0,invalid_field,invalid_field,invalid_field,invalid_field,D,0.93601,invalid_field,0.999&0.999&1.0,D&D&D,invalid_field,invalid_field,49.0,0.0001956447,250454.0,invalid_field,invalid_field,0.0,0.0,0.0,0.04,11.0,-37.0,-29.0,15.0,69856_enh,enhancer,BENGI,"CAPN9,AL512328.1",0.691448,1,0.0,0.0002,0.04,0.571588
95128,chr1:240329355,T,C,PASS,GLE_2671540368,8121,hiConfDeNovo,"0/1:20,13:33:99:92:32:329,0,522:244,0,610","0/0:34,0:34:58:92:32:0,93,1395:0,58,1387","0/0:38,0:38:64:92:32:0,99,1266:0,64,1258",99,33,34,38,60.0,C,missense_variant,MODERATE,FMN2,ENSG00000155816,Transcript,ENST00000319653,protein_coding,10/18,,ENST00000319653.14:c.4324T>C,ENSP00000318884.9:p.Ser1442Pro,4551,4324,1442,S/P,Tca/Cca,COSV60430245,,1,,SNV,HGNC,HGNC:14074,YES,NM_020066.5,,5,P1,CCDS31069.2,ENSP00000318884,Q9NZ56.169,,UPI00015FA087,Q9NZ56-1,1.0,deleterious(0),probably_damaging(0.996),AFDB-ENSP_mappings:AF-Q9NZ56-F1.A,,,,,,,,,,,,,,,,,,,,,,,1,1,,,,,,,29.4,invalid_field,invalid_field,invalid_field,invalid_field,T,0.56375,invalid_field,1.0&.,D&D,invalid_field,invalid_field,,,,invalid_field,invalid_field,0.0,0.16,0.0,0.0,-16.0,22.0,-4.0,22.0,"29415_pro,76124_enh","enhancer,promoter","ENCODE-HMM,BENGI",FMN2,0.525843,1,0.0,0.0,0.16,0.296206


In [26]:
len(PolyPhen_damaging)
#62

67

# SIFT

In [27]:
SIFT_pred_D = df[df['SIFT'].str.contains("deleterious\(")]

In [28]:
SIFT_pred_D

Unnamed: 0,locus,ref,alt,filter,proband,family,conf,proband_info,father_info,mother_info,proband_GQ,proband_DP,father_DP,mother_DP,variant_MQ,Allele,Consequence,IMPACT,SYMBOL,Gene,Feature_type,Feature,BIOTYPE,EXON,INTRON,HGVSc,HGVSp,cDNA_position,CDS_position,Protein_position,Amino_acids,Codons,Existing_variation,DISTANCE,STRAND,FLAGS,VARIANT_CLASS,SYMBOL_SOURCE,HGNC_ID,CANONICAL,MANE_SELECT,MANE_PLUS_CLINICAL,TSL,APPRIS,CCDS,ENSP,SWISSPROT,TREMBL,UNIPARC,UNIPROT_ISOFORM,GENE_PHENO,SIFT,PolyPhen,DOMAINS,miRNA,HGVS_OFFSET,AF,AFR_AF,AMR_AF,EAS_AF,EUR_AF,SAS_AF,AA_AF,EA_AF,gnomAD_AF,gnomAD_AFR_AF,gnomAD_AMR_AF,gnomAD_ASJ_AF,gnomAD_EAS_AF,gnomAD_FIN_AF,gnomAD_NFE_AF,gnomAD_OTH_AF,gnomAD_SAS_AF,MAX_AF,MAX_AF_POPS,CLIN_SIG,SOMATIC,PHENO,PUBMED,MOTIF_NAME,MOTIF_POS,HIGH_INF_POS,MOTIF_SCORE_CHANGE,TRANSCRIPTION_FACTORS,CADD_phred,DisGeNET,MPC,MTR,Mastermind,MetaSVM_pred,MetaSVM_rankscore,Phenotypes,Polyphen2_HDIV_score,SIFT_pred,SplicAI,SpliceRegion,gnomAD_exomes_AC,gnomAD_exomes_AF,gnomAD_exomes_AN,pLI,pLI_values,DS_AG,DS_AL,DS_DG,DS_DL,DP_AG,DP_AL,DP_DG,DP_DL,greendb_id,greendb_stdtype,greendb_dbsource,greendb_genes,greendb_constraint,greendb_level,greendb_more_support,gnomad_genomes_312_AF,SpliceAI_max,binom_p_val
6641,chr1:16948842,C,T,PASS,GLE_6729788170,8366,hiConfDeNovo,"0/1:52,13:65:78:59:3:157,0,1397:78,0,1483","0/0:71,0:71:22:59:3:0,60,2020:0,22,2008","0/0:36,0:36:65:59:3:0,100,1157:0,65,1148",78,65,71,36,56.86,T,missense_variant,MODERATE,CROCC,ENSG00000058453,Transcript,ENST00000375541,protein_coding,19/37,,ENST00000375541.10:c.2752C>T,ENSP00000364691.4:p.Arg918Trp,2840,2752,918,R/W,Cgg/Tgg,rs143866013&COSV65010102,,1,,SNV,HGNC,HGNC:21299,YES,NM_014675.5,,5,P1,CCDS30616.1,ENSP00000364691,Q5TZA2.144,,UPI000042B0BB,,,deleterious(0),probably_damaging(0.915),AFDB-ENSP_mappings:AF-Q5TZA2-F1.A,,,,,,,,,,,0.03498,0.00477,0.05406,0.014,0.0008806,0.05556,0.04305,0.02226,0.02572,0.05556,gnomAD_FIN,,0&1,0&1,,,,,,,26.6,invalid_field,invalid_field,invalid_field,invalid_field,T,0.52106,invalid_field,,D,invalid_field,invalid_field,,,,invalid_field,invalid_field,0.00,0.04,0.00,0.0,15,-43,-7,20,"15187_pro,1941947_enh","enhancer,promoter","DECRES,SegWey,ENCODE-HMM,BENGI,FulcoEtAl2019",CROCC,3,0,0.0007,,0.04,0.000001
9398,chr1:40515533,G,A,PASS,GLE_5214625480,8461,hiConfDeNovo,"0/1:15,8:23:99:77:18:209,0,442:124,0,530","0/0:37,0:37:65:77:18:0,100,1192:0,65,1184","0/0:28,0:28:43:77:18:0,78,1170:0,43,1162",99,23,37,28,60.00,A,missense_variant,MODERATE,EXO5,ENSG00000164002,Transcript,ENST00000296380,protein_coding,3/3,,ENST00000296380.9:c.989G>A,ENSP00000296380.4:p.Arg330Gln,1194,989,330,R/Q,cGa/cAa,rs768138919,,1,,SNV,HGNC,HGNC:26115,,,,2,P1,CCDS453.1,ENSP00000296380,Q9H790.126,,UPI00000722CF,,,deleterious(0.03),benign(0.241),PDB-ENSP_mappings:7lw7.A&PDB-ENSP_mappings:7lw...,,,,,,,,,,,7.965e-06,6.158e-05,0,0,0,0,8.799e-06,0,0,6.158e-05,gnomAD_AFR,,,,,,,,,,23.1,invalid_field,invalid_field,invalid_field,invalid_field,T,0.73873,invalid_field,0.881&0.881&0.881,D&D&D,invalid_field,invalid_field,2,7.964891e-06,251102,invalid_field,invalid_field,0.00,0.00,0.00,0.0,3,11,-47,-49,,,,,,,,0.000013,0.00,0.210040
9735,chr1:43598027,C,G,PASS,GLE_6560019403,8063,hiConfDeNovo,"0/1:76,58:134:99:100:40:1588,0,2204:1503,0,2292","0/0:44,0:44:70:100:40:0,105,1454:0,70,1446","0/0:38,0:38:67:100:40:0,102,1261:0,67,1253",99,134,44,38,60.00,G,missense_variant,MODERATE,PTPRF,ENSG00000142949,Transcript,ENST00000359947,protein_coding,12/34,,ENST00000359947.9:c.2093C>G,ENSP00000353030.4:p.Pro698Arg,2426,2093,698,P/R,cCg/cGg,,,1,,SNV,HGNC,HGNC:9670,YES,NM_002840.5,,1,A1,CCDS489.2,ENSP00000353030,P10586.237,,UPI0000470154,P10586-1,1,deleterious(0.01),benign(0.401),PDB-ENSP_mappings:2edx.A&PDB-ENSP_mappings:4n5...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,24.0,invalid_field,invalid_field,invalid_field,invalid_field,T,0.80774,invalid_field,0.756&0.852,D&D,invalid_field,invalid_field,,,,invalid_field,invalid_field,0.00,0.00,0.00,0.0,-27,-2,-50,7,,,,,,,,0.000000,0.00,0.141662
86341,chr1:151752930,TTTTTCTTTTC,*,PASS,GLE_3601683474,8067,hiConfDeNovo,"0/1:13,13:26:99:-1:-1:.:.:482,0,491:462,0,576","0/0:42,0:42:99:-1:-1:.:.:0,105,1287:0,123,1332","0/0:46,0:46:20:.:.:.:.:0,0,0:0,20,45",99,26,42,46,60.00,A,missense_variant,MODERATE,SNX27,ENSG00000143376,Transcript,ENST00000368838,protein_coding,8/10,,ENST00000368838.2:c.842T>A,ENSP00000357831.2:p.Leu281His,841,842,281,L/H,cTc/cAc,,,1,cds_start_NF,SNV,HGNC,HGNC:20073,,,,1,,,ENSP00000357831,,A0A5H1ZRP6.8,UPI000D18ED09,,,deleterious(0),probably_damaging(0.97),,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,28.1,invalid_field,invalid_field,invalid_field,invalid_field,D,0.96707,invalid_field,.&1.0&0.999,.&D&D,invalid_field,invalid_field,,,,invalid_field,invalid_field,0.00,0.00,0.00,0.0,,,,,18188_enh,enhancer,JungEtAl2019,"MRPL9,RNU6-662P",0.718017,1,0,0.000000,0.00,1.000000
90886,chr1:198529169,C,T,PASS,GLE_2790909549,8402,hiConfDeNovo,"0/1:16,11:27:99:-1:-1:302,0,405:282,0,490","0/0:0,0:0:20:.:.:0,0,0:0,20,45","0/0:27,0:27:99:-1:-1:0,81,848:0,99,893",99,27,0,27,60.00,T,missense_variant,MODERATE,ATP6V1G3,ENSG00000151418,Transcript,ENST00000281087,protein_coding,3/4,,ENST00000281087.6:c.95G>A,ENSP00000281087.2:p.Arg32Gln,201,95,32,R/Q,cGa/cAa,rs74134730&COSV55278319,,-1,,SNV,HGNC,HGNC:18265,,,,5,P4,CCDS1395.1,ENSP00000281087,Q96LB4.153,,UPI00001380C6,Q96LB4-1,,deleterious(0.02),benign(0.117),AFDB-ENSP_mappings:AF-Q96LB4-F1.A,,,0.0052,0.0182,0.0014,0.001,0,0,0.02066,0,0.001272,0.01865,0.0005679,0,0.0009583,0,1.152e-05,0.0002764,0,0.02066,AA,,0&1,0&1,,,,,,,19.36,invalid_field,invalid_field,invalid_field,invalid_field,T,0.15130,invalid_field,0.59&0.59&0.681,D&D&D,invalid_field,invalid_field,221,1.272455e-03,173680,invalid_field,invalid_field,0.00,0.00,0.00,0.0,1,6,12,-28,"46677_enh,46674_enh",enhancer,"DECRES,BENGI","AL450352.1,ATP6V1G3",0.194725,1,0,0.006000,0.00,0.442068
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
705415,chr22:22721088,A,G,PASS,GLE_3395739852,7657,hiConfDeNovo,"0/1:22,13:35:99:86:26:347,0,619:262,0,707","0/0:31,0:31:52:86:26:0,87,1305:0,52,1297","0/0:44,0:44:62:86:26:0,97,1400:0,62,1392",99,35,31,44,60.00,G,missense_variant,MODERATE,IGLV3-19,ENSG00000211663,Transcript,ENST00000390309,IG_V_gene,2/2,,ENST00000390309.2:c.280A>G,ENSP00000374844.2:p.Thr94Ala,320,280,94,T/A,Act/Gct,,,1,cds_end_NF,SNV,HGNC,HGNC:5903,YES,,,,P1,,ENSP00000374844,P01714.122,,UPI000173A2CD,,,deleterious(0.05),benign(0.159),PDB-ENSP_mappings:6z1i.A&PDB-ENSP_mappings:6z1...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.00,0.00,0.00,0.0,-23,12,36,23,"309265_pro,1980050_enh,1977830_enh,832043_enh,...","enhancer,promoter","DECRES,JungEtAl2019,SegWey,ENCODE-HMM,BENGI,HACER","GGTLC2,IGLV3-19",0.274938,3,0,0.000000,0.00,0.175465
718360,chrX:49030303,T,A,PASS,GLE_6317135658,8088,hiConfDeNovo,"0/1:20,3:23:44:.:.:0|1:49030297_T_A:66,0,1402:...","0/0:22,0:22:22:.:.:.:.:0,0,449:0,22,497","0/0:29,2:31:42:.:.:0|1:49030284_T_A:0,20,1902:...",44,23,22,31,60.00,A,missense_variant,MODERATE,TFE3,ENSG00000068323,Transcript,ENST00000315869,protein_coding,10/10,,ENST00000315869.8:c.1583A>T,ENSP00000314129.7:p.Glu528Val,1714,1583,528,E/V,gAg/gTg,,,-1,,SNV,HGNC,HGNC:11752,YES,NM_006521.6,,1,P1,CCDS14315.3,ENSP00000314129,P19532.214,A0A024QZ23.59,UPI0000117AE8,P19532-1,1,deleterious(0),probably_damaging(0.95),AFDB-ENSP_mappings:AF-P19532-F1.A,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,26.7,invalid_field,invalid_field,invalid_field,invalid_field,T,0.74483,invalid_field,0.999,D,invalid_field,invalid_field,,,,invalid_field,invalid_field,0.00,0.00,0.01,0.0,2,-2,2,-10,1450729_enh,enhancer,JungEtAl2019,TFE3,0.989763,4,0,0.000000,0.01,0.000488
718361,chrX:49030306,T,A,PASS,GLE_6317135658,8088,hiConfDeNovo,"0/1:20,3:23:44:.:.:0|1:49030297_T_A:66,0,1402:...","0/0:20,0:20:24:.:.:.:.:0,2,459:0,24,507","0/0:40,0:40:22:.:.:.:.:0,0,830:0,22,878",44,23,20,40,60.00,A,missense_variant,MODERATE,TFE3,ENSG00000068323,Transcript,ENST00000315869,protein_coding,10/10,,ENST00000315869.8:c.1580A>T,ENSP00000314129.7:p.Glu527Val,1711,1580,527,E/V,gAg/gTg,,,-1,,SNV,HGNC,HGNC:11752,YES,NM_006521.6,,1,P1,CCDS14315.3,ENSP00000314129,P19532.214,A0A024QZ23.59,UPI0000117AE8,P19532-1,1,deleterious(0.01),possibly_damaging(0.82),AFDB-ENSP_mappings:AF-P19532-F1.A,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,26.3,invalid_field,invalid_field,invalid_field,invalid_field,T,0.74492,invalid_field,0.951,D,invalid_field,invalid_field,,,,invalid_field,invalid_field,0.00,0.00,0.00,0.0,2,-2,2,-13,1450729_enh,enhancer,JungEtAl2019,TFE3,0.989763,4,0,0.000000,0.00,0.000488
718382,chrX:49228345,T,C,PASS,GLE_5702196366,8332,hiConfDeNovo,"0/1:17,21:38:99:74:15:561,0,452:476,0,540","0/0:26,0:26:40:74:15:0,75,933:0,40,925","0/0:40,0:40:65:74:15:0,100,1218:0,65,1210",99,38,26,40,60.00,C,missense_variant,MODERATE,CACNA1F,ENSG00000102001,Transcript,ENST00000323022,protein_coding,7/48,,ENST00000323022.10:c.920A>G,ENSP00000321618.6:p.Asn307Ser,951,920,307,N/S,aAt/aGt,rs782180501,,-1,,SNV,HGNC,HGNC:1393,YES,NM_001256789.3,,1,A2,CCDS59167.1,ENSP00000321618,O60840.207,,UPI0000127279,O60840-2,1,deleterious(0.02),probably_damaging(0.994),,,,,,,,,,,,5.525e-05,0,0,0,0.0001456,0,7.454e-05,0,0.0001065,0.0001456,gnomAD_EAS,,,,,,,,,,24.8,invalid_field,invalid_field,invalid_field,invalid_field,D,0.99623,invalid_field,.&.&0.999,D&D&D,invalid_field,invalid_field,10,5.525472e-05,180980,invalid_field,invalid_field,0.03,0.01,0.00,0.0,-48,6,-1,-7,,,,,,,,0.000018,0.03,0.627103


In [29]:
len(SIFT_pred_D)
#81

108

In [30]:
len(SIFT_pred_D[(~SIFT_pred_D['PolyPhen'].str.contains("probably_damaging")) & ~(SIFT_pred_D['MetaSVM_pred'] == 'D') & ~(SIFT_pred_D['IMPACT'] == 'HIGH')])
#21

58

# SpliceAI

In [31]:
splice_AI = df[df['DS_AG'] != ''].astype({'DS_AG': float, 'DS_AL': float, 'DS_DG': float, 'DS_DL': float})
splice_AI.head()

Unnamed: 0,locus,ref,alt,filter,proband,family,conf,proband_info,father_info,mother_info,proband_GQ,proband_DP,father_DP,mother_DP,variant_MQ,Allele,Consequence,IMPACT,SYMBOL,Gene,Feature_type,Feature,BIOTYPE,EXON,INTRON,HGVSc,HGVSp,cDNA_position,CDS_position,Protein_position,Amino_acids,Codons,Existing_variation,DISTANCE,STRAND,FLAGS,VARIANT_CLASS,SYMBOL_SOURCE,HGNC_ID,CANONICAL,MANE_SELECT,MANE_PLUS_CLINICAL,TSL,APPRIS,CCDS,ENSP,SWISSPROT,TREMBL,UNIPARC,UNIPROT_ISOFORM,GENE_PHENO,SIFT,PolyPhen,DOMAINS,miRNA,HGVS_OFFSET,AF,AFR_AF,AMR_AF,EAS_AF,EUR_AF,SAS_AF,AA_AF,EA_AF,gnomAD_AF,gnomAD_AFR_AF,gnomAD_AMR_AF,gnomAD_ASJ_AF,gnomAD_EAS_AF,gnomAD_FIN_AF,gnomAD_NFE_AF,gnomAD_OTH_AF,gnomAD_SAS_AF,MAX_AF,MAX_AF_POPS,CLIN_SIG,SOMATIC,PHENO,PUBMED,MOTIF_NAME,MOTIF_POS,HIGH_INF_POS,MOTIF_SCORE_CHANGE,TRANSCRIPTION_FACTORS,CADD_phred,DisGeNET,MPC,MTR,Mastermind,MetaSVM_pred,MetaSVM_rankscore,Phenotypes,Polyphen2_HDIV_score,SIFT_pred,SplicAI,SpliceRegion,gnomAD_exomes_AC,gnomAD_exomes_AF,gnomAD_exomes_AN,pLI,pLI_values,DS_AG,DS_AL,DS_DG,DS_DL,DP_AG,DP_AL,DP_DG,DP_DL,greendb_id,greendb_stdtype,greendb_dbsource,greendb_genes,greendb_constraint,greendb_level,greendb_more_support,gnomad_genomes_312_AF,SpliceAI_max,binom_p_val
48,chr1:28694,A,AGAAT,PASS,GLE_6560019403,8063,hiConfDeNovo,"0/1:16,4:20:99:-1:-1:120,0,855:100,0,894","0/0:1,0:1:20:.:.:0,0,0:0,20,44","0/0:5,0:5:33:-1:-1:0,15,174:0,33,218",99,20,1,5,27.0,GAAT,upstream_gene_variant,MODIFIER,MIR1302-2HG,ENSG00000243485,Transcript,ENST00000469289,lncRNA,,,,,,,,,,rs1157902025,1572.0,1.0,,insertion,HGNC,HGNC:52482,,,,5.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,,,,,32562_pro,promoter,EnsemblRegBuild,"MIR1302-2HG,OR4F5,WASH7P",1.0,0.0,0.0,,0.0,0.011818
67,chr1:50299,CA,C,PASS,GLE_2932131647,8345,hiConfDeNovo,"0/1:1,2:3:51:-1:-1:72,0,31:51,0,68","0/0:6,0:6:31:-1:-1:0,12,180:0,31,226","0/0:0,0:0:21:.:.:0,0,0:0,21,46",51,3,6,0,29.84,-,upstream_gene_variant,MODIFIER,OR4G4P,ENSG00000268020,Transcript,ENST00000606857,unprocessed_pseudogene,,,,,,,,,,rs769584624,2173.0,1.0,,deletion,HGNC,HGNC:14822,YES,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,,,,,,,,,,,,0.0014,0.0,1.0
79,chr1:66244,ATTATATAATATATAATATAAATATAATATAAATTATAT,A,PASS,GLE_6577308379,8413,hiConfDeNovo,"0/1:2,4:6:99:-1:-1:.:.:162,0,72:143,0,102","0/0:1,0:1:20:.:.:.:.:0,0,0:0,20,45","0/0:1,0:1:21:-1:-1:.:.:0,3,14:0,21,59",99,6,1,1,56.85,-,downstream_gene_variant,MODIFIER,OR4G11P,ENSG00000240361,Transcript,ENST00000492842,transcribed_unprocessed_pseudogene,,,,,,,,,,rs1273585901,2358.0,1.0,,deletion,HGNC,HGNC:31276,YES,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,,,,,,,,,,,,0.0021,0.0,0.6875
103,chr1:83886,A,G,PASS,GLE_6101591489,8088,hiConfDeNovo,"0/1:31,5:36:35:.:.:0|1:83886_A_G:59,0,1276:35,...","0/0:44,0:44:99:.:.:.:.:0,101,1196:0,125,1247","0/0:41,0:41:99:.:.:.:.:0,111,1653:0,135,1704",35,36,44,41,57.39,G,intergenic_variant,MODIFIER,,,,,,,,,,,,,,,rs1315695765,,,,SNV,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,,,,,,,,,,,,0.0006,0.0,1.3e-05
104,chr1:83919,G,A,PASS,GLE_6101591489,8088,hiConfDeNovo,"0/1:36,10:46:99:.:.:.:.:306,0,1566:287,0,1590","0/0:44,0:44:99:.:.:.:.:0,101,1196:0,120,1239","0/0:50,0:50:99:.:.:.:.:0,117,1755:0,136,1798",99,46,44,50,57.17,A,intergenic_variant,MODIFIER,,,,,,,,,,,,,,,rs992179951,,,,SNV,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,,,,,,,,,,,,0.0056,0.0,0.000156


In [32]:
splice_AI['SpliceAI_max'] = splice_AI[['DS_AG','DS_AL','DS_DG','DS_DL']].max(axis=1).astype(float)
splice_AI_high = splice_AI[splice_AI['SpliceAI_max'] > 0.5].sort_values(by='SpliceAI_max',ascending=False)

In [33]:
len(splice_AI_high)
#32

45

In [34]:
splice_AI_high

Unnamed: 0,locus,ref,alt,filter,proband,family,conf,proband_info,father_info,mother_info,proband_GQ,proband_DP,father_DP,mother_DP,variant_MQ,Allele,Consequence,IMPACT,SYMBOL,Gene,Feature_type,Feature,BIOTYPE,EXON,INTRON,HGVSc,HGVSp,cDNA_position,CDS_position,Protein_position,Amino_acids,Codons,Existing_variation,DISTANCE,STRAND,FLAGS,VARIANT_CLASS,SYMBOL_SOURCE,HGNC_ID,CANONICAL,MANE_SELECT,MANE_PLUS_CLINICAL,TSL,APPRIS,CCDS,ENSP,SWISSPROT,TREMBL,UNIPARC,UNIPROT_ISOFORM,GENE_PHENO,SIFT,PolyPhen,DOMAINS,miRNA,HGVS_OFFSET,AF,AFR_AF,AMR_AF,EAS_AF,EUR_AF,SAS_AF,AA_AF,EA_AF,gnomAD_AF,gnomAD_AFR_AF,gnomAD_AMR_AF,gnomAD_ASJ_AF,gnomAD_EAS_AF,gnomAD_FIN_AF,gnomAD_NFE_AF,gnomAD_OTH_AF,gnomAD_SAS_AF,MAX_AF,MAX_AF_POPS,CLIN_SIG,SOMATIC,PHENO,PUBMED,MOTIF_NAME,MOTIF_POS,HIGH_INF_POS,MOTIF_SCORE_CHANGE,TRANSCRIPTION_FACTORS,CADD_phred,DisGeNET,MPC,MTR,Mastermind,MetaSVM_pred,MetaSVM_rankscore,Phenotypes,Polyphen2_HDIV_score,SIFT_pred,SplicAI,SpliceRegion,gnomAD_exomes_AC,gnomAD_exomes_AF,gnomAD_exomes_AN,pLI,pLI_values,DS_AG,DS_AL,DS_DG,DS_DL,DP_AG,DP_AL,DP_DG,DP_DL,greendb_id,greendb_stdtype,greendb_dbsource,greendb_genes,greendb_constraint,greendb_level,greendb_more_support,gnomad_genomes_312_AF,SpliceAI_max,binom_p_val
539000,chr13:113117585,TCGCGGTGCTGGGTGGGTACCACTCTCCCCTGTCCGAC,T,PASS,GLE_1293525316,8313,hiConfDeNovo,"0/1:20,10:30:99:98:38:0|1:113117585_TCGCGGTGCT...","0/0:41,0:41:65:98:38:.:.:0,105,1401:0,65,1386","0/0:39,0:39:59:98:38:.:.:0,99,1336:0,59,1321",99,30,41,39,59.71,-,coding_sequence_variant&intron_variant&splice_...,HIGH,F7,ENSG00000057593,Transcript,ENST00000346342,protein_coding,7/8,7/7,ENST00000346342.8:c.739+7_739+43del,,780-?,729-?,243-?,,,rs1309684522,,1,,deletion,HGNC,HGNC:3544,YES,NM_019616.4,,1,P2,CCDS9529.1,ENSP00000329546,P08709.270,,UPI000002A952,P08709-2,1.0,,,,,,,,,,,,,,0.0004705,6.158e-05,0.0003763,0.0007959,5.438e-05,0.000232,0.0007415,0.00049,9.801e-05,0.0007959,gnomAD_ASJ,uncertain_significance,,1,31064749.0,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.96,1.0,-16,-23,48,11,,,,,,,,0.0026,1.0,0.09873715
370230,chr9:136671015,G,A,PASS,GLE_6317135658,8088,hiConfDeNovo,"0/1:13,2:15:23:.:.:0|1:136671015_G_A:45,0,606:...","0/0:31,0:31:86:.:.:.:.:0,64,908:0,86,956","0/0:39,0:39:75:.:.:.:.:0,53,1106:0,75,1154",23,15,31,39,59.84,A,splice_donor_variant,HIGH,EGFL7,ENSG00000172889,Transcript,ENST00000308874,protein_coding,,9/10,ENST00000308874.12:c.636+1G>A,,,,,,,rs746089480&COSV58247378,,1,,SNV,HGNC,HGNC:20594,YES,NM_016215.5,,1,P1,CCDS7002.1,ENSP00000307843,Q9UHF1.167,A0A024R8F5.60,UPI0000036A42,,,,,,,,,,,,,,,,2.614e-05,0.0,4.057e-05,0.0,9.006e-05,0.0,1.689e-05,0.0,4.405e-05,9.006e-05,gnomAD_EAS,,0&1,0&1,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.05,1.0,-19,-45,10,-1,1385799_enh,enhancer,DECRES,"MIR126,EGFL7",0.0501356,3.0,0.0,0.0004,1.0,0.007385254
490526,chr12:103984922,T,A,PASS,GLE_7507632649,8336,hiConfDeNovo,"0/1:31,3:34:44:0:1:0|1:103984920_G_GAGGATGCAAA...","0/0:30,0:30:20:0:1:.:.:0,55,923:0,20,913","0/0:19,0:22:23:0:1:0|1:103984920_G_GA:69,126,1...",44,34,30,22,59.13,A,splice_donor_variant,HIGH,TDG,ENSG00000139372,Transcript,ENST00000266775,protein_coding,,9/10,ENST00000266775.13:c.952+2T>A,,,,,,,rs760400700&COSV99904221,,1,,SNV,HGNC,HGNC:11700,,,,1,A2,,ENSP00000266775,,G8JL98.68,UPI000204AD36,,,,,,,,,,,,,,,,,,,,,,,,,,,,0&1,0&1,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.51,1.0,-36,-11,-36,-2,271847_enh,enhancer,JungEtAl2019,TDG,0.833483,1.0,0.0,0.0,1.0,7.660128e-07
333943,chr9:2073624,G,T,PASS,GLE_3222964762,7660,hiConfDeNovo,"0/1:19,21:40:99:98:38:610,0,515:525,0,603","0/0:36,0:36:70:98:38:0,105,1226:0,70,1218","0/0:36,0:36:64:98:38:0,99,1236:0,64,1228",99,40,36,36,60.0,T,splice_donor_variant,HIGH,SMARCA2,ENSG00000080503,Transcript,ENST00000349721,protein_coding,,12/33,ENST00000349721.8:c.1935+1G>T,,,,,,,,,1,,SNV,HGNC,HGNC:11098,YES,NM_003070.5,,5,P3,CCDS34977.1,ENSP00000265773,P51531.219,,UPI00001AE8EB,P51531-1,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.82,1.0,40,-50,3,-1,1391298_enh,enhancer,JungEtAl2019,"SMARCA2,AL359076.1",0.127022,1.0,0.0,0.0,1.0,0.8746293
591879,chr17:3755115,A,ATCAGGTGGCCCCGCCCTCATCAGGTGGCCCCGCCCTCATCAGGTGGC,PASS,GLE_5702196366,8332,hiConfDeNovo,"0/1:10,4:14:52:116:56:.:.:137,0,1141:52,0,1229","0/0:82,0:82:85:116:56:.:.:0,120,1800:0,85,1792","0/0:62,0:62:85:116:56:.:.:0,120,1800:0,85,1792",52,14,82,62,59.7,TCAGGTGGCCCCGCCCTCATCAGGTGGCCCCGCCCTCATCAGGTGGC,splice_donor_variant,HIGH,ITGAE,ENSG00000083457,Transcript,ENST00000263087,protein_coding,,12/30,ENST00000263087.9:c.1384+1_1384+2insGCCACCTGAT...,,,,,,,COSV53992822&COSV99561834,,-1,,insertion,HGNC,HGNC:6147,YES,NM_002208.5,,1,P1,CCDS32531.1,ENSP00000263087,P38570.190,,UPI000049DE2D,,,,,,,,,,,,,,,,,,,,,,,,,,,,1&1,1&1,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.42,0.99,16,50,16,2,"1811407_enh,1578573_enh,192945_pro,192951_pro,...","enhancer,promoter","DECRES,SegWey,BENGI,EnsemblRegBuild,FulcoEtAl2019","AC116914.1,ITGAE",0.502477,3.0,0.0,0.0002,0.99,0.1795654
411573,chr11:17527221,A,C,PASS,GLE_8399408151,8065,hiConfDeNovo,"0/1:34,8:42:99:1:2:0|1:17527208_T_TCCC:152,0,8...","0/0:50,0:50:22:1:2:.:.:0,0,1225:0,22,1274","0/0:32,0:32:22:1:2:.:.:0,0,675:0,22,724",99,42,50,32,59.74,C,splice_donor_variant,HIGH,USH1C,ENSG00000006611,Transcript,ENST00000005226,protein_coding,,5/26,ENST00000005226.12:c.496+2T>G,,,,,,,rs1449256750,,-1,,SNV,HGNC,HGNC:12597,YES,NM_153676.4,,5,,CCDS7825.1,ENSP00000005226,Q9Y6N9.191,,UPI00001D965A,Q9Y6N9-5,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.79,0.99,21,50,9,2,"225420_enh,225422_enh",enhancer,"JungEtAl2019,BENGI",USH1C,0.796057,4.0,0.0,0.0002,0.99,6.877111e-05
367392,chr9:121165101,G,A,PASS,GLE_3872672670,7557,hiConfDeNovo,"0/1:22,12:34:99:97:37:315,0,622:230,0,710","0/0:35,0:35:64:97:37:0,99,1232:0,64,1224","0/0:36,0:36:67:97:37:0,102,1191:0,67,1183",99,34,35,36,60.0,A,non_coding_transcript_variant&splice_donor_var...,HIGH,CNTRL,ENSG00000119397,Transcript,ENST00000373845,retained_intron,,10/18,ENST00000373845.6:n.2060+1G>A,,,,,,,,,1,,SNV,HGNC,HGNC:1858,,,,1,,,,,,,,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.68,0.99,25,-21,23,-1,1372383_enh,enhancer,JungEtAl2019,CNTRL,0.868688,1.0,0.0,0.0,0.99,0.1214495
139038,chr2:148233419,A,G,PASS,GLE_3946338448,7478,hiConfDeNovo,"0/1:17,22:39:99:96:36:628,0,399:543,0,487","0/0:36,0:36:66:96:36:0,101,1227:0,66,1219","0/0:35,0:35:64:96:36:0,99,1221:0,64,1213",99,39,36,35,60.0,G,intron_variant,MODIFIER,MBD5,ENSG00000204406,Transcript,ENST00000407073,protein_coding,,4/14,ENST00000407073.5:c.-680+24A>G,,,,,,,,,1,,SNV,HGNC,HGNC:20444,,,,1,,CCDS33302.1,ENSP00000386049,Q9P267.159,,UPI0000208C40,Q9P267-1,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.98,0.43,-1,-20,-1,-24,,,,,,,,0.0,0.98,0.5223974
373089,chr10:6478991,C,G,PASS,GLE_4931115797,8077,hiConfDeNovo,"0/1:30,22:52:99:101:41:534,0,832:449,0,920","0/0:48,0:48:78:101:41:0,113,1637:0,78,1629","0/0:37,0:37:67:101:41:0,102,1286:0,67,1278",99,52,48,37,60.0,G,splice_donor_variant,HIGH,PRKCQ,ENSG00000065675,Transcript,ENST00000263125,protein_coding,,12/17,ENST00000263125.10:c.1353+1G>C,,,,,,,,,-1,,SNV,HGNC,HGNC:9410,YES,NM_006257.5,,1,P1,CCDS7079.1,ENSP00000263125,Q04759.226,,UPI000012DF74,Q04759-1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.01,0.0,0.0,0.98,6,1,18,1,177910_enh,enhancer,BENGI,PRKCQ,0.703937,4.0,0.0,0.0,0.98,0.3317498
679658,chr20:32210193,G,A,PASS,GLE_7269173106,7662,hiConfDeNovo,"0/1:37,11:48:99:63:5:238,0,1042:154,0,1130","0/0:30,0:30:56:63:5:0,90,1023:0,56,1016","0/0:33,0:33:29:63:5:0,64,1048:0,29,1040",99,48,30,33,60.0,A,upstream_gene_variant,MODIFIER,PLAGL2,ENSG00000126003,Transcript,ENST00000246229,protein_coding,,,,,,,,,,rs553681367,2450.0,-1,,SNV,HGNC,HGNC:9047,YES,NM_002657.3,,1,P1,CCDS13197.1,ENSP00000246229,Q9UPG8.179,,UPI0000001C1D,,,,,,,,0.0002,0.0,0.0014,0.0,0.0,0.0,,,,,,,,,,,,0.0014,AMR,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.97,45,-30,-32,-1,"538765_pro,1627949_enh","enhancer,promoter","DECRES,SegWey,ENCODE-HMM,EnsemblRegBuild",PLAGL2,0.995633,4.0,0.0,7e-06,0.97,0.000222245


# GREEN-VARAN level 4 regulatory mutations

In [35]:
greenvaran_high = df[df['greendb_level'] == '4']
greenvaran_high = greenvaran_high[greenvaran_high['IMPACT'] == 'MODIFIER']

In [36]:
greenvaran_genes = []
for i in list(greenvaran_high.greendb_genes):
    for j in i.split(','):
        greenvaran_genes.append(j)
Counter(greenvaran_genes).most_common()

[('AC004549.1', 6),
 ('TAX1BP1', 6),
 ('AC087672.1', 5),
 ('AC087627.1', 5),
 ('JPH1', 5),
 ('FEZF2', 4),
 ('PTPRG-AS1', 4),
 ('GRM4', 4),
 ('DDX27', 4),
 ('LINC01967', 3),
 ('CMC1', 3),
 ('ADK', 3),
 ('RPSAP6', 3),
 ('IFFO1', 3),
 ('MAP2K6', 3),
 ('LMNA', 2),
 ('WNT10A', 2),
 ('MYRIP', 2),
 ('AC099331.1', 2),
 ('MIR6083', 2),
 ('KALRN', 2),
 ('EPHB3', 2),
 ('AL662884.1', 2),
 ('PBX2', 2),
 ('PRKAG2', 2),
 ('TBC1D2', 2),
 ('ZBTB34', 2),
 ('LGI1', 2),
 ('LMO1', 2),
 ('CHRM1', 2),
 ('PRICKLE1', 2),
 ('HSD11B2', 2),
 ('BRCA1', 2),
 ('RPL21P4', 2),
 ('RUNX1', 2),
 ('AC092198.1', 2),
 ('BCOR', 2),
 ('TIMM8A', 2),
 ('FMR1', 2),
 ('AL096861.1', 2),
 ('TP73', 1),
 ('AL136528.1', 1),
 ('WDTC1', 1),
 ('FO393419.3', 1),
 ('EPB41', 1),
 ('TMEM200B', 1),
 ('HPCA', 1),
 ('MACF1', 1),
 ('FAF1', 1),
 ('FAF1-AS1', 1),
 ('AL355483.2', 1),
 ('LRP8', 1),
 ('AK5', 1),
 ('AC118549.1', 1),
 ('DPYD', 1),
 ('AL354760.1', 1),
 ('ST7L', 1),
 ('BCL9', 1),
 ('ADAMTSL4', 1),
 ('ADAMTSL4-AS2', 1),
 ('FASLG', 1),
 ('

In [37]:
len(greenvaran_high)
#110

198

In [38]:
greenvaran_high.head()

Unnamed: 0,locus,ref,alt,filter,proband,family,conf,proband_info,father_info,mother_info,proband_GQ,proband_DP,father_DP,mother_DP,variant_MQ,Allele,Consequence,IMPACT,SYMBOL,Gene,Feature_type,Feature,BIOTYPE,EXON,INTRON,HGVSc,HGVSp,cDNA_position,CDS_position,Protein_position,Amino_acids,Codons,Existing_variation,DISTANCE,STRAND,FLAGS,VARIANT_CLASS,SYMBOL_SOURCE,HGNC_ID,CANONICAL,MANE_SELECT,MANE_PLUS_CLINICAL,TSL,APPRIS,CCDS,ENSP,SWISSPROT,TREMBL,UNIPARC,UNIPROT_ISOFORM,GENE_PHENO,SIFT,PolyPhen,DOMAINS,miRNA,HGVS_OFFSET,AF,AFR_AF,AMR_AF,EAS_AF,EUR_AF,SAS_AF,AA_AF,EA_AF,gnomAD_AF,gnomAD_AFR_AF,gnomAD_AMR_AF,gnomAD_ASJ_AF,gnomAD_EAS_AF,gnomAD_FIN_AF,gnomAD_NFE_AF,gnomAD_OTH_AF,gnomAD_SAS_AF,MAX_AF,MAX_AF_POPS,CLIN_SIG,SOMATIC,PHENO,PUBMED,MOTIF_NAME,MOTIF_POS,HIGH_INF_POS,MOTIF_SCORE_CHANGE,TRANSCRIPTION_FACTORS,CADD_phred,DisGeNET,MPC,MTR,Mastermind,MetaSVM_pred,MetaSVM_rankscore,Phenotypes,Polyphen2_HDIV_score,SIFT_pred,SplicAI,SpliceRegion,gnomAD_exomes_AC,gnomAD_exomes_AF,gnomAD_exomes_AN,pLI,pLI_values,DS_AG,DS_AL,DS_DG,DS_DL,DP_AG,DP_AL,DP_DG,DP_DL,greendb_id,greendb_stdtype,greendb_dbsource,greendb_genes,greendb_constraint,greendb_level,greendb_more_support,gnomad_genomes_312_AF,SpliceAI_max,binom_p_val
3768,chr1:3677070,T,C,PASS,GLE_1567252581,8078,hiConfDeNovo,"0/1:21,19:40:99:99:39:500,0,511:415,0,599","0/0:40,0:40:72:99:39:0,107,1371:0,72,1363","0/0:35,0:35:65:99:39:0,100,1211:0,65,1203",99,40,40,35,60.0,C,intron_variant,MODIFIER,TP73,ENSG00000078900,Transcript,ENST00000346387,protein_coding,,1/11,ENST00000346387.8:c.-33-5263T>C,,,,,,,,,1,,SNV,HGNC,HGNC:12003,,,,5,,CCDS55568.1,ENSP00000340740,O15350.232,,UPI000002B05C,O15350-6,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,29,7,25,-2,"35137_pro,1487719_enh,35114_pro,89917_enh","enhancer,promoter","DECRES,FOCS,ENCODE-HMM,BENGI,FulcoEtAl2019","TP73,AL136528.1",0.86505,4,0,0.0,0.0,0.874629
8030,chr1:27298763,CAT,C,PASS,GLE_5702196366,8332,hiConfDeNovo,"0/1:11,5:16:91:79:19:0|1:27298751_A_AGGCCCCCCC...","0/0:42,0:42:46:79:19:.:.:0,84,1260:0,46,1248","0/0:42,0:42:43:79:19:.:.:0,81,1215:0,43,1203",91,16,42,42,60.0,-,intron_variant,MODIFIER,WDTC1,ENSG00000142784,Transcript,ENST00000319394,protein_coding,,12/15,ENST00000319394.8:c.1232+653_1232+654del,,,,,,,rs1570986586,,1,,deletion,HGNC,HGNC:29175,YES,NM_001276252.2,,1,P4,CCDS60044.1,ENSP00000317971,Q8N5D0.173,,UPI000004814E,Q8N5D0-1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.01,-12,10,14,-29,"83777_enh,83780_enh,1486337_enh",enhancer,"DECRES,JungEtAl2019,ENCODE-HMM,BENGI","WDTC1,FO393419.3",0.923135,4,0,0.0006,0.01,0.210114
8252,chr1:29117534,A,G,PASS,GLE_8739880167,8074,hiConfDeNovo,"0/1:15,21:36:99:84:25:622,0,413:537,0,501","0/0:33,0:33:50:84:25:0,85,1058:0,50,1050","0/0:51,0:51:78:84:25:0,113,1708:0,78,1700",99,36,33,51,60.0,G,3_prime_UTR_variant,MODIFIER,EPB41,ENSG00000159023,Transcript,ENST00000343067,protein_coding,21/21,,ENST00000343067.9:c.*722A>G,,3524.0,,,,,,,1,,SNV,HGNC,HGNC:3377,YES,NM_001376013.1,,5,P2,CCDS53288.1,ENSP00000345259,P11171.232,,UPI000014177D,P11171-1,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,23,-46,-1,-3,85346_enh,enhancer,BENGI,"EPB41,TMEM200B",0.935115,4,0,0.0,0.0,0.405032
8706,chr1:32893663,C,G,PASS,GLE_5368458628,8158,hiConfDeNovo,"0/1:8,7:15:99:-1:-1:188,0,203:163,0,291","0/0:32,0:32:99:-1:-1:0,90,1032:0,113,1084","0/0:54,0:54:25:.:.:0,0,0:0,25,52",99,15,32,54,60.0,G,downstream_gene_variant,MODIFIER,TMEM54,ENSG00000121900,Transcript,ENST00000329151,protein_coding,,,,,,,,,,rs748820769,932.0,-1,,SNV,HGNC,HGNC:24143,,,,1,,CCDS85954.1,ENSP00000328630,Q969K7.135,,UPI0000074654,Q969K7-3,,,,,,,,,,,,,,,3.461e-05,7.254e-05,0.0001493,0.0,0.0,0.0,1.968e-05,0.0,0.0,0.0001493,gnomAD_AMR,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,-44,-22,-30,-34,"579548_pro,88370_enh","enhancer,promoter","FOCS,SegWey,ENCODE-HMM,BENGI,EnsemblRegBuild",HPCA,0.727079,4,0,7e-06,0.0,1.0
9284,chr1:39406760,ATTCTTT,A,PASS,GLE_6101591489,8088,hiConfDeNovo,"0/1:7,2:9:23:.:.:1|0:39406756_A_AAAAAAAAAAAAC:...","0/0:26,0:26:94:.:.:.:.:0,72,823:0,94,871","0/0:51,0:51:99:.:.:.:.:0,108,1620:0,130,1668",23,9,26,51,60.0,-,intron_variant&non_coding_transcript_variant,MODIFIER,MACF1,ENSG00000127603,Transcript,ENST00000289893,retained_intron,,22/63,ENST00000289893.8:n.11268-15613_11268-15608del,,,,,,,,,1,,deletion,HGNC,HGNC:13664,,,,5,,,,,,,,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,-31,38,-41,14,"1771750_enh,510156_pro,1488315_enh","enhancer,promoter","DECRES,FOCS,SegWey,ENCODE-HMM,BENGI,EnsemblReg...",MACF1,0.975802,4,0,0.0022,0.0,0.179688


# Combined

In [39]:
combined = pd.concat([impact_high,MetaSVM_pred_D,PolyPhen_damaging,SIFT_pred_D,splice_AI_high,greenvaran_high]).drop_duplicates()
combined.head()

Unnamed: 0,locus,ref,alt,filter,proband,family,conf,proband_info,father_info,mother_info,proband_GQ,proband_DP,father_DP,mother_DP,variant_MQ,Allele,Consequence,IMPACT,SYMBOL,Gene,Feature_type,Feature,BIOTYPE,EXON,INTRON,HGVSc,HGVSp,cDNA_position,CDS_position,Protein_position,Amino_acids,Codons,Existing_variation,DISTANCE,STRAND,FLAGS,VARIANT_CLASS,SYMBOL_SOURCE,HGNC_ID,CANONICAL,MANE_SELECT,MANE_PLUS_CLINICAL,TSL,APPRIS,CCDS,ENSP,SWISSPROT,TREMBL,UNIPARC,UNIPROT_ISOFORM,GENE_PHENO,SIFT,PolyPhen,DOMAINS,miRNA,HGVS_OFFSET,AF,AFR_AF,AMR_AF,EAS_AF,EUR_AF,SAS_AF,AA_AF,EA_AF,gnomAD_AF,gnomAD_AFR_AF,gnomAD_AMR_AF,gnomAD_ASJ_AF,gnomAD_EAS_AF,gnomAD_FIN_AF,gnomAD_NFE_AF,gnomAD_OTH_AF,gnomAD_SAS_AF,MAX_AF,MAX_AF_POPS,CLIN_SIG,SOMATIC,PHENO,PUBMED,MOTIF_NAME,MOTIF_POS,HIGH_INF_POS,MOTIF_SCORE_CHANGE,TRANSCRIPTION_FACTORS,CADD_phred,DisGeNET,MPC,MTR,Mastermind,MetaSVM_pred,MetaSVM_rankscore,Phenotypes,Polyphen2_HDIV_score,SIFT_pred,SplicAI,SpliceRegion,gnomAD_exomes_AC,gnomAD_exomes_AF,gnomAD_exomes_AN,pLI,pLI_values,DS_AG,DS_AL,DS_DG,DS_DL,DP_AG,DP_AL,DP_DG,DP_DL,greendb_id,greendb_stdtype,greendb_dbsource,greendb_genes,greendb_constraint,greendb_level,greendb_more_support,gnomad_genomes_312_AF,SpliceAI_max,binom_p_val
7396,chr1:23077336,C,T,PASS,GLE_1567252581,8078,hiConfDeNovo,"0/1:23,11:34:99:81:21:280,0,697:195,0,785","0/0:37,0:37:65:81:21:0,100,1252:0,65,1244","0/0:32,0:32:47:81:21:0,82,1013:0,47,1005",99,34,37,32,60.0,T,stop_gained,HIGH,KDM1A,ENSG00000004487,Transcript,ENST00000356634,protein_coding,14/19,,ENST00000356634.7:c.1771C>T,ENSP00000349049.3:p.Arg591Ter,1920,1771,591,R/*,Cga/Tga,COSV63088329,,1,,SNV,HGNC,HGNC:29079,,,,1,,CCDS30627.1,ENSP00000349049,O60341.213,,UPI000020466D,O60341-1,1,,,PDB-ENSP_mappings:2dw4.A&PDB-ENSP_mappings:2ej...,,,,,,,,,,,,,,,,,,,,,,,1.0,1.0,,,,,,,38.0,invalid_field,invalid_field,invalid_field,invalid_field,,,invalid_field,.&.&.&.,.&.&.&.,invalid_field,invalid_field,,,,invalid_field,invalid_field,0.0,0.0,0.08,0.01,25,-5,46,24,72086_enh,enhancer,JungEtAl2019,"KDM1A,AL031428.1",0.72106,4.0,0.0,0.0,0.08,0.05761267
91255,chr1:201899847,ATG,A,PASS,GLE_2069350888,8334,hiConfDeNovo,"0/1:33,6:39:99:102:42:0|1:201899839_G_T:192,0,...","0/0:42,0:42:72:102:42:.:.:0,107,1513:0,72,1505","0/0:38,0:38:70:102:42:.:.:0,105,1351:0,70,1343",99,39,42,38,60.0,-,frameshift_variant,HIGH,LMOD1,ENSG00000163431,Transcript,ENST00000367288,protein_coding,2/3,,ENST00000367288.5:c.1164_1165del,ENSP00000356257.4:p.Ile389HisfsTer66,1372-1373,1164-1165,388-389,TI/TX,acCAtc/actc,,,-1,,deletion,HGNC,HGNC:6647,YES,NM_012134.3,,1,P1,CCDS53457.1,ENSP00000356257,P29536.168,,UPI00003665F4,P29536-1,1,,,PDB-ENSP_mappings:4z79.A&PDB-ENSP_mappings:4z8...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,13,-36,-36,-41,49275_enh,enhancer,JungEtAl2019,"AL513217.1,LMOD1",0.0929413,3.0,0.0,0.0,0.0,1.429926e-05
91256,chr1:201899850,G,GCA,PASS,GLE_2069350888,8334,hiConfDeNovo,"0/1:32,6:38:99:102:42:0|1:201899839_G_T:192,0,...","0/0:42,0:42:72:102:42:.:.:0,107,1513:0,72,1505","0/0:38,0:38:70:102:42:.:.:0,105,1351:0,70,1343",99,38,42,38,60.0,CA,frameshift_variant,HIGH,LMOD1,ENSG00000163431,Transcript,ENST00000367288,protein_coding,2/3,,ENST00000367288.5:c.1162_1163insTG,ENSP00000356257.4:p.Thr388MetfsTer31,1370-1371,1162-1163,388,T/MX,acc/aTGcc,,,-1,,insertion,HGNC,HGNC:6647,YES,NM_012134.3,,1,P1,CCDS53457.1,ENSP00000356257,P29536.168,,UPI00003665F4,P29536-1,1,,,PDB-ENSP_mappings:4z79.A&PDB-ENSP_mappings:4z8...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,10,-39,-38,-44,49275_enh,enhancer,JungEtAl2019,"AL513217.1,LMOD1",0.0929413,3.0,0.0,0.0,0.0,2.434256e-05
150240,chr3:10342946,A,ACCGTGAGCGGCGAATCTGTGCCCATGAAGGCACCGGCCAACACAA...,PASS,GLE_4949935840,8299,hiConfDeNovo,"0/1:39,5:44:99:61:3:0|1:10342936_G_A:1423,0,63...","0/0:55,0:55:26:61:3:.:.:0,61,1643:0,26,1635","0/0:41,0:59:99:61:3:0|1:10342944_G_GCGT:598,75...",99,44,55,59,59.99,CCGTGAGCGGCGAATCTGTGCCCATGAAGGCACCGGCCAACACAAT...,frameshift_variant&stop_gained,HIGH,ATP2B2,ENSG00000157087,Transcript,ENST00000352432,protein_coding,17/23,,ENST00000352432.9:c.2689_2690insCTCGCTCTACCAAA...,ENSP00000324172.6:p.Val897AlafsTer15,2689-2690,2689-2690,897,V/ARSTKTSSVSSSSK*R*T*PLASLCWPVPSWAQIRRSRX,gtg/gCTCGCTCTACCAAAACATCCAGCGTTTCATCCTCTTCCAAA...,,,-1,,insertion,HGNC,HGNC:815,,,,1,,,ENSP00000324172,,A0A2U3TZI3.19,UPI000D1955CC,,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.29,0.15,0.0,0.0,0,19,0,4,854554_enh,enhancer,JungEtAl2019,"ATP2B2,MIR378B",0.726492,4.0,0.0,0.0,0.29,1.405162e-07
154089,chr3:53747333,GT,G,PASS,GLE_4931115797,8077,hiConfDeNovo,"0/1:20,26:46:99:98:38:0|1:53747333_GT_G:1016,0...","0/0:47,0:47:85:98:38:.:.:0,120,1800:0,85,1792","0/0:34,0:34:64:98:38:.:.:0,99,1201:0,64,1193",99,46,47,34,60.0,-,frameshift_variant,HIGH,CACNA1D,ENSG00000157388,Transcript,ENST00000288139,protein_coding,27/49,,ENST00000288139.11:c.3261del,ENSP00000288139.3:p.Asp1088ThrfsTer24,3816,3260,1087,V/X,gTt/gt,,,1,,deletion,HGNC,HGNC:1391,,,NM_000720.4,1,P2,CCDS2872.1,ENSP00000288139,Q01668.209,,UPI000005031A,Q01668-2,1,,,,,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,-31,-9,14,39,,,,,,,,0.0,0.0,0.4613912


In [40]:
combined[combined['SYMBOL'].isin(WES_gene_list)]

Unnamed: 0,locus,ref,alt,filter,proband,family,conf,proband_info,father_info,mother_info,proband_GQ,proband_DP,father_DP,mother_DP,variant_MQ,Allele,Consequence,IMPACT,SYMBOL,Gene,Feature_type,Feature,BIOTYPE,EXON,INTRON,HGVSc,HGVSp,cDNA_position,CDS_position,Protein_position,Amino_acids,Codons,Existing_variation,DISTANCE,STRAND,FLAGS,VARIANT_CLASS,SYMBOL_SOURCE,HGNC_ID,CANONICAL,MANE_SELECT,MANE_PLUS_CLINICAL,TSL,APPRIS,CCDS,ENSP,SWISSPROT,TREMBL,UNIPARC,UNIPROT_ISOFORM,GENE_PHENO,SIFT,PolyPhen,DOMAINS,miRNA,HGVS_OFFSET,AF,AFR_AF,AMR_AF,EAS_AF,EUR_AF,SAS_AF,AA_AF,EA_AF,gnomAD_AF,gnomAD_AFR_AF,gnomAD_AMR_AF,gnomAD_ASJ_AF,gnomAD_EAS_AF,gnomAD_FIN_AF,gnomAD_NFE_AF,gnomAD_OTH_AF,gnomAD_SAS_AF,MAX_AF,MAX_AF_POPS,CLIN_SIG,SOMATIC,PHENO,PUBMED,MOTIF_NAME,MOTIF_POS,HIGH_INF_POS,MOTIF_SCORE_CHANGE,TRANSCRIPTION_FACTORS,CADD_phred,DisGeNET,MPC,MTR,Mastermind,MetaSVM_pred,MetaSVM_rankscore,Phenotypes,Polyphen2_HDIV_score,SIFT_pred,SplicAI,SpliceRegion,gnomAD_exomes_AC,gnomAD_exomes_AF,gnomAD_exomes_AN,pLI,pLI_values,DS_AG,DS_AL,DS_DG,DS_DL,DP_AG,DP_AL,DP_DG,DP_DL,greendb_id,greendb_stdtype,greendb_dbsource,greendb_genes,greendb_constraint,greendb_level,greendb_more_support,gnomad_genomes_312_AF,SpliceAI_max,binom_p_val
7396,chr1:23077336,C,T,PASS,GLE_1567252581,8078,hiConfDeNovo,"0/1:23,11:34:99:81:21:280,0,697:195,0,785","0/0:37,0:37:65:81:21:0,100,1252:0,65,1244","0/0:32,0:32:47:81:21:0,82,1013:0,47,1005",99,34,37,32,60.0,T,stop_gained,HIGH,KDM1A,ENSG00000004487,Transcript,ENST00000356634,protein_coding,14/19,,ENST00000356634.7:c.1771C>T,ENSP00000349049.3:p.Arg591Ter,1920,1771,591,R/*,Cga/Tga,COSV63088329,,1,,SNV,HGNC,HGNC:29079,,,,1,,CCDS30627.1,ENSP00000349049,O60341.213,,UPI000020466D,O60341-1,1,,,PDB-ENSP_mappings:2dw4.A&PDB-ENSP_mappings:2ej...,,,,,,,,,,,,,,,,,,,,,,,1.0,1.0,,,,,,,38.0,invalid_field,invalid_field,invalid_field,invalid_field,,,invalid_field,.&.&.&.,.&.&.&.,invalid_field,invalid_field,,,,invalid_field,invalid_field,0.0,0.0,0.08,0.01,25,-5,46,24,72086_enh,enhancer,JungEtAl2019,"KDM1A,AL031428.1",0.72106,4,0,0.0,0.08,0.057613
255889,chr6:33670584,G,A,PASS,GLE_7507632649,8336,hiConfDeNovo,"0/1:16,8:24:99:77:18:196,0,454:111,0,542","0/0:29,0:29:43:77:18:0,78,1170:0,43,1162","0/0:40,0:40:79:77:18:0,114,1710:0,79,1702",99,24,29,40,60.0,A,intron_variant&splice_region_variant,LOW,ITPR3,ENSG00000096433,Transcript,ENST00000374316,protein_coding,,20/58,ENST00000374316.9:c.2441+8G>A,,,,,,,,,1,,SNV,HGNC,HGNC:6182,,,,5,P1,CCDS4783.1,ENSP00000363435,Q14573.204,,UPI000013CB74,,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.9,0.0,1,26,1,-11,1174714_enh,enhancer,BENGI,"MIR3934,ITPR3",0.913404,2,0,0.0,0.9,0.15159
542191,chr14:24099236,G,GATCCTGGGCGTCACGTCGCCCAAGGGCCGCAAGTACCAC,PASS,GLE_8240152843,8252,hiConfDeNovo,"0/1:47,15:62:99:110:50:0|1:24099224_A_C:527,0,...","0/0:82,0:82:82:110:50:.:.:0,120,1800:0,82,1787","0/0:55,0:55:73:110:50:.:.:0,111,1665:0,73,1652",99,62,82,55,59.45,ATCCTGGGCGTCACGTCGCCCAAGGGCCGCAAGTACCAC,inframe_insertion&splice_region_variant,MODERATE,PCK2,ENSG00000100889,Transcript,ENST00000216780,protein_coding,,,ENST00000216780.9:c.852_852+1insATCCTGGGCGTCAC...,ENSP00000216780.4:p.Gly287_Ile288insValThrSerP...,947-948,852-853,284-285,-/ILGVTSPKGRKYH,-/ATCCTGGGCGTCACGTCGCCCAAGGGCCGCAAGTACCAC,,,1,,insertion,HGNC,HGNC:8725,YES,NM_004563.4,,1,P1,CCDS9609.1,ENSP00000216780,Q16822.207,A0A384MTT2.16,UPI0000169DF3,Q16822-1,1,,,AFDB-ENSP_mappings:AF-Q16822-F1.A,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.06,0.53,-23,11,11,0,"394051_enh,394052_enh",enhancer,"JungEtAl2019,BENGI",NRL,0.843415,4,0,0.0,0.53,5.8e-05
328308,chr8:101573569,T,C,PASS,GLE_8797717620,6673,hiConfDeNovo,"0/1:19,26:45:99:96:36:694,0,441:609,0,529","0/0:38,0:38:65:96:36:0,100,1335:0,65,1327","0/0:33,0:33:64:96:36:0,99,1107:0,64,1099",99,45,38,33,60.0,C,intron_variant,MODIFIER,GRHL2,ENSG00000083307,Transcript,ENST00000395927,protein_coding,,5/15,ENST00000395927.1:c.687-99T>C,,,,,,,rs1364292836,,1,,SNV,HGNC,HGNC:2799,,,,2,,CCDS83312.1,ENSP00000379260,Q6ISB3.146,,UPI000035CC51,Q6ISB3-2,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,-6,47,2,-8,"1728007_enh,1289801_enh",enhancer,"DECRES,JungEtAl2019,BENGI","AP001207.1,GRHL2",0.856029,4,0,7e-06,0.0,0.371298


In [41]:
combined_short = combined[['locus','ref','alt','proband','family','Consequence','IMPACT','SYMBOL','PolyPhen','MetaSVM_pred','SpliceAI_max']].drop_duplicates()

In [42]:
combined['locus_ref_alt_proband'] = combined['locus'] + '_' + combined['ref'] + '_' + combined['alt'] + '_' + combined['proband']
combined.to_csv('combined_metaSVM_PolyPhen_SIFT_SpliceAI_greenvaran_multisplit_20230829.csv',sep=',',index=False)

# Check new variants

In [59]:
combined_new = pd.read_csv("combined_metaSVM_PolyPhen_SIFT_SpliceAI_greenvaran_multisplit_20230829_new.tsv",sep='\t')

In [64]:
combined_old = pd.read_csv("combined_metaSVM_PolyPhen_SIFT_SpliceAI_greenvaran_multisplit_20230829_old.tsv",sep='\t')

In [63]:
Counter(combined_new['VARIANT_CLASS']) # new variants found 20230829

Counter({'insertion': 55,
         'SNV': 127,
         'deletion': 44,
         'indel': 4,
         'substitution': 1})

In [65]:
Counter(combined_old['VARIANT_CLASS']) # old variants from previous analysis

Counter({'SNV': 178, 'deletion': 15, 'insertion': 17})

In [62]:
Counter(combined['VARIANT_CLASS']) # all

Counter({'SNV': 305,
         'deletion': 59,
         'insertion': 72,
         'indel': 4,
         'substitution': 1})