# Merging Annotations For Different Regions

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
pd.options.display.max_rows = 8

Below we import our vcf reading function:

In [3]:
def read_vcf(path):
    with open(path, 'r') as f:
        lines = [l for l in f if not l.startswith('##')]
    return pd.read_csv(io.StringIO(''.join(lines)), dtype={'#CHROM': str, 'POS':int, 'ID':str, 'REF': str, 'ALT': str, 'QUAL': str, 'FILTER': str, 'INFO': str}, sep='\t').rename(columns={'#CHROM': 'CHROM'})

### Benign Coding Region:

Need to rename a few columns in ANNOVAR dataframes in order to merge them with CADD annotations and also remove 'chr' in all the values under the #Chrom column. Notice how we merge the dataframes using how = 'left' notation. The reason for this is that there are more annotations using CADD and this allows us to keep all of them. Where there is a corresponding annotation for ANNNOVAR, the columns following the CADD annotations get filled in with the ANNOVAR annotations. If not, the remaining columns are left blank.

##### Human Derived

Below is the annovar annotation file for the human derived coding region. We have cut the string 'chr' out of the first column and changed the first and second column names to #Chrom and Pos respectively:

In [29]:
human_derived_coding_annotated_with_annovar = pd.read_csv('human_derived_coding_annotated.hg38_multianno.csv')
human_derived_coding_annotated_with_annovar['Chr'] = human_derived_coding_annotated_with_annovar['Chr'].map(lambda x: x.lstrip('chr').rstrip('aAbBcC'))
human_derived_coding_annotated_with_annovar = human_derived_coding_annotated_with_annovar.rename(columns = {'Chr': '#Chrom', 'Start': 'Pos'})
human_derived_coding_annotated_with_annovar

Unnamed: 0,#Chrom,Pos,End,Ref,Alt,Func.refGene,Gene.refGene,GeneDetail.refGene,ExonicFunc.refGene,AAChange.refGene,...,BP4,BP5,BP6,BP7,cosmic70,MCAP,REVEL,regsnp_fpr,regsnp_disease,regsnp_splicing_site
0,10,100020652,100020652,G,A,intergenic,DNMBP;CPN1,dist=10699;dist=21656,.,.,...,.,.,.,.,.,.,.,.,.,.
1,10,100190879,100190879,T,C,exonic,CHUK,.,nonsynonymous SNV,CHUK:NM_001278:exon20:c.A2198G:p.N733S,...,1,0,0,0,.,0.00447999043875,0.039,.,.,.
2,10,100233196,100233196,G,A,UTR3,CWF19L1,NM_018294:c.*31C>T;NM_001303407:c.*31C>T;NM_00...,.,.,...,.,.,.,.,.,.,.,.,.,.
3,10,100267615,100267615,C,T,UTR5,CWF19L1,NM_018294:c.-22G>A;NM_001303407:c.-20707G>A;NM...,.,.,...,.,.,.,.,.,.,.,.,.,.
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53794,X,99719539,99719539,A,G,ncRNA_exonic,XRCC6P5,.,.,.,...,.,.,.,.,.,.,.,.,.,.
53795,X,99719939,99719939,C,A,ncRNA_exonic,XRCC6P5,.,.,.,...,.,.,.,.,.,.,.,.,.,.
53796,X,99720084,99720084,G,A,ncRNA_exonic,XRCC6P5,.,.,.,...,.,.,.,.,.,.,.,.,.,.
53797,X,99721008,99721008,G,A,ncRNA_exonic,XRCC6P5,.,.,.,...,.,.,.,.,.,.,.,.,.,.


Below is the file for the CADD annotations for the human derived coding region:

In [30]:
human_derived_coding_annotated_with_cadd = pd.read_table('human_derived_coding_for_cadd_noheader.tsv')
human_derived_coding_annotated_with_cadd

  """Entry point for launching an IPython kernel.
  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,#Chrom,Pos,Ref,Alt,Type,Length,AnnoType,Consequence,ConsScore,ConsDetail,...,Freq10000bp,Rare10000bp,Sngl10000bp,EnsembleRegulatoryFeature,dbscSNV-ada_score,dbscSNV-rf_score,RemapOverlapTF,RemapOverlapCL,RawScore,PHRED
0,1,943329,C,T,SNV,0,RegulatoryFeature,REGULATORY,4,regulatory,...,18,117,2586,CTCF Binding Site,,,88.0,121.0,0.590702,8.117
1,1,943329,C,T,SNV,0,CodingTranscript,SYNONYMOUS,5,synonymous,...,18,117,2586,CTCF Binding Site,,,88.0,121.0,0.590702,8.117
2,1,943329,C,T,SNV,0,Intergenic,DOWNSTREAM,1,downstream,...,18,117,2586,CTCF Binding Site,,,88.0,121.0,0.590702,8.117
3,1,944699,C,T,SNV,0,CodingTranscript,NON_SYNONYMOUS,7,missense,...,16,117,2573,,,,12.0,13.0,2.980499,23.200
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
122229,X,155260197,C,A,SNV,0,Transcript,3PRIME_UTR,2,3_prime_UTR,...,9,36,873,,,,3.0,6.0,-0.219204,0.473
122230,X,155612732,T,C,SNV,0,RegulatoryFeature,REGULATORY,4,regulatory,...,7,27,866,Promoter,,,112.0,202.0,0.661682,8.709
122231,X,155612732,T,C,SNV,0,Transcript,INTRONIC,2,intron,...,7,27,866,Promoter,,,112.0,202.0,0.661682,8.709
122232,X,155612732,T,C,SNV,0,Intergenic,DOWNSTREAM,1,downstream,...,7,27,866,Promoter,,,112.0,202.0,0.661682,8.709


Below we have merged the two dataframes above. We have merged using the CADD annotation dataframe (how = 'left') since there were more variants and more annotations given by CADD than ANNOVAR. This fact is shown below:

Merged annotations based on CADD:

In [31]:
human_derived_coding_merged = pd.merge(human_derived_coding_annotated_with_cadd, human_derived_coding_annotated_with_annovar,
                                         how = 'left', on = ['#Chrom', 'Pos', 'Ref', 'Alt'])
human_derived_coding_merged

Unnamed: 0,#Chrom,Pos,Ref,Alt,Type,Length,AnnoType,Consequence,ConsScore,ConsDetail,...,BP4,BP5,BP6,BP7,cosmic70,MCAP,REVEL,regsnp_fpr,regsnp_disease,regsnp_splicing_site
0,1,943329,C,T,SNV,0,RegulatoryFeature,REGULATORY,4,regulatory,...,,,,,,,,,,
1,1,943329,C,T,SNV,0,CodingTranscript,SYNONYMOUS,5,synonymous,...,,,,,,,,,,
2,1,943329,C,T,SNV,0,Intergenic,DOWNSTREAM,1,downstream,...,,,,,,,,,,
3,1,944699,C,T,SNV,0,CodingTranscript,NON_SYNONYMOUS,7,missense,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
122229,X,155260197,C,A,SNV,0,Transcript,3PRIME_UTR,2,3_prime_UTR,...,.,.,.,.,.,.,.,.,.,.
122230,X,155612732,T,C,SNV,0,RegulatoryFeature,REGULATORY,4,regulatory,...,.,.,.,.,.,.,.,.,.,.
122231,X,155612732,T,C,SNV,0,Transcript,INTRONIC,2,intron,...,.,.,.,.,.,.,.,.,.,.
122232,X,155612732,T,C,SNV,0,Intergenic,DOWNSTREAM,1,downstream,...,.,.,.,.,.,.,.,.,.,.


Merged annotations based on ANNOVAR:

In [32]:
human_derived_coding_merged_annovar = pd.merge(human_derived_coding_annotated_with_cadd, human_derived_coding_annotated_with_annovar,
                                         how = 'right', on = ['#Chrom', 'Pos', 'Ref', 'Alt'])
pd.set_option('display.max_columns', 270)
pd.options.display.max_rows = 8
human_derived_coding_merged_annovar

Unnamed: 0,#Chrom,Pos,Ref,Alt,Type,Length,AnnoType,Consequence,ConsScore,ConsDetail,GC,CpG,motifECount,motifEName,motifEHIPos,motifEScoreChng,oAA,nAA,GeneID,FeatureID,GeneName,CCDS,Intron,Exon,cDNApos,relcDNApos,CDSpos,relCDSpos,protPos,relProtPos,Domain,Dst2Splice,Dst2SplType,minDistTSS,minDistTSE,SIFTcat,SIFTval,PolyPhenCat,PolyPhenVal,priPhCons,mamPhCons,verPhCons,priPhyloP,mamPhyloP,verPhyloP,bStatistic,targetScan,mirSVR-Score,mirSVR-E,mirSVR-Aln,cHmm_E1,cHmm_E2,cHmm_E3,cHmm_E4,cHmm_E5,cHmm_E6,cHmm_E7,cHmm_E8,cHmm_E9,cHmm_E10,cHmm_E11,cHmm_E12,cHmm_E13,cHmm_E14,cHmm_E15,cHmm_E16,cHmm_E17,cHmm_E18,cHmm_E19,cHmm_E20,cHmm_E21,cHmm_E22,cHmm_E23,cHmm_E24,cHmm_E25,GerpRS,GerpRSpval,GerpN,GerpS,tOverlapMotifs,motifDist,EncodeH3K4me1-sum,EncodeH3K4me1-max,EncodeH3K4me2-sum,EncodeH3K4me2-max,EncodeH3K4me3-sum,EncodeH3K4me3-max,EncodeH3K9ac-sum,EncodeH3K9ac-max,EncodeH3K9me3-sum,EncodeH3K9me3-max,EncodeH3K27ac-sum,EncodeH3K27ac-max,EncodeH3K27me3-sum,EncodeH3K27me3-max,EncodeH3K36me3-sum,EncodeH3K36me3-max,EncodeH3K79me2-sum,EncodeH3K79me2-max,EncodeH4K20me1-sum,EncodeH4K20me1-max,EncodeH2AFZ-sum,EncodeH2AFZ-max,EncodeDNase-sum,EncodeDNase-max,EncodetotalRNA-sum,EncodetotalRNA-max,Grantham,Dist2Mutation,Freq100bp,Rare100bp,Sngl100bp,Freq1000bp,Rare1000bp,Sngl1000bp,Freq10000bp,Rare10000bp,Sngl10000bp,EnsembleRegulatoryFeature,dbscSNV-ada_score,dbscSNV-rf_score,RemapOverlapTF,RemapOverlapCL,RawScore,PHRED,End,Func.refGene,Gene.refGene,GeneDetail.refGene,ExonicFunc.refGene,AAChange.refGene,Func.knownGene,Gene.knownGene,GeneDetail.knownGene,ExonicFunc.knownGene,AAChange.knownGene,Func.ensGene,Gene.ensGene,GeneDetail.ensGene,ExonicFunc.ensGene,AAChange.ensGene,SIFT_score,SIFT_pred,Polyphen2_HDIV_score,Polyphen2_HDIV_pred,Polyphen2_HVAR_score,Polyphen2_HVAR_pred,LRT_score,LRT_pred,MutationTaster_score,MutationTaster_pred,MutationAssessor_score,MutationAssessor_pred,FATHMM_score,FATHMM_pred,RadialSVM_score,RadialSVM_pred,LR_score,LR_pred,VEST3_score,CADD_raw,CADD_phred,GERP++_RS,phyloP46way_placental,phyloP100way_vertebrate,SiPhy_29way_logOdds,Interpro_domain,SIFT_score.1,SIFT_converted_rankscore,SIFT_pred.1,Polyphen2_HDIV_score.1,Polyphen2_HDIV_rankscore,Polyphen2_HDIV_pred.1,Polyphen2_HVAR_score.1,Polyphen2_HVAR_rankscore,Polyphen2_HVAR_pred.1,LRT_score.1,LRT_converted_rankscore,LRT_pred.1,MutationTaster_score.1,MutationTaster_converted_rankscore,MutationTaster_pred.1,MutationAssessor_score.1,MutationAssessor_score_rankscore,MutationAssessor_pred.1,FATHMM_score.1,FATHMM_converted_rankscore,FATHMM_pred.1,PROVEAN_score,PROVEAN_converted_rankscore,PROVEAN_pred,VEST3_score.1,VEST3_rankscore,MetaSVM_score,MetaSVM_rankscore,MetaSVM_pred,MetaLR_score,MetaLR_rankscore,MetaLR_pred,M-CAP_score,M-CAP_rankscore,M-CAP_pred,CADD_raw.1,CADD_raw_rankscore,CADD_phred.1,DANN_score,DANN_rankscore,fathmm-MKL_coding_score,fathmm-MKL_coding_rankscore,fathmm-MKL_coding_pred,Eigen_coding_or_noncoding,Eigen-raw,Eigen-PC-raw,GenoCanyon_score,GenoCanyon_score_rankscore,integrated_fitCons_score,integrated_fitCons_score_rankscore,integrated_confidence_value,GERP++_RS.1,GERP++_RS_rankscore,phyloP100way_vertebrate.1,phyloP100way_vertebrate_rankscore,phyloP20way_mammalian,phyloP20way_mammalian_rankscore,phastCons100way_vertebrate,phastCons100way_vertebrate_rankscore,phastCons20way_mammalian,phastCons20way_mammalian_rankscore,SiPhy_29way_logOdds.1,SiPhy_29way_logOdds_rankscore,Interpro_domain.1,GTEx_V6_gene,GTEx_V6_tissue,dbscSNV_ADA_SCORE,dbscSNV_RF_SCORE,InterVar_automated,PVS1,PS1,PS2,PS3,PS4,PM1,PM2,PM3,PM4,PM5,PM6,PP1,PP2,PP3,PP4,PP5,BA1,BS1,BS2,BS3,BS4,BP1,BP2,BP3,BP4,BP5,BP6,BP7,cosmic70,MCAP,REVEL,regsnp_fpr,regsnp_disease,regsnp_splicing_site
0,9,1046375,G,T,SNV,0.0,Intergenic,UPSTREAM,1.0,upstream,0.649,0.147,,,,,,,ENSG00000173253,ENST00000382251,DMRT2,CCDS6444.1,,,,,,,,,,,,750.0,2267.0,,,,,0.002,0.001,0.0,-0.382,-0.373,-0.324,952.0,,,,,1.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0,9.0,8.0,17.0,4.0,0.0,0.0,,,1.61,-3.21,,,9.28,2.72,8.93,2.22,4.07,1.13,2.37,1.3,14.40,3.64,3.24,1.20,73.12,39.14,4.04,1.18,3.34,0.93,4.55,1.07,9.45,3.27,0.35,0.17,,,,3.0,0.0,2.0,21.0,2.0,13.0,213.0,40.0,99.0,1972.0,,,,7.0,16.0,-0.106065,0.969,1046375,intergenic,DMRT3;LINC01230,dist=54643;dist=1768,.,.,ncRNA_exonic,LINC01230,.,.,.,ncRNA_exonic,LINC01230,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.
1,9,1046375,G,T,SNV,0.0,NonCodingTranscript,NONCODING_CHANGE,5.0,non_coding_exon,0.649,0.147,,,,,,,ENSG00000281769,ENST00000625222,LINC01230,,,1/1,751.0,0.249,,,,,,,,750.0,2267.0,,,,,0.002,0.001,0.0,-0.382,-0.373,-0.324,952.0,,,,,1.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0,9.0,8.0,17.0,4.0,0.0,0.0,,,1.61,-3.21,,,9.28,2.72,8.93,2.22,4.07,1.13,2.37,1.3,14.40,3.64,3.24,1.20,73.12,39.14,4.04,1.18,3.34,0.93,4.55,1.07,9.45,3.27,0.35,0.17,,,,3.0,0.0,2.0,21.0,2.0,13.0,213.0,40.0,99.0,1972.0,,,,7.0,16.0,-0.106065,0.969,1046375,intergenic,DMRT3;LINC01230,dist=54643;dist=1768,.,.,ncRNA_exonic,LINC01230,.,.,.,ncRNA_exonic,LINC01230,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.
2,9,1046421,A,G,SNV,0.0,NonCodingTranscript,NONCODING_CHANGE,5.0,non_coding_exon,0.517,0.080,,,,,,,ENSG00000281769,ENST00000625222,LINC01230,,,1/1,797.0,0.264,,,,,,,,796.0,2221.0,,,,,0.006,0.000,0.0,0.595,-0.197,-0.221,952.0,,,,,1.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0,7.0,8.0,20.0,3.0,0.0,0.0,,,4.73,-3.59,,,6.95,2.37,8.60,2.21,5.45,1.94,3.29,1.3,14.74,4.85,2.39,0.94,63.16,37.78,1.95,0.81,2.95,0.93,3.75,1.07,8.08,1.95,0.35,0.19,,,,5.0,0.0,0.0,26.0,2.0,13.0,214.0,40.0,99.0,1971.0,,,,3.0,12.0,0.197295,4.027,1046421,intergenic,DMRT3;LINC01230,dist=54689;dist=1722,.,.,ncRNA_exonic,LINC01230,.,.,.,ncRNA_exonic,LINC01230,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.
3,9,1046421,A,G,SNV,0.0,Intergenic,UPSTREAM,1.0,upstream,0.517,0.080,,,,,,,ENSG00000173253,ENST00000382251,DMRT2,CCDS6444.1,,,,,,,,,,,,796.0,2221.0,,,,,0.006,0.000,0.0,0.595,-0.197,-0.221,952.0,,,,,1.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0,7.0,8.0,20.0,3.0,0.0,0.0,,,4.73,-3.59,,,6.95,2.37,8.60,2.21,5.45,1.94,3.29,1.3,14.74,4.85,2.39,0.94,63.16,37.78,1.95,0.81,2.95,0.93,3.75,1.07,8.08,1.95,0.35,0.19,,,,5.0,0.0,0.0,26.0,2.0,13.0,214.0,40.0,99.0,1971.0,,,,3.0,12.0,0.197295,4.027,1046421,intergenic,DMRT3;LINC01230,dist=54689;dist=1722,.,.,ncRNA_exonic,LINC01230,.,.,.,ncRNA_exonic,LINC01230,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57690,X,153182814,T,A,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,153182814,exonic,MAGEA1,.,synonymous SNV,MAGEA1:NM_004988:exon3:c.A425A:p.E142E,exonic,MAGEA1,.,synonymous SNV,MAGEA1:uc004fhf.3:exon3:c.A425A:p.E142E,exonic,MAGEA1,.,synonymous SNV,MAGEA1:ENST00000356661.6:exon3:c.A425A:p.E142E,0.35,T,0.0,B,0.003,B,0.147,N,1.000,N,0,N,3.44,T,-0.962,T,0.007,T,0.112,-0.967,0.313,-2.09,-1.453,-0.012,0.067,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,0.00480613091631,0.048,.,.,.
57691,X,153182926,A,G,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,153182926,exonic,MAGEA1,.,stopgain,MAGEA1:NM_004988:exon3:c.T537G:p.Y179X,exonic,MAGEA1,.,stopgain,MAGEA1:uc004fhf.3:exon3:c.T537G:p.Y179X,exonic,MAGEA1,.,stopgain,MAGEA1:ENST00000356661.6:exon3:c.T537G:p.Y179X,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,1.000,0.500,.,.,.,-2.55,0.059,-4.507,0.003,-4.751,0.000,0.000,0.063,0.000,0.016,9.852,0.400,MAGE homology domain,.,.,.,.,Uncertain significance,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,.,.,.,.,.,.
57692,X,153183077,T,A,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,153183077,exonic,MAGEA1,.,synonymous SNV,MAGEA1:NM_004988:exon3:c.A688A:p.S230S,exonic,MAGEA1,.,synonymous SNV,MAGEA1:uc004fhf.3:exon3:c.A688A:p.S230S,exonic,MAGEA1,.,synonymous SNV,MAGEA1:ENST00000356661.6:exon3:c.A688A:p.S230S,0.07,T,0.996,D,0.905,P,0.006,N,1.000,N,1.63,L,3.43,T,-1.021,T,0.037,T,0.061,1.324,10.34,-2.55,-0.677,-0.478,4.633,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,0.00127721042215,0.038,.,.,.
57693,X,153183202,G,T,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,153183202,exonic,MAGEA1,.,synonymous SNV,MAGEA1:NM_004988:exon3:c.C813T:p.L271L,exonic,MAGEA1,.,synonymous SNV,MAGEA1:uc004fhf.3:exon3:c.C813T:p.L271L,exonic,MAGEA1,.,synonymous SNV,MAGEA1:ENST00000356661.6:exon3:c.C813T:p.L271L,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,Likely benign,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,.,.,.,.,.,.


When looking at all the columns, we see that for the first several variants, almost all the ANNOVAR annotations are NaN values. Therefore, we know that CADD annotations give us more values and therefore we should use the CADD dataframe to merge by since we have a greater output of variants.

Below we have dropped any duplicate rows as to not have our model train on multiple variants that are exactly the same:

In [33]:
human_derived_coding_merged = human_derived_coding_merged.drop_duplicates(keep = 'first')
human_derived_coding_merged

Unnamed: 0,#Chrom,Pos,Ref,Alt,Type,Length,AnnoType,Consequence,ConsScore,ConsDetail,GC,CpG,motifECount,motifEName,motifEHIPos,motifEScoreChng,oAA,nAA,GeneID,FeatureID,GeneName,CCDS,Intron,Exon,cDNApos,relcDNApos,CDSpos,relCDSpos,protPos,relProtPos,Domain,Dst2Splice,Dst2SplType,minDistTSS,minDistTSE,SIFTcat,SIFTval,PolyPhenCat,PolyPhenVal,priPhCons,mamPhCons,verPhCons,priPhyloP,mamPhyloP,verPhyloP,bStatistic,targetScan,mirSVR-Score,mirSVR-E,mirSVR-Aln,cHmm_E1,cHmm_E2,cHmm_E3,cHmm_E4,cHmm_E5,cHmm_E6,cHmm_E7,cHmm_E8,cHmm_E9,cHmm_E10,cHmm_E11,cHmm_E12,cHmm_E13,cHmm_E14,cHmm_E15,cHmm_E16,cHmm_E17,cHmm_E18,cHmm_E19,cHmm_E20,cHmm_E21,cHmm_E22,cHmm_E23,cHmm_E24,cHmm_E25,GerpRS,GerpRSpval,GerpN,GerpS,tOverlapMotifs,motifDist,EncodeH3K4me1-sum,EncodeH3K4me1-max,EncodeH3K4me2-sum,EncodeH3K4me2-max,EncodeH3K4me3-sum,EncodeH3K4me3-max,EncodeH3K9ac-sum,EncodeH3K9ac-max,EncodeH3K9me3-sum,EncodeH3K9me3-max,EncodeH3K27ac-sum,EncodeH3K27ac-max,EncodeH3K27me3-sum,EncodeH3K27me3-max,EncodeH3K36me3-sum,EncodeH3K36me3-max,EncodeH3K79me2-sum,EncodeH3K79me2-max,EncodeH4K20me1-sum,EncodeH4K20me1-max,EncodeH2AFZ-sum,EncodeH2AFZ-max,EncodeDNase-sum,EncodeDNase-max,EncodetotalRNA-sum,EncodetotalRNA-max,Grantham,Dist2Mutation,Freq100bp,Rare100bp,Sngl100bp,Freq1000bp,Rare1000bp,Sngl1000bp,Freq10000bp,Rare10000bp,Sngl10000bp,EnsembleRegulatoryFeature,dbscSNV-ada_score,dbscSNV-rf_score,RemapOverlapTF,RemapOverlapCL,RawScore,PHRED,End,Func.refGene,Gene.refGene,GeneDetail.refGene,ExonicFunc.refGene,AAChange.refGene,Func.knownGene,Gene.knownGene,GeneDetail.knownGene,ExonicFunc.knownGene,AAChange.knownGene,Func.ensGene,Gene.ensGene,GeneDetail.ensGene,ExonicFunc.ensGene,AAChange.ensGene,SIFT_score,SIFT_pred,Polyphen2_HDIV_score,Polyphen2_HDIV_pred,Polyphen2_HVAR_score,Polyphen2_HVAR_pred,LRT_score,LRT_pred,MutationTaster_score,MutationTaster_pred,MutationAssessor_score,MutationAssessor_pred,FATHMM_score,FATHMM_pred,RadialSVM_score,RadialSVM_pred,LR_score,LR_pred,VEST3_score,CADD_raw,CADD_phred,GERP++_RS,phyloP46way_placental,phyloP100way_vertebrate,SiPhy_29way_logOdds,Interpro_domain,SIFT_score.1,SIFT_converted_rankscore,SIFT_pred.1,Polyphen2_HDIV_score.1,Polyphen2_HDIV_rankscore,Polyphen2_HDIV_pred.1,Polyphen2_HVAR_score.1,Polyphen2_HVAR_rankscore,Polyphen2_HVAR_pred.1,LRT_score.1,LRT_converted_rankscore,LRT_pred.1,MutationTaster_score.1,MutationTaster_converted_rankscore,MutationTaster_pred.1,MutationAssessor_score.1,MutationAssessor_score_rankscore,MutationAssessor_pred.1,FATHMM_score.1,FATHMM_converted_rankscore,FATHMM_pred.1,PROVEAN_score,PROVEAN_converted_rankscore,PROVEAN_pred,VEST3_score.1,VEST3_rankscore,MetaSVM_score,MetaSVM_rankscore,MetaSVM_pred,MetaLR_score,MetaLR_rankscore,MetaLR_pred,M-CAP_score,M-CAP_rankscore,M-CAP_pred,CADD_raw.1,CADD_raw_rankscore,CADD_phred.1,DANN_score,DANN_rankscore,fathmm-MKL_coding_score,fathmm-MKL_coding_rankscore,fathmm-MKL_coding_pred,Eigen_coding_or_noncoding,Eigen-raw,Eigen-PC-raw,GenoCanyon_score,GenoCanyon_score_rankscore,integrated_fitCons_score,integrated_fitCons_score_rankscore,integrated_confidence_value,GERP++_RS.1,GERP++_RS_rankscore,phyloP100way_vertebrate.1,phyloP100way_vertebrate_rankscore,phyloP20way_mammalian,phyloP20way_mammalian_rankscore,phastCons100way_vertebrate,phastCons100way_vertebrate_rankscore,phastCons20way_mammalian,phastCons20way_mammalian_rankscore,SiPhy_29way_logOdds.1,SiPhy_29way_logOdds_rankscore,Interpro_domain.1,GTEx_V6_gene,GTEx_V6_tissue,dbscSNV_ADA_SCORE,dbscSNV_RF_SCORE,InterVar_automated,PVS1,PS1,PS2,PS3,PS4,PM1,PM2,PM3,PM4,PM5,PM6,PP1,PP2,PP3,PP4,PP5,BA1,BS1,BS2,BS3,BS4,BP1,BP2,BP3,BP4,BP5,BP6,BP7,cosmic70,MCAP,REVEL,regsnp_fpr,regsnp_disease,regsnp_splicing_site
0,1,943329,C,T,SNV,0,RegulatoryFeature,REGULATORY,4,regulatory,0.675,0.093,,,,,,,,ENSR00000344437,,,,,,,,,,,,,,1163,335,,,,,0.817,0.197,0.000,-1.217,-1.307,-1.541,940.0,,,,,0,1,0,0,0,0,1,0,0,0,1,5,1,2,3,0,1,1,0,15,0,5,8,2,2,,,18.30,-15.60,,,35.67,9.85,26.74,9.35,26.18,10.77,21.90,8.94,5.92,2.84,22.88,10.90,38.80,24.49,10.97,2.87,8.26,2.52,24.86,4.94,7.70,2.47,8.49,5.15,0.60,0.32,,6,0.0,2.0,38.0,1.0,17.0,314.0,18,117,2586,CTCF Binding Site,,,88.0,121.0,0.590702,8.117,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,1,943329,C,T,SNV,0,CodingTranscript,SYNONYMOUS,5,synonymous,0.675,0.093,,,,,D,D,ENSG00000187634,ENST00000342066,SAMD11,CCDS2.2,,12/14,1724.0,0.676,1641.0,0.802,547.0,0.803,ndomain,,,1163,335,,,,,0.817,0.197,0.000,-1.217,-1.307,-1.541,940.0,,,,,0,1,0,0,0,0,1,0,0,0,1,5,1,2,3,0,1,1,0,15,0,5,8,2,2,,,18.30,-15.60,,,35.67,9.85,26.74,9.35,26.18,10.77,21.90,8.94,5.92,2.84,22.88,10.90,38.80,24.49,10.97,2.87,8.26,2.52,24.86,4.94,7.70,2.47,8.49,5.15,0.60,0.32,,6,0.0,2.0,38.0,1.0,17.0,314.0,18,117,2586,CTCF Binding Site,,,88.0,121.0,0.590702,8.117,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,1,943329,C,T,SNV,0,Intergenic,DOWNSTREAM,1,downstream,0.675,0.093,,,,,,,ENSG00000188976,ENST00000327044,NOC2L,CCDS3.1,,,,,,,,,,,,1163,335,,,,,0.817,0.197,0.000,-1.217,-1.307,-1.541,940.0,,,,,0,1,0,0,0,0,1,0,0,0,1,5,1,2,3,0,1,1,0,15,0,5,8,2,2,,,18.30,-15.60,,,35.67,9.85,26.74,9.35,26.18,10.77,21.90,8.94,5.92,2.84,22.88,10.90,38.80,24.49,10.97,2.87,8.26,2.52,24.86,4.94,7.70,2.47,8.49,5.15,0.60,0.32,,6,0.0,2.0,38.0,1.0,17.0,314.0,18,117,2586,CTCF Binding Site,,,88.0,121.0,0.590702,8.117,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,1,944699,C,T,SNV,0,CodingTranscript,NON_SYNONYMOUS,7,missense,0.695,0.053,,,,,D,N,ENSG00000188976,ENST00000327044,NOC2L,CCDS3.1,,19/19,2295.0,0.823,2245.0,0.998,749.0,1.000,lcompl,,,864,118,deleterious,0.0,possibly_damaging,0.791,0.007,1.000,0.988,-0.494,1.459,2.526,934.0,,,,,1,1,0,0,0,5,11,3,1,5,5,0,1,3,3,0,1,0,0,0,0,1,2,3,2,,,18.30,-4.82,,,18.34,4.96,15.56,5.30,12.11,3.01,14.44,9.34,1.38,0.52,37.51,20.55,3.00,1.73,8.22,2.08,0.61,0.61,22.94,5.98,6.13,2.90,2.33,0.94,35.12,23.19,23.0,7,0.0,0.0,34.0,4.0,14.0,290.0,16,117,2573,,,,12.0,13.0,2.980499,23.200,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
122229,X,155260197,C,A,SNV,0,Transcript,3PRIME_UTR,2,3_prime_UTR,0.377,0.013,,,,,,,ENSG00000155961,ENST00000369454,RAB39B,CCDS14766.1,,2/2,1549.0,0.442,,,,,,,,4393,1956,,,,,0.160,0.000,0.000,0.386,-1.109,-1.160,541.0,,-0.2312,-16.8,150.0,0,0,1,16,0,0,3,1,0,0,0,0,0,0,23,0,0,0,0,0,4,0,0,0,0,6692.35,0.0,5.81,-9.09,,,7.13,1.65,3.88,1.26,10.09,2.15,4.57,1.38,9.09,3.38,3.72,1.38,6.08,1.24,7.35,2.16,5.53,1.78,9.23,2.24,10.70,3.78,0.17,0.07,0.70,0.52,,9,0.0,0.0,6.0,0.0,5.0,85.0,9,36,873,,,,3.0,6.0,-0.219204,0.473,155260197.0,UTR3,RAB39B,NM_171998:c.*606G>T,.,.,UTR3,RAB39B,uc004fne.5:c.*606G>T,.,.,UTR3,RAB39B,ENST00000369454.3:c.*606G>T,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.
122230,X,155612732,T,C,SNV,0,RegulatoryFeature,REGULATORY,4,regulatory,0.695,0.147,,,,,,,,ENSR00000249743,,,,,,,,,,,,,,146,604,,,,,0.026,0.000,0.000,0.237,0.421,0.441,,,,,,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,44,1,0,2,0,0,0,0,0,0,,,4.17,1.47,,,5.11,0.77,158.05,35.65,527.53,60.44,549.81,109.62,11.13,2.37,406.50,99.96,11.39,4.35,6.98,1.62,54.94,11.00,7.66,1.99,233.84,36.28,3.93,1.60,0.38,0.08,,26,0.0,2.0,12.0,1.0,3.0,122.0,7,27,866,Promoter,,,112.0,202.0,0.661682,8.709,155612732.0,intronic,SPRY3;TMLHE,.,.,.,intronic,TMLHE,.,.,.,intronic,TMLHE,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.
122231,X,155612732,T,C,SNV,0,Transcript,INTRONIC,2,intron,0.695,0.147,,,,,,,ENSG00000185973,ENST00000334398,TMLHE,CCDS14768.1,1/7,,,,,,,,,,,146,604,,,,,0.026,0.000,0.000,0.237,0.421,0.441,,,,,,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,44,1,0,2,0,0,0,0,0,0,,,4.17,1.47,,,5.11,0.77,158.05,35.65,527.53,60.44,549.81,109.62,11.13,2.37,406.50,99.96,11.39,4.35,6.98,1.62,54.94,11.00,7.66,1.99,233.84,36.28,3.93,1.60,0.38,0.08,,26,0.0,2.0,12.0,1.0,3.0,122.0,7,27,866,Promoter,,,112.0,202.0,0.661682,8.709,155612732.0,intronic,SPRY3;TMLHE,.,.,.,intronic,TMLHE,.,.,.,intronic,TMLHE,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.
122232,X,155612732,T,C,SNV,0,Intergenic,DOWNSTREAM,1,downstream,0.695,0.147,,,,,,,ENSG00000224533,ENST00000452506,TMLHE-AS1,,,,,,,,,,,,,146,604,,,,,0.026,0.000,0.000,0.237,0.421,0.441,,,,,,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,44,1,0,2,0,0,0,0,0,0,,,4.17,1.47,,,5.11,0.77,158.05,35.65,527.53,60.44,549.81,109.62,11.13,2.37,406.50,99.96,11.39,4.35,6.98,1.62,54.94,11.00,7.66,1.99,233.84,36.28,3.93,1.60,0.38,0.08,,26,0.0,2.0,12.0,1.0,3.0,122.0,7,27,866,Promoter,,,112.0,202.0,0.661682,8.709,155612732.0,intronic,SPRY3;TMLHE,.,.,.,intronic,TMLHE,.,.,.,intronic,TMLHE,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.


##### ClinVar

Below is the Benign Coding Region for ClinVar:

In [34]:
clinvar_benign_coding_annotated_with_annovar = pd.read_csv('clinvar_benign_coding_for_annovar.hg38_multianno.csv')
clinvar_benign_coding_annotated_with_annovar

Unnamed: 0,Chr,Start,End,Ref,Alt,...,MCAP,REVEL,regsnp_fpr,regsnp_disease,regsnp_splicing_site
0,chr10,100987606,100987606,G,T,...,.,.,.,.,.
1,chr10,102065918,102065918,C,G,...,.,.,.,.,.
2,chr10,102396271,102396271,G,A,...,.,0.018,.,.,.
3,chr10,102399466,102399466,C,T,...,.,.,.,.,.
...,...,...,...,...,...,...,...,...,...,...,...
4655,chrX,85964016,85964016,T,C,...,.,.,.,.,.
4656,chrX,85978770,85978770,G,C,...,0.0245874451416,0.248,.,.,.
4657,chrX,85978816,85978816,T,A,...,.,0.100,.,.,.
4658,chrX,93671995,93671995,G,C,...,0.0059993776503,0.060,.,.,.


In [35]:
clinvar_benign_coding_annotated_with_cadd = pd.read_table('clinvar_coding_benign_cadd_annotations_noheader.tsv')
clinvar_benign_coding_annotated_with_cadd

  """Entry point for launching an IPython kernel.
  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,#Chrom,Pos,Ref,Alt,Type,...,dbscSNV-rf_score,RemapOverlapTF,RemapOverlapCL,RawScore,PHRED
0,1,1014042,G,A,SNV,...,,69.0,97.0,0.076262,2.519
1,1,1014042,G,A,SNV,...,,69.0,97.0,0.076262,2.519
2,1,1014042,G,A,SNV,...,,69.0,97.0,0.076262,2.519
3,1,1014042,G,A,SNV,...,,69.0,97.0,0.076262,2.519
...,...,...,...,...,...,...,...,...,...,...,...
9134,X,154776813,C,CAAG,INS,...,,7.0,7.0,0.893695,10.880
9135,X,154776813,C,CAAG,INS,...,,7.0,7.0,0.893695,10.880
9136,X,154776813,C,CAAG,INS,...,,7.0,7.0,0.893695,10.880
9137,X,154929926,T,G,SNV,...,,,,0.089486,2.669


Below are the steps used to combine the annotations for clinvar benign coding variants:

In [36]:
clinvar_benign_coding_annotated_with_annovar['Chr'] = clinvar_benign_coding_annotated_with_annovar['Chr'].map(lambda x: x.lstrip('chr').rstrip('aAbBcC'))
clinvar_benign_coding_annotated_with_annovar = clinvar_benign_coding_annotated_with_annovar.rename(columns = {'Chr': '#Chrom', 'Start': 'Pos'})
clinvar_benign_coding_merged = pd.merge(clinvar_benign_coding_annotated_with_cadd, clinvar_benign_coding_annotated_with_annovar,
                                         how = 'left', on = ['#Chrom', 'Pos', 'Ref', 'Alt'])
clinvar_benign_coding_merged

Unnamed: 0,#Chrom,Pos,Ref,Alt,Type,...,MCAP,REVEL,regsnp_fpr,regsnp_disease,regsnp_splicing_site
0,1,1014042,G,A,SNV,...,,,,,
1,1,1014042,G,A,SNV,...,,,,,
2,1,1014042,G,A,SNV,...,,,,,
3,1,1014042,G,A,SNV,...,,,,,
...,...,...,...,...,...,...,...,...,...,...,...
9134,X,154776813,C,CAAG,INS,...,,,,,
9135,X,154776813,C,CAAG,INS,...,,,,,
9136,X,154776813,C,CAAG,INS,...,,,,,
9137,X,154929926,T,G,SNV,...,.,.,.,.,.


Below I have dropped the duplicates of the merged dataframe:

In [60]:
clinvar_benign_coding_merged = clinvar_benign_coding_merged.drop_duplicates(keep = 'first')
clinvar_benign_coding_merged

Unnamed: 0,#Chrom,Pos,Ref,Alt,Type,...,MCAP,REVEL,regsnp_fpr,regsnp_disease,regsnp_splicing_site
0,1,1014042,G,A,SNV,...,,,,,
1,1,1014042,G,A,SNV,...,,,,,
2,1,1014042,G,A,SNV,...,,,,,
3,1,1014042,G,A,SNV,...,,,,,
...,...,...,...,...,...,...,...,...,...,...,...
9134,X,154776813,C,CAAG,INS,...,,,,,
9135,X,154776813,C,CAAG,INS,...,,,,,
9136,X,154776813,C,CAAG,INS,...,,,,,
9137,X,154929926,T,G,SNV,...,.,.,.,.,.


Below is the final dataframe for all benign coding region variants:

In [61]:
benign_coding_region_variants = pd.concat([clinvar_benign_coding_merged, human_derived_coding_merged])
benign_coding_region_variants

Unnamed: 0,#Chrom,Pos,Ref,Alt,Type,...,MCAP,REVEL,regsnp_fpr,regsnp_disease,regsnp_splicing_site
0,1,1014042,G,A,SNV,...,,,,,
1,1,1014042,G,A,SNV,...,,,,,
2,1,1014042,G,A,SNV,...,,,,,
3,1,1014042,G,A,SNV,...,,,,,
...,...,...,...,...,...,...,...,...,...,...,...
122229,X,155260197,C,A,SNV,...,.,.,.,.,.
122230,X,155612732,T,C,SNV,...,.,.,.,.,.
122231,X,155612732,T,C,SNV,...,.,.,.,.,.
122232,X,155612732,T,C,SNV,...,.,.,.,.,.


We need to drop some columns that are unneccessary for our final model, shown below:

In [62]:
benign_coding_region_variants = benign_coding_region_variants.drop(columns = ['PolyPhenVal', 'priPhCons', 'cHmm_E1', 'cHmm_E2', 'cHmm_E3', 'cHmm_E4', 'cHmm_E5', 'cHmm_E6', 'cHmm_E7', 'cHmm_E8', 'cHmm_E9', 'cHmm_E10', 'cHmm_E11', 'cHmm_E12', 'cHmm_E13', 'cHmm_E14', 'cHmm_E15', 'cHmm_E16', 'cHmm_E17', 'cHmm_E18', 'cHmm_E19', 'cHmm_E20', 'cHmm_E21', 'cHmm_E22', 'cHmm_E23', 'cHmm_E24', 'cHmm_E25', 'GerpN', 'GerpS'])
benign_coding_region_variants

Unnamed: 0,#Chrom,Pos,Ref,Alt,Type,...,MCAP,REVEL,regsnp_fpr,regsnp_disease,regsnp_splicing_site
0,1,1014042,G,A,SNV,...,,,,,
1,1,1014042,G,A,SNV,...,,,,,
2,1,1014042,G,A,SNV,...,,,,,
3,1,1014042,G,A,SNV,...,,,,,
...,...,...,...,...,...,...,...,...,...,...,...
122229,X,155260197,C,A,SNV,...,.,.,.,.,.
122230,X,155612732,T,C,SNV,...,.,.,.,.,.
122231,X,155612732,T,C,SNV,...,.,.,.,.,.
122232,X,155612732,T,C,SNV,...,.,.,.,.,.


The following code changes all scores and other numerical outputs to floats from objects for both CADD and ANNOVAR:

In [63]:
benign_coding_region_variants_with_nan = benign_coding_region_variants.replace('.', np.nan)
benign_coding_region_variants_with_nan = benign_coding_region_variants_with_nan.astype({'SIFT_score': float, 'Polyphen2_HDIV_score' : float, 
                'Polyphen2_HVAR_score' : float, 'LRT_score' : float, 
                'MutationTaster_score' : float, 'MutationAssessor_score' : float, 
                'FATHMM_score' : float, 'RadialSVM_score' : float,
               'LR_score' : float, 'VEST3_score' : float, 'CADD_raw' : float,
               'CADD_phred' : float, 'GERP++_RS' : float, 'phyloP46way_placental' : float,
               'phyloP100way_vertebrate' : float, 'SiPhy_29way_logOdds' : float, 'SIFT_score.1' : float,
               'SIFT_converted_rankscore' : float, 'Polyphen2_HDIV_score.1' : float, 'Polyphen2_HDIV_rankscore' : float,
               'Polyphen2_HVAR_score.1' : float, 'Polyphen2_HVAR_rankscore' : float, 'LRT_score.1' : float,
               'LRT_converted_rankscore' : float, 'MutationTaster_score.1' : float, 'MutationTaster_converted_rankscore' : float,
               'MutationAssessor_score.1' : float, 'MutationAssessor_score_rankscore' : float, 'FATHMM_score.1' : float,
               'FATHMM_converted_rankscore' : float, 'PROVEAN_score' : float, 'PROVEAN_converted_rankscore' : float,
               'VEST3_score.1' : float, 'VEST3_rankscore' : float, 'MetaSVM_score' : float, 'MetaSVM_rankscore' : float,
               'MetaLR_score' : float, 'MetaLR_rankscore' : float, 'M-CAP_score' : float, 'M-CAP_rankscore' : float,
               'CADD_raw.1' : float, 'CADD_raw_rankscore' : float, 'CADD_phred.1' : float, 'DANN_score' : float,
               'DANN_rankscore' : float, 'fathmm-MKL_coding_score' : float, 'fathmm-MKL_coding_rankscore' : float,
               'Eigen-raw' : float, 'Eigen-PC-raw' : float, 'GenoCanyon_score' : float, 'GenoCanyon_score_rankscore' : float,
               'integrated_fitCons_score' : float, 'integrated_fitCons_score_rankscore' : float, 'integrated_confidence_value' : float,
               'GERP++_RS.1' : float, 'GERP++_RS_rankscore' : float, 'phyloP100way_vertebrate.1' : float, 
                'phyloP100way_vertebrate_rankscore' : float, 'phyloP20way_mammalian' : float, 'phyloP20way_mammalian_rankscore' : float,
               'phastCons100way_vertebrate' : float, 'phastCons100way_vertebrate_rankscore' : float, 'phastCons20way_mammalian' : float,
               'phastCons20way_mammalian_rankscore' : float, 'SiPhy_29way_logOdds.1' : float, 'SiPhy_29way_logOdds_rankscore' : float,
               'dbscSNV_ADA_SCORE' : float, 'dbscSNV_RF_SCORE' : float, 'PVS1' : float, 'PS1' : float, 'PS2' : float, 'PS3' : float,
               'PS4' : float, 'PM1' : float, 'PM2' : float, 'PM3' : float, 'PM4' : float, 'PM5' : float, 'PM6' : float, 'PP1' : float, 'PP2' : float,
               'PP3' : float, 'PP4' : float, 'PP5' : float, 'BA1' : float, 'BS1' : float, 'BS2' : float, 'BS3' : float, 'BS4' : float, 'BP1' : float, 'BP2' : float,
               'BP3' : float, 'BP4' : float, 'BP5' : float, 'BP6' : float, 'BP7' : float, 'MCAP' : float, 'REVEL' : float, 'regsnp_fpr' : float,
                'dbscSNV-rf_score' : float})

Below I have saved the dataframe for use in other Notebooks:

In [64]:
benign_coding_region_variants_with_nan.to_csv('benign_coding_region_variants.csv')

### Benign Noncoding Region:

Before running the next cell, you must execute the following commands in terminal:
1. pip install dask==1.0.0
2. pip install toolz
3. pip install dask[dataframe]

This gives us the ability to use dask, which we will need to read in our large annotated CADD files for the human derived data

In [None]:
import dask.dataframe as dd

In [1]:
import pandas as pd
import numpy as np
import dask.dataframe as dd
import matplotlib.pyplot as plt
pd.options.display.max_rows = 8

Below I have read in the ANNOVAR annotations using a chunking method, which takes a small sample of files at a time and then groups them together, using less memory than if read all at once using pandas:

In [3]:
my_list = []
for chunk in pd.read_csv('human_derived_noncoding_annotated.hg38_multianno.csv', chunksize = 5000, low_memory = False):
    my_list.append(chunk)
human_derived_noncoding_annotated_with_annovar = pd.concat(my_list, axis = 0)
del my_list
human_derived_noncoding_annotated_with_annovar

Unnamed: 0,Chr,Start,End,Ref,Alt,Func.refGene,Gene.refGene,GeneDetail.refGene,ExonicFunc.refGene,AAChange.refGene,...,BP4,BP5,BP6,BP7,cosmic70,MCAP,REVEL,regsnp_fpr,regsnp_disease,regsnp_splicing_site
0,chr10,1000013,1000013,G,A,intronic,GTPBP4,.,.,.,...,.,.,.,.,.,.,.,.,.,.
1,chr10,100020652,100020652,G,A,intergenic,DNMBP;CPN1,dist=10699;dist=21656,.,.,...,.,.,.,.,.,.,.,.,.,.
2,chr10,1000297,1000297,T,G,intronic,GTPBP4,.,.,.,...,.,.,.,.,.,.,.,.,.,.
3,chr10,1000555,1000555,A,T,intronic,GTPBP4,.,.,.,...,.,.,.,.,.,.,.,0.482876712329,B,off
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
966378,chrX,9931817,9931817,T,C,intronic,SHROOM2,.,.,.,...,.,.,.,.,.,.,.,.,.,.
966379,chrX,9931818,9931818,G,A,intronic,SHROOM2,.,.,.,...,.,.,.,.,.,.,.,.,.,.
966380,chrX,9931993,9931993,T,C,intronic,SHROOM2,.,.,.,...,.,.,.,.,.,.,.,0.438356164384,B,off
966381,chrX,9932000,9932000,C,T,intronic,SHROOM2,.,.,.,...,.,.,.,.,.,.,.,0.794520547945,B,off


Below I have cut out 'chr' from the chromosome values and changed the column names of the ANNOVAR annotation dataframe so the two annotation files can be merged:

In [None]:
human_derived_noncoding_annotated_with_annovar['Chr'] = human_derived_noncoding_annotated_with_annovar['Chr'].map(lambda x: x.lstrip('chr').rstrip('aAbBcC'))
human_derived_noncoding_annotated_with_annovar = human_derived_noncoding_annotated_with_annovar.rename(columns = {'Chr': '#Chrom', 'Start': 'Pos'})

Below I have pickled the above dataframe so that I do not have to call the above functions again:

In [None]:
human_derived_noncoding_annotated_with_annovar.to_pickle('human_derived_noncoding_annotated_with_annovar.pkl')

I have already changed the columns of the ANNOVAR annotation dataframe (Start -> Pos, Chr -> #Chrom) and cut out the string 'chr' from the dataframe, like I had to do with all the other dataframes. I then saved the resulting dataframe and have imported it as 'human_derived_noncoding_annotated_with_annovar.pkl' seen below. In order to merge the CADD and ANNOVAR annotations together, I also needed to change the values of the dataframe to strings, seen below:

In [2]:
human_derived_noncoding_annotated_with_annovar = pd.read_pickle('human_derived_noncoding_annotated_with_annovar.pkl')
human_derived_noncoding_annotated_with_annovar = human_derived_noncoding_annotated_with_annovar.astype(str)

Below is the method used to create a single csv file with all the annotations from CADD. I have read the files into a dataframe and then sent them all to the same csv file using the extension "mode = 'a'" which appends the next annotation file to the same output. 

In [None]:
df1 = pd.read_csv('GRCh38-v1.5_anno_0265e2d84faa1b92f64be0ad04982d58_noheader.tsv', sep = '\t')
df1.to_csv('annnotationforhumanderivedcadd.csv', index=False)

df2 = pd.read_csv('GRCh38-v1.5_anno_3c0e5f59d9c7d35f1d1311158fac67f3_noheader.tsv', sep = '\t')
df2.to_csv('annnotationforhumanderivedcadd.csv', mode = 'a', index=False)

df3= pd.read_csv('GRCh38-v1.5_anno_4b0464a325358ba5cace1e505fbda034_noheader.tsv', sep = '\t')
df3.to_csv('annnotationforhumanderivedcadd.csv', mode = 'a', index=False)

df4 = pd.read_csv('GRCh38-v1.5_anno_76ecb0e4f43d81dd38abdf89fd16850f_noheader.tsv', sep = '\t')
df4.to_csv('annnotationforhumanderivedcadd.csv', mode = 'a', index=False)

df5 = pd.read_csv('GRCh38-v1.5_anno_85ec7a65fce6a2cf5c60035e2cdc5a14_noheader.tsv', sep = '\t')
df5.to_csv('annnotationforhumanderivedcadd.csv', mode = 'a', index=False)

df6 = pd.read_csv('GRCh38-v1.5_anno_8f9e07c972f3b44de92c82a49704f297_noheader.tsv', sep = '\t')
df6.to_csv('annnotationforhumanderivedcadd.csv', mode = 'a', index=False)

df7 = pd.read_csv('GRCh38-v1.5_anno_ab4b1db57087c3537ff7101b7cbd4eaf_noheader.tsv', sep = '\t')
df7.to_csv('annnotationforhumanderivedcadd.csv', mode = 'a', index=False)

df8 = pd.read_csv('GRCh38-v1.5_anno_bb2bc990f3ac17f338ddfaeac5a643dd_noheader.tsv', sep = '\t')
df8.to_csv('annnotationforhumanderivedcadd.csv', mode = 'a', index=False)

df9 = pd.read_csv('GRCh38-v1.5_anno_bc3fb6e7bed9d35196a24000cbba11d3_noheader.tsv', sep = '\t')
df9.to_csv('annnotationforhumanderivedcadd.csv', mode = 'a', index=False)

df10 = pd.read_csv('GRCh38-v1.5_anno_caed4a68cd64fff0da36fe81fa3b5e85_noheader.tsv', sep = '\t')
df10.to_csv('annnotationforhumanderivedcadd.csv', mode = 'a', index=False)

df11 = pd.read_csv('GRCh38-v1.5_anno_d334b44948c3cbc6067e6a0146556550_noheader.tsv', sep = '\t')
df11.to_csv('annnotationforhumanderivedcadd.csv', mode = 'a', index=False)

Since we are using very large dataframes, we must use dask dataframe, which we called in above. The below code gives us an output of a new csv file which is the merged dataframe of the two annotation files. The 'converters' part of the code below changes the values of the CADD annotation to strings just as we did for the ANNOVAR values above. The output file is given at the end and is 1.3 GB large.

In [8]:
dd.merge(dd.read_csv("annnotationforhumanderivedcadd.csv", converters={i: str for i in range(250)}), human_derived_noncoding_annotated_with_annovar, how='left', on=['#Chrom', 'Pos', 'Ref','Alt']).compute().to_csv('human_derived_noncoding_merged.csv')

MemoryError: 

Below I have called in the csv file and had to specify the dtype of all the values so that they could be read by dask and dropped the duplicate rows from the dask dataframe:

In [2]:
human_derived_noncoding_merged = dd.read_csv('human_derived_noncoding_merged.csv', dtype={'#Chrom': 'object',
       'CDSpos': 'object',
       'ConsScore': 'object',
       'CpG': 'object',
       'Dist2Mutation': 'object',
       'Domain': 'object',
       'Dst2Splice': 'object',
       'EncodeDNase-max': 'object',
       'EncodeDNase-sum': 'object',
       'EncodeH2AFZ-max': 'object',
       'EncodeH2AFZ-sum': 'object',
       'EncodeH3K27ac-max': 'object',
       'EncodeH3K27ac-sum': 'object',
       'EncodeH3K27me3-max': 'object',
       'EncodeH3K27me3-sum': 'object',
       'EncodeH3K36me3-max': 'object',
       'EncodeH3K36me3-sum': 'object',
       'EncodeH3K4me1-max': 'object',
       'EncodeH3K4me1-sum': 'object',
       'EncodeH3K4me2-max': 'object',
       'EncodeH3K4me2-sum': 'object',
       'EncodeH3K4me3-max': 'object',
       'EncodeH3K4me3-sum': 'object',
       'EncodeH3K79me2-max': 'object',
       'EncodeH3K79me2-sum': 'object',
       'EncodeH3K9ac-max': 'object',
       'EncodeH3K9ac-sum': 'object',
       'EncodeH3K9me3-max': 'object',
       'EncodeH3K9me3-sum': 'object',
       'EncodeH4K20me1-max': 'object',
       'EncodeH4K20me1-sum': 'object',
       'EncodetotalRNA-max': 'object',
       'EncodetotalRNA-sum': 'object',
       'End': 'float64',
       'Exon': 'object',
       'Freq10000bp': 'object',
       'Freq1000bp': 'object',
       'Freq100bp': 'object',
       'GC': 'object',
       'GerpN': 'object',
       'GerpRS': 'object',
       'GerpRSpval': 'object',
       'GerpS': 'object',
       'Grantham': 'object',
       'Length': 'object',
       'PHRED': 'object',
       'PolyPhenCat': 'object',
       'PolyPhenVal': 'object',
       'Pos': 'object',
       'Rare10000bp': 'object',
       'Rare1000bp': 'object',
       'Rare100bp': 'object',
       'RawScore': 'object',
       'RemapOverlapCL': 'object',
       'RemapOverlapTF': 'object',
       'SIFTcat': 'object',
       'SIFTval': 'object',
       'Sngl10000bp': 'object',
       'Sngl1000bp': 'object',
       'Sngl100bp': 'object',
       'bStatistic': 'object',
       'cDNApos': 'object',
       'cHmm_E1': 'object',
       'cHmm_E10': 'object',
       'cHmm_E11': 'object',
       'cHmm_E12': 'object',
       'cHmm_E13': 'object',
       'cHmm_E14': 'object',
       'cHmm_E15': 'object',
       'cHmm_E16': 'object',
       'cHmm_E17': 'object',
       'cHmm_E18': 'object',
       'cHmm_E19': 'object',
       'cHmm_E2': 'object',
       'cHmm_E20': 'object',
       'cHmm_E21': 'object',
       'cHmm_E22': 'object',
       'cHmm_E23': 'object',
       'cHmm_E24': 'object',
       'cHmm_E25': 'object',
       'cHmm_E3': 'object',
       'cHmm_E4': 'object',
       'cHmm_E5': 'object',
       'cHmm_E6': 'object',
       'cHmm_E7': 'object',
       'cHmm_E8': 'object',
       'cHmm_E9': 'object',
       'dbscSNV-ada_score': 'object',
       'dbscSNV-rf_score': 'object',
       'mamPhCons': 'object',
       'mamPhyloP': 'object',
       'minDistTSE': 'object',
       'minDistTSS': 'object',
       'mirSVR-Aln': 'object',
       'mirSVR-E': 'object',
       'mirSVR-Score': 'object',
       'motifDist': 'object',
       'motifECount': 'object',
       'motifEHIPos': 'object',
       'motifEScoreChng': 'object',
       'nAA': 'object',
       'oAA': 'object',
       'priPhCons': 'object',
       'priPhyloP': 'object',
       'protPos': 'object',
       'relCDSpos': 'object',
       'relProtPos': 'object',
       'relcDNApos': 'object',
       'tOverlapMotifs': 'object',
       'targetScan': 'object',
       'verPhCons': 'object',
       'verPhyloP': 'object'})
human_derived_noncoding_merged.drop_duplicates()

Unnamed: 0_level_0,Unnamed: 0,#Chrom,Pos,Ref,Alt,Type,Length,AnnoType,Consequence,ConsScore,ConsDetail,GC,CpG,motifECount,motifEName,motifEHIPos,motifEScoreChng,oAA,nAA,GeneID,FeatureID,GeneName,CCDS,Intron,Exon,cDNApos,relcDNApos,CDSpos,relCDSpos,protPos,relProtPos,Domain,Dst2Splice,Dst2SplType,minDistTSS,minDistTSE,SIFTcat,SIFTval,PolyPhenCat,PolyPhenVal,priPhCons,mamPhCons,verPhCons,priPhyloP,mamPhyloP,verPhyloP,bStatistic,targetScan,mirSVR-Score,mirSVR-E,mirSVR-Aln,cHmm_E1,cHmm_E2,cHmm_E3,cHmm_E4,cHmm_E5,cHmm_E6,cHmm_E7,cHmm_E8,cHmm_E9,cHmm_E10,cHmm_E11,cHmm_E12,cHmm_E13,cHmm_E14,cHmm_E15,cHmm_E16,cHmm_E17,cHmm_E18,cHmm_E19,cHmm_E20,cHmm_E21,cHmm_E22,cHmm_E23,cHmm_E24,cHmm_E25,GerpRS,GerpRSpval,GerpN,GerpS,tOverlapMotifs,motifDist,EncodeH3K4me1-sum,EncodeH3K4me1-max,EncodeH3K4me2-sum,EncodeH3K4me2-max,EncodeH3K4me3-sum,EncodeH3K4me3-max,EncodeH3K9ac-sum,EncodeH3K9ac-max,EncodeH3K9me3-sum,EncodeH3K9me3-max,EncodeH3K27ac-sum,EncodeH3K27ac-max,EncodeH3K27me3-sum,EncodeH3K27me3-max,EncodeH3K36me3-sum,EncodeH3K36me3-max,EncodeH3K79me2-sum,EncodeH3K79me2-max,EncodeH4K20me1-sum,EncodeH4K20me1-max,EncodeH2AFZ-sum,EncodeH2AFZ-max,EncodeDNase-sum,EncodeDNase-max,EncodetotalRNA-sum,EncodetotalRNA-max,Grantham,Dist2Mutation,Freq100bp,Rare100bp,Sngl100bp,Freq1000bp,Rare1000bp,Sngl1000bp,Freq10000bp,Rare10000bp,Sngl10000bp,EnsembleRegulatoryFeature,dbscSNV-ada_score,dbscSNV-rf_score,RemapOverlapTF,RemapOverlapCL,RawScore,PHRED,End,Func.refGene,Gene.refGene,GeneDetail.refGene,ExonicFunc.refGene,AAChange.refGene,Func.knownGene,Gene.knownGene,GeneDetail.knownGene,ExonicFunc.knownGene,AAChange.knownGene,Func.ensGene,Gene.ensGene,GeneDetail.ensGene,ExonicFunc.ensGene,AAChange.ensGene,SIFT_score,SIFT_pred,Polyphen2_HDIV_score,Polyphen2_HDIV_pred,Polyphen2_HVAR_score,Polyphen2_HVAR_pred,LRT_score,LRT_pred,MutationTaster_score,MutationTaster_pred,MutationAssessor_score,MutationAssessor_pred,FATHMM_score,FATHMM_pred,RadialSVM_score,RadialSVM_pred,LR_score,LR_pred,VEST3_score,CADD_raw,CADD_phred,GERP++_RS,phyloP46way_placental,phyloP100way_vertebrate,SiPhy_29way_logOdds,Interpro_domain,SIFT_score.1,SIFT_converted_rankscore,SIFT_pred.1,Polyphen2_HDIV_score.1,Polyphen2_HDIV_rankscore,Polyphen2_HDIV_pred.1,Polyphen2_HVAR_score.1,Polyphen2_HVAR_rankscore,Polyphen2_HVAR_pred.1,LRT_score.1,LRT_converted_rankscore,LRT_pred.1,MutationTaster_score.1,MutationTaster_converted_rankscore,MutationTaster_pred.1,MutationAssessor_score.1,MutationAssessor_score_rankscore,MutationAssessor_pred.1,FATHMM_score.1,FATHMM_converted_rankscore,FATHMM_pred.1,PROVEAN_score,PROVEAN_converted_rankscore,PROVEAN_pred,VEST3_score.1,VEST3_rankscore,MetaSVM_score,MetaSVM_rankscore,MetaSVM_pred,MetaLR_score,MetaLR_rankscore,MetaLR_pred,M-CAP_score,M-CAP_rankscore,M-CAP_pred,CADD_raw.1,CADD_raw_rankscore,CADD_phred.1,DANN_score,DANN_rankscore,fathmm-MKL_coding_score,fathmm-MKL_coding_rankscore,fathmm-MKL_coding_pred,Eigen_coding_or_noncoding,Eigen-raw,Eigen-PC-raw,GenoCanyon_score,GenoCanyon_score_rankscore,integrated_fitCons_score,integrated_fitCons_score_rankscore,integrated_confidence_value,GERP++_RS.1,GERP++_RS_rankscore,phyloP100way_vertebrate.1,phyloP100way_vertebrate_rankscore,phyloP20way_mammalian,phyloP20way_mammalian_rankscore,phastCons100way_vertebrate,phastCons100way_vertebrate_rankscore,phastCons20way_mammalian,phastCons20way_mammalian_rankscore,SiPhy_29way_logOdds.1,SiPhy_29way_logOdds_rankscore,Interpro_domain.1,GTEx_V6_gene,GTEx_V6_tissue,dbscSNV_ADA_SCORE,dbscSNV_RF_SCORE,InterVar_automated,PVS1,PS1,PS2,PS3,PS4,PM1,PM2,PM3,PM4,PM5,PM6,PP1,PP2,PP3,PP4,PP5,BA1,BS1,BS2,BS3,BS4,BP1,BP2,BP3,BP4,BP5,BP6,BP7,cosmic70,MCAP,REVEL,regsnp_fpr,regsnp_disease,regsnp_splicing_site
npartitions=1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1,Unnamed: 102_level_1,Unnamed: 103_level_1,Unnamed: 104_level_1,Unnamed: 105_level_1,Unnamed: 106_level_1,Unnamed: 107_level_1,Unnamed: 108_level_1,Unnamed: 109_level_1,Unnamed: 110_level_1,Unnamed: 111_level_1,Unnamed: 112_level_1,Unnamed: 113_level_1,Unnamed: 114_level_1,Unnamed: 115_level_1,Unnamed: 116_level_1,Unnamed: 117_level_1,Unnamed: 118_level_1,Unnamed: 119_level_1,Unnamed: 120_level_1,Unnamed: 121_level_1,Unnamed: 122_level_1,Unnamed: 123_level_1,Unnamed: 124_level_1,Unnamed: 125_level_1,Unnamed: 126_level_1,Unnamed: 127_level_1,Unnamed: 128_level_1,Unnamed: 129_level_1,Unnamed: 130_level_1,Unnamed: 131_level_1,Unnamed: 132_level_1,Unnamed: 133_level_1,Unnamed: 134_level_1,Unnamed: 135_level_1,Unnamed: 136_level_1,Unnamed: 137_level_1,Unnamed: 138_level_1,Unnamed: 139_level_1,Unnamed: 140_level_1,Unnamed: 141_level_1,Unnamed: 142_level_1,Unnamed: 143_level_1,Unnamed: 144_level_1,Unnamed: 145_level_1,Unnamed: 146_level_1,Unnamed: 147_level_1,Unnamed: 148_level_1,Unnamed: 149_level_1,Unnamed: 150_level_1,Unnamed: 151_level_1,Unnamed: 152_level_1,Unnamed: 153_level_1,Unnamed: 154_level_1,Unnamed: 155_level_1,Unnamed: 156_level_1,Unnamed: 157_level_1,Unnamed: 158_level_1,Unnamed: 159_level_1,Unnamed: 160_level_1,Unnamed: 161_level_1,Unnamed: 162_level_1,Unnamed: 163_level_1,Unnamed: 164_level_1,Unnamed: 165_level_1,Unnamed: 166_level_1,Unnamed: 167_level_1,Unnamed: 168_level_1,Unnamed: 169_level_1,Unnamed: 170_level_1,Unnamed: 171_level_1,Unnamed: 172_level_1,Unnamed: 173_level_1,Unnamed: 174_level_1,Unnamed: 175_level_1,Unnamed: 176_level_1,Unnamed: 177_level_1,Unnamed: 178_level_1,Unnamed: 179_level_1,Unnamed: 180_level_1,Unnamed: 181_level_1,Unnamed: 182_level_1,Unnamed: 183_level_1,Unnamed: 184_level_1,Unnamed: 185_level_1,Unnamed: 186_level_1,Unnamed: 187_level_1,Unnamed: 188_level_1,Unnamed: 189_level_1,Unnamed: 190_level_1,Unnamed: 191_level_1,Unnamed: 192_level_1,Unnamed: 193_level_1,Unnamed: 194_level_1,Unnamed: 195_level_1,Unnamed: 196_level_1,Unnamed: 197_level_1,Unnamed: 198_level_1,Unnamed: 199_level_1,Unnamed: 200_level_1,Unnamed: 201_level_1,Unnamed: 202_level_1,Unnamed: 203_level_1,Unnamed: 204_level_1,Unnamed: 205_level_1,Unnamed: 206_level_1,Unnamed: 207_level_1,Unnamed: 208_level_1,Unnamed: 209_level_1,Unnamed: 210_level_1,Unnamed: 211_level_1,Unnamed: 212_level_1,Unnamed: 213_level_1,Unnamed: 214_level_1,Unnamed: 215_level_1,Unnamed: 216_level_1,Unnamed: 217_level_1,Unnamed: 218_level_1,Unnamed: 219_level_1,Unnamed: 220_level_1,Unnamed: 221_level_1,Unnamed: 222_level_1,Unnamed: 223_level_1,Unnamed: 224_level_1,Unnamed: 225_level_1,Unnamed: 226_level_1,Unnamed: 227_level_1,Unnamed: 228_level_1,Unnamed: 229_level_1,Unnamed: 230_level_1,Unnamed: 231_level_1,Unnamed: 232_level_1,Unnamed: 233_level_1,Unnamed: 234_level_1,Unnamed: 235_level_1,Unnamed: 236_level_1,Unnamed: 237_level_1,Unnamed: 238_level_1,Unnamed: 239_level_1,Unnamed: 240_level_1,Unnamed: 241_level_1,Unnamed: 242_level_1,Unnamed: 243_level_1,Unnamed: 244_level_1,Unnamed: 245_level_1,Unnamed: 246_level_1,Unnamed: 247_level_1,Unnamed: 248_level_1,Unnamed: 249_level_1,Unnamed: 250_level_1,Unnamed: 251_level_1,Unnamed: 252_level_1,Unnamed: 253_level_1,Unnamed: 254_level_1,Unnamed: 255_level_1,Unnamed: 256_level_1,Unnamed: 257_level_1,Unnamed: 258_level_1,Unnamed: 259_level_1,Unnamed: 260_level_1,Unnamed: 261_level_1,Unnamed: 262_level_1,Unnamed: 263_level_1,Unnamed: 264_level_1,Unnamed: 265_level_1,Unnamed: 266_level_1,Unnamed: 267_level_1,Unnamed: 268_level_1,Unnamed: 269_level_1,Unnamed: 270_level_1,Unnamed: 271_level_1
,int64,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,float64,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...


##### ClinVar

Below I have called in the clinvar annotation files for CADD and ANNOVAR:

In [3]:
clinvar_benign_noncoding_annotated_with_annovar = pd.read_csv('clinvar_benign_noncoding.hg38_multianno.csv')
clinvar_benign_noncoding_annotated_with_annovar

Unnamed: 0,Chr,Start,End,Ref,Alt,Func.refGene,Gene.refGene,GeneDetail.refGene,ExonicFunc.refGene,AAChange.refGene,...,BP4,BP5,BP6,BP7,cosmic70,MCAP,REVEL,regsnp_fpr,regsnp_disease,regsnp_splicing_site
0,chr10,100989312,100989312,G,A,exonic,TWNK,.,nonsynonymous SNV,"TWNK:NM_001163812:exon1:c.G1102A:p.V368I,TWNK:...",...,0,0,1,0,.,.,0.353,.,.,.
1,chr10,100990864,100990864,C,T,intronic,TWNK,.,.,.,...,.,.,.,.,.,.,.,0.624215246637,B,on
2,chr10,100990866,100990866,T,C,intronic,TWNK,.,.,.,...,.,.,.,.,.,.,.,0.0717488789238,PD,on
3,chr10,100991026,100991026,C,A,UTR3,TWNK,NM_001163812:c.*1C>A;NM_001163814:c.*1C>A,.,.,...,.,.,.,.,.,.,.,.,.,.
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1556,chrX,71132768,71132792,CTCTTCTCTTCTCTTCTCTTCTCTT,-,intronic,MED12,.,.,.,...,.,.,.,.,.,.,.,.,.,.
1557,chrX,71132768,71132772,CTCTT,-,intronic,MED12,.,.,.,...,.,.,.,.,.,.,.,.,.,.
1558,chrX,71132768,71132797,CTCTTCTCTTCTCTTCTCTTCTCTTCTCTT,-,intronic,MED12,.,.,.,...,.,.,.,.,.,.,.,.,.,.
1559,chrX,78118027,78118027,C,T,intronic,PGK1,.,.,.,...,.,.,.,.,.,.,.,0.61301369863,B,off


In [4]:
clinvar_benign_noncoding_annotated_with_cadd = pd.read_table('clinvar_noncoding_benign_cadd_annotations_noheader.tsv')
clinvar_benign_noncoding_annotated_with_cadd

  """Entry point for launching an IPython kernel.


Unnamed: 0,#Chrom,Pos,Ref,Alt,Type,Length,AnnoType,Consequence,ConsScore,ConsDetail,...,Freq10000bp,Rare10000bp,Sngl10000bp,EnsembleRegulatoryFeature,dbscSNV-ada_score,dbscSNV-rf_score,RemapOverlapTF,RemapOverlapCL,RawScore,PHRED
0,1,1041950,T,C,SNV,0,Transcript,SPLICE_SITE,5,"splice,intron",...,23,100,1931,,0.00001,0.002,10.0,13.0,0.092155,2.700
1,1,1042190,G,A,SNV,0,Transcript,INTRONIC,2,intron,...,25,109,1955,,,,12.0,14.0,-0.401276,0.118
2,1,1043223,CCT,C,DEL,2,Transcript,INTRONIC,2,intron,...,24,117,2004,,,,8.0,9.0,-0.012669,1.640
3,1,1045707,A,G,SNV,0,Transcript,INTRONIC,2,intron,...,22,131,2126,,,,9.0,10.0,-0.057159,1.287
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3200,X,154419685,C,T,SNV,0,Transcript,INTRONIC,2,intron,...,11,29,820,,,,4.0,4.0,-0.033909,1.464
3201,X,154420108,C,T,SNV,0,Intergenic,DOWNSTREAM,1,downstream,...,12,31,826,,,,36.0,47.0,-0.549155,0.036
3202,X,154420108,C,T,SNV,0,Transcript,INTRONIC,2,intron,...,12,31,826,,,,36.0,47.0,-0.549155,0.036
3203,X,154961190,A,G,SNV,0,Transcript,INTRONIC,2,intron,...,7,45,823,,,,1.0,1.0,0.351934,5.879


Just as with the other regions, I had to manipulate the ANNOVAR data to look like the CADD data in order to merge it:

In [5]:
clinvar_benign_noncoding_annotated_with_annovar['Chr'] = clinvar_benign_noncoding_annotated_with_annovar['Chr'].map(lambda x: x.lstrip('chr').rstrip('aAbBcC'))
clinvar_benign_noncoding_annotated_with_annovar = clinvar_benign_noncoding_annotated_with_annovar.rename(columns = {'Chr': '#Chrom', 'Start': 'Pos'})
clinvar_benign_noncoding_merged = pd.merge(clinvar_benign_noncoding_annotated_with_cadd, clinvar_benign_noncoding_annotated_with_annovar,
                                         how = 'left', on = ['#Chrom', 'Pos', 'Ref', 'Alt'])
clinvar_benign_noncoding_merged

Unnamed: 0,#Chrom,Pos,Ref,Alt,Type,Length,AnnoType,Consequence,ConsScore,ConsDetail,...,BP4,BP5,BP6,BP7,cosmic70,MCAP,REVEL,regsnp_fpr,regsnp_disease,regsnp_splicing_site
0,1,1041950,T,C,SNV,0,Transcript,SPLICE_SITE,5,"splice,intron",...,.,.,.,.,.,.,.,0.794618834081,B,on
1,1,1042190,G,A,SNV,0,Transcript,INTRONIC,2,intron,...,.,.,.,.,.,.,.,0.0445205479452,D,off
2,1,1043223,CCT,C,DEL,2,Transcript,INTRONIC,2,intron,...,,,,,,,,,,
3,1,1045707,A,G,SNV,0,Transcript,INTRONIC,2,intron,...,.,.,.,.,.,.,.,0.441780821918,B,off
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3200,X,154419685,C,T,SNV,0,Transcript,INTRONIC,2,intron,...,.,.,.,.,.,.,.,.,.,.
3201,X,154420108,C,T,SNV,0,Intergenic,DOWNSTREAM,1,downstream,...,.,.,.,.,.,.,.,.,.,.
3202,X,154420108,C,T,SNV,0,Transcript,INTRONIC,2,intron,...,.,.,.,.,.,.,.,.,.,.
3203,X,154961190,A,G,SNV,0,Transcript,INTRONIC,2,intron,...,.,.,.,.,.,.,.,0.825342465753,B,off


Below I have dropped duplicates:

In [6]:
clinvar_benign_noncoding_merged = clinvar_benign_noncoding_merged.drop_duplicates(keep = 'first')
clinvar_benign_noncoding_merged

Unnamed: 0,#Chrom,Pos,Ref,Alt,Type,Length,AnnoType,Consequence,ConsScore,ConsDetail,...,BP4,BP5,BP6,BP7,cosmic70,MCAP,REVEL,regsnp_fpr,regsnp_disease,regsnp_splicing_site
0,1,1041950,T,C,SNV,0,Transcript,SPLICE_SITE,5,"splice,intron",...,.,.,.,.,.,.,.,0.794618834081,B,on
1,1,1042190,G,A,SNV,0,Transcript,INTRONIC,2,intron,...,.,.,.,.,.,.,.,0.0445205479452,D,off
2,1,1043223,CCT,C,DEL,2,Transcript,INTRONIC,2,intron,...,,,,,,,,,,
3,1,1045707,A,G,SNV,0,Transcript,INTRONIC,2,intron,...,.,.,.,.,.,.,.,0.441780821918,B,off
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3200,X,154419685,C,T,SNV,0,Transcript,INTRONIC,2,intron,...,.,.,.,.,.,.,.,.,.,.
3201,X,154420108,C,T,SNV,0,Intergenic,DOWNSTREAM,1,downstream,...,.,.,.,.,.,.,.,.,.,.
3202,X,154420108,C,T,SNV,0,Transcript,INTRONIC,2,intron,...,.,.,.,.,.,.,.,.,.,.
3203,X,154961190,A,G,SNV,0,Transcript,INTRONIC,2,intron,...,.,.,.,.,.,.,.,0.825342465753,B,off


Below is the final merged dataframe for the benign noncoding region in which the output is given as a csv. It contains both the human derived and clinvar annotated variants and does not contain the columns which are unneccessary for our model:

In [7]:
human_derived_noncoding_merged.compute().append(clinvar_benign_noncoding_merged).drop(columns = ['PolyPhenVal', 'priPhCons', 'cHmm_E1', 'cHmm_E2', 'cHmm_E3', 'cHmm_E4', 'cHmm_E5', 'cHmm_E6', 'cHmm_E7', 'cHmm_E8', 'cHmm_E9', 'cHmm_E10', 'cHmm_E11', 'cHmm_E12', 'cHmm_E13', 'cHmm_E14', 'cHmm_E15', 'cHmm_E16', 'cHmm_E17', 'cHmm_E18', 'cHmm_E19', 'cHmm_E20', 'cHmm_E21', 'cHmm_E22', 'cHmm_E23', 'cHmm_E24', 'cHmm_E25', 'GerpN', 'GerpS']).to_csv('final_benign_noncoding_dataframe.csv')

  args2 = [_execute_task(a, cache) for a in args]
  args2 = [_execute_task(a, cache) for a in args]
  args2 = [_execute_task(a, cache) for a in args]
  args2 = [_execute_task(a, cache) for a in args]
  args2 = [_execute_task(a, cache) for a in args]
  args2 = [_execute_task(a, cache) for a in args]
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


Below is the code for the final dataframe created in the cell above. We must change all the columns' dtypes to object in order for dask dataframe to read them correctly:

In [9]:
human_derived_final_dataframe = dd.read_csv('final_benign_noncoding_dataframe.csv', dtype={'#Chrom': 'object',
       'ConsScore': 'object',
       'CpG': 'object',
       'Dist2Mutation': 'object',
       'Domain': 'object',
       'Dst2Splice': 'object',
       'EncodeDNase-max': 'object',
       'EncodeDNase-sum': 'object',
       'EncodeH2AFZ-max': 'object',
       'EncodeH2AFZ-sum': 'object',
       'EncodeH3K27ac-max': 'object',
       'EncodeH3K27ac-sum': 'object',
       'EncodeH3K27me3-max': 'object',
       'EncodeH3K27me3-sum': 'object',
       'EncodeH3K36me3-max': 'object',
       'EncodeH3K36me3-sum': 'object',
       'EncodeH3K4me1-max': 'object',
       'EncodeH3K4me1-sum': 'object',
       'EncodeH3K4me2-max': 'object',
       'EncodeH3K4me2-sum': 'object',
       'EncodeH3K4me3-max': 'object',
       'EncodeH3K4me3-sum': 'object',
       'EncodeH3K79me2-max': 'object',
       'EncodeH3K79me2-sum': 'object',
       'EncodeH3K9ac-max': 'object',
       'EncodeH3K9ac-sum': 'object',
       'EncodeH3K9me3-max': 'object',
       'EncodeH3K9me3-sum': 'object',
       'EncodeH4K20me1-max': 'object',
       'EncodeH4K20me1-sum': 'object',
       'EncodetotalRNA-max': 'object',
       'EncodetotalRNA-sum': 'object',
       'Exon': 'object',
       'Freq10000bp': 'object',
       'Freq1000bp': 'object',
       'Freq100bp': 'object',
       'GC': 'object',
       'GerpRS': 'object',
       'GerpRSpval': 'object',
       'Grantham': 'object',
       'Length': 'object',
       'PHRED': 'object',
       'PolyPhenCat': 'object',
       'Pos': 'object',
       'Rare10000bp': 'object',
       'Rare1000bp': 'object',
       'Rare100bp': 'object',
       'RawScore': 'object',
       'RemapOverlapCL': 'object',
       'RemapOverlapTF': 'object',
       'SIFTcat': 'object',
       'SIFTval': 'object',
       'Sngl10000bp': 'object',
       'Sngl1000bp': 'object',
       'Sngl100bp': 'object',
       'bStatistic': 'object',
       'cDNApos': 'object',
       'dbscSNV-ada_score': 'object',
       'dbscSNV-rf_score': 'object',
       'mamPhCons': 'object',
       'mamPhyloP': 'object',
       'minDistTSE': 'object',
       'minDistTSS': 'object',
       'mirSVR-Aln': 'object',
       'mirSVR-E': 'object',
       'mirSVR-Score': 'object',
       'motifDist': 'object',
       'motifECount': 'object',
       'motifEHIPos': 'object',
       'motifEScoreChng': 'object',
       'nAA': 'object',
       'oAA': 'object',
       'priPhyloP': 'object',
       'protPos': 'object',
       'relCDSpos': 'object',
       'relProtPos': 'object',
       'relcDNApos': 'object',
       'tOverlapMotifs': 'object',
       'targetScan': 'object',
       'verPhCons': 'object',
       'verPhyloP': 'object',
        'CDSpos': 'object'}).compute()

  args2 = [_execute_task(a, cache) for a in args]
  args2 = [_execute_task(a, cache) for a in args]
  args2 = [_execute_task(a, cache) for a in args]
  args2 = [_execute_task(a, cache) for a in args]


We notice that there are two more columns which we do not want and therefore we drop these columns from the dataframe and once again save the dataframe as the same csv:

In [10]:
human_derived_final_dataframe = human_derived_final_dataframe.drop(['Unnamed: 0', 'Unnamed: 0.1'], axis = 1)

For our model, we want to have NaN values in place of the string '.' in the dataframe. The reason for this is that it is very easy to change NaN values and manipulate them to our will in Python. In order to do so, we must break the dataframe into multiple sections and replace values this way. Otherwise we would receive a Memory Error. We do this with the following code:

In [None]:
benign_noncoding_final_dataframe_1 = benign_noncoding_final_dataframe[:600000]
benign_noncoding_final_dataframe_1_with_nan = benign_noncoding_final_dataframe_1.replace('.', np.nan)
benign_noncoding_final_dataframe_2 = benign_noncoding_final_dataframe[600000:1100000]
benign_noncoding_final_dataframe_2_with_nan = benign_noncoding_final_dataframe_2.replace('.', np.nan)
benign_noncoding_final_dataframe_3 = benign_noncoding_final_dataframe[1100000:1538959]
benign_noncoding_final_dataframe_3_with_nan = benign_noncoding_final_dataframe_3.replace('.', np.nan)
benign_noncoding_final_dataframe_1_with_nan.to_csv('benignnoncodingfinaldataframe_withNaN.csv', index=False)
benign_noncoding_final_dataframe_2_with_nan.to_csv('benignnoncodingfinaldataframe_withNaN.csv', mode = 'a', index=False)
benign_noncoding_final_dataframe_3_with_nan.to_csv('benignnoncodingfinaldataframe_withNaN.csv', mode = 'a', index=False)
del benign_noncoding_final_dataframe_1_with_nan, benign_noncoding_final_dataframe_2_with_nan, benign_noncoding_final_dataframe_3_with_nan

Another thing that our method to create the resulting dataframe has unfortunately given us is extra rows in the dataframe that contain the names of the columns. This is because when we append dataframes using dask, a row is created for the header of the dataframes being added on. We need to drop these rows. To do this, we specify that only the duplicates with all values in common will be dropped, shown below:

In [None]:
benign_noncoding_final_dataframe_with_nan = benign_noncoding_final_dataframe_with_nan.drop_duplicates(subset = ['#Chrom', 'AAChange.ensGene', 'AAChange.knownGene', 'AAChange.refGene',
 'Alt',
 'AnnoType',
 'BA1',
 'BP1',
 'BP2',
 'BP3',
 'BP4',
 'BP5',
 'BP6',
 'BP7',
 'BS1',
 'BS2',
 'BS3',
 'BS4',
 'CADD_phred',
 'CADD_phred.1',
 'CADD_raw',
 'CADD_raw.1',
 'CADD_raw_rankscore',
 'CCDS',
 'CDSpos',
 'ConsDetail',
 'ConsScore',
 'Consequence',
 'CpG',
 'DANN_rankscore',
 'DANN_score',
 'Dist2Mutation',
 'Domain',
 'Dst2SplType',
 'Dst2Splice',
 'Eigen-PC-raw',
 'Eigen-raw',
 'Eigen_coding_or_noncoding',
 'EncodeDNase-max',
 'EncodeDNase-sum',
 'EncodeH2AFZ-max',
 'EncodeH2AFZ-sum',
 'EncodeH3K27ac-max',
 'EncodeH3K27ac-sum',
 'EncodeH3K27me3-max',
 'EncodeH3K27me3-sum',
 'EncodeH3K36me3-max',
 'EncodeH3K36me3-sum',
 'EncodeH3K4me1-max',
 'EncodeH3K4me1-sum',
 'EncodeH3K4me2-max',
 'EncodeH3K4me2-sum',
 'EncodeH3K4me3-max',
 'EncodeH3K4me3-sum',
 'EncodeH3K79me2-max',
 'EncodeH3K79me2-sum',
 'EncodeH3K9ac-max',
 'EncodeH3K9ac-sum',
 'EncodeH3K9me3-max',
 'EncodeH3K9me3-sum',
 'EncodeH4K20me1-max',
 'EncodeH4K20me1-sum',
 'EncodetotalRNA-max',
 'EncodetotalRNA-sum',
 'End',
 'EnsembleRegulatoryFeature',
 'Exon',
 'ExonicFunc.ensGene',
 'ExonicFunc.knownGene',
 'ExonicFunc.refGene',
 'FATHMM_converted_rankscore',
 'FATHMM_pred',
 'FATHMM_pred.1',
 'FATHMM_score',
 'FATHMM_score.1',
 'FeatureID',
 'Freq10000bp',
 'Freq1000bp',
 'Freq100bp',
 'Func.ensGene',
 'Func.knownGene',
 'Func.refGene',
 'GC',
 'GERP++_RS',
 'GERP++_RS.1',
 'GERP++_RS_rankscore',
 'GTEx_V6_gene',
 'GTEx_V6_tissue',
 'Gene.ensGene',
 'Gene.knownGene',
 'Gene.refGene',
 'GeneDetail.ensGene',
 'GeneDetail.knownGene',
 'GeneDetail.refGene',
 'GeneID',
 'GeneName',
 'GenoCanyon_score',
 'GenoCanyon_score_rankscore',
 'GerpRS',
 'GerpRSpval',
 'Grantham',
 'InterVar_automated',
 'Interpro_domain',
 'Interpro_domain.1',
 'Intron',
 'LRT_converted_rankscore',
 'LRT_pred',
 'LRT_pred.1',
 'LRT_score',
 'LRT_score.1',
 'LR_pred',
 'LR_score',
 'Length',
 'M-CAP_pred',
 'M-CAP_rankscore',
 'M-CAP_score',
 'MCAP',
 'MetaLR_pred',
 'MetaLR_rankscore',
 'MetaLR_score',
 'MetaSVM_pred',
 'MetaSVM_rankscore',
 'MetaSVM_score',
 'MutationAssessor_pred',
 'MutationAssessor_pred.1',
 'MutationAssessor_score',
 'MutationAssessor_score.1',
 'MutationAssessor_score_rankscore',
 'MutationTaster_converted_rankscore',
 'MutationTaster_pred',
 'MutationTaster_pred.1',
 'MutationTaster_score',
 'MutationTaster_score.1',
 'PHRED',
 'PM1',
 'PM2',
 'PM3',
 'PM4',
 'PM5',
 'PM6',
 'PP1',
 'PP2',
 'PP3',
 'PP4',
 'PP5',
 'PROVEAN_converted_rankscore',
 'PROVEAN_pred',
 'PROVEAN_score',
 'PS1',
 'PS2',
 'PS3',
 'PS4',
 'PVS1',
 'PolyPhenCat',
 'Polyphen2_HDIV_pred',
 'Polyphen2_HDIV_pred.1',
 'Polyphen2_HDIV_rankscore',
 'Polyphen2_HDIV_score',
 'Polyphen2_HDIV_score.1',
 'Polyphen2_HVAR_pred',
 'Polyphen2_HVAR_pred.1',
 'Polyphen2_HVAR_rankscore',
 'Polyphen2_HVAR_score',
 'Polyphen2_HVAR_score.1',
 'Pos',
 'REVEL',
 'RadialSVM_pred',
 'RadialSVM_score',
 'Rare10000bp',
 'Rare1000bp',
 'Rare100bp',
 'RawScore',
 'Ref',
 'RemapOverlapCL',
 'RemapOverlapTF',
 'SIFT_converted_rankscore',
 'SIFT_pred',
 'SIFT_pred.1',
 'SIFT_score',
 'SIFT_score.1',
 'SIFTcat',
 'SIFTval',
 'SiPhy_29way_logOdds',
 'SiPhy_29way_logOdds.1',
 'SiPhy_29way_logOdds_rankscore',
 'Sngl10000bp',
 'Sngl1000bp',
 'Sngl100bp',
 'Type',
 'VEST3_rankscore',
 'VEST3_score',
 'VEST3_score.1',
 'bStatistic',
 'cDNApos',
 'cosmic70',
 'dbscSNV-ada_score',
 'dbscSNV-rf_score',
 'dbscSNV_ADA_SCORE',
 'dbscSNV_RF_SCORE',
 'fathmm-MKL_coding_pred',
 'fathmm-MKL_coding_rankscore',
 'fathmm-MKL_coding_score',
 'integrated_confidence_value',
 'integrated_fitCons_score',
 'integrated_fitCons_score_rankscore',
 'mamPhCons',
 'mamPhyloP',
 'minDistTSE',
 'minDistTSS',
 'mirSVR-Aln',
 'mirSVR-E',
 'mirSVR-Score',
 'motifDist',
 'motifECount',
 'motifEHIPos',
 'motifEName',
 'motifEScoreChng',
 'nAA',
 'oAA',
 'phastCons100way_vertebrate',
 'phastCons100way_vertebrate_rankscore',
 'phastCons20way_mammalian',
 'phastCons20way_mammalian_rankscore',
 'phyloP100way_vertebrate',
 'phyloP100way_vertebrate.1',
 'phyloP100way_vertebrate_rankscore',
 'phyloP20way_mammalian',
 'phyloP20way_mammalian_rankscore',
 'phyloP46way_placental',
 'priPhyloP',
 'protPos',
 'regsnp_disease',
 'regsnp_fpr',
 'regsnp_splicing_site',
 'relCDSpos',
 'relProtPos',
 'relcDNApos',
 'tOverlapMotifs',
 'targetScan',
 'verPhCons',
 'verPhyloP'] , keep = False)

Now we must change the dtypes of the numerical values for use in the future from string to float. I have created the following code using the Python tool to_numeric in order to change the data types to floats for columns with numerical values:

In [None]:
benign_noncoding_final_dataframe_with_nan["verPhyloP"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["verPhyloP"])
benign_noncoding_final_dataframe_with_nan["verPhCons"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["verPhCons"])
benign_noncoding_final_dataframe_with_nan["targetScan"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["targetScan"])
benign_noncoding_final_dataframe_with_nan["tOverlapMotifs"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["tOverlapMotifs"])
benign_noncoding_final_dataframe_with_nan["relcDNApos"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["relcDNApos"])
benign_noncoding_final_dataframe_with_nan["relProtPos"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["relProtPos"])
benign_noncoding_final_dataframe_with_nan["relCDSpos"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["relCDSpos"])
benign_noncoding_final_dataframe_with_nan["regsnp_fpr"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["regsnp_fpr"])
benign_noncoding_final_dataframe_with_nan["protPos"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["protPos"])
benign_noncoding_final_dataframe_with_nan["priPhyloP"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["priPhyloP"])
benign_noncoding_final_dataframe_with_nan["phyloP46way_placental"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["phyloP46way_placental"])
benign_noncoding_final_dataframe_with_nan["phyloP20way_mammalian_rankscore"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["phyloP20way_mammalian_rankscore"])
benign_noncoding_final_dataframe_with_nan["phyloP20way_mammalian"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["phyloP20way_mammalian"])
benign_noncoding_final_dataframe_with_nan["phyloP100way_vertebrate_rankscore"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["phyloP100way_vertebrate_rankscore"])
benign_noncoding_final_dataframe_with_nan["phyloP100way_vertebrate.1"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["phyloP100way_vertebrate.1"])
benign_noncoding_final_dataframe_with_nan["phyloP100way_vertebrate"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["phyloP100way_vertebrate"])
benign_noncoding_final_dataframe_with_nan["phastCons20way_mammalian_rankscore"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["phastCons20way_mammalian_rankscore"])
benign_noncoding_final_dataframe_with_nan["phastCons20way_mammalian"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["phastCons20way_mammalian"])
benign_noncoding_final_dataframe_with_nan["phastCons100way_vertebrate_rankscore"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["phastCons100way_vertebrate_rankscore"])
benign_noncoding_final_dataframe_with_nan["phastCons100way_vertebrate"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["phastCons100way_vertebrate"])
benign_noncoding_final_dataframe_with_nan["motifEScoreChng"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["motifEScoreChng"])
benign_noncoding_final_dataframe_with_nan["motifDist"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["motifDist"])
benign_noncoding_final_dataframe_with_nan["mirSVR-Score"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["mirSVR-Score"])
benign_noncoding_final_dataframe_with_nan["mirSVR-E"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["mirSVR-E"])
benign_noncoding_final_dataframe_with_nan["mirSVR-Aln"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["mirSVR-Aln"])
benign_noncoding_final_dataframe_with_nan["minDistTSS"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["minDistTSS"])
benign_noncoding_final_dataframe_with_nan["minDistTSE"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["minDistTSE"])
benign_noncoding_final_dataframe_with_nan["mamPhyloP"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["mamPhyloP"])
benign_noncoding_final_dataframe_with_nan["mamPhCons"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["mamPhCons"])
benign_noncoding_final_dataframe_with_nan["integrated_fitCons_score"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["integrated_fitCons_score"])
benign_noncoding_final_dataframe_with_nan["integrated_confidence_value"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["integrated_confidence_value"])
benign_noncoding_final_dataframe_with_nan["fathmm-MKL_coding_score"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["fathmm-MKL_coding_score"])
benign_noncoding_final_dataframe_with_nan["fathmm-MKL_coding_rankscore"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["fathmm-MKL_coding_rankscore"])
benign_noncoding_final_dataframe_with_nan["dbscSNV_RF_SCORE"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["dbscSNV_RF_SCORE"])
benign_noncoding_final_dataframe_with_nan["dbscSNV_ADA_SCORE"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["dbscSNV_ADA_SCORE"])
benign_noncoding_final_dataframe_with_nan["dbscSNV-rf_score"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["dbscSNV-rf_score"])
benign_noncoding_final_dataframe_with_nan["dbscSNV-ada_score"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["dbscSNV-ada_score"])
benign_noncoding_final_dataframe_with_nan["cDNApos"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["cDNApos"])
benign_noncoding_final_dataframe_with_nan["bStatistic"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["bStatistic"])
benign_noncoding_final_dataframe_with_nan["VEST3_score.1"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["VEST3_score.1"])
benign_noncoding_final_dataframe_with_nan["VEST3_score"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["VEST3_score"])
benign_noncoding_final_dataframe_with_nan["VEST3_rankscore"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["VEST3_rankscore"])
benign_noncoding_final_dataframe_with_nan["Sngl100bp"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["Sngl100bp"])
benign_noncoding_final_dataframe_with_nan["Sngl1000bp"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["Sngl1000bp"])
benign_noncoding_final_dataframe_with_nan["Sngl10000bp"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["Sngl10000bp"])
benign_noncoding_final_dataframe_with_nan["SiPhy_29way_logOdds_rankscore"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["SiPhy_29way_logOdds_rankscore"])
benign_noncoding_final_dataframe_with_nan["SiPhy_29way_logOdds.1"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["SiPhy_29way_logOdds.1"])
benign_noncoding_final_dataframe_with_nan["SiPhy_29way_logOdds"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["SiPhy_29way_logOdds"])
benign_noncoding_final_dataframe_with_nan["SIFTval"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["SIFTval"])
benign_noncoding_final_dataframe_with_nan["SIFT_score.1"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["SIFT_score.1"])
benign_noncoding_final_dataframe_with_nan["SIFT_score"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["SIFT_score"])
benign_noncoding_final_dataframe_with_nan["SIFT_converted_rankscore"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["SIFT_converted_rankscore"])
benign_noncoding_final_dataframe_with_nan["RemapOverlapTF"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["RemapOverlapTF"])
benign_noncoding_final_dataframe_with_nan["RemapOverlapCL"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["RemapOverlapCL"])
benign_noncoding_final_dataframe_with_nan["RawScore"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["RawScore"])
benign_noncoding_final_dataframe_with_nan["Rare100bp"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["Rare100bp"])
benign_noncoding_final_dataframe_with_nan["Rare1000bp"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["Rare1000bp"])
benign_noncoding_final_dataframe_with_nan["Rare10000bp"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["Rare10000bp"])
benign_noncoding_final_dataframe_with_nan["RadialSVM_score"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["RadialSVM_score"])
benign_noncoding_final_dataframe_with_nan["REVEL"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["REVEL"])
benign_noncoding_final_dataframe_with_nan["Polyphen2_HVAR_score.1"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["Polyphen2_HVAR_score.1"])
benign_noncoding_final_dataframe_with_nan["Polyphen2_HVAR_rankscore"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["Polyphen2_HVAR_rankscore"])
benign_noncoding_final_dataframe_with_nan["Polyphen2_HVAR_score"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["Polyphen2_HVAR_score"])
benign_noncoding_final_dataframe_with_nan["Polyphen2_HDIV_score.1"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["Polyphen2_HDIV_score.1"])
benign_noncoding_final_dataframe_with_nan["Polyphen2_HDIV_score"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["Polyphen2_HDIV_score"])
benign_noncoding_final_dataframe_with_nan["Polyphen2_HDIV_rankscore"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["Polyphen2_HDIV_rankscore"])
benign_noncoding_final_dataframe_with_nan["PVS1"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["PVS1"])
benign_noncoding_final_dataframe_with_nan["PS4"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["PS4"])
benign_noncoding_final_dataframe_with_nan["PS3"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["PS3"])
benign_noncoding_final_dataframe_with_nan["PS2"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["PS2"])
benign_noncoding_final_dataframe_with_nan["PS1"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["PS1"])
benign_noncoding_final_dataframe_with_nan["PROVEAN_score"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["PROVEAN_score"])
benign_noncoding_final_dataframe_with_nan["PROVEAN_converted_rankscore"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["PROVEAN_converted_rankscore"])
benign_noncoding_final_dataframe_with_nan["PP5"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["PP5"])
benign_noncoding_final_dataframe_with_nan["PP4"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["PP4"])
benign_noncoding_final_dataframe_with_nan["PP3"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["PP3"])
benign_noncoding_final_dataframe_with_nan["PP2"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["PP2"])
benign_noncoding_final_dataframe_with_nan["PP1"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["PP1"])
benign_noncoding_final_dataframe_with_nan["PM6"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["PM6"])
benign_noncoding_final_dataframe_with_nan["PM5"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["PM5"])
benign_noncoding_final_dataframe_with_nan["PM4"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["PM4"])
benign_noncoding_final_dataframe_with_nan["PM3"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["PM3"])
benign_noncoding_final_dataframe_with_nan["PM2"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["PM2"])
benign_noncoding_final_dataframe_with_nan["PM1"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["PM1"])
benign_noncoding_final_dataframe_with_nan["PHRED"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["PHRED"])
benign_noncoding_final_dataframe_with_nan["MutationTaster_score.1"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["MutationTaster_score.1"])
benign_noncoding_final_dataframe_with_nan["MutationTaster_score"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["MutationTaster_score"])
benign_noncoding_final_dataframe_with_nan["MutationTaster_converted_rankscore"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["MutationTaster_converted_rankscore"])
benign_noncoding_final_dataframe_with_nan["MutationAssessor_score_rankscore"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["MutationAssessor_score_rankscore"])
benign_noncoding_final_dataframe_with_nan["MutationAssessor_score.1"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["MutationAssessor_score.1"])
benign_noncoding_final_dataframe_with_nan["MutationAssessor_score"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["MutationAssessor_score"])
benign_noncoding_final_dataframe_with_nan["MetaSVM_score"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["MetaSVM_score"])
benign_noncoding_final_dataframe_with_nan["MetaSVM_rankscore"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["MetaSVM_rankscore"])
benign_noncoding_final_dataframe_with_nan["MetaLR_score"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["MetaLR_score"])
benign_noncoding_final_dataframe_with_nan["MetaLR_rankscore"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["MetaLR_rankscore"])
benign_noncoding_final_dataframe_with_nan["MCAP"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["MCAP"])
benign_noncoding_final_dataframe_with_nan["M-CAP_score"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["M-CAP_score"])
benign_noncoding_final_dataframe_with_nan["M-CAP_rankscore"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["M-CAP_rankscore"])
benign_noncoding_final_dataframe_with_nan["Length"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["Length"])
benign_noncoding_final_dataframe_with_nan["LR_score"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["LR_score"])
benign_noncoding_final_dataframe_with_nan["LRT_score.1"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["LRT_score.1"])
benign_noncoding_final_dataframe_with_nan["LRT_score"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["LRT_score"])
benign_noncoding_final_dataframe_with_nan["LRT_converted_rankscore"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["LRT_converted_rankscore"])
benign_noncoding_final_dataframe_with_nan["Grantham"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["Grantham"])
benign_noncoding_final_dataframe_with_nan["GerpRSpval"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["GerpRSpval"])
benign_noncoding_final_dataframe_with_nan["GerpRS"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["GerpRS"])
benign_noncoding_final_dataframe_with_nan["GenoCanyon_score_rankscore"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["GenoCanyon_score_rankscore"])
benign_noncoding_final_dataframe_with_nan["GenoCanyon_score"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["GenoCanyon_score"])
benign_noncoding_final_dataframe_with_nan["GERP++_RS_rankscore"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["GERP++_RS_rankscore"])
benign_noncoding_final_dataframe_with_nan["GERP++_RS.1"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["GERP++_RS.1"])
benign_noncoding_final_dataframe_with_nan["GERP++_RS"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["GERP++_RS"])
benign_noncoding_final_dataframe_with_nan["GC"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["GC"])
benign_noncoding_final_dataframe_with_nan["Freq100bp"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["Freq100bp"])
benign_noncoding_final_dataframe_with_nan["Freq1000bp"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["Freq1000bp"])
benign_noncoding_final_dataframe_with_nan["Freq10000bp"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["Freq10000bp"])
benign_noncoding_final_dataframe_with_nan["FATHMM_score.1"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["FATHMM_score.1"])
benign_noncoding_final_dataframe_with_nan["FATHMM_score"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["FATHMM_score"])
benign_noncoding_final_dataframe_with_nan["FATHMM_converted_rankscore"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["FATHMM_converted_rankscore"])
benign_noncoding_final_dataframe_with_nan["End"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["End"])
benign_noncoding_final_dataframe_with_nan["EncodetotalRNA-sum"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["EncodetotalRNA-sum"])
benign_noncoding_final_dataframe_with_nan["EncodetotalRNA-max"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["EncodetotalRNA-max"])
benign_noncoding_final_dataframe_with_nan["EncodeH4K20me1-sum"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["EncodeH4K20me1-sum"])
benign_noncoding_final_dataframe_with_nan["EncodeH4K20me1-max"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["EncodeH4K20me1-max"])
benign_noncoding_final_dataframe_with_nan["EncodeH3K9me3-sum"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["EncodeH3K9me3-sum"])
benign_noncoding_final_dataframe_with_nan["EncodeH3K9me3-max"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["EncodeH3K9me3-max"])
benign_noncoding_final_dataframe_with_nan["EncodeH3K9ac-sum"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["EncodeH3K9ac-sum"])
benign_noncoding_final_dataframe_with_nan["EncodeH3K9ac-max"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["EncodeH3K9ac-max"])
benign_noncoding_final_dataframe_with_nan["EncodeH3K79me2-sum"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["EncodeH3K79me2-sum"])
benign_noncoding_final_dataframe_with_nan["EncodeH3K79me2-max"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["EncodeH3K79me2-max"])
benign_noncoding_final_dataframe_with_nan["EncodeH3K4me3-sum"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["EncodeH3K4me3-sum"])
benign_noncoding_final_dataframe_with_nan["EncodeH3K4me3-max"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["EncodeH3K4me3-max"])
benign_noncoding_final_dataframe_with_nan["EncodeH3K4me2-sum"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["EncodeH3K4me2-sum"])
benign_noncoding_final_dataframe_with_nan["EncodeH3K4me2-max"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["EncodeH3K4me2-max"])
benign_noncoding_final_dataframe_with_nan["EncodeH3K4me1-sum"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["EncodeH3K4me1-sum"])
benign_noncoding_final_dataframe_with_nan["EncodeH3K4me1-max"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["EncodeH3K4me1-max"])
benign_noncoding_final_dataframe_with_nan["EncodeH3K36me3-sum"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["EncodeH3K36me3-sum"])
benign_noncoding_final_dataframe_with_nan["EncodeH3K36me3-max"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["EncodeH3K36me3-max"])
benign_noncoding_final_dataframe_with_nan["EncodeH3K27me3-sum"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["EncodeH3K27me3-sum"])
benign_noncoding_final_dataframe_with_nan["EncodeH3K27me3-max"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["EncodeH3K27me3-max"])
benign_noncoding_final_dataframe_with_nan["EncodeH3K27ac-sum"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["EncodeH3K27ac-sum"])
benign_noncoding_final_dataframe_with_nan["EncodeH3K27ac-max"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["EncodeH3K27ac-max"])
benign_noncoding_final_dataframe_with_nan["EncodeH2AFZ-sum"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["EncodeH2AFZ-sum"])
benign_noncoding_final_dataframe_with_nan["EncodeH2AFZ-max"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["EncodeH2AFZ-max"])
benign_noncoding_final_dataframe_with_nan["EncodeDNase-sum"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["EncodeDNase-sum"])
benign_noncoding_final_dataframe_with_nan["EncodeDNase-max"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["EncodeDNase-max"])
benign_noncoding_final_dataframe_with_nan["Eigen-raw"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["Eigen-raw"])
benign_noncoding_final_dataframe_with_nan["Eigen-PC-raw"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["Eigen-PC-raw"])
benign_noncoding_final_dataframe_with_nan["Dst2Splice"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["Dst2Splice"])
benign_noncoding_final_dataframe_with_nan["Dist2Mutation"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["Dist2Mutation"])
benign_noncoding_final_dataframe_with_nan["DANN_score"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["DANN_score"])
benign_noncoding_final_dataframe_with_nan["DANN_rankscore"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["DANN_rankscore"])
benign_noncoding_final_dataframe_with_nan["CpG"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["CpG"])
benign_noncoding_final_dataframe_with_nan["ConsScore"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["ConsScore"])
benign_noncoding_final_dataframe_with_nan["CDSpos"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["CDSpos"])
benign_noncoding_final_dataframe_with_nan["CADD_raw_rankscore"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["CADD_raw_rankscore"])
benign_noncoding_final_dataframe_with_nan["CADD_raw.1"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["CADD_raw.1"])
benign_noncoding_final_dataframe_with_nan["CADD_raw"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["CADD_raw"])
benign_noncoding_final_dataframe_with_nan["CADD_phred.1"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["CADD_phred.1"])
benign_noncoding_final_dataframe_with_nan["CADD_phred"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["CADD_phred"])
benign_noncoding_final_dataframe_with_nan["BS4"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["BS4"])
benign_noncoding_final_dataframe_with_nan["BS3"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["BS3"])
benign_noncoding_final_dataframe_with_nan["BS2"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["BS2"])
benign_noncoding_final_dataframe_with_nan["BS1"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["BS1"])
benign_noncoding_final_dataframe_with_nan["BP7"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["BP7"])
benign_noncoding_final_dataframe_with_nan["BP6"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["BP6"])
benign_noncoding_final_dataframe_with_nan["BP5"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["BP5"])
benign_noncoding_final_dataframe_with_nan["BP4"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["BP4"])
benign_noncoding_final_dataframe_with_nan["BP3"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["BP3"])
benign_noncoding_final_dataframe_with_nan["BP2"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["BP2"])
benign_noncoding_final_dataframe_with_nan["BP1"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["BP1"])
benign_noncoding_final_dataframe_with_nan["BA1"] = pd.to_numeric(benign_noncoding_final_dataframe_with_nan["BA1"])

Below we have saved the above dataframe for future use:

In [None]:
benign_noncoding_final_dataframe_with_nan.to_csv('benign_noncoding_final_dataframe_with_nan_and_floats.csv')

In order to call in the dataframe correctly using dask dataframe, we must run the following code which specifies the data type for all annotations not included in our code. This gives us the final benign noncoding variant dataframe:

In [None]:
benign_noncoding_final_dataframe_with_nan_and_floats = dd.read_csv('benign_noncoding_final_dataframe_with_nan_and_floats.csv', dtype={'AAChange.ensGene': 'object',
       'AAChange.knownGene': 'object',
       'AAChange.refGene': 'object',
       'Domain': 'object',
       'Eigen_coding_or_noncoding': 'object',
       'Exon': 'object',
       'ExonicFunc.ensGene': 'object',
       'ExonicFunc.knownGene': 'object',
       'ExonicFunc.refGene': 'object',
       'FATHMM_pred': 'object',
       'FATHMM_pred.1': 'object',
       'InterVar_automated': 'object',
       'Interpro_domain': 'object',
       'Interpro_domain.1': 'object',
       'LRT_pred': 'object',
       'LRT_pred.1': 'object',
       'LR_pred': 'object',
       'M-CAP_pred': 'object',
       'MetaLR_pred': 'object',
       'MetaSVM_pred': 'object',
       'MutationAssessor_pred': 'object',
       'MutationAssessor_pred.1': 'object',
       'MutationTaster_pred': 'object',
       'MutationTaster_pred.1': 'object',
       'PROVEAN_pred': 'object',
       'PolyPhenCat': 'object',
       'Polyphen2_HDIV_pred': 'object',
       'Polyphen2_HDIV_pred.1': 'object',
       'Polyphen2_HVAR_pred': 'object',
       'Polyphen2_HVAR_pred.1': 'object',
       'RadialSVM_pred': 'object',
       'SIFT_pred': 'object',
       'SIFT_pred.1': 'object',
       'SIFTcat': 'object',
       'cosmic70': 'object',
       'fathmm-MKL_coding_pred': 'object',
       'nAA': 'object',
       'oAA': 'object', 
        'GTEx_V6_gene': 'object',
       'GTEx_V6_tissue': 'object', '#Chrom': 'object'}).compute()

### Pathogenic Coding Region:

##### ClinVar

Below is the Pathogenic Coding Region for ClinVar:

In [5]:
clinvar_pathogenic_coding_annotated_with_annovar = pd.read_csv('clinvar_pathogenic_coding_from_cadd.csv')
clinvar_pathogenic_coding_annotated_with_annovar

Unnamed: 0,Chr,Start,End,Ref,Alt,Func.refGene,Gene.refGene,GeneDetail.refGene,ExonicFunc.refGene,AAChange.refGene,...,BP4,BP5,BP6,BP7,cosmic70,MCAP,REVEL,regsnp_fpr,regsnp_disease,regsnp_splicing_site
0,chr10,100154922,100154922,G,A,exonic,ERLIN1,.,stopgain,"ERLIN1:NM_001347858:exon9:c.C283T:p.R95X,ERLIN...",...,0,0,0,0,.,.,.,.,.,.
1,chr10,100183802,100183802,C,A,exonic,ERLIN1,.,nonsynonymous SNV,"ERLIN1:NM_006459:exon2:c.G149T:p.G50V,ERLIN1:N...",...,.,.,.,.,.,0.5619750829,0.984,.,.,.
2,chr10,100246865,100246865,T,-,exonic,CWF19L1,.,frameshift deletion,CWF19L1:NM_001303406:exon5:c.368delA:p.D123Vfs...,...,.,.,.,.,.,.,.,.,.,.
3,chr10,100253422,100253422,G,A,exonic,CWF19L1,.,stopgain,"CWF19L1:NM_001303406:exon3:c.C211T:p.R71X,CWF1...",...,0,0,0,0,.,.,.,.,.,.
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16771,chrX,85981796,85981796,C,A,exonic,CHM,.,stopgain,"CHM:NM_000390:exon3:c.G130T:p.G44X,CHM:NM_0011...",...,0,0,0,0,.,.,.,.,.,.
16772,chrX,9759332,9759332,C,T,exonic,GPR143,.,nonsynonymous SNV,GPR143:NM_000273:exon3:c.G455A:p.S152N,...,0,0,0,0,.,0.676591847234,0.695,.,.,.
16773,chrX,9759390,9759390,A,G,exonic,GPR143,.,nonsynonymous SNV,GPR143:NM_000273:exon3:c.T397C:p.W133R,...,0,0,0,0,.,0.790856144973,0.400,.,.,.
16774,chrX,9759390,9759390,A,T,exonic,GPR143,.,nonsynonymous SNV,GPR143:NM_000273:exon3:c.T397A:p.W133R,...,0,0,0,0,.,0.790215722738,0.386,.,.,.


In [6]:
clinvar_pathogenic_coding_annotated_with_cadd = pd.read_table('clinvar_coding_pathogenic_cadd_annotations_noheader.tsv')
clinvar_pathogenic_coding_annotated_with_cadd

  """Entry point for launching an IPython kernel.
  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,#Chrom,Pos,Ref,Alt,Type,Length,AnnoType,Consequence,ConsScore,ConsDetail,...,Freq10000bp,Rare10000bp,Sngl10000bp,EnsembleRegulatoryFeature,dbscSNV-ada_score,dbscSNV-rf_score,RemapOverlapTF,RemapOverlapCL,RawScore,PHRED
0,1,1014143,C,T,SNV,0,CodingTranscript,STOP_GAINED,8.0,stop_gained,...,22,87,1622,Promoter,,,75.0,107.0,5.410558,35.00
1,1,1014143,C,T,SNV,0,Intergenic,UPSTREAM,1.0,upstream,...,22,87,1622,Promoter,,,75.0,107.0,5.410558,35.00
2,1,1014143,C,T,SNV,0,RegulatoryFeature,REGULATORY,4.0,regulatory,...,22,87,1622,Promoter,,,75.0,107.0,5.410558,35.00
3,1,1014143,C,T,SNV,0,RegulatoryFeature,REGULATORY,4.0,regulatory,...,22,87,1622,Promoter,,,75.0,107.0,5.410558,35.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31074,X,155280059,G,C,SNV,0,CodingTranscript,NON_SYNONYMOUS,7.0,missense,...,11,33,827,Promoter Flanking Region,,,,,1.198603,13.85
31075,X,155280059,G,C,SNV,0,RegulatoryFeature,REGULATORY,4.0,regulatory,...,11,33,827,Promoter Flanking Region,,,,,1.198603,13.85
31076,X,155506930,GAT,G,DEL,2,CodingTranscript,FRAME_SHIFT,7.0,frameshift,...,6,39,652,,,,1.0,1.0,4.018377,25.80
31077,X,155506930,GAT,G,DEL,2,Transcript,INTRONIC,2.0,"intron,non_coding",...,6,39,652,,,,1.0,1.0,4.018377,25.80


Below are the steps used to combine the annotations for clinvar pathogenic coding variants:

In [7]:
clinvar_pathogenic_coding_annotated_with_annovar['Chr'] = clinvar_pathogenic_coding_annotated_with_annovar['Chr'].map(lambda x: x.lstrip('chr').rstrip('aAbBcC'))
clinvar_pathogenic_coding_annotated_with_annovar = clinvar_pathogenic_coding_annotated_with_annovar.rename(columns = {'Chr': '#Chrom', 'Start': 'Pos'})
clinvar_pathogenic_coding_merged = pd.merge(clinvar_pathogenic_coding_annotated_with_cadd, clinvar_pathogenic_coding_annotated_with_annovar,
                                         how = 'left', on = ['#Chrom', 'Pos', 'Ref', 'Alt'])
clinvar_pathogenic_coding_merged

Unnamed: 0,#Chrom,Pos,Ref,Alt,Type,Length,AnnoType,Consequence,ConsScore,ConsDetail,...,BP4,BP5,BP6,BP7,cosmic70,MCAP,REVEL,regsnp_fpr,regsnp_disease,regsnp_splicing_site
0,1,1014143,C,T,SNV,0,CodingTranscript,STOP_GAINED,8.0,stop_gained,...,,,,,,,,,,
1,1,1014143,C,T,SNV,0,Intergenic,UPSTREAM,1.0,upstream,...,,,,,,,,,,
2,1,1014143,C,T,SNV,0,RegulatoryFeature,REGULATORY,4.0,regulatory,...,,,,,,,,,,
3,1,1014143,C,T,SNV,0,RegulatoryFeature,REGULATORY,4.0,regulatory,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31074,X,155280059,G,C,SNV,0,CodingTranscript,NON_SYNONYMOUS,7.0,missense,...,1,0,0,0,.,0.138062872375,0.222,.,.,.
31075,X,155280059,G,C,SNV,0,RegulatoryFeature,REGULATORY,4.0,regulatory,...,1,0,0,0,.,0.138062872375,0.222,.,.,.
31076,X,155506930,GAT,G,DEL,2,CodingTranscript,FRAME_SHIFT,7.0,frameshift,...,,,,,,,,,,
31077,X,155506930,GAT,G,DEL,2,Transcript,INTRONIC,2.0,"intron,non_coding",...,,,,,,,,,,


Below we drop the duplicate rows:

In [8]:
clinvar_pathogenic_coding_merged = clinvar_pathogenic_coding_merged.drop_duplicates(keep = 'first')
clinvar_pathogenic_coding_merged

Unnamed: 0,#Chrom,Pos,Ref,Alt,Type,Length,AnnoType,Consequence,ConsScore,ConsDetail,...,BP4,BP5,BP6,BP7,cosmic70,MCAP,REVEL,regsnp_fpr,regsnp_disease,regsnp_splicing_site
0,1,1014143,C,T,SNV,0,CodingTranscript,STOP_GAINED,8.0,stop_gained,...,,,,,,,,,,
1,1,1014143,C,T,SNV,0,Intergenic,UPSTREAM,1.0,upstream,...,,,,,,,,,,
2,1,1014143,C,T,SNV,0,RegulatoryFeature,REGULATORY,4.0,regulatory,...,,,,,,,,,,
3,1,1014143,C,T,SNV,0,RegulatoryFeature,REGULATORY,4.0,regulatory,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31074,X,155280059,G,C,SNV,0,CodingTranscript,NON_SYNONYMOUS,7.0,missense,...,1,0,0,0,.,0.138062872375,0.222,.,.,.
31075,X,155280059,G,C,SNV,0,RegulatoryFeature,REGULATORY,4.0,regulatory,...,1,0,0,0,.,0.138062872375,0.222,.,.,.
31076,X,155506930,GAT,G,DEL,2,CodingTranscript,FRAME_SHIFT,7.0,frameshift,...,,,,,,,,,,
31077,X,155506930,GAT,G,DEL,2,Transcript,INTRONIC,2.0,"intron,non_coding",...,,,,,,,,,,


##### HGMD

Again we repeat the same processes as above:

In [9]:
hgmd_coding_annotated_with_annovar = pd.read_csv('hgmd_coding_annotated.hg38_multianno.csv')
hgmd_coding_annotated_with_annovar

Unnamed: 0,Chr,Start,End,Ref,Alt,Func.refGene,Gene.refGene,GeneDetail.refGene,ExonicFunc.refGene,AAChange.refGene,...,BP4,BP5,BP6,BP7,cosmic70,MCAP,REVEL,regsnp_fpr,regsnp_disease,regsnp_splicing_site
0,chr10,100154922,100154922,G,A,exonic,ERLIN1,.,stopgain,"ERLIN1:NM_001347858:exon9:c.C283T:p.R95X,ERLIN...",...,0,0,0,0,.,.,.,.,.,.
1,chr10,100183802,100183802,C,A,exonic,ERLIN1,.,nonsynonymous SNV,"ERLIN1:NM_006459:exon2:c.G149T:p.G50V,ERLIN1:N...",...,.,.,.,.,.,0.5619750829,0.984,.,.,.
2,chr10,100253438,100253438,-,T,exonic,CWF19L1,.,stopgain,CWF19L1:NM_001303406:exon3:c.194dupA:p.Y65fs*0...,...,.,.,.,.,.,.,.,.,.,.
3,chr10,100256299,100256299,G,-,exonic,CWF19L1,.,frameshift deletion,CWF19L1:NM_001303406:exon2:c.56delC:p.P19Hfs*3...,...,.,.,.,.,.,.,.,.,.,.
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47641,chrX,9760731,9760731,A,G,exonic,GPR143,.,nonsynonymous SNV,GPR143:NM_000273:exon2:c.T346C:p.C116R,...,0,0,0,0,.,0.172141444647,0.295,.,.,.
47642,chrX,9760732,9760732,-,A,exonic,GPR143,.,frameshift insertion,GPR143:NM_000273:exon2:c.344dupT:p.C116Lfs*69,...,.,.,.,.,.,.,.,.,.,.
47643,chrX,9760737,9760737,C,-,exonic,GPR143,.,frameshift deletion,GPR143:NM_000273:exon2:c.340delG:p.A114Lfs*30,...,.,.,.,.,.,.,.,.,.,.
47644,chrX,9760741,9760741,-,G,exonic,GPR143,.,frameshift insertion,GPR143:NM_000273:exon2:c.335dupC:p.A113Cfs*72,...,.,.,.,.,.,.,.,.,.,.


In [10]:
hgmd_coding_annotated_with_cadd = pd.read_table('hgmd_coding_cadd_annotations_noheader.tsv')
hgmd_coding_annotated_with_cadd

  """Entry point for launching an IPython kernel.
  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,#Chrom,Pos,Ref,Alt,Type,Length,AnnoType,Consequence,ConsScore,ConsDetail,...,Freq10000bp,Rare10000bp,Sngl10000bp,EnsembleRegulatoryFeature,dbscSNV-ada_score,dbscSNV-rf_score,RemapOverlapTF,RemapOverlapCL,RawScore,PHRED
0,1,1014143,C,T,SNV,0,CodingTranscript,STOP_GAINED,8,stop_gained,...,22,87,1622,Promoter,,,75.0,107.0,5.410558,35.0
1,1,1014143,C,T,SNV,0,Intergenic,UPSTREAM,1,upstream,...,22,87,1622,Promoter,,,75.0,107.0,5.410558,35.0
2,1,1014143,C,T,SNV,0,RegulatoryFeature,REGULATORY,4,regulatory,...,22,87,1622,Promoter,,,75.0,107.0,5.410558,35.0
3,1,1014143,C,T,SNV,0,RegulatoryFeature,REGULATORY,4,regulatory,...,22,87,1622,Promoter,,,75.0,107.0,5.410558,35.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
89254,X,154966616,CTTCA,C,DEL,4,CodingTranscript,FRAME_SHIFT,7,frameshift,...,7,35,840,,,,,,2.413846,21.5
89255,X,154966617,TTC,T,DEL,2,CodingTranscript,FRAME_SHIFT,7,frameshift,...,7,35,840,,,,,,2.392576,21.3
89256,X,155524585,G,A,SNV,0,CodingTranscript,STOP_GAINED,8,stop_gained,...,9,26,682,,,,4.0,4.0,6.231926,35.0
89257,X,155524585,G,A,SNV,0,Transcript,INTRONIC,2,"intron,non_coding",...,9,26,682,,,,4.0,4.0,6.231926,35.0


Below are the steps used to combine the annotations for hgmd pathogenic coding variants:

In [11]:
hgmd_coding_annotated_with_annovar['Chr'] = hgmd_coding_annotated_with_annovar['Chr'].map(lambda x: x.lstrip('chr').rstrip('aAbBcC'))
hgmd_coding_annotated_with_annovar = hgmd_coding_annotated_with_annovar.rename(columns = {'Chr': '#Chrom', 'Start': 'Pos'})
hgmd_coding_merged = pd.merge(hgmd_coding_annotated_with_cadd, hgmd_coding_annotated_with_annovar,
                                         how = 'left', on = ['#Chrom', 'Pos', 'Ref', 'Alt'])
hgmd_coding_merged

Unnamed: 0,#Chrom,Pos,Ref,Alt,Type,Length,AnnoType,Consequence,ConsScore,ConsDetail,...,BP4,BP5,BP6,BP7,cosmic70,MCAP,REVEL,regsnp_fpr,regsnp_disease,regsnp_splicing_site
0,1,1014143,C,T,SNV,0,CodingTranscript,STOP_GAINED,8,stop_gained,...,,,,,,,,,,
1,1,1014143,C,T,SNV,0,Intergenic,UPSTREAM,1,upstream,...,,,,,,,,,,
2,1,1014143,C,T,SNV,0,RegulatoryFeature,REGULATORY,4,regulatory,...,,,,,,,,,,
3,1,1014143,C,T,SNV,0,RegulatoryFeature,REGULATORY,4,regulatory,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
89288,X,154966616,CTTCA,C,DEL,4,CodingTranscript,FRAME_SHIFT,7,frameshift,...,,,,,,,,,,
89289,X,154966617,TTC,T,DEL,2,CodingTranscript,FRAME_SHIFT,7,frameshift,...,,,,,,,,,,
89290,X,155524585,G,A,SNV,0,CodingTranscript,STOP_GAINED,8,stop_gained,...,0,0,0,0,ID=COSM3424643;OCCURENCE=1(large_intestine),.,.,.,.,.
89291,X,155524585,G,A,SNV,0,Transcript,INTRONIC,2,"intron,non_coding",...,0,0,0,0,ID=COSM3424643;OCCURENCE=1(large_intestine),.,.,.,.,.


Below I have dropped the duplicates which contain the same values for all annotations as another row, which means they are the exact same and therefore should be removed. I have kept one of them, as to not disregard the variant completely.

In [12]:
hgmd_coding_merged = hgmd_coding_merged.drop_duplicates(keep = 'first')
hgmd_coding_merged

Unnamed: 0,#Chrom,Pos,Ref,Alt,Type,Length,AnnoType,Consequence,ConsScore,ConsDetail,...,BP4,BP5,BP6,BP7,cosmic70,MCAP,REVEL,regsnp_fpr,regsnp_disease,regsnp_splicing_site
0,1,1014143,C,T,SNV,0,CodingTranscript,STOP_GAINED,8,stop_gained,...,,,,,,,,,,
1,1,1014143,C,T,SNV,0,Intergenic,UPSTREAM,1,upstream,...,,,,,,,,,,
2,1,1014143,C,T,SNV,0,RegulatoryFeature,REGULATORY,4,regulatory,...,,,,,,,,,,
3,1,1014143,C,T,SNV,0,RegulatoryFeature,REGULATORY,4,regulatory,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
89288,X,154966616,CTTCA,C,DEL,4,CodingTranscript,FRAME_SHIFT,7,frameshift,...,,,,,,,,,,
89289,X,154966617,TTC,T,DEL,2,CodingTranscript,FRAME_SHIFT,7,frameshift,...,,,,,,,,,,
89290,X,155524585,G,A,SNV,0,CodingTranscript,STOP_GAINED,8,stop_gained,...,0,0,0,0,ID=COSM3424643;OCCURENCE=1(large_intestine),.,.,.,.,.
89291,X,155524585,G,A,SNV,0,Transcript,INTRONIC,2,"intron,non_coding",...,0,0,0,0,ID=COSM3424643;OCCURENCE=1(large_intestine),.,.,.,.,.


Below is the final pathogenic coding region dataframe:

In [56]:
pathogenic_coding_region_variants = pd.concat([hgmd_coding_merged, clinvar_pathogenic_coding_merged])
pathogenic_coding_region_variants

Unnamed: 0,#Chrom,Pos,Ref,Alt,Type,...,MCAP,REVEL,regsnp_fpr,regsnp_disease,regsnp_splicing_site
0,1,1014143,C,T,SNV,...,,,,,
1,1,1014143,C,T,SNV,...,,,,,
2,1,1014143,C,T,SNV,...,,,,,
3,1,1014143,C,T,SNV,...,,,,,
...,...,...,...,...,...,...,...,...,...,...,...
31074,X,155280059,G,C,SNV,...,0.138062872375,0.222,.,.,.
31075,X,155280059,G,C,SNV,...,0.138062872375,0.222,.,.,.
31076,X,155506930,GAT,G,DEL,...,,,,,
31077,X,155506930,GAT,G,DEL,...,,,,,


Below we have dropped unneccessary columns we do not need for the model:

In [57]:
pathogenic_coding_region_variants = pathogenic_coding_region_variants.drop(columns = ['PolyPhenVal', 'priPhCons', 'cHmm_E1', 'cHmm_E2', 'cHmm_E3', 'cHmm_E4', 'cHmm_E5', 'cHmm_E6', 'cHmm_E7', 'cHmm_E8', 'cHmm_E9', 'cHmm_E10', 'cHmm_E11', 'cHmm_E12', 'cHmm_E13', 'cHmm_E14', 'cHmm_E15', 'cHmm_E16', 'cHmm_E17', 'cHmm_E18', 'cHmm_E19', 'cHmm_E20', 'cHmm_E21', 'cHmm_E22', 'cHmm_E23', 'cHmm_E24', 'cHmm_E25', 'GerpN', 'GerpS'])
pathogenic_coding_region_variants

Unnamed: 0,#Chrom,Pos,Ref,Alt,Type,...,MCAP,REVEL,regsnp_fpr,regsnp_disease,regsnp_splicing_site
0,1,1014143,C,T,SNV,...,,,,,
1,1,1014143,C,T,SNV,...,,,,,
2,1,1014143,C,T,SNV,...,,,,,
3,1,1014143,C,T,SNV,...,,,,,
...,...,...,...,...,...,...,...,...,...,...,...
31074,X,155280059,G,C,SNV,...,0.138062872375,0.222,.,.,.
31075,X,155280059,G,C,SNV,...,0.138062872375,0.222,.,.,.
31076,X,155506930,GAT,G,DEL,...,,,,,
31077,X,155506930,GAT,G,DEL,...,,,,,


I have replaced the values specified by '.' by NaN values as done before and changed all dtypes of numerical values to float, shown below:

In [58]:
pathogenic_coding_region_variants_with_nan = pathogenic_coding_region_variants.replace('.', np.nan)
pathogenic_coding_region_variants_with_nan = pathogenic_coding_region_variants_with_nan.astype({'SIFT_score': float, 'Polyphen2_HDIV_score' : float, 
                'Polyphen2_HVAR_score' : float, 'LRT_score' : float, 
                'MutationTaster_score' : float, 'MutationAssessor_score' : float, 
                'FATHMM_score' : float, 'RadialSVM_score' : float,
               'LR_score' : float, 'VEST3_score' : float, 'CADD_raw' : float,
               'CADD_phred' : float, 'GERP++_RS' : float, 'phyloP46way_placental' : float,
               'phyloP100way_vertebrate' : float, 'SiPhy_29way_logOdds' : float, 'SIFT_score.1' : float,
               'SIFT_converted_rankscore' : float, 'Polyphen2_HDIV_score.1' : float, 'Polyphen2_HDIV_rankscore' : float,
               'Polyphen2_HVAR_score.1' : float, 'Polyphen2_HVAR_rankscore' : float, 'LRT_score.1' : float,
               'LRT_converted_rankscore' : float, 'MutationTaster_score.1' : float, 'MutationTaster_converted_rankscore' : float,
               'MutationAssessor_score.1' : float, 'MutationAssessor_score_rankscore' : float, 'FATHMM_score.1' : float,
               'FATHMM_converted_rankscore' : float, 'PROVEAN_score' : float, 'PROVEAN_converted_rankscore' : float,
               'VEST3_score.1' : float, 'VEST3_rankscore' : float, 'MetaSVM_score' : float, 'MetaSVM_rankscore' : float,
               'MetaLR_score' : float, 'MetaLR_rankscore' : float, 'M-CAP_score' : float, 'M-CAP_rankscore' : float,
               'CADD_raw.1' : float, 'CADD_raw_rankscore' : float, 'CADD_phred.1' : float, 'DANN_score' : float,
               'DANN_rankscore' : float, 'fathmm-MKL_coding_score' : float, 'fathmm-MKL_coding_rankscore' : float,
               'Eigen-raw' : float, 'Eigen-PC-raw' : float, 'GenoCanyon_score' : float, 'GenoCanyon_score_rankscore' : float,
               'integrated_fitCons_score' : float, 'integrated_fitCons_score_rankscore' : float, 'integrated_confidence_value' : float,
               'GERP++_RS.1' : float, 'GERP++_RS_rankscore' : float, 'phyloP100way_vertebrate.1' : float, 
                'phyloP100way_vertebrate_rankscore' : float, 'phyloP20way_mammalian' : float, 'phyloP20way_mammalian_rankscore' : float,
               'phastCons100way_vertebrate' : float, 'phastCons100way_vertebrate_rankscore' : float, 'phastCons20way_mammalian' : float,
               'phastCons20way_mammalian_rankscore' : float, 'SiPhy_29way_logOdds.1' : float, 'SiPhy_29way_logOdds_rankscore' : float,
               'dbscSNV_ADA_SCORE' : float, 'dbscSNV_RF_SCORE' : float, 'PVS1' : float, 'PS1' : float, 'PS2' : float, 'PS3' : float,
               'PS4' : float, 'PM1' : float, 'PM2' : float, 'PM3' : float, 'PM4' : float, 'PM5' : float, 'PM6' : float, 'PP1' : float, 'PP2' : float,
               'PP3' : float, 'PP4' : float, 'PP5' : float, 'BA1' : float, 'BS1' : float, 'BS2' : float, 'BS3' : float, 'BS4' : float, 'BP1' : float, 'BP2' : float,
               'BP3' : float, 'BP4' : float, 'BP5' : float, 'BP6' : float, 'BP7' : float, 'MCAP' : float, 'REVEL' : float, 'regsnp_fpr' : float,
                'dbscSNV-rf_score' : float})

Below I have saved the resulting file for use in other notebooks:

In [59]:
pathogenic_coding_region_variants_with_nan.to_csv('pathogenic_coding_region_variants.csv')

## Pathogenic NonCoding

##### HGMD

Following the pipeline again:

In [17]:
hgmd_noncoding_annotated_with_annovar = pd.read_csv('hgmd_noncoding_annotated.hg38_multianno.csv')
hgmd_noncoding_annotated_with_annovar

Unnamed: 0,Chr,Start,End,Ref,Alt,Func.refGene,Gene.refGene,GeneDetail.refGene,ExonicFunc.refGene,AAChange.refGene,...,BP4,BP5,BP6,BP7,cosmic70,MCAP,REVEL,regsnp_fpr,regsnp_disease,regsnp_splicing_site
0,chr10,100988295,100988295,C,T,exonic,TWNK,.,stopgain,"TWNK:NM_001163812:exon1:c.C85T:p.R29X,TWNK:NM_...",...,0,0,0,0,.,.,.,.,.,.
1,chr10,100988415,100988415,A,T,exonic,TWNK,.,nonsynonymous SNV,"TWNK:NM_001163812:exon1:c.A205T:p.I69F,TWNK:NM...",...,0,0,0,0,.,0.0927775052736,0.592,.,.,.
2,chr10,100988457,100988457,C,T,exonic,TWNK,.,nonsynonymous SNV,"TWNK:NM_001163812:exon1:c.C247T:p.P83S,TWNK:NM...",...,0,0,0,0,.,0.381929214361,0.584,.,.,.
3,chr10,100988526,100988526,A,G,exonic,TWNK,.,nonsynonymous SNV,"TWNK:NM_001163812:exon1:c.A316G:p.K106E,TWNK:N...",...,0,0,0,0,.,0.207740715004,0.787,.,.,.
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3803,chrX,85964053,85964053,C,G,splicing,CHM,NM_000390:exon5:c.315-1G>C;NM_001320959:exon5:...,.,.,...,.,.,.,.,.,.,.,0.019730941704,D,on
3804,chrX,85964054,85964054,T,C,splicing,CHM,NM_000390:exon5:c.315-2A>G;NM_001320959:exon5:...,.,.,...,.,.,.,.,.,.,.,0.0206278026906,D,on
3805,chrX,85965588,85965588,T,C,intronic,CHM,.,.,.,...,.,.,.,.,.,.,.,.,.,.
3806,chrX,85968639,85968639,A,T,intronic,CHM,.,.,.,...,.,.,.,.,.,.,.,.,.,.


In [18]:
hgmd_noncoding_annotated_with_cadd = pd.read_table('hgmd_noncoding_cadd_annotations_noheader.tsv')
hgmd_noncoding_annotated_with_cadd

  """Entry point for launching an IPython kernel.
  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,#Chrom,Pos,Ref,Alt,Type,Length,AnnoType,Consequence,ConsScore,ConsDetail,...,Freq10000bp,Rare10000bp,Sngl10000bp,EnsembleRegulatoryFeature,dbscSNV-ada_score,dbscSNV-rf_score,RemapOverlapTF,RemapOverlapCL,RawScore,PHRED
0,1,7961859,C,G,SNV,0,RegulatoryFeature,REGULATORY,4,regulatory,...,26,48,1451,Promoter,,,208.0,475.0,1.043084,12.600
1,1,7961859,C,G,SNV,0,Transcript,INTRONIC,2,intron,...,26,48,1451,Promoter,,,208.0,475.0,1.043084,12.600
2,1,9720021,G,A,SNV,0,RegulatoryFeature,REGULATORY,4,regulatory,...,9,74,1406,CTCF Binding Site,0.00003,0.014,12.0,15.0,-0.385227,0.135
3,1,9720021,G,A,SNV,0,Transcript,SPLICE_SITE,5,"splice,intron",...,9,74,1406,CTCF Binding Site,0.00003,0.014,12.0,15.0,-0.385227,0.135
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8609,X,154965965,C,G,SNV,0,Transcript,SPLICE_SITE,5,"splice,intron",...,7,36,843,,0.99521,0.924,,,2.271896,20.600
8610,X,154965965,C,T,SNV,0,Transcript,SPLICE_SITE,5,"splice,intron",...,7,36,843,,0.98832,0.926,,,2.302688,20.800
8611,X,155492384,C,A,SNV,0,CodingTranscript,NON_SYNONYMOUS,7,missense,...,3,3,19,,,,,,2.463914,21.700
8612,X,155492384,C,A,SNV,0,Transcript,INTRONIC,2,"intron,non_coding",...,3,3,19,,,,,,2.463914,21.700


In [19]:
hgmd_noncoding_annotated_with_annovar['Chr'] = hgmd_noncoding_annotated_with_annovar['Chr'].map(lambda x: x.lstrip('chr').rstrip('aAbBcC'))
hgmd_noncoding_annotated_with_annovar = hgmd_noncoding_annotated_with_annovar.rename(columns = {'Chr': '#Chrom', 'Start': 'Pos'})
hgmd_noncoding_merged = pd.merge(hgmd_noncoding_annotated_with_cadd, hgmd_noncoding_annotated_with_annovar,
                                         how = 'left', on = ['#Chrom', 'Pos', 'Ref', 'Alt'])
hgmd_noncoding_merged

Unnamed: 0,#Chrom,Pos,Ref,Alt,Type,Length,AnnoType,Consequence,ConsScore,ConsDetail,...,BP4,BP5,BP6,BP7,cosmic70,MCAP,REVEL,regsnp_fpr,regsnp_disease,regsnp_splicing_site
0,1,7961859,C,G,SNV,0,RegulatoryFeature,REGULATORY,4,regulatory,...,.,.,.,.,.,.,.,.,.,.
1,1,7961859,C,G,SNV,0,Transcript,INTRONIC,2,intron,...,.,.,.,.,.,.,.,.,.,.
2,1,9720021,G,A,SNV,0,RegulatoryFeature,REGULATORY,4,regulatory,...,.,.,.,.,.,.,.,.,.,.
3,1,9720021,G,A,SNV,0,Transcript,SPLICE_SITE,5,"splice,intron",...,.,.,.,.,.,.,.,.,.,.
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8674,X,154965965,C,G,SNV,0,Transcript,SPLICE_SITE,5,"splice,intron",...,.,.,.,.,.,.,.,0.0,D,on
8675,X,154965965,C,T,SNV,0,Transcript,SPLICE_SITE,5,"splice,intron",...,.,.,.,.,.,.,.,0.0,D,on
8676,X,155492384,C,A,SNV,0,CodingTranscript,NON_SYNONYMOUS,7,missense,...,0,0,0,0,.,.,0.651,.,.,.
8677,X,155492384,C,A,SNV,0,Transcript,INTRONIC,2,"intron,non_coding",...,0,0,0,0,.,.,0.651,.,.,.


In [20]:
hgmd_noncoding_merged = hgmd_noncoding_merged.drop_duplicates(keep = 'first')
hgmd_noncoding_merged

Unnamed: 0,#Chrom,Pos,Ref,Alt,Type,Length,AnnoType,Consequence,ConsScore,ConsDetail,...,BP4,BP5,BP6,BP7,cosmic70,MCAP,REVEL,regsnp_fpr,regsnp_disease,regsnp_splicing_site
0,1,7961859,C,G,SNV,0,RegulatoryFeature,REGULATORY,4,regulatory,...,.,.,.,.,.,.,.,.,.,.
1,1,7961859,C,G,SNV,0,Transcript,INTRONIC,2,intron,...,.,.,.,.,.,.,.,.,.,.
2,1,9720021,G,A,SNV,0,RegulatoryFeature,REGULATORY,4,regulatory,...,.,.,.,.,.,.,.,.,.,.
3,1,9720021,G,A,SNV,0,Transcript,SPLICE_SITE,5,"splice,intron",...,.,.,.,.,.,.,.,.,.,.
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8674,X,154965965,C,G,SNV,0,Transcript,SPLICE_SITE,5,"splice,intron",...,.,.,.,.,.,.,.,0.0,D,on
8675,X,154965965,C,T,SNV,0,Transcript,SPLICE_SITE,5,"splice,intron",...,.,.,.,.,.,.,.,0.0,D,on
8676,X,155492384,C,A,SNV,0,CodingTranscript,NON_SYNONYMOUS,7,missense,...,0,0,0,0,.,.,0.651,.,.,.
8677,X,155492384,C,A,SNV,0,Transcript,INTRONIC,2,"intron,non_coding",...,0,0,0,0,.,.,0.651,.,.,.


##### ClinVar

In [21]:
clinvar_pathogenic_noncoding_annotated_with_annovar = pd.read_csv('clinvar.hg38_multianno.csv')
clinvar_pathogenic_noncoding_annotated_with_annovar

Unnamed: 0,Chr,Start,End,Ref,Alt,Func.refGene,Gene.refGene,GeneDetail.refGene,ExonicFunc.refGene,AAChange.refGene,...,BP4,BP5,BP6,BP7,cosmic70,MCAP,REVEL,regsnp_fpr,regsnp_disease,regsnp_splicing_site
0,chr10,100988541,100988541,T,-,exonic,TWNK,.,frameshift deletion,"TWNK:NM_001163812:exon1:c.331delT:p.L112Sfs*2,...",...,.,.,.,.,.,.,.,.,.,.
1,chr10,100989084,100989084,C,A,exonic,TWNK,.,nonsynonymous SNV,"TWNK:NM_001163812:exon1:c.C874A:p.P292T,TWNK:N...",...,0,0,0,0,.,0.153055471878,0.729,.,.,.
2,chr10,100989118,100989118,G,A,exonic,TWNK,.,nonsynonymous SNV,"TWNK:NM_001163812:exon1:c.G908A:p.R303Q,TWNK:N...",...,0,0,0,0,.,0.0960831061338,0.654,.,.,.
3,chr10,100989154,100989154,G,T,exonic,TWNK,.,nonsynonymous SNV,"TWNK:NM_001163812:exon1:c.G944T:p.W315L,TWNK:N...",...,0,0,0,0,.,0.328561992494,0.803,.,.,.
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1110,chrX,74529428,74529428,-,C,exonic,SLC16A2,.,frameshift insertion,SLC16A2:NM_006517:exon5:c.1387dupC:p.I465Hfs*51,...,.,.,.,.,.,.,.,.,.,.
1111,chrX,77902641,77902641,A,G,splicing,COX7B,NM_001866:exon2:c.41-2A>G,.,.,...,.,.,.,.,.,.,.,0.00358744394619,D,on
1112,chrX,78003237,78003237,G,A,splicing,ATP7A,NM_001282224:exon6:c.1707+1G>A;NM_000052:exon6...,.,.,...,.,.,.,.,.,.,.,0.00179372197309,D,on
1113,chrX,78122954,78122954,G,A,intronic,PGK1,.,.,.,...,.,.,.,.,.,.,.,0.0,D,on


In [22]:
clinvar_pathogenic_noncoding_annotated_with_cadd = pd.read_table('clinvar_noncoding_pathogenic_cadd_annotations_noheader.tsv')
clinvar_pathogenic_noncoding_annotated_with_cadd

  """Entry point for launching an IPython kernel.


Unnamed: 0,#Chrom,Pos,Ref,Alt,Type,Length,AnnoType,Consequence,ConsScore,ConsDetail,...,Freq10000bp,Rare10000bp,Sngl10000bp,EnsembleRegulatoryFeature,dbscSNV-ada_score,dbscSNV-rf_score,RemapOverlapTF,RemapOverlapCL,RawScore,PHRED
0,1,11960768,G,A,SNV,0,RegulatoryFeature,REGULATORY,4,regulatory,...,21,90,1434,Promoter Flanking Region,0.99998,0.942,27.0,36.0,5.268674,35.0
1,1,11960768,G,A,SNV,0,Transcript,CANONICAL_SPLICE,6,splice_donor,...,21,90,1434,Promoter Flanking Region,0.99998,0.942,27.0,36.0,5.268674,35.0
2,1,11964787,T,C,SNV,0,Transcript,CANONICAL_SPLICE,6,splice_donor,...,19,99,1332,,0.99987,0.886,10.0,10.0,4.686023,31.0
3,1,11972977,C,T,SNV,0,CodingTranscript,STOP_GAINED,8,stop_gained,...,20,95,1304,Promoter Flanking Region,,,19.0,23.0,7.497634,38.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2641,X,154419746,G,T,SNV,0,CodingTranscript,STOP_GAINED,8,"splice,stop_gained",...,11,29,822,,0.99999,1.0,9.0,10.0,7.476349,38.0
2642,X,154419746,G,T,SNV,0,Intergenic,DOWNSTREAM,1,downstream,...,11,29,822,,0.99999,1.0,9.0,10.0,7.476349,38.0
2643,X,154420211,G,C,SNV,0,Intergenic,DOWNSTREAM,1,downstream,...,12,32,828,,0.99999,0.944,38.0,46.0,4.640280,29.8
2644,X,154420211,G,C,SNV,0,Transcript,CANONICAL_SPLICE,6,splice_acceptor,...,12,32,828,,0.99999,0.944,38.0,46.0,4.640280,29.8


In [23]:
clinvar_pathogenic_noncoding_annotated_with_annovar['Chr'] = clinvar_pathogenic_noncoding_annotated_with_annovar['Chr'].map(lambda x: x.lstrip('chr').rstrip('aAbBcC'))
clinvar_pathogenic_noncoding_annotated_with_annovar = clinvar_pathogenic_noncoding_annotated_with_annovar.rename(columns = {'Chr': '#Chrom', 'Start': 'Pos'})
clinvar_pathogenic_noncoding_merged = pd.merge(clinvar_pathogenic_noncoding_annotated_with_cadd, clinvar_pathogenic_noncoding_annotated_with_annovar,
                                         how = 'left', on = ['#Chrom', 'Pos', 'Ref', 'Alt'])
clinvar_pathogenic_noncoding_merged

Unnamed: 0,#Chrom,Pos,Ref,Alt,Type,Length,AnnoType,Consequence,ConsScore,ConsDetail,...,BP4,BP5,BP6,BP7,cosmic70,MCAP,REVEL,regsnp_fpr,regsnp_disease,regsnp_splicing_site
0,1,11960768,G,A,SNV,0,RegulatoryFeature,REGULATORY,4,regulatory,...,.,.,.,.,.,.,.,.,.,.
1,1,11960768,G,A,SNV,0,Transcript,CANONICAL_SPLICE,6,splice_donor,...,.,.,.,.,.,.,.,.,.,.
2,1,11964787,T,C,SNV,0,Transcript,CANONICAL_SPLICE,6,splice_donor,...,.,.,.,.,.,.,.,.,.,.
3,1,11972977,C,T,SNV,0,CodingTranscript,STOP_GAINED,8,stop_gained,...,0,0,0,0,.,.,.,.,.,.
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2641,X,154419746,G,T,SNV,0,CodingTranscript,STOP_GAINED,8,"splice,stop_gained",...,0,0,0,0,.,.,.,.,.,.
2642,X,154419746,G,T,SNV,0,Intergenic,DOWNSTREAM,1,downstream,...,0,0,0,0,.,.,.,.,.,.
2643,X,154420211,G,C,SNV,0,Intergenic,DOWNSTREAM,1,downstream,...,.,.,.,.,.,0.853367625825,0.869,.,.,.
2644,X,154420211,G,C,SNV,0,Transcript,CANONICAL_SPLICE,6,splice_acceptor,...,.,.,.,.,.,0.853367625825,0.869,.,.,.


In [24]:
clinvar_pathogenic_noncoding_merged = clinvar_pathogenic_noncoding_merged.drop_duplicates(keep = 'first')
clinvar_pathogenic_noncoding_merged

Unnamed: 0,#Chrom,Pos,Ref,Alt,Type,Length,AnnoType,Consequence,ConsScore,ConsDetail,...,BP4,BP5,BP6,BP7,cosmic70,MCAP,REVEL,regsnp_fpr,regsnp_disease,regsnp_splicing_site
0,1,11960768,G,A,SNV,0,RegulatoryFeature,REGULATORY,4,regulatory,...,.,.,.,.,.,.,.,.,.,.
1,1,11960768,G,A,SNV,0,Transcript,CANONICAL_SPLICE,6,splice_donor,...,.,.,.,.,.,.,.,.,.,.
2,1,11964787,T,C,SNV,0,Transcript,CANONICAL_SPLICE,6,splice_donor,...,.,.,.,.,.,.,.,.,.,.
3,1,11972977,C,T,SNV,0,CodingTranscript,STOP_GAINED,8,stop_gained,...,0,0,0,0,.,.,.,.,.,.
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2641,X,154419746,G,T,SNV,0,CodingTranscript,STOP_GAINED,8,"splice,stop_gained",...,0,0,0,0,.,.,.,.,.,.
2642,X,154419746,G,T,SNV,0,Intergenic,DOWNSTREAM,1,downstream,...,0,0,0,0,.,.,.,.,.,.
2643,X,154420211,G,C,SNV,0,Intergenic,DOWNSTREAM,1,downstream,...,.,.,.,.,.,0.853367625825,0.869,.,.,.
2644,X,154420211,G,C,SNV,0,Transcript,CANONICAL_SPLICE,6,splice_acceptor,...,.,.,.,.,.,0.853367625825,0.869,.,.,.


The final dataframe for the pathogenic noncoding region variants is shown below:

In [48]:
pathogenic_noncoding_region_variants = pd.concat([hgmd_noncoding_merged, clinvar_pathogenic_noncoding_merged])
pathogenic_noncoding_region_variants

Unnamed: 0,#Chrom,Pos,Ref,Alt,Type,...,MCAP,REVEL,regsnp_fpr,regsnp_disease,regsnp_splicing_site
0,1,7961859,C,G,SNV,...,.,.,.,.,.
1,1,7961859,C,G,SNV,...,.,.,.,.,.
2,1,9720021,G,A,SNV,...,.,.,.,.,.
3,1,9720021,G,A,SNV,...,.,.,.,.,.
...,...,...,...,...,...,...,...,...,...,...,...
2641,X,154419746,G,T,SNV,...,.,.,.,.,.
2642,X,154419746,G,T,SNV,...,.,.,.,.,.
2643,X,154420211,G,C,SNV,...,0.853367625825,0.869,.,.,.
2644,X,154420211,G,C,SNV,...,0.853367625825,0.869,.,.,.


Below we have dropped unneccessary columns:

In [49]:
pathogenic_noncoding_region_variants = pathogenic_noncoding_region_variants.drop(columns = ['PolyPhenVal', 'priPhCons', 'cHmm_E1', 'cHmm_E2', 'cHmm_E3', 'cHmm_E4', 'cHmm_E5', 'cHmm_E6', 'cHmm_E7', 'cHmm_E8', 'cHmm_E9', 'cHmm_E10', 'cHmm_E11', 'cHmm_E12', 'cHmm_E13', 'cHmm_E14', 'cHmm_E15', 'cHmm_E16', 'cHmm_E17', 'cHmm_E18', 'cHmm_E19', 'cHmm_E20', 'cHmm_E21', 'cHmm_E22', 'cHmm_E23', 'cHmm_E24', 'cHmm_E25', 'GerpN', 'GerpS'])
pathogenic_noncoding_region_variants

Unnamed: 0,#Chrom,Pos,Ref,Alt,Type,...,MCAP,REVEL,regsnp_fpr,regsnp_disease,regsnp_splicing_site
0,1,7961859,C,G,SNV,...,.,.,.,.,.
1,1,7961859,C,G,SNV,...,.,.,.,.,.
2,1,9720021,G,A,SNV,...,.,.,.,.,.
3,1,9720021,G,A,SNV,...,.,.,.,.,.
...,...,...,...,...,...,...,...,...,...,...,...
2641,X,154419746,G,T,SNV,...,.,.,.,.,.
2642,X,154419746,G,T,SNV,...,.,.,.,.,.
2643,X,154420211,G,C,SNV,...,0.853367625825,0.869,.,.,.
2644,X,154420211,G,C,SNV,...,0.853367625825,0.869,.,.,.


Below I have changed the dtype like before and replaced the empty values with NaN values like before: 

In [50]:
pathogenic_noncoding_region_variants_with_nan = pathogenic_noncoding_region_variants.replace('.', np.nan)
pathogenic_noncoding_region_variants_with_nan = pathogenic_noncoding_region_variants_with_nan.astype({'SIFT_score': float, 'Polyphen2_HDIV_score' : float, 
                'Polyphen2_HVAR_score' : float, 'LRT_score' : float, 
                'MutationTaster_score' : float, 'MutationAssessor_score' : float, 
                'FATHMM_score' : float, 'RadialSVM_score' : float,
               'LR_score' : float, 'VEST3_score' : float, 'CADD_raw' : float,
               'CADD_phred' : float, 'GERP++_RS' : float, 'phyloP46way_placental' : float,
               'phyloP100way_vertebrate' : float, 'SiPhy_29way_logOdds' : float, 'SIFT_score.1' : float,
               'SIFT_converted_rankscore' : float, 'Polyphen2_HDIV_score.1' : float, 'Polyphen2_HDIV_rankscore' : float,
               'Polyphen2_HVAR_score.1' : float, 'Polyphen2_HVAR_rankscore' : float, 'LRT_score.1' : float,
               'LRT_converted_rankscore' : float, 'MutationTaster_score.1' : float, 'MutationTaster_converted_rankscore' : float,
               'MutationAssessor_score.1' : float, 'MutationAssessor_score_rankscore' : float, 'FATHMM_score.1' : float,
               'FATHMM_converted_rankscore' : float, 'PROVEAN_score' : float, 'PROVEAN_converted_rankscore' : float,
               'VEST3_score.1' : float, 'VEST3_rankscore' : float, 'MetaSVM_score' : float, 'MetaSVM_rankscore' : float,
               'MetaLR_score' : float, 'MetaLR_rankscore' : float, 'M-CAP_score' : float, 'M-CAP_rankscore' : float,
               'CADD_raw.1' : float, 'CADD_raw_rankscore' : float, 'CADD_phred.1' : float, 'DANN_score' : float,
               'DANN_rankscore' : float, 'fathmm-MKL_coding_score' : float, 'fathmm-MKL_coding_rankscore' : float,
               'Eigen-raw' : float, 'Eigen-PC-raw' : float, 'GenoCanyon_score' : float, 'GenoCanyon_score_rankscore' : float,
               'integrated_fitCons_score' : float, 'integrated_fitCons_score_rankscore' : float, 'integrated_confidence_value' : float,
               'GERP++_RS.1' : float, 'GERP++_RS_rankscore' : float, 'phyloP100way_vertebrate.1' : float, 
                'phyloP100way_vertebrate_rankscore' : float, 'phyloP20way_mammalian' : float, 'phyloP20way_mammalian_rankscore' : float,
               'phastCons100way_vertebrate' : float, 'phastCons100way_vertebrate_rankscore' : float, 'phastCons20way_mammalian' : float,
               'phastCons20way_mammalian_rankscore' : float, 'SiPhy_29way_logOdds.1' : float, 'SiPhy_29way_logOdds_rankscore' : float,
               'dbscSNV_ADA_SCORE' : float, 'dbscSNV_RF_SCORE' : float, 'PVS1' : float, 'PS1' : float, 'PS2' : float, 'PS3' : float,
               'PS4' : float, 'PM1' : float, 'PM2' : float, 'PM3' : float, 'PM4' : float, 'PM5' : float, 'PM6' : float, 'PP1' : float, 'PP2' : float,
               'PP3' : float, 'PP4' : float, 'PP5' : float, 'BA1' : float, 'BS1' : float, 'BS2' : float, 'BS3' : float, 'BS4' : float, 'BP1' : float, 'BP2' : float,
               'BP3' : float, 'BP4' : float, 'BP5' : float, 'BP6' : float, 'BP7' : float, 'MCAP' : float, 'REVEL' : float, 'regsnp_fpr' : float,
                'dbscSNV-rf_score' : float})

Below I have saved the above dataframe for use in other Notebooks:

In [51]:
pathogenic_noncoding_region_variants_with_nan.to_csv('pathogenic_noncoding_region_variants.csv')