In [3]:
import pandas as pd 
import numpy as np
import io
import matplotlib.pyplot as plt
pd.options.display.max_rows = 8

In [2]:
def read_vcf(path):
    with open(path, 'r') as f:
        lines = [l for l in f if not l.startswith('##')]
    return pd.read_csv(io.StringIO(''.join(lines)), dtype={'#CHROM': str, 'POS':int, 'ID':str, 'REF': str, 'ALT': str, 'QUAL': str, 'FILTER': str, 'INFO': str}, sep='\t').rename(columns={'#CHROM': 'CHROM'})

### All CADD Annotations were manipulated for annotation in the other Jupyter Notebook machine entitled CADD Annotations.ipynb

Steps for Annovar to get variants annotated: 

1. Upload the necessary files into Jupyter Notebook for data parsing
2. Create 3 more columns since it needs 8 column format
3. Make sure to keep title first row, like below
4. Create header and export to vcf
5. Put on WinSCP 
6. Cut header until only data
7. Run the two commands used to annotate the files

# Clinvar

Below is the file manipulation for clinvar pathogenic coding data:

In [22]:
pathogenic_coding_for_annovar = read_vcf('clinvar_pathogenic_coding.vcf')

In [23]:
pathogenic_coding_for_annovar['..'] = '.'
pathogenic_coding_for_annovar['...'] = '.'
pathogenic_coding_for_annovar['....'] = '.'
pathogenic_coding_for_annovar

Unnamed: 0,chr10,100154922,226426,G,A,..,...,....
0,chr10,100183802,226427,C,A,.,.,.
1,chr10,100246864,504028,AT,A,.,.,.
2,chr10,100253422,419610,G,A,.,.,.
3,chr10,100256298,253212,TG,T,.,.,.
...,...,...,...,...,...,...,...,...
16770,chrX,85981796,279771,C,A,.,.,.
16771,chrX,9759332,10517,C,T,.,.,.
16772,chrX,9759390,10516,A,G,.,.,.
16773,chrX,9759390,10519,A,T,.,.,.


In [24]:
pathogenic_coding_for_annovar.loc[-1] = ['chr10', '100154922', '22646', 'G', 'A', '.', '.', '.']
pathogenic_coding_for_annovar.index = pathogenic_coding_for_annovar.index + 1  # shifting index
pathogenic_coding_for_annovar = pathogenic_coding_for_annovar.sort_index()
pathogenic_coding_for_annovar.columns = ['', '', '', '', '', '', '', '']
pathogenic_coding_for_annovar

Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8
0,chr10,100154922,22646,G,A,.,.,.
1,chr10,100183802,226427,C,A,.,.,.
2,chr10,100246864,504028,AT,A,.,.,.
3,chr10,100253422,419610,G,A,.,.,.
...,...,...,...,...,...,...,...,...
16771,chrX,85981796,279771,C,A,.,.,.
16772,chrX,9759332,10517,C,T,.,.,.
16773,chrX,9759390,10516,A,G,.,.,.
16774,chrX,9759390,10519,A,T,.,.,.


In [17]:
header = """##fileformat=VCFv4.1
##fileDate=20090805
##source=myImputationProgramV3.1
##reference=file:///seq/references/
#CHROM POS ID REF ALT QUAL FILTER INFO
"""
output_VCF = "clinvar_pathogenic_coding_for_annovar.vcf"
with open(output_VCF, 'w') as vcf:
    vcf.write(header)
pathogenic_coding_for_annovar.to_csv(output_VCF, sep="\t", mode='a', index=False)

Below is the annotated results for clinvar pathogenic coding data from ANNOVAR:

In [12]:
clinvar_pathogenic_coding_annotated_with_annovar = pd.read_csv('clinvar_pathogenic_coding_from_cadd.csv')
clinvar_pathogenic_coding_annotated_with_annovar

Unnamed: 0,Chr,Start,End,Ref,Alt,Func.refGene,Gene.refGene,GeneDetail.refGene,ExonicFunc.refGene,AAChange.refGene,...,BP4,BP5,BP6,BP7,cosmic70,MCAP,REVEL,regsnp_fpr,regsnp_disease,regsnp_splicing_site
0,chr10,100154922,100154922,G,A,exonic,ERLIN1,.,stopgain,"ERLIN1:NM_001347858:exon9:c.C283T:p.R95X,ERLIN...",...,0,0,0,0,.,.,.,.,.,.
1,chr10,100183802,100183802,C,A,exonic,ERLIN1,.,nonsynonymous SNV,"ERLIN1:NM_006459:exon2:c.G149T:p.G50V,ERLIN1:N...",...,.,.,.,.,.,0.5619750829,0.984,.,.,.
2,chr10,100246865,100246865,T,-,exonic,CWF19L1,.,frameshift deletion,CWF19L1:NM_001303406:exon5:c.368delA:p.D123Vfs...,...,.,.,.,.,.,.,.,.,.,.
3,chr10,100253422,100253422,G,A,exonic,CWF19L1,.,stopgain,"CWF19L1:NM_001303406:exon3:c.C211T:p.R71X,CWF1...",...,0,0,0,0,.,.,.,.,.,.
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16771,chrX,85981796,85981796,C,A,exonic,CHM,.,stopgain,"CHM:NM_000390:exon3:c.G130T:p.G44X,CHM:NM_0011...",...,0,0,0,0,.,.,.,.,.,.
16772,chrX,9759332,9759332,C,T,exonic,GPR143,.,nonsynonymous SNV,GPR143:NM_000273:exon3:c.G455A:p.S152N,...,0,0,0,0,.,0.676591847234,0.695,.,.,.
16773,chrX,9759390,9759390,A,G,exonic,GPR143,.,nonsynonymous SNV,GPR143:NM_000273:exon3:c.T397C:p.W133R,...,0,0,0,0,.,0.790856144973,0.400,.,.,.
16774,chrX,9759390,9759390,A,T,exonic,GPR143,.,nonsynonymous SNV,GPR143:NM_000273:exon3:c.T397A:p.W133R,...,0,0,0,0,.,0.790215722738,0.386,.,.,.


Below is the annotated results for clinvar pathogenic coding data from CADD:

In [13]:
clinvar_pathogenic_coding_annotated_with_cadd = pd.read_table('clinvar_coding_pathogenic_cadd_annotations_noheader.tsv')
clinvar_pathogenic_coding_annotated_with_cadd

  """Entry point for launching an IPython kernel.
  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,#Chrom,Pos,Ref,Alt,Type,Length,AnnoType,Consequence,ConsScore,ConsDetail,...,Freq10000bp,Rare10000bp,Sngl10000bp,EnsembleRegulatoryFeature,dbscSNV-ada_score,dbscSNV-rf_score,RemapOverlapTF,RemapOverlapCL,RawScore,PHRED
0,1,1014143,C,T,SNV,0,CodingTranscript,STOP_GAINED,8.0,stop_gained,...,22,87,1622,Promoter,,,75.0,107.0,5.410558,35.00
1,1,1014143,C,T,SNV,0,Intergenic,UPSTREAM,1.0,upstream,...,22,87,1622,Promoter,,,75.0,107.0,5.410558,35.00
2,1,1014143,C,T,SNV,0,RegulatoryFeature,REGULATORY,4.0,regulatory,...,22,87,1622,Promoter,,,75.0,107.0,5.410558,35.00
3,1,1014143,C,T,SNV,0,RegulatoryFeature,REGULATORY,4.0,regulatory,...,22,87,1622,Promoter,,,75.0,107.0,5.410558,35.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31074,X,155280059,G,C,SNV,0,CodingTranscript,NON_SYNONYMOUS,7.0,missense,...,11,33,827,Promoter Flanking Region,,,,,1.198603,13.85
31075,X,155280059,G,C,SNV,0,RegulatoryFeature,REGULATORY,4.0,regulatory,...,11,33,827,Promoter Flanking Region,,,,,1.198603,13.85
31076,X,155506930,GAT,G,DEL,2,CodingTranscript,FRAME_SHIFT,7.0,frameshift,...,6,39,652,,,,1.0,1.0,4.018377,25.80
31077,X,155506930,GAT,G,DEL,2,Transcript,INTRONIC,2.0,"intron,non_coding",...,6,39,652,,,,1.0,1.0,4.018377,25.80


Below is the file manipulation for clinvar benign noncoding data:

In [7]:
clinvar_benign_noncoding_for_annovar = read_vcf('clinvar_noncoding_benign_for_cadd_noheader_with_chr_final_for_annovar.vcf')
clinvar_benign_noncoding_for_annovar

Unnamed: 0,chr10,100989312,136588,G,A
0,chr10,100990864,136589,C,T
1,chr10,100990866,136590,T,C
2,chr10,100991026,136591,C,A
3,chr10,100991027,136592,G,A
...,...,...,...,...,...
1555,chrX,71132767,213614,CCTCTTCTCTTCTCTTCTCTTCTCTT,C
1556,chrX,71132767,95249,CCTCTT,C
1557,chrX,71132767,95251,CCTCTTCTCTTCTCTTCTCTTCTCTTCTCTT,C
1558,chrX,78118027,558817,C,T


In [8]:
clinvar_benign_noncoding_for_annovar['..'] = '.'
clinvar_benign_noncoding_for_annovar['...'] = '.'
clinvar_benign_noncoding_for_annovar['....'] = '.'
clinvar_benign_noncoding_for_annovar.loc[-1] = ['chr10', '100989312', '136588', 'G', 'A', '.', '.', '.']
clinvar_benign_noncoding_for_annovar.index = clinvar_benign_noncoding_for_annovar.index + 1  # shifting index
clinvar_benign_noncoding_for_annovar = clinvar_benign_noncoding_for_annovar.sort_index()
clinvar_benign_noncoding_for_annovar.columns = ['', '', '', '', '', '', '', '']
clinvar_benign_noncoding_for_annovar

Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8
0,chr10,100989312,136588,G,A,.,.,.
1,chr10,100990864,136589,C,T,.,.,.
2,chr10,100990866,136590,T,C,.,.,.
3,chr10,100991026,136591,C,A,.,.,.
...,...,...,...,...,...,...,...,...
1556,chrX,71132767,213614,CCTCTTCTCTTCTCTTCTCTTCTCTT,C,.,.,.
1557,chrX,71132767,95249,CCTCTT,C,.,.,.
1558,chrX,71132767,95251,CCTCTTCTCTTCTCTTCTCTTCTCTTCTCTT,C,.,.,.
1559,chrX,78118027,558817,C,T,.,.,.


In [None]:
header = """##fileformat=VCFv4.1
##fileDate=20090805
##source=myImputationProgramV3.1
##reference=file:///seq/references/
#CHROM POS ID REF ALT QUAL FILTER INFO
"""
output_VCF = "clinvar_benign_noncoding_for_annovar.vcf"
with open(output_VCF, 'w') as vcf:
    vcf.write(header)
clinvar_benign_noncoding_for_annovar.to_csv(output_VCF, sep="\t", mode='a', index=False)

Below is the annotated results for clinvar benign noncoding data from ANNOVAR:

In [91]:
clinvar_benign_noncoding_annotated_with_annovar = pd.read_csv('clinvar_benign_noncoding.hg38_multianno.csv')
clinvar_benign_noncoding_annotated_with_annovar

Unnamed: 0,Chr,Start,End,Ref,Alt,Func.refGene,Gene.refGene,GeneDetail.refGene,ExonicFunc.refGene,AAChange.refGene,Func.knownGene,Gene.knownGene,GeneDetail.knownGene,ExonicFunc.knownGene,AAChange.knownGene,Func.ensGene,Gene.ensGene,GeneDetail.ensGene,ExonicFunc.ensGene,AAChange.ensGene,SIFT_score,SIFT_pred,Polyphen2_HDIV_score,Polyphen2_HDIV_pred,Polyphen2_HVAR_score,Polyphen2_HVAR_pred,LRT_score,LRT_pred,MutationTaster_score,MutationTaster_pred,MutationAssessor_score,MutationAssessor_pred,FATHMM_score,FATHMM_pred,RadialSVM_score,RadialSVM_pred,LR_score,LR_pred,VEST3_score,CADD_raw,CADD_phred,GERP++_RS,phyloP46way_placental,phyloP100way_vertebrate,SiPhy_29way_logOdds,Interpro_domain,SIFT_score.1,SIFT_converted_rankscore,SIFT_pred.1,Polyphen2_HDIV_score.1,Polyphen2_HDIV_rankscore,Polyphen2_HDIV_pred.1,Polyphen2_HVAR_score.1,Polyphen2_HVAR_rankscore,Polyphen2_HVAR_pred.1,LRT_score.1,LRT_converted_rankscore,LRT_pred.1,MutationTaster_score.1,MutationTaster_converted_rankscore,MutationTaster_pred.1,MutationAssessor_score.1,MutationAssessor_score_rankscore,MutationAssessor_pred.1,FATHMM_score.1,FATHMM_converted_rankscore,FATHMM_pred.1,PROVEAN_score,PROVEAN_converted_rankscore,PROVEAN_pred,VEST3_score.1,VEST3_rankscore,MetaSVM_score,MetaSVM_rankscore,MetaSVM_pred,MetaLR_score,MetaLR_rankscore,MetaLR_pred,M-CAP_score,M-CAP_rankscore,M-CAP_pred,CADD_raw.1,CADD_raw_rankscore,CADD_phred.1,DANN_score,DANN_rankscore,fathmm-MKL_coding_score,fathmm-MKL_coding_rankscore,fathmm-MKL_coding_pred,Eigen_coding_or_noncoding,Eigen-raw,Eigen-PC-raw,GenoCanyon_score,GenoCanyon_score_rankscore,integrated_fitCons_score,integrated_fitCons_score_rankscore,integrated_confidence_value,GERP++_RS.1,GERP++_RS_rankscore,phyloP100way_vertebrate.1,phyloP100way_vertebrate_rankscore,phyloP20way_mammalian,phyloP20way_mammalian_rankscore,phastCons100way_vertebrate,phastCons100way_vertebrate_rankscore,phastCons20way_mammalian,phastCons20way_mammalian_rankscore,SiPhy_29way_logOdds.1,SiPhy_29way_logOdds_rankscore,Interpro_domain.1,GTEx_V6_gene,GTEx_V6_tissue,dbscSNV_ADA_SCORE,dbscSNV_RF_SCORE,InterVar_automated,PVS1,PS1,PS2,PS3,PS4,PM1,PM2,PM3,PM4,PM5,PM6,PP1,PP2,PP3,PP4,PP5,BA1,BS1,BS2,BS3,BS4,BP1,BP2,BP3,BP4,BP5,BP6,BP7,cosmic70,MCAP,REVEL,regsnp_fpr,regsnp_disease,regsnp_splicing_site
0,chr10,100989312,100989312,G,A,exonic,TWNK,.,nonsynonymous SNV,"TWNK:NM_001163812:exon1:c.G1102A:p.V368I,TWNK:...",exonic,C10orf2,.,nonsynonymous SNV,"C10orf2:uc001ksf.3:exon1:c.G1102A:p.V368I,C10o...",exonic,TWNK,.,nonsynonymous SNV,"TWNK:ENST00000311916.6:exon1:c.G1102A:p.V368I,...",0.45,T,0.528,P,0.046,B,0.000,D,0.895,D,0.945,L,-3.32,D,-0.752,T,0.016,T,0.057,1.265,10.13,5.13,2.869,2.482,7.064,.,0.583,0.067,T,0.341,0.358,B,0.029,0.233,B,0.000,0.559,D,0.895,0.360,D,0.915,0.234,L,-3.32,0.951,D,-0.26,0.112,N,0.057,0.040,-0.752,0.579,T,0.016,0.065,T,.,.,.,1.707,0.266,14.45,0.925,0.213,0.856,0.444,D,c,0.028,0.207,1.000,0.747,0.672,0.522,0,5.13,0.696,2.572,0.454,1.047,0.674,0.998,0.411,0.991,0.552,7.064,0.242,.,.,.,.,.,Benign,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,.,.,0.353,.,.,.
1,chr10,100990864,100990864,C,T,intronic,TWNK,.,.,.,intronic,C10orf2,.,.,.,intronic,TWNK,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,0,0,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,0.624215246637,B,on
2,chr10,100990866,100990866,T,C,intronic,TWNK,.,.,.,intronic,C10orf2,.,.,.,intronic,TWNK,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,0,0.052,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,0.0717488789238,PD,on
3,chr10,100991026,100991026,C,A,UTR3,TWNK,NM_001163812:c.*1C>A;NM_001163814:c.*1C>A,.,.,UTR3,C10orf2,uc001ksg.3:c.*1C>A,.,.,UTR3,TWNK,ENST00000370228.1:c.*1C>A,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1556,chrX,71132768,71132792,CTCTTCTCTTCTCTTCTCTTCTCTT,-,intronic,MED12,.,.,.,intronic,MED12,.,.,.,intronic,MED12,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.
1557,chrX,71132768,71132772,CTCTT,-,intronic,MED12,.,.,.,intronic,MED12,.,.,.,intronic,MED12,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.
1558,chrX,71132768,71132797,CTCTTCTCTTCTCTTCTCTTCTCTTCTCTT,-,intronic,MED12,.,.,.,intronic,MED12,.,.,.,intronic,MED12,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.
1559,chrX,78118027,78118027,C,T,intronic,PGK1,.,.,.,intronic,PGK1,.,.,.,intronic,PGK1,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,0.61301369863,B,off


Below is the annotated results for clinvar benign noncoding data from CADD:

In [95]:
clinvar_benign_noncoding_annotated_with_cadd = pd.read_table('clinvar_noncoding_benign_cadd_annotations_noheader.tsv')
clinvar_benign_noncoding_annotated_with_cadd

  """Entry point for launching an IPython kernel.


Unnamed: 0,#Chrom,Pos,Ref,Alt,Type,Length,AnnoType,Consequence,ConsScore,ConsDetail,GC,CpG,motifECount,motifEName,motifEHIPos,motifEScoreChng,oAA,nAA,GeneID,FeatureID,GeneName,CCDS,Intron,Exon,cDNApos,relcDNApos,CDSpos,relCDSpos,protPos,relProtPos,Domain,Dst2Splice,Dst2SplType,minDistTSS,minDistTSE,SIFTcat,SIFTval,PolyPhenCat,PolyPhenVal,priPhCons,mamPhCons,verPhCons,priPhyloP,mamPhyloP,verPhyloP,bStatistic,targetScan,mirSVR-Score,mirSVR-E,mirSVR-Aln,cHmm_E1,cHmm_E2,cHmm_E3,cHmm_E4,cHmm_E5,cHmm_E6,cHmm_E7,cHmm_E8,cHmm_E9,cHmm_E10,cHmm_E11,cHmm_E12,cHmm_E13,cHmm_E14,cHmm_E15,cHmm_E16,cHmm_E17,cHmm_E18,cHmm_E19,cHmm_E20,cHmm_E21,cHmm_E22,cHmm_E23,cHmm_E24,cHmm_E25,GerpRS,GerpRSpval,GerpN,GerpS,tOverlapMotifs,motifDist,EncodeH3K4me1-sum,EncodeH3K4me1-max,EncodeH3K4me2-sum,EncodeH3K4me2-max,EncodeH3K4me3-sum,EncodeH3K4me3-max,EncodeH3K9ac-sum,EncodeH3K9ac-max,EncodeH3K9me3-sum,EncodeH3K9me3-max,EncodeH3K27ac-sum,EncodeH3K27ac-max,EncodeH3K27me3-sum,EncodeH3K27me3-max,EncodeH3K36me3-sum,EncodeH3K36me3-max,EncodeH3K79me2-sum,EncodeH3K79me2-max,EncodeH4K20me1-sum,EncodeH4K20me1-max,EncodeH2AFZ-sum,EncodeH2AFZ-max,EncodeDNase-sum,EncodeDNase-max,EncodetotalRNA-sum,EncodetotalRNA-max,Grantham,Dist2Mutation,Freq100bp,Rare100bp,Sngl100bp,Freq1000bp,Rare1000bp,Sngl1000bp,Freq10000bp,Rare10000bp,Sngl10000bp,EnsembleRegulatoryFeature,dbscSNV-ada_score,dbscSNV-rf_score,RemapOverlapTF,RemapOverlapCL,RawScore,PHRED
0,1,1041950,T,C,SNV,0,Transcript,SPLICE_SITE,5,"splice,intron",0.702,0.120,,,,,,,ENSG00000188157,ENST00000379370,AGRN,CCDS30551.1,6/35,,,,,,,,,-6.0,ACCEPTOR,7083,553,,,,,0.006,0.0,0.0,0.398,-0.498,-0.484,940.0,,,,,0,2,0,2,0,7,6,5,1,2,2,0,0,0,12,0,0,0,0,0,2,4,3,0,0,,,4.03,4.03,,,7.16,1.40,6.10,2.06,6.40,0.99,2.91,0.78,5.47,1.33,2.72,1.32,3.20,1.44,13.29,3.23,15.60,3.09,21.17,3.99,5.78,4.03,0.90,0.57,0.31,0.17,,2,1,2,21,3,8,200,23,100,1931,,0.00001,0.002,10.0,13.0,0.092155,2.700
1,1,1042190,G,A,SNV,0,Transcript,INTRONIC,2,intron,0.669,0.093,,,,,,,ENSG00000188157,ENST00000379370,AGRN,CCDS30551.1,7/35,,,,,,,,,,,7323,793,,,,,0.011,0.0,0.0,-0.346,-1.186,-1.417,940.0,,,,,1,1,0,2,0,10,4,6,1,2,2,0,0,0,11,0,0,0,0,0,2,4,2,0,0,,,10.30,-20.60,1.0,0.13,8.05,1.59,9.22,2.78,5.16,1.25,4.23,2.12,8.84,2.66,6.76,1.20,9.21,2.85,26.09,9.04,20.55,4.96,28.70,5.25,4.82,1.26,0.80,0.38,0.54,0.19,,12,1,0,20,3,7,213,25,109,1955,,,,12.0,14.0,-0.401276,0.118
2,1,1043223,CCT,C,DEL,2,Transcript,INTRONIC,2,intron,0.678,0.079,,,,,,,ENSG00000188157,ENST00000379370,AGRN,CCDS30551.1,7/35,,,,,,,,,-14.0,ACCEPTOR,23101,12892,,,,,0.001,0.0,0.0,0.418,-0.047,-0.165,940.0,,,,,1,1,0,3,1,6,4,14,1,0,0,0,0,0,12,0,0,0,0,0,1,2,2,0,0,,,10.30,1.42,1.0,-0.96,2.89,0.74,3.29,1.03,6.55,2.05,6.62,1.17,6.56,2.21,5.19,1.22,5.39,2.43,19.45,4.05,12.22,3.37,18.53,4.93,5.36,2.41,0.63,0.30,0.33,0.26,,6,0,1,18,1,10,216,24,117,2004,,,,8.0,9.0,-0.012669,1.640
3,1,1045707,A,G,SNV,0,Transcript,INTRONIC,2,intron,0.656,0.093,,,,,,,ENSG00000188157,ENST00000379370,AGRN,CCDS30551.1,14/35,,,,,,,,,,,308,643,,,,,0.076,0.0,0.0,0.475,-0.843,-0.993,940.0,,,,,0,1,0,4,0,5,8,14,1,1,0,0,0,0,9,0,0,0,0,0,2,0,3,0,0,,,6.94,-13.90,,,4.50,0.82,1.41,0.67,3.61,0.94,3.39,0.95,1.25,0.55,2.68,0.85,1.59,0.94,35.93,7.26,8.03,3.06,17.59,3.50,3.05,1.14,0.84,0.37,0.43,0.17,,2,1,1,17,1,13,208,22,131,2126,,,,9.0,10.0,-0.057159,1.287
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3200,X,154419685,C,T,SNV,0,Transcript,INTRONIC,2,intron,0.589,0.067,,,,,,,ENSG00000102125,ENST00000601016,TAZ,CCDS14748.1,6/10,,,,,,,,,-20.0,ACCEPTOR,307,46,,,,,0.282,0.0,0.0,0.367,-1.410,-2.587,84.0,,,,,0,1,0,0,0,1,15,1,0,0,2,3,2,4,16,0,0,0,0,0,0,0,1,1,1,,,7.59,-12.20,,,17.10,5.44,9.06,3.54,10.70,3.56,12.16,2.72,16.61,3.38,19.98,9.16,8.35,2.58,23.41,3.23,10.09,1.87,12.96,2.68,7.48,1.84,0.55,0.10,0.82,0.16,,22,0,0,16,0,5,105,11,29,820,,,,4.0,4.0,-0.033909,1.464
3201,X,154420108,C,T,SNV,0,Intergenic,DOWNSTREAM,1,downstream,0.656,0.053,,,,,,,ENSG00000197180,ENST00000360656,CH17-340M24.3,,,,,,,,,,,,,117,14,,,,,0.007,0.0,0.0,-1.998,-2.121,-2.419,84.0,,,,,0,1,0,0,0,1,16,2,0,1,2,3,1,3,15,0,0,0,0,0,0,0,1,1,1,,,5.27,-10.50,,,15.24,5.14,6.47,1.41,5.91,1.49,13.65,3.71,11.55,4.22,5.02,1.10,2.45,1.14,19.29,4.51,4.87,2.06,14.04,2.63,7.68,1.94,0.43,0.17,0.19,0.09,,16,0,1,7,0,4,101,12,31,826,,,,36.0,47.0,-0.549155,0.036
3202,X,154420108,C,T,SNV,0,Transcript,INTRONIC,2,intron,0.656,0.053,,,,,,,ENSG00000102125,ENST00000601016,TAZ,CCDS14748.1,8/10,,,,,,,,,-14.0,DONOR,117,14,,,,,0.007,0.0,0.0,-1.998,-2.121,-2.419,84.0,,,,,0,1,0,0,0,1,16,2,0,1,2,3,1,3,15,0,0,0,0,0,0,0,1,1,1,,,5.27,-10.50,,,15.24,5.14,6.47,1.41,5.91,1.49,13.65,3.71,11.55,4.22,5.02,1.10,2.45,1.14,19.29,4.51,4.87,2.06,14.04,2.63,7.68,1.94,0.43,0.17,0.19,0.09,,16,0,1,7,0,4,101,12,31,826,,,,36.0,47.0,-0.549155,0.036
3203,X,154961190,A,G,SNV,0,Transcript,INTRONIC,2,intron,0.358,0.040,,,,,,,ENSG00000185010,ENST00000360256,F8,CCDS35457.1,9/25,,,,,,,,,,,61526,31037,,,,,0.202,0.0,0.0,0.404,0.046,0.041,212.0,,,,,0,0,1,33,0,0,0,0,0,0,0,0,0,0,12,0,0,0,0,0,1,1,0,0,0,,,10.90,-1.23,,,1.99,0.77,2.39,0.83,6.68,1.78,5.65,2.53,9.39,1.31,2.07,0.82,16.78,3.18,2.89,1.19,1.80,1.09,7.22,3.13,6.76,1.16,0.15,0.05,,,,11,0,1,10,0,3,75,7,45,823,,,,1.0,1.0,0.351934,5.879


Below is the file manipulation for clinvar noncoding benign data:

In [8]:
clinvar_benign_coding_for_annovar = read_vcf('clinvar_coding_benign_for_cadd_noheader_with_chr_final_for_annovar.vcf')
clinvar_benign_coding_for_annovar

Unnamed: 0,chr10,100987606,136593,G,T
0,chr10,102065918,284200,C,G
1,chr10,102396271,474786,G,A
2,chr10,102399466,541631,C,T
3,chr10,102401202,474781,C,T
...,...,...,...,...,...
4654,chrX,85964016,255991,T,C
4655,chrX,85978770,497462,G,C
4656,chrX,85978816,377662,T,A
4657,chrX,93671995,208906,G,C


In [9]:
clinvar_benign_coding_for_annovar['..'] = '.'
clinvar_benign_coding_for_annovar['...'] = '.'
clinvar_benign_coding_for_annovar['....'] = '.'
clinvar_benign_coding_for_annovar.loc[-1] = ['chr10', '100987606', '136593', 'G', 'T', '.', '.', '.']
clinvar_benign_coding_for_annovar.index = clinvar_benign_coding_for_annovar.index + 1  # shifting index
clinvar_benign_coding_for_annovar = clinvar_benign_coding_for_annovar.sort_index()
clinvar_benign_coding_for_annovar.columns = ['', '', '', '', '', '', '', '']
clinvar_benign_coding_for_annovar

Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8
0,chr10,100987606,136593,G,T,.,.,.
1,chr10,102065918,284200,C,G,.,.,.
2,chr10,102396271,474786,G,A,.,.,.
3,chr10,102399466,541631,C,T,.,.,.
...,...,...,...,...,...,...,...,...
4655,chrX,85964016,255991,T,C,.,.,.
4656,chrX,85978770,497462,G,C,.,.,.
4657,chrX,85978816,377662,T,A,.,.,.
4658,chrX,93671995,208906,G,C,.,.,.


In [72]:
header = """##fileformat=VCFv4.1
##fileDate=20090805
##source=myImputationProgramV3.1
##reference=file:///seq/references/
#CHROM POS ID REF ALT QUAL FILTER INFO
"""
output_VCF = "clinvar_benign_coding_for_annovar.vcf"
with open(output_VCF, 'w') as vcf:
    vcf.write(header)
clinvar_benign_coding_for_annovar.to_csv(output_VCF, sep="\t", mode='a', index=False)

NameError: name 'clinvar_benign_coding_for_annovar' is not defined

Below is the annotated results for clinvar benign coding data from ANNOVAR:

In [58]:
clinvar_benign_coding_annotated_with_annovar = pd.read_csv('clinvar_benign_coding_for_annovar.hg38_multianno.csv')
clinvar_benign_coding_annotated_with_annovar

Unnamed: 0,Chr,Start,End,Ref,Alt,Func.refGene,Gene.refGene,GeneDetail.refGene,ExonicFunc.refGene,AAChange.refGene,...,BP4,BP5,BP6,BP7,cosmic70,MCAP,REVEL,regsnp_fpr,regsnp_disease,regsnp_splicing_site
0,chr10,100987606,100987606,G,T,UTR5,TWNK,NM_021830:c.-605G>T;NM_001163813:c.-2157G>T;NM...,.,.,...,.,.,.,.,.,.,.,.,.,.
1,chr10,102065918,102065918,C,G,exonic,HPS6,.,synonymous SNV,HPS6:NM_024747:exon1:c.C444G:p.A148A,...,1,0,1,1,.,.,.,.,.,.
2,chr10,102396271,102396271,G,A,exonic,NFKB2,.,nonsynonymous SNV,"NFKB2:NM_001261403:exon2:c.G40A:p.E14K,NFKB2:N...",...,0,0,0,0,.,.,0.018,.,.,.
3,chr10,102399466,102399466,C,T,exonic,NFKB2,.,synonymous SNV,"NFKB2:NM_001261403:exon12:c.C1296T:p.C432C,NFK...",...,1,0,0,1,.,.,.,.,.,.
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4655,chrX,85964016,85964016,T,C,exonic,CHM,.,synonymous SNV,CHM:NM_000390:exon5:c.A351G:p.A117A,...,1,0,1,1,.,.,.,.,.,.
4656,chrX,85978770,85978770,G,C,exonic,CHM,.,nonsynonymous SNV,"CHM:NM_000390:exon4:c.C311G:p.A104G,CHM:NM_001...",...,0,0,0,0,.,0.0245874451416,0.248,.,.,.
4657,chrX,85978816,85978816,T,A,exonic,CHM,.,nonsynonymous SNV,"CHM:NM_000390:exon4:c.A265T:p.S89C,CHM:NM_0011...",...,0,0,1,0,.,.,0.100,.,.,.
4658,chrX,93671995,93671995,G,C,exonic,NAP1L3,.,nonsynonymous SNV,NAP1L3:NM_004538:exon1:c.C1310G:p.A437G,...,1,0,1,0,.,0.0059993776503,0.060,.,.,.


Below is the annotated results for clinvar benign coding data from CADD:

In [45]:
clinvar_benign_coding_annotated_with_cadd = pd.read_table('clinvar_coding_benign_cadd_annotations_noheader.tsv')
clinvar_benign_coding_annotated_with_cadd

  """Entry point for launching an IPython kernel.
  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,#Chrom,Pos,Ref,Alt,Type,Length,AnnoType,Consequence,ConsScore,ConsDetail,...,Freq10000bp,Rare10000bp,Sngl10000bp,EnsembleRegulatoryFeature,dbscSNV-ada_score,dbscSNV-rf_score,RemapOverlapTF,RemapOverlapCL,RawScore,PHRED
0,1,1014042,G,A,SNV,0,CodingTranscript,NON_SYNONYMOUS,7,missense,...,22,87,1616,Promoter,,,69.0,97.0,0.076262,2.519
1,1,1014042,G,A,SNV,0,Intergenic,UPSTREAM,1,upstream,...,22,87,1616,Promoter,,,69.0,97.0,0.076262,2.519
2,1,1014042,G,A,SNV,0,RegulatoryFeature,REGULATORY,4,regulatory,...,22,87,1616,Promoter,,,69.0,97.0,0.076262,2.519
3,1,1014042,G,A,SNV,0,RegulatoryFeature,REGULATORY,4,regulatory,...,22,87,1616,Promoter,,,69.0,97.0,0.076262,2.519
4,1,1014217,C,T,SNV,0,CodingTranscript,SYNONYMOUS,5,synonymous,...,21,87,1629,Promoter,,,69.0,94.0,-0.293246,0.276
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9133,X,154776309,C,T,SNV,0,Intergenic,DOWNSTREAM,1,downstream,...,4,21,970,,,,17.0,17.0,-0.180506,0.611
9134,X,154776813,C,CAAG,INS,3,CodingTranscript,INFRAME,6,inframe_insertion,...,4,22,980,,,,7.0,7.0,0.893695,10.880
9135,X,154776813,C,CAAG,INS,3,Intergenic,DOWNSTREAM,1,downstream,...,4,22,980,,,,7.0,7.0,0.893695,10.880
9136,X,154776813,C,CAAG,INS,3,Intergenic,DOWNSTREAM,1,downstream,...,4,22,980,,,,7.0,7.0,0.893695,10.880


Below is the annotated results for clinvar pathogenic noncoding data from ANNOVAR:

In [9]:
clinvar_pathogenic_noncoding_annotated_with_annovar = pd.read_csv('clinvar.hg38_multianno.csv')
clinvar_pathogenic_noncoding_annotated_with_annovar

Unnamed: 0,Chr,Start,End,Ref,Alt,Func.refGene,Gene.refGene,GeneDetail.refGene,ExonicFunc.refGene,AAChange.refGene,...,BP4,BP5,BP6,BP7,cosmic70,MCAP,REVEL,regsnp_fpr,regsnp_disease,regsnp_splicing_site
0,chr10,100988541,100988541,T,-,exonic,TWNK,.,frameshift deletion,"TWNK:NM_001163812:exon1:c.331delT:p.L112Sfs*2,...",...,.,.,.,.,.,.,.,.,.,.
1,chr10,100989084,100989084,C,A,exonic,TWNK,.,nonsynonymous SNV,"TWNK:NM_001163812:exon1:c.C874A:p.P292T,TWNK:N...",...,0,0,0,0,.,0.153055471878,0.729,.,.,.
2,chr10,100989118,100989118,G,A,exonic,TWNK,.,nonsynonymous SNV,"TWNK:NM_001163812:exon1:c.G908A:p.R303Q,TWNK:N...",...,0,0,0,0,.,0.0960831061338,0.654,.,.,.
3,chr10,100989154,100989154,G,T,exonic,TWNK,.,nonsynonymous SNV,"TWNK:NM_001163812:exon1:c.G944T:p.W315L,TWNK:N...",...,0,0,0,0,.,0.328561992494,0.803,.,.,.
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1110,chrX,74529428,74529428,-,C,exonic,SLC16A2,.,frameshift insertion,SLC16A2:NM_006517:exon5:c.1387dupC:p.I465Hfs*51,...,.,.,.,.,.,.,.,.,.,.
1111,chrX,77902641,77902641,A,G,splicing,COX7B,NM_001866:exon2:c.41-2A>G,.,.,...,.,.,.,.,.,.,.,0.00358744394619,D,on
1112,chrX,78003237,78003237,G,A,splicing,ATP7A,NM_001282224:exon6:c.1707+1G>A;NM_000052:exon6...,.,.,...,.,.,.,.,.,.,.,0.00179372197309,D,on
1113,chrX,78122954,78122954,G,A,intronic,PGK1,.,.,.,...,.,.,.,.,.,.,.,0.0,D,on


Below is the annotated result for pathogenic noncoding variants from CADD:

In [10]:
clinvar_pathogenic_noncoding_annotated_with_cadd = pd.read_table('clinvar_noncoding_pathogenic_cadd_annotations_noheader.tsv')
clinvar_pathogenic_noncoding_annotated_with_cadd

  """Entry point for launching an IPython kernel.


Unnamed: 0,#Chrom,Pos,Ref,Alt,Type,Length,AnnoType,Consequence,ConsScore,ConsDetail,...,Freq10000bp,Rare10000bp,Sngl10000bp,EnsembleRegulatoryFeature,dbscSNV-ada_score,dbscSNV-rf_score,RemapOverlapTF,RemapOverlapCL,RawScore,PHRED
0,1,11960768,G,A,SNV,0,RegulatoryFeature,REGULATORY,4,regulatory,...,21,90,1434,Promoter Flanking Region,0.99998,0.942,27.0,36.0,5.268674,35.0
1,1,11960768,G,A,SNV,0,Transcript,CANONICAL_SPLICE,6,splice_donor,...,21,90,1434,Promoter Flanking Region,0.99998,0.942,27.0,36.0,5.268674,35.0
2,1,11964787,T,C,SNV,0,Transcript,CANONICAL_SPLICE,6,splice_donor,...,19,99,1332,,0.99987,0.886,10.0,10.0,4.686023,31.0
3,1,11972977,C,T,SNV,0,CodingTranscript,STOP_GAINED,8,stop_gained,...,20,95,1304,Promoter Flanking Region,,,19.0,23.0,7.497634,38.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2641,X,154419746,G,T,SNV,0,CodingTranscript,STOP_GAINED,8,"splice,stop_gained",...,11,29,822,,0.99999,1.0,9.0,10.0,7.476349,38.0
2642,X,154419746,G,T,SNV,0,Intergenic,DOWNSTREAM,1,downstream,...,11,29,822,,0.99999,1.0,9.0,10.0,7.476349,38.0
2643,X,154420211,G,C,SNV,0,Intergenic,DOWNSTREAM,1,downstream,...,12,32,828,,0.99999,0.944,38.0,46.0,4.640280,29.8
2644,X,154420211,G,C,SNV,0,Transcript,CANONICAL_SPLICE,6,splice_acceptor,...,12,32,828,,0.99999,0.944,38.0,46.0,4.640280,29.8


# HGMD

Below is the file manipulation for hgmd coding data:

In [17]:
hgmd_coding_for_annovar = read_vcf('hgmd_coding_for_cadd_noheader_for_annovar.vcf')
hgmd_coding_for_annovar

Unnamed: 0,chr10,100154922,CM140970,G,A
0,chr10,100183802,CM140971,C,A
1,chr10,100253438,CI1824020,A,AT
2,chr10,100256298,CD162836,TG,T
3,chr10,100262050,CM162834,C,G
...,...,...,...,...,...
47640,chrX,9760731,CM981395,A,G
47641,chrX,9760732,CI183806,G,GA
47642,chrX,9760736,CD171619,GC,G
47643,chrX,9760741,CI115195,A,AG


In [18]:
hgmd_coding_for_annovar['..'] = '.'
hgmd_coding_for_annovar['...'] = '.'
hgmd_coding_for_annovar['....'] = '.'
hgmd_coding_for_annovar.loc[-1] = ['chr10', '100154922', 'CM140970', 'G', 'A', '.', '.', '.']
hgmd_coding_for_annovar.index = hgmd_coding_for_annovar.index + 1  # shifting index
hgmd_coding_for_annovar = hgmd_coding_for_annovar.sort_index()
hgmd_coding_for_annovar.columns = ['', '', '', '', '', '', '', '']
hgmd_coding_for_annovar

Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8
0,chr10,100154922,CM140970,G,A,.,.,.
1,chr10,100183802,CM140971,C,A,.,.,.
2,chr10,100253438,CI1824020,A,AT,.,.,.
3,chr10,100256298,CD162836,TG,T,.,.,.
...,...,...,...,...,...,...,...,...
47641,chrX,9760731,CM981395,A,G,.,.,.
47642,chrX,9760732,CI183806,G,GA,.,.,.
47643,chrX,9760736,CD171619,GC,G,.,.,.
47644,chrX,9760741,CI115195,A,AG,.,.,.


In [19]:
header = """##fileformat=VCFv4.1
##fileDate=20090805
##source=myImputationProgramV3.1
##reference=file:///seq/references/
#CHROM POS ID REF ALT QUAL FILTER INFO
"""
output_VCF = "hgmd_coding_for_annovar.vcf"
with open(output_VCF, 'w') as vcf:
    vcf.write(header)
hgmd_coding_for_annovar.to_csv(output_VCF, sep="\t", mode='a', index=False)

Below is the annotated results for hgmd coding data from ANNOVAR:

In [63]:
hgmd_coding_annotated_with_annovar = pd.read_csv('hgmd_coding_annotated.hg38_multianno.csv')
hgmd_coding_annotated_with_annovar

Unnamed: 0,Chr,Start,End,Ref,Alt,Func.refGene,Gene.refGene,GeneDetail.refGene,ExonicFunc.refGene,AAChange.refGene,...,BP4,BP5,BP6,BP7,cosmic70,MCAP,REVEL,regsnp_fpr,regsnp_disease,regsnp_splicing_site
0,chr10,100154922,100154922,G,A,exonic,ERLIN1,.,stopgain,"ERLIN1:NM_001347858:exon9:c.C283T:p.R95X,ERLIN...",...,0,0,0,0,.,.,.,.,.,.
1,chr10,100183802,100183802,C,A,exonic,ERLIN1,.,nonsynonymous SNV,"ERLIN1:NM_006459:exon2:c.G149T:p.G50V,ERLIN1:N...",...,.,.,.,.,.,0.5619750829,0.984,.,.,.
2,chr10,100253438,100253438,-,T,exonic,CWF19L1,.,stopgain,CWF19L1:NM_001303406:exon3:c.194dupA:p.Y65fs*0...,...,.,.,.,.,.,.,.,.,.,.
3,chr10,100256299,100256299,G,-,exonic,CWF19L1,.,frameshift deletion,CWF19L1:NM_001303406:exon2:c.56delC:p.P19Hfs*3...,...,.,.,.,.,.,.,.,.,.,.
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47641,chrX,9760731,9760731,A,G,exonic,GPR143,.,nonsynonymous SNV,GPR143:NM_000273:exon2:c.T346C:p.C116R,...,0,0,0,0,.,0.172141444647,0.295,.,.,.
47642,chrX,9760732,9760732,-,A,exonic,GPR143,.,frameshift insertion,GPR143:NM_000273:exon2:c.344dupT:p.C116Lfs*69,...,.,.,.,.,.,.,.,.,.,.
47643,chrX,9760737,9760737,C,-,exonic,GPR143,.,frameshift deletion,GPR143:NM_000273:exon2:c.340delG:p.A114Lfs*30,...,.,.,.,.,.,.,.,.,.,.
47644,chrX,9760741,9760741,-,G,exonic,GPR143,.,frameshift insertion,GPR143:NM_000273:exon2:c.335dupC:p.A113Cfs*72,...,.,.,.,.,.,.,.,.,.,.


Below is the annotated results for hgmd coding data from CADD:

In [54]:
hgmd_coding_annotated_with_cadd = pd.read_table('hgmd_coding_cadd_annotations_noheader.tsv')
hgmd_coding_annotated_with_cadd

  """Entry point for launching an IPython kernel.
  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,#Chrom,Pos,Ref,Alt,Type,Length,AnnoType,Consequence,ConsScore,ConsDetail,...,Freq10000bp,Rare10000bp,Sngl10000bp,EnsembleRegulatoryFeature,dbscSNV-ada_score,dbscSNV-rf_score,RemapOverlapTF,RemapOverlapCL,RawScore,PHRED
0,1,1014143,C,T,SNV,0,CodingTranscript,STOP_GAINED,8,stop_gained,...,22,87,1622,Promoter,,,75.0,107.0,5.410558,35.0
1,1,1014143,C,T,SNV,0,Intergenic,UPSTREAM,1,upstream,...,22,87,1622,Promoter,,,75.0,107.0,5.410558,35.0
2,1,1014143,C,T,SNV,0,RegulatoryFeature,REGULATORY,4,regulatory,...,22,87,1622,Promoter,,,75.0,107.0,5.410558,35.0
3,1,1014143,C,T,SNV,0,RegulatoryFeature,REGULATORY,4,regulatory,...,22,87,1622,Promoter,,,75.0,107.0,5.410558,35.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
89254,X,154966616,CTTCA,C,DEL,4,CodingTranscript,FRAME_SHIFT,7,frameshift,...,7,35,840,,,,,,2.413846,21.5
89255,X,154966617,TTC,T,DEL,2,CodingTranscript,FRAME_SHIFT,7,frameshift,...,7,35,840,,,,,,2.392576,21.3
89256,X,155524585,G,A,SNV,0,CodingTranscript,STOP_GAINED,8,stop_gained,...,9,26,682,,,,4.0,4.0,6.231926,35.0
89257,X,155524585,G,A,SNV,0,Transcript,INTRONIC,2,"intron,non_coding",...,9,26,682,,,,4.0,4.0,6.231926,35.0


Below is the file manipulation for hgmd noncoding data:

In [16]:
hgmd_noncoding_for_annovar = read_vcf('hgmd_noncoding_for_cadd_noheader_for_annovar.vcf')
hgmd_noncoding_for_annovar

Unnamed: 0,chr10,100988295,CM114899,C,T
0,chr10,100988415,CM164756,A,T
1,chr10,100988457,CM127719,C,T
2,chr10,100988526,CM1610318,A,G
3,chr10,100988540,CD169898,CT,C
...,...,...,...,...,...
3802,chrX,85964053,CS1810957,C,G
3803,chrX,85964054,CS1723659,T,C
3804,chrX,85965588,CS173873,T,C
3805,chrX,85968639,CS032064,A,T


In [20]:
hgmd_noncoding_for_annovar['..'] = '.'
hgmd_noncoding_for_annovar['...'] = '.'
hgmd_noncoding_for_annovar['....'] = '.'
hgmd_noncoding_for_annovar.loc[-1] = ['chr10', '100988295', 'CM114899', 'C', 'T', '.', '.', '.']
hgmd_noncoding_for_annovar.index = hgmd_noncoding_for_annovar.index + 1  # shifting index
hgmd_noncoding_for_annovar = hgmd_noncoding_for_annovar.sort_index()
hgmd_noncoding_for_annovar.columns = ['', '', '', '', '', '', '', '']
hgmd_noncoding_for_annovar

Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8
0,chr10,100988295,CM114899,C,T,.,.,.
1,chr10,100988415,CM164756,A,T,.,.,.
2,chr10,100988457,CM127719,C,T,.,.,.
3,chr10,100988526,CM1610318,A,G,.,.,.
...,...,...,...,...,...,...,...,...
3803,chrX,85964053,CS1810957,C,G,.,.,.
3804,chrX,85964054,CS1723659,T,C,.,.,.
3805,chrX,85965588,CS173873,T,C,.,.,.
3806,chrX,85968639,CS032064,A,T,.,.,.


In [21]:
header = """##fileformat=VCFv4.1
##fileDate=20090805
##source=myImputationProgramV3.1
##reference=file:///seq/references/
#CHROM POS ID REF ALT QUAL FILTER INFO
"""
output_VCF = "hgmd_noncoding_for_annovar.vcf"
with open(output_VCF, 'w') as vcf:
    vcf.write(header)
hgmd_noncoding_for_annovar.to_csv(output_VCF, sep="\t", mode='a', index=False)

Below is the annotated results for hgmd noncoding data from ANNOVAR:

In [79]:
hgmd_noncoding_annotated_with_annovar = pd.read_csv('hgmd_noncoding_annotated.hg38_multianno.csv')
hgmd_noncoding_annotated_with_annovar

Unnamed: 0,Chr,Start,End,Ref,Alt,Func.refGene,Gene.refGene,GeneDetail.refGene,ExonicFunc.refGene,AAChange.refGene,Func.knownGene,Gene.knownGene,GeneDetail.knownGene,ExonicFunc.knownGene,AAChange.knownGene,Func.ensGene,Gene.ensGene,GeneDetail.ensGene,ExonicFunc.ensGene,AAChange.ensGene,SIFT_score,SIFT_pred,Polyphen2_HDIV_score,Polyphen2_HDIV_pred,Polyphen2_HVAR_score,Polyphen2_HVAR_pred,LRT_score,LRT_pred,MutationTaster_score,MutationTaster_pred,MutationAssessor_score,MutationAssessor_pred,FATHMM_score,FATHMM_pred,RadialSVM_score,RadialSVM_pred,LR_score,LR_pred,VEST3_score,CADD_raw,CADD_phred,GERP++_RS,phyloP46way_placental,phyloP100way_vertebrate,SiPhy_29way_logOdds,Interpro_domain,SIFT_score.1,SIFT_converted_rankscore,SIFT_pred.1,Polyphen2_HDIV_score.1,Polyphen2_HDIV_rankscore,Polyphen2_HDIV_pred.1,Polyphen2_HVAR_score.1,Polyphen2_HVAR_rankscore,Polyphen2_HVAR_pred.1,LRT_score.1,LRT_converted_rankscore,LRT_pred.1,MutationTaster_score.1,MutationTaster_converted_rankscore,MutationTaster_pred.1,MutationAssessor_score.1,MutationAssessor_score_rankscore,MutationAssessor_pred.1,FATHMM_score.1,FATHMM_converted_rankscore,FATHMM_pred.1,PROVEAN_score,PROVEAN_converted_rankscore,PROVEAN_pred,VEST3_score.1,VEST3_rankscore,MetaSVM_score,MetaSVM_rankscore,MetaSVM_pred,MetaLR_score,MetaLR_rankscore,MetaLR_pred,M-CAP_score,M-CAP_rankscore,M-CAP_pred,CADD_raw.1,CADD_raw_rankscore,CADD_phred.1,DANN_score,DANN_rankscore,fathmm-MKL_coding_score,fathmm-MKL_coding_rankscore,fathmm-MKL_coding_pred,Eigen_coding_or_noncoding,Eigen-raw,Eigen-PC-raw,GenoCanyon_score,GenoCanyon_score_rankscore,integrated_fitCons_score,integrated_fitCons_score_rankscore,integrated_confidence_value,GERP++_RS.1,GERP++_RS_rankscore,phyloP100way_vertebrate.1,phyloP100way_vertebrate_rankscore,phyloP20way_mammalian,phyloP20way_mammalian_rankscore,phastCons100way_vertebrate,phastCons100way_vertebrate_rankscore,phastCons20way_mammalian,phastCons20way_mammalian_rankscore,SiPhy_29way_logOdds.1,SiPhy_29way_logOdds_rankscore,Interpro_domain.1,GTEx_V6_gene,GTEx_V6_tissue,dbscSNV_ADA_SCORE,dbscSNV_RF_SCORE,InterVar_automated,PVS1,PS1,PS2,PS3,PS4,PM1,PM2,PM3,PM4,PM5,PM6,PP1,PP2,PP3,PP4,PP5,BA1,BS1,BS2,BS3,BS4,BP1,BP2,BP3,BP4,BP5,BP6,BP7,cosmic70,MCAP,REVEL,regsnp_fpr,regsnp_disease,regsnp_splicing_site
0,chr10,100988295,100988295,C,T,exonic,TWNK,.,stopgain,"TWNK:NM_001163812:exon1:c.C85T:p.R29X,TWNK:NM_...",exonic,C10orf2,.,stopgain,"C10orf2:uc001ksf.3:exon1:c.C85T:p.R29X,C10orf2...",exonic,TWNK,.,stopgain,"TWNK:ENST00000311916.6:exon1:c.C85T:p.R29X,TWN...",0.76,T,.,.,.,.,0.018,N,1.000,A,.,.,.,.,.,.,.,.,.,2.804,15.34,-0.054,-0.047,-0.078,6.492,.,.,.,.,.,.,.,.,.,.,0.018,0.276,N,1,0.810,A,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,6.409,0.885,29.6,0.993,0.591,0.191,0.203,N,c,0.086,-0.217,1.000,0.454,0.543,0.218,0,-0.054,0.131,0.224,0.174,-0.142,0.109,0.001,0.137,0.932,0.405,6.492,0.212,.,.,.,.,.,Uncertain significance,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,.,.,.,.,.,.
1,chr10,100988415,100988415,A,T,exonic,TWNK,.,nonsynonymous SNV,"TWNK:NM_001163812:exon1:c.A205T:p.I69F,TWNK:NM...",exonic,C10orf2,.,nonsynonymous SNV,"C10orf2:uc001ksf.3:exon1:c.A205T:p.I69F,C10orf...",exonic,TWNK,.,nonsynonymous SNV,"TWNK:ENST00000311916.6:exon1:c.A205T:p.I69F,TW...",0.25,T,0.996,D,0.9,P,0.000,D,0.991,D,1.845,L,-3.37,D,0.702,D,0.841,D,0.551,2.870,15.56,5.5,2.089,1.505,9.028,.,0.19,0.241,T,0.996,0.670,D,0.9,0.621,P,0.000,0.504,D,0.971,0.414,D,2.28,0.651,M,-3.37,0.953,D,-1.84,0.431,N,0.551,0.587,0.702,0.932,D,0.841,0.947,D,0.093,0.760,D,1.435,0.235,12.97,0.978,0.358,0.907,0.518,D,c,0.407,0.402,1.000,0.747,0.628,0.401,0,5.5,0.813,1.493,0.350,1.076,0.850,0.953,0.330,0.990,0.544,9.028,0.352,.,.,.,.,.,Uncertain significance,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,.,0.0927775052736,0.592,.,.,.
2,chr10,100988457,100988457,C,T,exonic,TWNK,.,nonsynonymous SNV,"TWNK:NM_001163812:exon1:c.C247T:p.P83S,TWNK:NM...",exonic,C10orf2,.,nonsynonymous SNV,"C10orf2:uc001ksf.3:exon1:c.C247T:p.P83S,C10orf...",exonic,TWNK,.,nonsynonymous SNV,"TWNK:ENST00000311916.6:exon1:c.C247T:p.P83S,TW...",0.01,D,0.985,D,0.827,P,0.000,D,0.883,D,1.845,L,-3.33,D,1.027,D,0.868,D,0.695,1.917,12.37,5.59,2.631,2.411,18.175,.,0.208,0.416,T,0.985,0.592,D,0.827,0.576,P,0.000,0.504,D,0.685,0.358,D,2.28,0.651,M,-3.33,0.951,D,-1.84,0.431,N,0.68,0.700,1.027,0.977,D,0.868,0.956,D,0.382,0.930,D,3.823,0.516,23.4,0.997,0.827,0.914,0.533,D,c,0.417,0.387,1.000,0.747,0.628,0.401,0,5.59,0.846,2.453,0.444,0.843,0.344,1.000,0.715,0.386,0.252,18.175,0.896,.,.,.,.,.,Uncertain significance,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,.,0.381929214361,0.584,.,.,.
3,chr10,100988526,100988526,A,G,exonic,TWNK,.,nonsynonymous SNV,"TWNK:NM_001163812:exon1:c.A316G:p.K106E,TWNK:N...",exonic,C10orf2,.,nonsynonymous SNV,"C10orf2:uc001ksf.3:exon1:c.A316G:p.K106E,C10or...",exonic,TWNK,.,nonsynonymous SNV,"TWNK:ENST00000311916.6:exon1:c.A316G:p.K106E,T...",0.28,T,1.0,D,0.997,D,0.000,D,1.000,D,2.005,M,-3.86,D,0.962,D,0.916,D,0.923,4.280,22.4,5.51,2.096,9.119,14.462,.,0.008,0.586,D,1.0,0.899,D,0.997,0.850,D,0.000,0.843,D,1.000,0.516,D,2.395,0.694,M,-3.86,0.967,D,-2.23,0.506,N,0.92,0.915,0.962,0.966,D,0.916,0.972,D,0.208,0.872,D,3.906,0.526,23.5,0.999,0.945,0.989,0.884,D,c,0.718,0.714,1.000,0.747,0.628,0.401,0,5.51,0.817,9.107,0.936,1.088,0.866,1.000,0.715,1.000,0.888,14.462,0.669,.,.,.,.,.,Uncertain significance,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,.,0.207740715004,0.787,.,.,.
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3803,chrX,85964053,85964053,C,G,splicing,CHM,NM_000390:exon5:c.315-1G>C;NM_001320959:exon5:...,.,.,splicing,CHM,uc004eet.3:exon5:c.315-1G>C,.,.,splicing,CHM,ENST00000357749.6:exon5:c.315-1G>C,.,.,.,.,.,.,.,.,.,.,1.000,D,.,.,.,.,.,.,.,.,.,2.050,12.81,4.64,2.028,3.381,12.833,.,.,.,.,.,.,.,.,.,.,.,.,.,1,0.810,D,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,3.876,0.523,23.5,0.988,0.459,0.805,0.399,D,.,.,.,1.000,0.747,.,.,.,4.64,0.572,3.419,0.522,0.816,0.334,1.000,0.715,1.000,0.888,12.833,0.570,.,.,.,1.0000,0.952,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,0.019730941704,D,on
3804,chrX,85964054,85964054,T,C,splicing,CHM,NM_000390:exon5:c.315-2A>G;NM_001320959:exon5:...,.,.,splicing,CHM,uc004eet.3:exon5:c.315-2A>G,.,.,splicing,CHM,ENST00000357749.6:exon5:c.315-2A>G,.,.,.,.,.,.,.,.,.,.,1.000,D,.,.,.,.,.,.,.,.,.,2.494,14.30,4.64,1.627,3.454,13.494,.,.,.,.,.,.,.,.,.,.,.,.,.,1,0.810,D,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,3.595,0.489,23.2,0.989,0.470,0.828,0.417,D,.,.,.,1.000,0.747,.,.,.,4.64,0.572,3.435,0.524,0.924,0.433,1.000,0.715,1.000,0.888,13.494,0.607,.,.,.,1.0000,0.958,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,0.0206278026906,D,on
3805,chrX,85965588,85965588,T,C,intronic,CHM,.,.,.,intronic,CHM,.,.,.,intronic,CHM,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.
3806,chrX,85968639,85968639,A,T,intronic,CHM,.,.,.,intronic,CHM,.,.,.,intronic,CHM,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.


Below is the annotated results for hgmd noncoding data from CADD:

In [80]:
hgmd_noncoding_annotated_with_cadd = pd.read_table('hgmd_noncoding_cadd_annotations_noheader.tsv')
hgmd_noncoding_annotated_with_cadd

  """Entry point for launching an IPython kernel.
  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,#Chrom,Pos,Ref,Alt,Type,Length,AnnoType,Consequence,ConsScore,ConsDetail,GC,CpG,motifECount,motifEName,motifEHIPos,motifEScoreChng,oAA,nAA,GeneID,FeatureID,GeneName,CCDS,Intron,Exon,cDNApos,relcDNApos,CDSpos,relCDSpos,protPos,relProtPos,Domain,Dst2Splice,Dst2SplType,minDistTSS,minDistTSE,SIFTcat,SIFTval,PolyPhenCat,PolyPhenVal,priPhCons,mamPhCons,verPhCons,priPhyloP,mamPhyloP,verPhyloP,bStatistic,targetScan,mirSVR-Score,mirSVR-E,mirSVR-Aln,cHmm_E1,cHmm_E2,cHmm_E3,cHmm_E4,cHmm_E5,cHmm_E6,cHmm_E7,cHmm_E8,cHmm_E9,cHmm_E10,cHmm_E11,cHmm_E12,cHmm_E13,cHmm_E14,cHmm_E15,cHmm_E16,cHmm_E17,cHmm_E18,cHmm_E19,cHmm_E20,cHmm_E21,cHmm_E22,cHmm_E23,cHmm_E24,cHmm_E25,GerpRS,GerpRSpval,GerpN,GerpS,tOverlapMotifs,motifDist,EncodeH3K4me1-sum,EncodeH3K4me1-max,EncodeH3K4me2-sum,EncodeH3K4me2-max,EncodeH3K4me3-sum,EncodeH3K4me3-max,EncodeH3K9ac-sum,EncodeH3K9ac-max,EncodeH3K9me3-sum,EncodeH3K9me3-max,EncodeH3K27ac-sum,EncodeH3K27ac-max,EncodeH3K27me3-sum,EncodeH3K27me3-max,EncodeH3K36me3-sum,EncodeH3K36me3-max,EncodeH3K79me2-sum,EncodeH3K79me2-max,EncodeH4K20me1-sum,EncodeH4K20me1-max,EncodeH2AFZ-sum,EncodeH2AFZ-max,EncodeDNase-sum,EncodeDNase-max,EncodetotalRNA-sum,EncodetotalRNA-max,Grantham,Dist2Mutation,Freq100bp,Rare100bp,Sngl100bp,Freq1000bp,Rare1000bp,Sngl1000bp,Freq10000bp,Rare10000bp,Sngl10000bp,EnsembleRegulatoryFeature,dbscSNV-ada_score,dbscSNV-rf_score,RemapOverlapTF,RemapOverlapCL,RawScore,PHRED
0,1,7961859,C,G,SNV,0,RegulatoryFeature,REGULATORY,4,regulatory,0.781,0.253,,,,,,,,ENSR00000000851,,,,,,,,,,,,,,36,9190,,,,,0.690,0.000,0.001,0.399,-0.152,-0.334,824.0,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,43.0,2.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,,,4.58,-9.17,,,3.18,1.11,104.49,25.01,525.02,88.14,411.25,64.60,5.76,1.06,359.53,65.41,9.83,2.28,6.51,2.64,82.68,28.05,9.52,2.49,177.88,29.14,9.10,2.57,0.34,0.24,,6,0.0,0.0,20.0,7,8,229,26,48,1451,Promoter,,,208.0,475.0,1.043084,12.600
1,1,7961859,C,G,SNV,0,Transcript,INTRONIC,2,intron,0.781,0.253,,,,,,,ENSG00000116288,ENST00000493678,PARK7,CCDS93.1,1/6,,,,,,,,,,,36,9190,,,,,0.690,0.000,0.001,0.399,-0.152,-0.334,824.0,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,43.0,2.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,,,4.58,-9.17,,,3.18,1.11,104.49,25.01,525.02,88.14,411.25,64.60,5.76,1.06,359.53,65.41,9.83,2.28,6.51,2.64,82.68,28.05,9.52,2.49,177.88,29.14,9.10,2.57,0.34,0.24,,6,0.0,0.0,20.0,7,8,229,26,48,1451,Promoter,,,208.0,475.0,1.043084,12.600
2,1,9720021,G,A,SNV,0,RegulatoryFeature,REGULATORY,4,regulatory,0.603,0.067,,,,,,,,ENSR00000346835,,,,,,,,,,,,,,9823,7207,,,,,0.227,0.002,0.000,-1.578,-0.179,-1.643,835.0,,,,,0.0,0.0,0.0,1.0,0.0,3.0,10.0,5.0,3.0,7.0,2.0,1.0,3.0,0.0,12.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,,,15.30,-30.60,,,36.93,11.08,14.49,4.51,9.86,2.26,18.01,7.50,7.67,1.73,50.36,17.98,7.69,2.64,45.86,13.42,24.37,6.03,36.66,24.95,16.91,4.03,0.89,0.36,0.20,0.05,,9,0.0,1.0,16.0,1,6,142,9,74,1406,CTCF Binding Site,0.00003,0.014,12.0,15.0,-0.385227,0.135
3,1,9720021,G,A,SNV,0,Transcript,SPLICE_SITE,5,"splice,intron",0.603,0.067,,,,,,,ENSG00000171608,ENST00000377346,PIK3CD,CCDS104.1,10/23,,,,,,,,,-4.0,DONOR,9823,7207,,,,,0.227,0.002,0.000,-1.578,-0.179,-1.643,835.0,,,,,0.0,0.0,0.0,1.0,0.0,3.0,10.0,5.0,3.0,7.0,2.0,1.0,3.0,0.0,12.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,,,15.30,-30.60,,,36.93,11.08,14.49,4.51,9.86,2.26,18.01,7.50,7.67,1.73,50.36,17.98,7.69,2.64,45.86,13.42,24.37,6.03,36.66,24.95,16.91,4.03,0.89,0.36,0.20,0.05,,9,0.0,1.0,16.0,1,6,142,9,74,1406,CTCF Binding Site,0.00003,0.014,12.0,15.0,-0.385227,0.135
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8609,X,154965965,C,G,SNV,0,Transcript,SPLICE_SITE,5,"splice,intron",0.371,0.000,,,,,,,ENSG00000185010,ENST00000360256,F8,CCDS35457.1,9/25,,,,,,,,,-5.0,DONOR,552,193,,,,,0.878,0.999,1.000,0.382,2.468,4.940,225.0,,,,,0.0,0.0,1.0,33.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,11.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,1982.59,3.442070e-106,11.70,11.70,,,5.50,1.08,2.25,0.75,8.21,2.26,2.94,0.77,8.37,3.04,5.54,1.52,8.49,2.11,3.04,1.29,2.19,1.00,3.39,2.69,7.86,2.57,0.33,0.16,0.01,0.01,,13,0.0,0.0,11.0,0,3,95,7,36,843,,0.99521,0.924,,,2.271896,20.600
8610,X,154965965,C,T,SNV,0,Transcript,SPLICE_SITE,5,"splice,intron",0.371,0.000,,,,,,,ENSG00000185010,ENST00000360256,F8,CCDS35457.1,9/25,,,,,,,,,-5.0,DONOR,552,193,,,,,0.878,0.999,1.000,0.382,2.468,4.940,225.0,,,,,0.0,0.0,1.0,33.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,11.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,1982.59,3.442070e-106,11.70,11.70,,,5.50,1.08,2.25,0.75,8.21,2.26,2.94,0.77,8.37,3.04,5.54,1.52,8.49,2.11,3.04,1.29,2.19,1.00,3.39,2.69,7.86,2.57,0.33,0.16,0.01,0.01,,13,0.0,0.0,11.0,0,3,95,7,36,843,,0.98832,0.926,,,2.302688,20.800
8611,X,155492384,C,A,SNV,0,CodingTranscript,NON_SYNONYMOUS,7,missense,0.464,0.053,,,,,E,D,ENSG00000185973,ENST00000334398,TMLHE,CCDS14768.1,,7/8,1253.0,0.442,1107.0,0.874,369.0,0.876,ndomain,,,106,152,deleterious,0.01,probably_damaging,0.968,0.163,0.979,0.990,-0.483,-0.176,0.142,,,,,,0.0,0.0,3.0,22.0,0.0,0.0,6.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,16.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2929.08,1.851780e-164,16.20,5.09,,,,,,,0.23,0.23,,,,,,,,,0.69,0.35,0.31,0.31,,,0.68,0.68,0.00,0.00,1.18,0.27,0.0,753,,,,0,1,3,3,3,19,,,,,,2.463914,21.700
8612,X,155492384,C,A,SNV,0,Transcript,INTRONIC,2,"intron,non_coding",0.464,0.053,,,,,,,ENSG00000224533,ENST00000433624,TMLHE-AS1,,3/3,,,,,,,,,,,106,152,,,,,0.163,0.979,0.990,-0.483,-0.176,0.142,,,,,,0.0,0.0,3.0,22.0,0.0,0.0,6.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,16.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2929.08,1.851780e-164,16.20,5.09,,,,,,,0.23,0.23,,,,,,,,,0.69,0.35,0.31,0.31,,,0.68,0.68,0.00,0.00,1.18,0.27,,753,,,,0,1,3,3,3,19,,,,,,2.463914,21.700


# Human Derived

Below is the annotated CADD file for human derived coding region:

In [4]:
human_derived_coding_annotated_with_CADD = pd.read_table('human_derived_coding_for_cadd_noheader.tsv')
human_derived_coding_annotated_with_CADD

  """Entry point for launching an IPython kernel.
  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,#Chrom,Pos,Ref,Alt,Type,Length,AnnoType,Consequence,ConsScore,ConsDetail,...,Freq10000bp,Rare10000bp,Sngl10000bp,EnsembleRegulatoryFeature,dbscSNV-ada_score,dbscSNV-rf_score,RemapOverlapTF,RemapOverlapCL,RawScore,PHRED
0,1,943329,C,T,SNV,0,RegulatoryFeature,REGULATORY,4,regulatory,...,18,117,2586,CTCF Binding Site,,,88.0,121.0,0.590702,8.117
1,1,943329,C,T,SNV,0,CodingTranscript,SYNONYMOUS,5,synonymous,...,18,117,2586,CTCF Binding Site,,,88.0,121.0,0.590702,8.117
2,1,943329,C,T,SNV,0,Intergenic,DOWNSTREAM,1,downstream,...,18,117,2586,CTCF Binding Site,,,88.0,121.0,0.590702,8.117
3,1,944699,C,T,SNV,0,CodingTranscript,NON_SYNONYMOUS,7,missense,...,16,117,2573,,,,12.0,13.0,2.980499,23.200
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
122229,X,155260197,C,A,SNV,0,Transcript,3PRIME_UTR,2,3_prime_UTR,...,9,36,873,,,,3.0,6.0,-0.219204,0.473
122230,X,155612732,T,C,SNV,0,RegulatoryFeature,REGULATORY,4,regulatory,...,7,27,866,Promoter,,,112.0,202.0,0.661682,8.709
122231,X,155612732,T,C,SNV,0,Transcript,INTRONIC,2,intron,...,7,27,866,Promoter,,,112.0,202.0,0.661682,8.709
122232,X,155612732,T,C,SNV,0,Intergenic,DOWNSTREAM,1,downstream,...,7,27,866,Promoter,,,112.0,202.0,0.661682,8.709


#### ANNOVAR

Now we will do the same with the human derived coding data for ANNOVAR:

In [40]:
human_derived_coding_region

Unnamed: 0,chr10,100020652,.,G,A
0,chr10,100190879,.,T,C
1,chr10,100233196,.,G,A
2,chr10,100267615,.,C,T
3,chr10,100347207,.,T,C
...,...,...,...,...,...
53793,chrX,99719539,.,A,G
53794,chrX,99719939,.,C,A
53795,chrX,99720084,.,G,A
53796,chrX,99721008,.,G,A


In [82]:
human_derived_coding_region['..'] = '.'
human_derived_coding_region['...'] = '.'
human_derived_coding_region['....'] = '.'
human_derived_coding_region.loc[-1] = ['chr10', '100020652', '.', 'G', 'A', '.', '.', '.']
human_derived_coding_region.index = human_derived_coding_region.index + 1  # shifting index
human_derived_coding_region = human_derived_coding_region.sort_index()
human_derived_coding_region.columns = ['', '', '', '', '', '', '', '']
human_derived_coding_region

ValueError: cannot set a row with mismatched columns

In [79]:
header = """##fileformat=VCFv4.1
##fileDate=20090805
##source=myImputationProgramV3.1
##reference=file:///seq/references/
#CHROM POS ID REF ALT QUAL FILTER INFO
"""
output_VCF = "human_derived_coding_region_for_annovar.vcf"
with open(output_VCF, 'w') as vcf:
    vcf.write(header)
human_derived_coding_region.to_csv(output_VCF, sep="\t", mode='a', index=False)

Below is the final annotated ANNOVAR data for the human derived coding region:

In [5]:
human_derived_coding_annotated_with_annovar = pd.read_csv('human_derived_coding_annotated.hg38_multianno.csv')
human_derived_coding_annotated_with_annovar

Unnamed: 0,Chr,Start,End,Ref,Alt,Func.refGene,Gene.refGene,GeneDetail.refGene,ExonicFunc.refGene,AAChange.refGene,...,BP4,BP5,BP6,BP7,cosmic70,MCAP,REVEL,regsnp_fpr,regsnp_disease,regsnp_splicing_site
0,chr10,100020652,100020652,G,A,intergenic,DNMBP;CPN1,dist=10699;dist=21656,.,.,...,.,.,.,.,.,.,.,.,.,.
1,chr10,100190879,100190879,T,C,exonic,CHUK,.,nonsynonymous SNV,CHUK:NM_001278:exon20:c.A2198G:p.N733S,...,1,0,0,0,.,0.00447999043875,0.039,.,.,.
2,chr10,100233196,100233196,G,A,UTR3,CWF19L1,NM_018294:c.*31C>T;NM_001303407:c.*31C>T;NM_00...,.,.,...,.,.,.,.,.,.,.,.,.,.
3,chr10,100267615,100267615,C,T,UTR5,CWF19L1,NM_018294:c.-22G>A;NM_001303407:c.-20707G>A;NM...,.,.,...,.,.,.,.,.,.,.,.,.,.
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53794,chrX,99719539,99719539,A,G,ncRNA_exonic,XRCC6P5,.,.,.,...,.,.,.,.,.,.,.,.,.,.
53795,chrX,99719939,99719939,C,A,ncRNA_exonic,XRCC6P5,.,.,.,...,.,.,.,.,.,.,.,.,.,.
53796,chrX,99720084,99720084,G,A,ncRNA_exonic,XRCC6P5,.,.,.,...,.,.,.,.,.,.,.,.,.,.
53797,chrX,99721008,99721008,G,A,ncRNA_exonic,XRCC6P5,.,.,.,...,.,.,.,.,.,.,.,.,.,.


Since we have a lot of files for the noncoding region for the human derived data in a lot of separate dataframes, we must combine them into one so that we can export the resulting final and only have to run ANNOVAR once for the noncoding region. The reason we have them in separate dataframes is that we initially split them for CADD annotations. Now we will use these files again for ANNOVAR. The process for this is shown below:

In [73]:
human_derived_noncoding_region_1 = read_vcf('human_derived_noncoding_for_cadd_1_for_annovar.vcf')
human_derived_noncoding_region_2 = read_vcf('human_derived_noncoding_for_cadd_2_for_annovar.vcf')
human_derived_noncoding_region_3 = read_vcf('human_derived_noncoding_for_cadd_3_for_annovar.vcf')
human_derived_noncoding_region_4 = read_vcf('human_derived_noncoding_for_cadd_4_for_annovar.vcf')
human_derived_noncoding_region_5 = read_vcf('human_derived_noncoding_for_cadd_5_for_annovar.vcf')
human_derived_noncoding_region_6 = read_vcf('human_derived_noncoding_for_cadd_6_for_annovar.vcf')
human_derived_noncoding_region_7 = read_vcf('human_derived_noncoding_for_cadd_7_for_annovar.vcf')
human_derived_noncoding_region_8 = read_vcf('human_derived_noncoding_for_cadd_8_for_annovar.vcf')
human_derived_noncoding_region_9 = read_vcf('human_derived_noncoding_for_cadd_9_for_annovar.vcf')
human_derived_noncoding_region_10 = read_vcf('human_derived_noncoding_for_cadd_10_for_annovar.vcf')
human_derived_noncoding_region_11 = read_vcf('human_derived_noncoding_for_cadd_11_for_annovar.vcf')

In [52]:
human_derived_noncoding_region_1['..'] = '.'
human_derived_noncoding_region_1['...'] = '.'
human_derived_noncoding_region_1['....'] = '.'
human_derived_noncoding_region_1.loc[-1] = ['chr10', '1000013', '.', 'G', 'A', '.', '.', '.']
human_derived_noncoding_region_1.index = human_derived_noncoding_region_1.index + 1  # shifting index
human_derived_noncoding_region_1 = human_derived_noncoding_region_1.sort_index()
human_derived_noncoding_region_1.columns = ['', '', '', '', '', '', '', '']
human_derived_noncoding_region_1

Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8
0,chr10,1000013,.,G,A,.,.,.
1,chr10,100020652,.,G,A,.,.,.
2,chr10,1000297,.,T,G,.,.,.
3,chr10,1000555,.,A,T,.,.,.
...,...,...,...,...,...,...,...,...
89996,chr11,47312241,.,G,A,.,.,.
89997,chr11,47312402,.,G,C,.,.,.
89998,chr11,47312428,.,C,T,.,.,.
89999,chr11,47312687,.,T,C,.,.,.


In [54]:
human_derived_noncoding_region_2['..'] = '.'
human_derived_noncoding_region_2['...'] = '.'
human_derived_noncoding_region_2['....'] = '.'
human_derived_noncoding_region_2.loc[-1] = ['chr11', '47312814', '.', 'G', 'A', '.', '.', '.']
human_derived_noncoding_region_2.index = human_derived_noncoding_region_2.index + 1  # shifting index
human_derived_noncoding_region_2 = human_derived_noncoding_region_2.sort_index()
human_derived_noncoding_region_2.columns = ['', '', '', '', '', '', '', '']
human_derived_noncoding_region_2

Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8
0,chr11,47312814,.,G,A,.,.,.
1,chr11,47312979,.,T,G,.,.,.
2,chr11,47313760,.,G,T,.,.,.
3,chr11,47313853,.,A,G,.,.,.
...,...,...,...,...,...,...,...,...
89996,chr12,65348795,.,C,A,.,.,.
89997,chr12,65348797,.,C,T,.,.,.
89998,chr12,65348836,.,C,T,.,.,.
89999,chr12,65348914,.,C,T,.,.,.


In [58]:
human_derived_noncoding_region_3['..'] = '.'
human_derived_noncoding_region_3['...'] = '.'
human_derived_noncoding_region_3['....'] = '.'
human_derived_noncoding_region_3.loc[-1] = ['chr12', '65348986', '.', 'C', 'A', '.', '.', '.']
human_derived_noncoding_region_3.index = human_derived_noncoding_region_3.index + 1  # shifting index
human_derived_noncoding_region_3 = human_derived_noncoding_region_3.sort_index()
human_derived_noncoding_region_3.columns = ['', '', '', '', '', '', '', '']

In [60]:
human_derived_noncoding_region_4['..'] = '.'
human_derived_noncoding_region_4['...'] = '.'
human_derived_noncoding_region_4['....'] = '.'
human_derived_noncoding_region_4.loc[-1] = ['chr15', '64116948', '.', 'G', 'A', '.', '.', '.']
human_derived_noncoding_region_4.index = human_derived_noncoding_region_4.index + 1  # shifting index
human_derived_noncoding_region_4 = human_derived_noncoding_region_4.sort_index()
human_derived_noncoding_region_4.columns = ['', '', '', '', '', '', '', '']

In [62]:
human_derived_noncoding_region_5['..'] = '.'
human_derived_noncoding_region_5['...'] = '.'
human_derived_noncoding_region_5['....'] = '.'
human_derived_noncoding_region_5.loc[-1] = ['chr17', '48835049', '.', 'G', 'C', '.', '.', '.']
human_derived_noncoding_region_5.index = human_derived_noncoding_region_5.index + 1  # shifting index
human_derived_noncoding_region_5 = human_derived_noncoding_region_5.sort_index()
human_derived_noncoding_region_5.columns = ['', '', '', '', '', '', '', '']

In [64]:
human_derived_noncoding_region_6['..'] = '.'
human_derived_noncoding_region_6['...'] = '.'
human_derived_noncoding_region_6['....'] = '.'
human_derived_noncoding_region_6.loc[-1] = ['chr20', '35887279', '.', 'C', 'T', '.', '.', '.']
human_derived_noncoding_region_6.index = human_derived_noncoding_region_6.index + 1  # shifting index
human_derived_noncoding_region_6 = human_derived_noncoding_region_6.sort_index()
human_derived_noncoding_region_6.columns = ['', '', '', '', '', '', '', '']

In [66]:
human_derived_noncoding_region_7['..'] = '.'
human_derived_noncoding_region_7['...'] = '.'
human_derived_noncoding_region_7['....'] = '.'
human_derived_noncoding_region_7.loc[-1] = ['chr22', '35418721', '.', 'A', 'T', '.', '.', '.']
human_derived_noncoding_region_7.index = human_derived_noncoding_region_7.index + 1  # shifting index
human_derived_noncoding_region_7 = human_derived_noncoding_region_7.sort_index()
human_derived_noncoding_region_7.columns = ['', '', '', '', '', '', '', '']

In [68]:
human_derived_noncoding_region_8['..'] = '.'
human_derived_noncoding_region_8['...'] = '.'
human_derived_noncoding_region_8['....'] = '.'
human_derived_noncoding_region_8.loc[-1] = ['chr3', '28600649', '.', 'C', 'T', '.', '.', '.']
human_derived_noncoding_region_8.index = human_derived_noncoding_region_8.index + 1  # shifting index
human_derived_noncoding_region_8 = human_derived_noncoding_region_8.sort_index()
human_derived_noncoding_region_8.columns = ['', '', '', '', '', '', '', '']

In [70]:
human_derived_noncoding_region_9['..'] = '.'
human_derived_noncoding_region_9['...'] = '.'
human_derived_noncoding_region_9['....'] = '.'
human_derived_noncoding_region_9.loc[-1] = ['chr5', '1377732', '.', 'G', 'A', '.', '.', '.']
human_derived_noncoding_region_9.index = human_derived_noncoding_region_9.index + 1  # shifting index
human_derived_noncoding_region_9 = human_derived_noncoding_region_9.sort_index()
human_derived_noncoding_region_9.columns = ['', '', '', '', '', '', '', '']

In [72]:
human_derived_noncoding_region_10['..'] = '.'
human_derived_noncoding_region_10['...'] = '.'
human_derived_noncoding_region_10['....'] = '.'
human_derived_noncoding_region_10.loc[-1] = ['chr7', '107252617', '.', 'A', 'T', '.', '.', '.']
human_derived_noncoding_region_10.index = human_derived_noncoding_region_10.index + 1  # shifting index
human_derived_noncoding_region_10 = human_derived_noncoding_region_10.sort_index()
human_derived_noncoding_region_10.columns = ['', '', '', '', '', '', '', '']

In [74]:
human_derived_noncoding_region_11['..'] = '.'
human_derived_noncoding_region_11['...'] = '.'
human_derived_noncoding_region_11['....'] = '.'
human_derived_noncoding_region_11.loc[-1] = ['chr8', '43782972', '.', 'A', 'G', '.', '.', '.']
human_derived_noncoding_region_11.index = human_derived_noncoding_region_11.index + 1  # shifting index
human_derived_noncoding_region_11 = human_derived_noncoding_region_11.sort_index()
human_derived_noncoding_region_11.columns = ['', '', '', '', '', '', '', '']

Now we want to combine these dataframes into one for ANNOVAR annotation and export them for ANNOVAR annotation:

In [75]:
human_derived_noncoding = pd.concat([human_derived_noncoding_region_1, human_derived_noncoding_region_2, human_derived_noncoding_region_3,
                                    human_derived_noncoding_region_4, human_derived_noncoding_region_5, human_derived_noncoding_region_6,
                                    human_derived_noncoding_region_7, human_derived_noncoding_region_8, human_derived_noncoding_region_9,
                                    human_derived_noncoding_region_10, human_derived_noncoding_region_11])
human_derived_noncoding

Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8
0,chr10,1000013,.,G,A,.,.,.
1,chr10,100020652,.,G,A,.,.,.
2,chr10,1000297,.,T,G,.,.,.
3,chr10,1000555,.,A,T,.,.,.
...,...,...,...,...,...,...,...,...
66378,chrX,9931817,.,T,C,.,.,.
66379,chrX,9931818,.,G,A,.,.,.
66380,chrX,9931993,.,T,C,.,.,.
66381,chrX,9932000,.,C,T,.,.,.


In [76]:
header = """##fileformat=VCFv4.1
##fileDate=20090805
##source=myImputationProgramV3.1
##reference=file:///seq/references/
#CHROM POS ID REF ALT QUAL FILTER INFO
"""
output_VCF = "human_derived_noncoding_for_annovar.vcf"
with open(output_VCF, 'w') as vcf:
    vcf.write(header)
human_derived_noncoding.to_csv(output_VCF, sep="\t", mode='a', index=False)

Below I have read in the ANNOVAR annotations using a chunking method, which takes a small sample of files at a time and then groups them together, using less memory than if read all at once using pandas:

In [3]:
my_list = []
for chunk in pd.read_csv('human_derived_noncoding_annotated.hg38_multianno.csv', chunksize = 5000, low_memory = False):
    my_list.append(chunk)
human_derived_noncoding_annotated_with_annovar = pd.concat(my_list, axis = 0)
del my_list
human_derived_noncoding_annotated_with_annovar

Unnamed: 0,Chr,Start,End,Ref,Alt,Func.refGene,Gene.refGene,GeneDetail.refGene,ExonicFunc.refGene,AAChange.refGene,...,BP4,BP5,BP6,BP7,cosmic70,MCAP,REVEL,regsnp_fpr,regsnp_disease,regsnp_splicing_site
0,chr10,1000013,1000013,G,A,intronic,GTPBP4,.,.,.,...,.,.,.,.,.,.,.,.,.,.
1,chr10,100020652,100020652,G,A,intergenic,DNMBP;CPN1,dist=10699;dist=21656,.,.,...,.,.,.,.,.,.,.,.,.,.
2,chr10,1000297,1000297,T,G,intronic,GTPBP4,.,.,.,...,.,.,.,.,.,.,.,.,.,.
3,chr10,1000555,1000555,A,T,intronic,GTPBP4,.,.,.,...,.,.,.,.,.,.,.,0.482876712329,B,off
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
966378,chrX,9931817,9931817,T,C,intronic,SHROOM2,.,.,.,...,.,.,.,.,.,.,.,.,.,.
966379,chrX,9931818,9931818,G,A,intronic,SHROOM2,.,.,.,...,.,.,.,.,.,.,.,.,.,.
966380,chrX,9931993,9931993,T,C,intronic,SHROOM2,.,.,.,...,.,.,.,.,.,.,.,0.438356164384,B,off
966381,chrX,9932000,9932000,C,T,intronic,SHROOM2,.,.,.,...,.,.,.,.,.,.,.,0.794520547945,B,off


Below is the final dataframe for CADD annotations for noncoding human derived data:

In [9]:
human_derived_noncoding_annotated_with_cadd = pd.concat([human_derived_noncoding_annotated_with_cadd_concat_1, human_derived_noncoding_annotated_with_cadd_concat_2, human_derived_noncoding_annotated_with_cadd_concat_3])
human_derived_noncoding_annotated_with_cadd

Unnamed: 0,#Chrom,Pos,Ref,Alt,Type,Length,AnnoType,Consequence,ConsScore,ConsDetail,...,Freq10000bp,Rare10000bp,Sngl10000bp,EnsembleRegulatoryFeature,dbscSNV-ada_score,dbscSNV-rf_score,RemapOverlapTF,RemapOverlapCL,RawScore,PHRED
0,1,926666,G,A,SNV,0,RegulatoryFeature,REGULATORY,4,regulatory,...,42.0,108.0,2217.0,Promoter,,,13.0,24.0,-0.115096,0.918
1,1,926666,G,A,SNV,0,Transcript,INTRONIC,2,intron,...,42.0,108.0,2217.0,Promoter,,,13.0,24.0,-0.115096,0.918
2,1,926854,C,G,SNV,0,RegulatoryFeature,REGULATORY,4,regulatory,...,41.0,109.0,2203.0,Promoter,,,10.0,20.0,0.158437,3.522
3,1,926854,C,G,SNV,0,Transcript,INTRONIC,2,intron,...,41.0,109.0,2203.0,Promoter,,,10.0,20.0,0.158437,3.522
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101318,X,155939495,G,C,SNV,0,Transcript,INTRONIC,2,intron,...,9.0,73.0,1553.0,,,,1.0,1.0,-0.315127,0.234
101319,X,155939592,G,A,SNV,0,Transcript,INTRONIC,2,intron,...,9.0,73.0,1545.0,,,,,,0.107893,2.886
101320,X,155939607,T,C,SNV,0,Transcript,INTRONIC,2,intron,...,9.0,73.0,1545.0,,,,,,-0.515402,0.047
101321,X,155939608,G,A,SNV,0,Transcript,INTRONIC,2,intron,...,9.0,73.0,1545.0,,,,,,-0.208316,0.509
