In [1]:
import pandas as pd
import numpy as np
import os
import time

In [20]:
def locateVar(vcf_file_org, vcf_file_new):
    variants = {}
    headers = []
    f = open(vcf_file_org)
    for line in f:
        if line[0] == '#':
            headers.append(line)
            continue
        v = line.strip().split('\t')
        chrom, seq_start, seq_end = v[0].split('_')
        #print chrom, seq_start, seq_end, v[1]
        pos = int(v[1]) + int(seq_start) -1
        name = chrom + '_' + str(pos) + '_' + v[3] + '_' + v[4]
        variants[name] = chrom + '\t' + str(pos) + '\t' + '\t'.join(v[2:])
    f.close()
    
    print 'Totally report %d SNVs.' % len(variants)
    
    with open(vcf_file_new, 'w') as fout:
        for headline in headers:
            fout.write(headline)
        for key in sorted(variants):
            fout.write(''.join(variants[key])+'\n')
    fout.close()
    var_dict_tmp = {'Chrom':[], 'Pos':[], 'Ref':[], 'Alt':[]}
    for name in variants:
        v = name.split('_')
        var_dict_tmp['Chrom'].append(v[0])
        var_dict_tmp['Pos'].append(int(v[1]))
        var_dict_tmp['Ref'].append(v[2])
        var_dict_tmp['Alt'].append(v[3])
    variantsDF = pd.DataFrame(var_dict_tmp, columns=['Chrom', 'Pos', 'Ref', 'Alt'])
    variantsDF.index = variantsDF['Chrom'] + '_' + variantsDF['Pos'].astype(str)
    return variantsDF

In [27]:
def annoteSNVID(vcf_file, variantsDF, output_vcf_file):
    N_var = variantsDF.shape[0]
    print 'Number of SNVs reported == ', N_var
       
    #load the dbSNP information related to the variants of the sample.
    variantsDF['dbSNP_ID'] = np.asarray(['nan'] * N_var)
    variantsDF['dbSNP_Ref'] = np.asarray(['nan'] * N_var)
    variantsDF['dbSNP_Alt'] = np.asarray(['nan'] * N_var)
    start_time_dbSNP = time.time()
    
    fd = open(vcf_file)

    
#    with futures.ThreadPoolExecutor(max_workers=5) as executor:
#        future_vcf = {executor.submit(searchID, variantsDF, vcf_line): vcf_line for vcf_line in lines}
#        for future in futures.as_completed(future_vcf):
#            vcf = future_vcf[future]
#            try:
#                resultsDF = future.result()
#            except Exception as exc:
#                print 'Exception generated:',exc
           
    
    
#    bar = progressbar.ProgressBar()
#    for line in bar(lines):
    for line in fd:
        if line[0] == '#':
            continue
        v = line.split('\t')
        chrom = 'chr' + v[0]
        name = chrom + '_' + v[1]
        if name in variantsDF.index:
            variantsDF.dbSNP_ID[name] = v[2]
            variantsDF.dbSNP_Ref[name] = v[3]
            variantsDF.dbSNP_Alt[name] = v[4]     
    fd.close()
    end_time_dbSNP = time.time()
    print (end_time_dbSNP - start_time_dbSNP) / 60.0
    variantsDF = variantsDF.sort_index()
    
    variantsDF.to_csv(output_vcf_file, sep = '\t', columns = ['Chrom', 'Pos', 'Ref', 'Alt', 'dbSNP_ID', 'dbSNP_Ref',
                                                             'dbSNP_Alt'], 
                      header = True, index = False)
    
    return variantsDF

In [37]:
def checkSNVAnnotation(variantsDF, output_vcf_masked, output_vcf_unmasked):
    index_mask = []
    index_unmask = []
    for idx in variantsDF.index:
        if not variantsDF.loc[idx,'dbSNP_ID'] == 'nan':
            if variantsDF.loc[idx, 'Ref'] == variantsDF.loc[idx, 'dbSNP_Ref'] and variantsDF.loc[idx, 'Alt'] in variantsDF.loc[idx,'dbSNP_Alt']:
                index_mask.append(idx)
            else:
                index_unmask.append(idx)
        else:
            index_unmask.append(idx)
    
    variantsDF_masked = variantsDF.loc[index_mask]
    variantsDF_unmasked = variantsDF.loc[index_unmask]
    print 'Number of SNVs called with dbSNP/clinVar IDs == ',variantsDF_masked.shape[0]
    print 'Number of SNVs called with no reference IDs == ', variantsDF_unmasked.shape[0]
    
    variantsDF_masked.to_csv(output_vcf_masked, sep='\t',  columns = ['Chrom', 'Pos', 'Ref', 'Alt', 'dbSNP_ID', 'dbSNP_Ref',
                                                             'dbSNP_Alt'], header = True, index = False)
    variantsDF_unmasked.to_csv(output_vcf_unmasked, sep='\t',  columns = ['Chrom', 'Pos', 'Ref', 'Alt', 'dbSNP_ID', 'dbSNP_Ref',
                                                             'dbSNP_Alt'], header = True, index = False)

In [31]:
def main():
    vcf_source = '/home/yaneng/RSun/Data/qiagen-colon/variance_called/'
    vcf_file_org = vcf_source + 'QIANGEN-2959YJ_S2_L001_var.vcf'
    vcf_file_new = vcf_source + 'QIANGEN-2959YJ_S2_L001_var_new.vcf'
    #Modify the locations of SNVs from relative positions to genomic positions.
    variants = locateVar(vcf_file_org, vcf_file_new)
    
    ref_source = '/home/yaneng/RSun/Data/Hg19/dbSNP/'
    vcf_file = ref_source + 'QIAGEN-COLON-SNV.vcf'
    output_vcf_file = vcf_source + 'QIANGEN-2959YJ_S2_L001_var_annoted.vcf'
    #Correspond the identified SNVs to the dbSNP and clinVar IDs.
    variants = annoteSNVID(vcf_file, variants, output_vcf_file)
    
    output_vcf_masked = vcf_source + 'QIANGEN-2959YJ_S2_L001_var_annoted_masked.vcf'
    output_vcf_unmasked = vcf_source + 'QIANGEN-2959YJ_S2_L001_var_annoted_unmasked.vcf'
    checkSNVAnnotation(variants, output_vcf_masked, output_vcf_unmasked)

In [38]:
if __name__ == '__main__':
    main()

Totally report 205 SNVs.
Number of SNVs reported ==  205


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


0.216473714511
Number of SNVs called with dbSNP/clinVar IDs ==  151
Number of SNVs called with no reference IDs ==  54
