In [1]:
import re
import pandas as pd
import ast
import sys
import json
#import jinja2
import argparse
import os
import numpy as np

#%reset_selective karyotype

def print_progress_bar(i, max):
    '''
    i is your current iteration max should be equal to your maximum number of iterations - 1 (otherwise the bar stop at 99%)
    50 and 100 are arbutrary. 50 defines the size of the bar in the terminal. 100 makes this a percentage.
    The block using this function shoudl be followed by a print("\n") to skip to the next line.
    '''
    j = int(i*50/max)
    sys.stdout.write("\r")
    sys.stdout.write("[%-50s]%d%%"%('='*j, 100/max*i))
    sys.stdout.flush()

#https://www.geeksforgeeks.org/python-convert-key-value-string-to-dictionary/

def htmlTodataframe(file):
    '''
    This function transform karyotype.js informations into dataframe, 
    file = karyotype.js
    columns = columns name of karyotype file
    '''
    
    colname = []
    if os.path.isfile(file):
        print("[#INFO] Config file: "+ os.path.dirname(os.path.realpath(file))+file)
        data = []
        datas = []
        with open(file) as f:
            for line in f:
                if 'chr' in line:
                    proper = (line[:-2].strip())
                    r = re.sub('{', '', proper)
                    t = re.sub('}', '', r)
                    res = dict(map(str.strip, sub.split(':', 1)) for sub in t.split(', ') if ':' in sub)
                    #print(res)
                    data.append(res)
        for values in data:
            for k, v in values.items():
                if '"' in v:
                    values[k] = v.replace('"', "")
            
            datas.append(values)
        #print(datas)
  
        for keys in datas[0].keys():
            colname.append(keys)
        print("[#INFO] Config values: ",colname)
        df = pd.DataFrame(columns=colname)
        
        for item in datas:
            #print(item)
            row = pd.Series(item)
            df = df.append(row, ignore_index=True)
        return df 
    else:
        print("ERROR "+file+" does not exist !!!")
        exit()
   


def dataframeTohtml(outputjs, name, dictargs, boolD, df):
    '''
    Transform filtered dataframe into htmlfile
    output = name of the outputfile  extension js
    name = name of the instance in javascript
    dictargs = parameters of the instance, for example size of Radius, thickness etc
    '''
    #print("[#INFO] Value of dataframe =",dataframe)

    
    tohtml = []
    tohtml.append(name)
    tohtml.append(dictargs.copy())
    
    if name != 'ARC_hg19':
        name = 'var '+name
    if boolD == True:
        data = df.to_dict('records')
        tohtml.append(data.copy())
        jsonString = json.dumps(tohtml)
        jsonFile = open(outputjs, 'w+')
        jsonFile.write(name+" = "+jsonString)
        jsonFile.write(';')
        jsonFile.close()
    
    else:
        jsonString = json.dumps(tohtml)
        jsonFile = open(outputjs, 'w+')
        jsonFile.write(name+" = "+jsonString)
        jsonFile.write(';')
        jsonFile.close()
            

def createMain(dictargs, wanted):
    '''
    Function to create the main js
    '''
    mainfile = 'output.html'
    
    m1 = """
    <!DOCTYPE html>
    <html>
        <head>
            <meta charset="utf-8">
            <title>BioCircos.js</title>
        </head>
        <body>
            <!-- BioCircos.js, Jquery.js and D3.js import -->
    """
    jsPath_lib = '../lib/'
    jsFile = ['jquery.js', 'd3.js', 'biocircos-1.1.0.js']
    
    jsScript = []
    for elem in jsFile:
        jsScript.append('\t\t<script src="'+jsPath_lib+elem+'"></script>')
    
    m2 = ''' 
            <!-- Prepare a <div> tag with "biocircos" id to set the picture position your will draw in html -->
            <div id="biocircos"></div>
    <!-- Data configuration -->
    '''
    jsPath_data = '../config_test/'
    jsFile_create = []
    jsFile_own = ['test.js', 'SNP04_test.js']
    for elem in jsFile_own:
        jsFile_create.append('<script src="'+jsPath_data+elem+'"></script>')
    
    m3 = '''
    </script>
    <!-- Genome configuration -->
    <script>
    '''
    
    genome = [
             ["1" , 249250621],
             ["2" , 243199373],
             ["3" , 198022430],
             ["4" , 191154276],
             ["5" , 180915260],
             ["6" , 171115067],
             ["7" , 159138663],
             ["8" , 146364022],
             ["9" , 141213431],
             ["10" , 135534747],
             ["11" , 135006516],
             ["12" , 133851895],
             ["13" , 115169878],
             ["14" , 107349540],
             ["15" , 102531392],
             ["16" , 90354753],
             ["17" , 81195210],
             ["18" , 78077248],
             ["19" , 59128983],
             ["20" , 63025520],
             ["21" , 48129895],
             ["22" , 51304566],
             ["X" , 155270560],
             ["Y" , 59373566]
          ]
    
    tokeep = []
    gen = np.array(genome)
    for chroms in gen:
        if str(chroms[0]) in wanted:
            tokeep.append(list(chroms))
    for value in tokeep:
        value[1] = int(value[1])
    
    print(type(tokeep))
    varBiocircos = 'var BioCircosGenome = '+str(tokeep) 
    #print(tokeep)
    
    BioCircos = '''
      BioCircos01 = new BioCircos(ARC_hg19,BACKGROUND01,SNP04_test,BioCircosGenome,{
      //Main configuration
         target : "biocircos",
         svgWidth : 900,
         svgHeight : 600,
         chrPad : 0.04,
         innerRadius: 246,
         outerRadius: 270,
         SNPMouseOverDisplay : true,
         SNPMouseOverColor : "red",
         SNPMouseOverCircleSize : 5,
         SNPMouseOverCircleOpacity : 1.0,
         SNPMouseOverCircleStrokeColor : "#F26223",
         SNPMouseOverCircleStrokeWidth : 3,
         SNPMouseOverTooltipsHtml01 : "chr : ",
         SNPMouseOverTooltipsHtml02 : "<br>position : ",
         SNPMouseOverTooltipsHtml03 : "<br>-log10(Pvalue) : ",
         SNPMouseOverTooltipsHtml04 : "<br>rsid : ",
         SNPMouseOverTooltipsHtml05 : "",
         SNPMouseOverTooltipsBorderWidth : 1,
         SNPMouseOutDisplay : true,
         SNPMouseOutAnimationTime : 700,
         SNPMouseOutColor : "none",
         SNPMouseOutCircleSize : "none",
         SNPMouseOutCircleOpacity : 1.0,
         SNPMouseOutCircleStrokeWidth : 0,
      });
      BioCircos01.draw_genome(BioCircos01.genomeLength);
    '''
    
    end = '''
    </script>
        <body style="background-color:rgb(176, 196, 222);">
        </body>
    </html>
    '''
    
    f = open(mainfile, 'w+')
    f.write(m1)
    
    #Path of configuration files provide by Biocircos
    for items in jsScript:
        f.write(items+'\n')
    
    f.write(m2)
        
    #path of configuration file created by dataFrametohtml function    
    for elems in jsFile_create:
        f.write(elems+'\n')
    
    for keys in dictargs:
        keys = '"'+keys+'"'
        
    f.write("<script>")
    f.write("\nvar BACKGROUND01 = ['BACKGROUND01', "+str(dictargs)+'];')
    f.write(m3)
    f.write(varBiocircos)
    f.write(BioCircos)
    f.write(end)
    f.close()

def selectData(df, chr):
    print("chr = ", chr)
    #print(df.head())
    #df_filter = df.loc[df['chr'] == chr]
    df_filter = df.loc[df['chr'].isin(chr)]
    
    return df_filter
    
        
def main():
        
    karyotype = 'karyotype.human.hg19.js'
    
    #file ='test.js'
    #columns = ["chr", "start", "end", "color"]
    
    SNP = 'SNP04_gwascatalog.js'
    #SNP_columns = ["chr", "pos", "value", "des", "color"]
    
    configjs = 'SNP04_test.js'
    name_config = 'SNP04_test'
    
    outputjs = 'test.js'
    name = 'ARC_hg19'
    dictargs = {
          "BginnerRadius": 205,
          "BgouterRadius": 153,
          "BgFillColor": "#F2F2F2",
          "BgborderColor" : "#000",
          "BgborderSize" : 0.3
        }
    dictargs_config = {
      "maxRadius": 205,
      "minRadius": 153,
      "SNPFillColor": "#9400D3",
      "PointType": "circle",
      "circleSize": 2,
      "rectWidth": 2,
      "rectHeight": 2,
      "displaySNPAxis": "false",
      "SNPAxisColor": "#B8B8B8",
      "SNPAxisWidth": 0.5
    }

    chr = ['13', '17']
    config = selectData(htmlTodataframe(karyotype), chr)
    print(config)
    dataframeTohtml(outputjs, name, dictargs, boolD=True, df=config)
    createMain(dictargs, chr)
    
def myoptions():
    parser = argparse.ArgumentParser()
    parser.add_argument("-c", "--chr", type = list, default = "[]", help = "list of wanted chr to disp", dest = 'chr')
    
if __name__ == '__main__':
    args = myoptions()
    #main()

    #SNP file must have the same length than the genome file 
#dataframeTohtml(output=output, name=name, dictargs=dictargs, dataframe=False)

In [3]:
import mimetypes
import time
import os 
import re
import pandas as pd 

file = 'TEM195660.final.vcf'

def assertions(path):
    assert os.path.isfile(path), path+" does'nt exist."
    return "[#INFO] File: "+path
    
def vcfTodataframe(file, rheader=False):
    '''
    Take in input vcf file, or tsv and return a dataframe
    I"m gonna build my own vcf parser et puis c'est tout
    return 3 Dataframe, full, only sample, only info
    '''
    print(assertions(file))
    name, extension = os.path.splitext(file)

    header = []
    variants_tmp = []
    variants = []
    
    
    if extension == '.vcf':
        #print('[#INFO] VCF: '+file)
        with open(file) as f:
            for lines in f:
                if lines[0] == '#':
                    header.append(lines.strip())
                else:
                    variants_tmp.append(lines)
                        
    else:
        #Create a tsv to dataframe, 2lines literally
        print("ok")
        
    print("[#INFO]", header[-1])
    #print(variants[-1])
   
    col = header[-1].strip().split('\t')
    for v in variants_tmp:
        variants.append(v.strip().split('\t'))
   
    #headerCol = [res.replace('#', '') for res in colTemp]
    dfVar = pd.DataFrame(columns=col)
    #print(variants[0:3])
   
    #Creating Dataframe from the whole VCF
    print("[#INFO] Whole VCF to Dataframe")
    for i, var in enumerate(variants):
        print_progress_bar(i, len(variants)-1)
        rows = pd.Series(var, index=dfVar.columns)
        #print(rows[0])
        dfVar = dfVar.append(rows, ignore_index=True)
   
    print('\n')
    #print(dfVar.head())
    #print(dfVar.columns)
    if rheader:
        return dfVar, header
    else: 
        return dfVar 

#vcfTodataframe(file)

In [2]:
import mimetypes
import time
import os 
import re
import pandas as pd
    
def parseInfoField(dfVar):
    
    ############
    #Parsing INFO field from dfVar dataframe containing all informations from vcf
    ############
    
    #print(dfVar.head())
    infoList = []
    dicoInfo = {}
    headers = []
    
    
    print("[#INFO] Parsing INFO field")
    for i,elems in dfVar.iterrows():
        print_progress_bar(i, len(dfVar.index)-1)
        infoList.append([pd.Series(x.split('=')) for x in elems['INFO'].split(';')])

    print("\n")
    [headers.append(elems[0]) for ite in infoList for elems in ite]
    dfInfo = pd.DataFrame(columns=np.unique(np.array(headers)))
    
    print("[#INFO] From INFO field to Dataframe")
    for j, elems in enumerate(infoList):
        print_progress_bar(j, len(infoList)-1)
        elems = dict(elems)
        dfInfo = dfInfo.append(elems, ignore_index=True)
    print("\n")
    print(dfInfo.head())
    #print(dfInfo.loc['1000genomesAFR'])

    return dfInfo

DF = parseInfoField(vcfTodataframe(file))


NameError: name 'vcfTodataframe' is not defined

In [None]:
import mimetypes
import time
import os 
import re
import pandas as pd

def parseSampleField(dfVar):
    #############
    ### Parsing Sample Field in VCF  
    #############
    
    dico = []
    #dfTest = pd.Series(dfVar.TEM195660.values,index=dfVar.FORMAT).to_dict()
   
    #Parsing FORMAT field in VCF
    print('\n')
    print("[#INFO] Parsing FORMAT field")
    for i, row in dfVar.iterrows():
        print_progress_bar(i, len(dfVar.index)-1)
        dico.append(pd.Series(row['TEM195660'].split(':'), index=row['FORMAT'].split(':')).to_dict())
      
    dfSample = pd.DataFrame(dico)
    print('\n')
    #print(dfSample.head())
    #print('\n')

    return dfSample



In [4]:
# library & dataset
import seaborn as sns
df = sns.load_dataset('penguins')
import matplotlib.pyplot as plt
#print(df.head())
#print(type(df))
#
#inf = parseSampleField(vcfTodataframe(file))
#whole = vcfTodataframe(file)
#
#result = pd.concat([whole, inf], axis=1)
#result = result.drop(['FORMAT', 'TEM195660'], axis=1)
##dfparsed = dfw.loc['#CHROM', 'AD', 'DP', 'PL']
#dfparsed = result.loc[:, ['DP', 'VAF', '#CHROM']]
##print(dfparsed.tail())
#dfparsed['DP'] = dfparsed['DP'].astype(float)
#dfparsed['VAF'] = dfparsed['VAF'].astype(float)
#
#print("longueur:", len(dfparsed.index))
##print(type(dfparsed))
##sns.pairplot(dfparsed, hue='#CHROM')

#my_list = DF.columns.values.tolist()
#print(my_list)