In [1]:
import json

with open('../data/results2.json') as f: 
    data = json.load(f)
    
#The attributes for each contig in the results are:
#'name', 'length', 'cs', 'ucs', 'depth', 'confidence'

##name is a string containing the id of the contig
##length is an integer corresponding to the length of the contig
##cs is the core sequences (each sequence entry have the following attributes: 'name', 'position', 'length')
##ucs are the unique core sequences (each sequence entry have the following attributes: 'name', 'position', 'length')
##depth is an integer corresponding to the number of kmers covering a given position in the contig
##confidence is a computed float estimating a confidence probability for a given position in the contig

print data[0].keys()

[u'cs', u'length', u'depth', u'confidence', u'name']


## Create formatted data structure

In [2]:
genomesize = 6264404

tot_len = 0
for i in range(len(data)):
    
    try:
        tot_len += data[i]['length']

    except KeyError:
        pass
    
gap_bet_contigs = (genomesize - tot_len) / len(data)
print gap_bet_contigs

3009


In [3]:
cs_start_pos = []

for num_contig in range(len(data)):
    
    try:
        
        cont_len = data[num_contig]['length']
        cs_tot_len = 0
        for num_cs in range(len(data[num_contig]['cs'])):
            cs_tot_len += data[num_contig]['cs'][num_cs]['length']
            
        dif = cont_len - cs_tot_len
        cs_st = float(dif)/2
        cs_start_pos.append(cs_st)
    
    except KeyError:
        
        cs_start_pos.append(0)

In [4]:
ref = {}
cs = {}
ucs = {}

reference = []
core_seq = []
unique_core_seq = []

for num_contig in range(len(data)):
        
    visit_flag = False

    try:

        ref['id'] = num_contig
        ref['name'] = num_contig
        ref['strand'] = -1

        if num_contig == 0:
            ref['start'] = 0 
            ref['end'] = data[num_contig]['length']
        else:
            ref['start'] = ref['end'] + 1
            ref['end'] = ref['start'] + data[num_contig]['length']

        cs_tot_len = 0

        for num_cs in range(len(data[num_contig]['cs'])):

            cs['contig'] = num_contig
            cs['id'] = data[num_contig]['cs'][num_cs]['name'] 
            cs['name'] = data[num_contig]['cs'][num_cs]['name']
            cs['strand'] = -2

            if num_cs == 0 and num_contig == 0:
                cs['start'] =  0
                cs['end'] = data[num_contig]['cs'][num_cs]['length']
            else:
                cs['start'] =  cs['end'] + 1
                cs['end'] = cs['start'] + data[num_contig]['cs'][num_cs]['length']

            #accumulate total cs length
            cs_tot_len += data[num_contig]['cs'][num_cs]['length']

            core_seq.append(cs.copy())

            if 'ucs' in data[num_contig].keys() and visit_flag == False:

                ucs_tot_len = 0

                for num_ucs in range(len(data[num_contig]['ucs'])):

                    ucs['contig'] = num_contig
                    ucs['id'] = data[num_contig]['ucs'][num_ucs]['name']
                    ucs['name'] = data[num_contig]['ucs'][num_ucs]['name']
                    ucs['strand'] = -3

                    if num_ucs == 0:
                        ucs['start'] = cs['start']
                        ucs['end'] = ucs['start'] + data[num_contig]['ucs'][num_ucs]['length']
                    else:
                        ucs['start'] = ucs['end'] + 1
                        ucs['end'] = ucs['start'] + data[num_contig]['ucs'][num_ucs]['length']

                    #accumulate total ucs length
                    ucs_tot_len += data[num_contig]['cs'][num_cs]['length']

                    unique_core_seq.append(ucs.copy())
                ucs['end'] += (data[num_contig]['length'] - ucs_tot_len) + gap_bet_contigs  

                visit_flag = True

        core_seq.append(cs.copy())
        cs['end'] += (data[num_contig]['length'] - cs_tot_len) + gap_bet_contigs 

        reference.append(ref.copy())
        ref['end'] += gap_bet_contigs

    except KeyError:

        pass


In [5]:
# centering cs according to contig (recalculating start and end of each cs)

for num_cs in range(len(core_seq)):
    
    core_seq[num_cs]['start'] =  core_seq[num_cs]['start'] + cs_start_pos[core_seq[num_cs]['contig']] 
    core_seq[num_cs]['end'] = core_seq[num_cs]['end'] + cs_start_pos[core_seq[num_cs]['contig']] 
    

In [14]:
# count num of cs and ucs per contig

cs = []
for num_contig in range(len(data)):
    
    try:
        cs.append(len(data[num_contig]['cs']))
        
    except KeyError:
        pass

In [7]:
track_type = ['reference', 'core_sequence', 'unique_core_sequence']
total_tracks = [reference, core_seq, unique_core_seq]

contigs = []

for i in range(len(track_type)):
    
    contig = {}
    
    contig['trackName'] = track_type[i] 
    contig['trackType'] = 'stranded'
    contig['visible'] = True
    contig['inner_radius'] = 120
    contig['outer_radius'] = 160
    contig['trackFeatures'] = 'complex'
    contig['featureThreshold'] = 7000000
    contig['mouseclick'] = 'islandPopup'
    contig['mouseover_callback'] = 'islandPopup'
    contig['mouseout_callback'] = 'islandPopupClear'
    contig['linear_mouseclick'] = 'linearPopup'
    contig['showLabels'] = True
    contig['showTooltip'] = True
    contig['linear_mouseclick'] = 'linearClick'
    contig['items'] = []
    
    contig['items'] = total_tracks[i]
    
    contigs.append(contig)
    
## add another reference
contig['trackName'] = 'contig'
contig['inner_radius'] = 30
contig['outer_radius'] = 480

contig['items'] = reference

contigs.append(contig)

In [8]:
# save to file:
with open('../data/contig.data.json', 'w') as f:
    json.dump(contigs, f, indent=3)

In [17]:
reference

[{'end': 109335, 'id': 0, 'name': 0, 'start': 0, 'strand': -1},
 {'end': 270658, 'id': 1, 'name': 1, 'start': 112345, 'strand': -1},
 {'end': 279078, 'id': 2, 'name': 2, 'start': 273668, 'strand': -1},
 {'end': 285257, 'id': 3, 'name': 3, 'start': 282088, 'strand': -1},
 {'end': 384486, 'id': 4, 'name': 4, 'start': 288267, 'strand': -1},
 {'end': 589481, 'id': 5, 'name': 5, 'start': 387496, 'strand': -1},
 {'end': 1441503, 'id': 6, 'name': 6, 'start': 592491, 'strand': -1},
 {'end': 1446825, 'id': 7, 'name': 7, 'start': 1444513, 'strand': -1},
 {'end': 1700814, 'id': 8, 'name': 8, 'start': 1449835, 'strand': -1},
 {'end': 2079173, 'id': 9, 'name': 9, 'start': 1703824, 'strand': -1},
 {'end': 2311735, 'id': 10, 'name': 10, 'start': 2082183, 'strand': -1},
 {'end': 2522961, 'id': 12, 'name': 12, 'start': 2342489, 'strand': -1},
 {'end': 2543239, 'id': 13, 'name': 13, 'start': 2525971, 'strand': -1},
 {'end': 2613305, 'id': 15, 'name': 15, 'start': 2547283, 'strand': -1},
 {'end': 2710714

In [15]:
len(cs)

111

In [16]:
cs

[1168,
 1906,
 7,
 17,
 17,
 1666,
 7467,
 4,
 3047,
 4556,
 2812,
 2166,
 18,
 647,
 446,
 4434,
 2193,
 6,
 2373,
 1,
 1399,
 17,
 297,
 2422,
 1408,
 3329,
 42,
 1,
 1018,
 10,
 1019,
 682,
 3068,
 1,
 13,
 609,
 2,
 12,
 1,
 357,
 13,
 170,
 1,
 3,
 504,
 5,
 7,
 5,
 1,
 202,
 1,
 6,
 1,
 2,
 9,
 66,
 46,
 1,
 2,
 1,
 14,
 3,
 220,
 11,
 2,
 3,
 15,
 7,
 5,
 1,
 1,
 9,
 4,
 1,
 1,
 4,
 10,
 7,
 55,
 2,
 30,
 9,
 7,
 1,
 2,
 1,
 1,
 16,
 2,
 13,
 19,
 1,
 1,
 1,
 13,
 2,
 2,
 1,
 11,
 2,
 5,
 5,
 2,
 2,
 3,
 1,
 1,
 6,
 2,
 1,
 28]