In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

agg = pd.read_csv('../data/aggregated_tools_new.csv')
agg.head()

Unnamed: 0,Index,Seeding,Pairwise,n
0,BWT-FM,NNN,Banded Myers Bit Vector,1
1,BWT-FM,NNN,HD,1
2,BWT-FM,NNN,NW,1
3,BWT-FM,NNN,SW,2
4,BWT-FM,NNN,SW & NW,1


In [2]:
agg = agg.rename(columns={"Index":"source","Seeding":"type","Pairwise":"target","n":"value"})
agg = agg.dropna()
agg

Unnamed: 0,source,type,target,value
0,BWT-FM,NNN,Banded Myers Bit Vector,1
1,BWT-FM,NNN,HD,1
2,BWT-FM,NNN,NW,1
3,BWT-FM,NNN,SW,2
4,BWT-FM,NNN,SW & NW,1
5,BWT-FM,YNN,HD,7
6,BWT-FM,YNN,Non-DP Heuristic,2
7,BWT-FM,YNN,SW,4
8,BWT-FM,YNN,SW & NW,4
9,BWT-FM,YNN,SW & Semi-Global,1


In [3]:
from ipysankeywidget import SankeyWidget
from floweaver import *

size = dict(width=1200, height=600)

nodes = {
    'index': ProcessGroup(['Hashing', 'BWT-FM', 'BWT-ST', 'Suffix Array', 'Suffix Tree']),
    'pairwise': ProcessGroup(['Banded Myers Bit Vector', 'HD', 'NW', 'SW',
                             'SW & NW', 'SW & Semi-Global', 'Non-DP Heuristic',
                             'Landau-Vishkin', 'Rabin-Karp Algorithm',
                             'Sparse DP', 'Semi-Global']),
}

#nodes = {
#    'index': ProcessGroup(['Hashing', 'BWT-FM', 'BWT-ST', 'Suffix Array', 'Suffix Tree']),
#    'pairwise': ProcessGroup(['Banded Myers Bit Vector', 'Based on the Wrapper, Local and Global', 'Global', 'Local',
#                             'Hamming Distance', 'Local & Global', 'Local & Semi-Global', 'Non-DP Heuristic',
#                             'Hamming, Local, and Global Alignment', 'Landau-Vishkin', 'Rabin-Karp Algorithm',
#                             'Sparse DP', 'Semi-Global']),
#}

ordering = [
    ['index'],  
    ['pairwise'],
]

bundles = [
    Bundle('index', 'pairwise'),
]

sdd = SankeyDefinition(nodes, bundles, ordering)
weave(sdd, agg).to_widget(**size, margins=dict(top=0, bottom=0, left=50, right=100))

SankeyWidget(layout=Layout(height='600', width='1200'), links=[{'source': 'index^*', 'target': 'pairwise^*', '…

In [4]:
index_part = Partition.Simple('process', [
    'Hashing',
    'BWT-FM',
    ('Other Suffix', ['BWT-ST', 'Suffix Array', 'Suffix Tree']),
])

pairwise_part = Partition.Simple('process', [
    ('Smith-Waterman', ['SW']),
    ('Hamming Distance', ['HD']),
    ('Needleman-Wunsch', ['NW']),
    ('Other DP', ['Rabin-Karp Algorithm', 'Landau-Vishkin', 'Sparse DP', 'Banded Myers Bit Vector', 'Semi-Global']),
    'Non-DP Heuristic',
    ('Multiple Methods', ['SW & NW', 'SW & Semi-Global',])
])

#pairwise_part = Partition.Simple('process', [
#    'Local',
#    'Hamming Distance',
#    'Global',
#    'Non-DP Heuristic',
#    ('Multiple Methods', ['Based on the Wrapper, Local and Global', 'Local & Global', 'Local & Semi-Global', 
#                          'Hamming, Local, and Global Alignment']),
#    ('Other DP', ['Rabin-Karp Algorithm', 'Landau-Vishkin', 'Sparse DP', 
#                          'Banded Myers Bit Vector', 'Semi-Global'])
#])

nodes['index'].partition = index_part
nodes['pairwise'].partition = pairwise_part
weave(sdd, agg).to_widget(**size, margins=dict(top=0, bottom=0, left=80, right=160))

SankeyWidget(groups=[{'id': 'index', 'type': 'process', 'title': '', 'nodes': ['index^Hashing', 'index^BWT-FM'…

In [5]:
part_by_seeding = Partition.Simple('type', [
    ('Fixed Length Seeds', ['YNN']),
    ('Fixed Chain of Seeds', ['YNY']),
    ('Fixed Spaced Seeds', ['YYN']),
    ('Fixed Spaced Chain of Seeds', ['YYY']),
    ('Variable Length Seeds', ['NNN']),
    ('Variable Chain of Seeds', ['NNY']),
    ('Variable Spaced Seeds', ['NYN']),
    #('Chain of Seeds', ['NNY', 'YNY']),
    #('Spaced Seeds', ['NYN', 'YYN'])
])
part_by_index = Partition.Simple('source', ['Hashing', 'BWT-FM', 'Other Suffix'])
pal = ['tomato', 'lightgreen', 'lightblue' ]
sdd = SankeyDefinition(nodes, bundles, ordering, flow_partition=part_by_index)
weave(sdd, agg, palette=pal).to_widget(**size, margins=dict(top=0, bottom=0, left=80, right=160))

SankeyWidget(groups=[{'id': 'index', 'type': 'process', 'title': '', 'nodes': ['index^Hashing', 'index^BWT-FM'…

In [7]:
nodes['seeding'] = Waypoint(part_by_seeding)
ordering = [
    ['index'],
    ['seeding'],
    ['pairwise']
]

bundles = [
    Bundle('index', 'pairwise', waypoints=['seeding']),
]

sdd = SankeyDefinition(nodes, bundles, ordering, flow_partition=part_by_index)

weave(sdd, agg, palette=pal).to_widget(**size, margins=dict(top=0, bottom=0, left=100, right=150)).auto_save_png('../figures/Sankey_Table1_Tools.png')



SankeyWidget(groups=[{'id': 'index', 'type': 'process', 'title': '', 'nodes': ['index^Hashing', 'index^BWT-FM'…

In [70]:
data = pd.read_csv('../data/Table1_new.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,Year_of_publication,Aligner,URL,Citation,read_length,max_read_length_tested,Title,Number_of_Citations,Indexing,...,Pairwise_alignment_2,Pairwise_alignment (DP-based?),fix length seed,Variable length seed (hybrid seeding),spaced seed,chain_of_seeds,Application,email - first author,email - last author,Verified
0,43.0,2011,Bismark,https://www.bioinformatics.babraham.ac.uk/proj...,Krueger et al. ‎2011,Ultra-short,50,Bismark: a flexible aligner and methylation ca...,1550,BWT-FM,...,Hamming Distance,N,Y,N,N,N,BS-Seq,felix.krueger@bbsrc.ac.uk,,
1,16.0,2009,BRAT,http://compbio.cs.ucr.edu/brat/,Harris et al. 2009,Ultra-short,26,BRAT: bisulfite-treated reads analysis tool,64,Hashing,...,Hamming Distance,N,Y,N,N,N,BS-Seq,elenah@cs.ucr.edu,stefano.lonardi@ucr.edu,
2,54.0,2012,BRAT-BW,http://compbio.cs.ucr.edu/brat/,Harris et al. 2012,Ultra-short,62,BRAT-BW: efficient and accurate mapping of bis...,70,BWT-FM,...,Hamming Distance,N,N,Y,N,N,BS-Seq,elenayharris@gmail.com,stefano.lonardi@ucr.edu,
3,32.0,2010,BS-Seeker,https://github.com/BSSeeker/BSseeker2,Chen et al. ‎2010,Ultra-short,36,BS Seeker: precise mapping for bisulfite seque...,215,BWT-FM,...,Hamming Distance,N,Y,N,N,N,BS-Seq,paoyang@gate.sinica.edu.tw,matteop@mcdb.ucla.edu,
4,63.0,2013,BS-Seeker2,https://github.com/BSSeeker/BSseeker2,Guo et al. ‎2013,Short,250,BS-Seeker2: a versatile aligning pipeline for ...,173,BWT-FM,...,Local & Global,Y,Y,N,N,N,BS-Seq,guoweilong@cau.edu.cn,matteop@mcdb.ucla.edu,
