Notebook to create sankey diagrams from the urbanmoddeling citations paper.

By Maarten Vanhoof

In [9]:
#Import dependencies 
import pandas as pd
from floweaver import *

In [10]:
#Read in start data 

df_nodes_com_top20_filter = pd.read_csv('/Users/Metti_Hoof/Desktop/df_nodes_com_top20_filter.csv')

df_nodes_com_top20_filter.head(10)
#df_nodes_com_top20_filter.shape

Unnamed: 0,id,title,lang,year,depth,horizontalDepth,microsim,transportmicrosimmodel,microsimmodel,urbanmicrosimmodel,...,spatialmicrosim,com_2004,com_2005,com_2006,top20_membership_2004,rank_com_2004,top20_membership_2005,rank_com_2005,top20_membership_2006,rank_com_2006
0,4080972026817744577,Do sons or daughters give more money to paren...,English,2006.0,0,23,308.0,,23.0,86.0,...,,,,16.0,False,,False,,True,8.0
1,5777717231776523709,Assessment of ASTER land cover and MODIS NDVI ...,English,2005.0,1,4,223.0,176.0,126.0,4.0,...,84.0,,4.0,4.0,False,,True,1.0,True,1.0
2,232821207661518428,A model of regional housing markets in Englan...,English,2005.0,1,24,24.0,408.0,36.0,795.0,...,140.0,,10.0,10.0,False,,True,12.0,True,10.0
3,7042996520119020314,An event-driven queue-based microsimulation o...,English,2006.0,2,29,430.0,353.0,602.0,29.0,...,84.0,,,12.0,False,,False,,True,6.0
4,2093602551282431789,"Multi acteurs, multi activités: simulations mu...",French,2006.0,1,6,6.0,340.0,8.0,33.0,...,115.0,,,14.0,False,,False,,True,2.0
5,8546256022862429143,Measuring accessibility for people with a disa...,English,2003.0,1,9,9.0,10.0,62.0,247.0,...,31.0,3.0,3.0,3.0,True,5.0,True,7.0,True,7.0
6,17385398044346996449,Comparative empirical evaluations of internal ...,English,2004.0,1,2,223.0,2.0,67.0,12.0,...,84.0,16.0,18.0,16.0,True,14.0,True,16.0,True,8.0
7,852510155665490767,Performance Evaluation framework of an integr...,English,2005.0,1,10,61.0,10.0,10.0,100.0,...,84.0,,11.0,1.0,False,,True,6.0,True,4.0
8,12036865038326554739,Modeling parking,,2000.0,1,44,,445.0,,,...,,1.0,9.0,23.0,True,6.0,True,10.0,True,18.0
9,7450019102237589693,Transborder city regions and the quest for in...,English,2001.0,1,5,223.0,150.0,67.0,5.0,...,31.0,5.0,4.0,4.0,True,1.0,True,1.0,True,1.0


In [63]:
#Proces input data to right format
#source,target,type,value

source_col='rank_com_2004'
target_col='rank_com_2005'

flows=df_nodes_com_top20_filter[[source_col,target_col]].copy()
flows=flows.rename(columns={source_col:'source',target_col:'target'})

#remove all rows that have NaN in both the source and target columns
#So in our case we drop all rows that have a NaN for two rank_com_year columns,
#meaning that in these years they were never in the top 20 . 
flows=flows.dropna(subset=['source','target'],how='all').copy() 

#Set type and value to be 1 as we don't need them for the viz but for the definition.
# IDEA: we could set as type the most cited paper in a community, and give them a black line in the viz to show their movement??
flows['type']=1
flows['value']=1

flows.head()


Unnamed: 0,source,target,type,value
1,,1.0,1,1
2,,12.0,1,1
5,5.0,7.0,1,1
6,14.0,16.0,1,1
7,,6.0,1,1


In [64]:
#Experiment with sankeys

# Set the default size to fit the documentation better.
size = dict(width=570, height=300)

nodes = {
    '2004': ProcessGroup(list(df_nodes_com_top20_filter_to_sankey.rank_com_2004.unique())),
    '2005': ProcessGroup(list(df_nodes_com_top20_filter_to_sankey.rank_com_2005.unique())),
}

ordering = [
    ['2004'],       # put 2004 on the left...
    ['2005'],   # ... and 2005 on the right.
]

bundles = [
    Bundle('2004', '2005'),
]


In [65]:
sdd = SankeyDefinition(nodes, bundles, ordering)
weave(sdd, flows).to_widget(**size)

SankeyWidget(layout=Layout(height='300', width='570'), links=[{'source': '2004^*', 'target': '2005^*', 'type':…

In [84]:
# Partition the left and right side of the sankey diagram (called process) 
# Either specify individual points, or group and name them
source_partition=Partition.Simple('process', 
                                  [1.0,2.0,3.0,4.0,5.0, 
                                  ('top 5-20',[6.0,7.0,8.0,9.0,10.0,11.0,12.0,13.0,14.0,15.0,16.0,17.0,18.0,19.0,20.0]),
                                  ('out of top 20',['NaN']),
                                  ])

# This is another partition.
target_partition = Partition.Simple('process', 
                                  [1.0,2.0,3.0,4.0,5.0, 
                                  ('top 5-20',[6.0,7.0,8.0,9.0,10.0,11.0,12.0,13.0,14.0,15.0,16.0,17.0,18.0,19.0,20.0]),
                                  ('out of top 20',['NaN']),
                                  ])

# Update the ProcessGroup nodes to use the partitions
nodes['2004'].partition = source_partition
nodes['2005'].partition = target_partition

# New Sankey! Flow_partition is the one that will get colored
sdd = SankeyDefinition(nodes, bundles, ordering)
weave(sdd, flows).to_widget(**size)


SankeyWidget(groups=[{'id': '2004', 'type': 'process', 'title': '', 'nodes': ['2004^1.0', '2004^2.0', '2004^3.…

In [85]:
# Color the flows, for this we need to add an extra partition called the flow partition 
# and we need to define a color palette (either existing - see https://sankeyview.readthedocs.io/en/latest/tutorials/colour-scales.html or created on our own)

#You can choose any column in your dataset, just specify it and the differt partitions you want (as a list)
flow_partition = Partition.Simple('source', [1.0,2.0,3.0,4.0,5.0, 
                                  ('top 5-20',[6.0,7.0,8.0,9.0,10.0,11.0,12.0,13.0,14.0,15.0,16.0,17.0,18.0,19.0,20.0]),
                                  ('out of top 20',['NaN']),
                                  ]) 

'''
# Set the colours for the labels in the partition yourself.
palette = {1.0: 'yellowgreen', 
           2.0: 'gold',
           3.0: 'brown',
           4.0: 'red',
           5.0: 'green',
           'top 5-10':'blue',
           'top 10-20':'orange',
           'out of top 20':'grey'
          }
'''

# New SDD with the flow_partition set
sdd = SankeyDefinition(nodes, bundles, ordering,
                       flow_partition=flow_partition)

weave(sdd, flows, palette='Set2_7').to_widget(**size)


# Save Sankey!
weave(sdd, flows,palette='Set2_7').to_widget(**size).auto_save_png('test1.png')


SankeyWidget(groups=[{'id': '2004', 'type': 'process', 'title': '', 'nodes': ['2004^1.0', '2004^2.0', '2004^3.…

In [99]:
##################################### 
#       Sankeys with waypoints     #
##################################### 

#Proces input data to right format
#source,target,type,value

source_col='rank_com_2004'
waypoint_col='rank_com_2005'
target_col='rank_com_2006'

flows2=df_nodes_com_top20_filter[[source_col,target_col,waypoint_col]].copy()
flows2=flows2.rename(columns={source_col:'source',target_col:'target',waypoint_col:'waypoint'})

#remove all rows that have NaN in both the source, waypoint, and target columns
#So in our case we drop all rows that have a NaN for two rank_com_year columns,
#meaning that in these years they were never in the top 20 . 
flows2=flows2.dropna(subset=['source','target','waypoint'],how='all').copy() 

#Set type and value to be 1 as we don't need them for the viz but for the definition.
# IDEA: we could set as type the most cited paper in a community, and give them a black line in the viz to show their movement??
flows2['type']=1
flows2['value']=1

print (flows2.head())


   source  target  waypoint  type  value
0     NaN     8.0       NaN     1      1
1     NaN     1.0       1.0     1      1
2     NaN    10.0      12.0     1      1
3     NaN     6.0       NaN     1      1
4     NaN     2.0       NaN     1      1


In [100]:

#Set up sankeys

# Set the default size to fit the documentation better.
size = dict(width=570, height=300)

nodes = {
    '2004': ProcessGroup(list(df_nodes_com_top20_filter_to_sankey.rank_com_2004.unique())),
    '2006': ProcessGroup(list(df_nodes_com_top20_filter_to_sankey.rank_com_2006.unique())),
}

ordering = [
    ['2004'],       # put 2004 on the left...
    ['2006'],   # ... and 2006 on the right.
]

bundles = [
    Bundle('2004', '2006'),
]


# Partition the left and right side of the sankey diagram (called process) 
# Either specify individual points, or group and name them
source_partition=Partition.Simple('process', 
                                  [1.0,2.0,3.0,4.0,5.0, 
                                  ('top 5-20',[6.0,7.0,8.0,9.0,10.0,11.0,12.0,13.0,14.0,15.0,16.0,17.0,18.0,19.0,20.0]),
                                  ('out of top 20',['NaN']),
                                  ])

# This is another partition.
target_partition = Partition.Simple('process', 
                                  [1.0,2.0,3.0,4.0,5.0, 
                                  ('top 5-20',[6.0,7.0,8.0,9.0,10.0,11.0,12.0,13.0,14.0,15.0,16.0,17.0,18.0,19.0,20.0]),
                                  ('out of top 20',['NaN']),
                                  ])

# Update the ProcessGroup nodes to use the partitions
nodes['2004'].partition = source_partition
nodes['2006'].partition = target_partition

# Set flow partition
flow_partition = Partition.Simple('source', [1.0,2.0,3.0,4.0,5.0, 
                                  ('top 5-20',[6.0,7.0,8.0,9.0,10.0,11.0,12.0,13.0,14.0,15.0,16.0,17.0,18.0,19.0,20.0]),
                                  ('out of top 20',['NaN']),
                                  ]) 


#Add waypoints
# 1. Define a new waypoint with a partition fron the flows 2 dataet
waypount_partition = Partition.Simple('waypoint', 
                                  [1.0,2.0,3.0,4.0,5.0, 
                                  ('top 5-20',[6.0,7.0,8.0,9.0,10.0,11.0,12.0,13.0,14.0,15.0,16.0,17.0,18.0,19.0,20.0]),
                                  ('out of top 20',['NaN']),
                                  ]) 
nodes['waypoint'] = Waypoint(waypount_partition)

# 2. Update the ordering to show where the waypoint goes: in the middle
ordering = [
    ['2004'],
    ['waypoint'],
    ['2006'],
]

# 3. Update the bundle definition to send the flows via the waypoint
bundles = [
    Bundle('2004', '2006', waypoints=['waypoint']),
]


# Create the SDD with the nodes, ordering & bundles.
sdd2 = SankeyDefinition(nodes, bundles, ordering,
                       flow_partition=flow_partition)

weave(sdd2, flows2, palette='Set2_7').to_widget(**size)



SankeyWidget(groups=[{'id': '2004', 'type': 'process', 'title': '', 'nodes': ['2004^1.0', '2004^2.0', '2004^3.…

In [None]:
##################################### 
#       Top 7 Sankeys only    #
##################################### 

#Proces input data to right format
#source,target,type,value

source_col='rank_com_2004'
waypoint_col='rank_com_2005'
target_col='rank_com_2006'

flows2=df_nodes_com_top20_filter[[source_col,target_col,waypoint_col]].copy()
flows2=flows2.rename(columns={source_col:'source',target_col:'target',waypoint_col:'waypoint'})

#remove all rows that have NaN in both the source, waypoint, and target columns
#So in our case we drop all rows that have a NaN for two rank_com_year columns,
#meaning that in these years they were never in the top 20 . 
flows2=flows2.dropna(subset=['source','target','waypoint'],how='all').copy() 

#Set type and value to be 1 as we don't need them for the viz but for the definition.
# IDEA: we could set as type the most cited paper in a community, and give them a black line in the viz to show their movement??
flows2['type']=1
flows2['value']=1

print (flows2.head())

#Set up sankeys

# Set the default size to fit the documentation better.
size = dict(width=570, height=300)

nodes = {
    '2004': ProcessGroup(list(df_nodes_com_top20_filter_to_sankey.rank_com_2004.unique())),
    '2006': ProcessGroup(list(df_nodes_com_top20_filter_to_sankey.rank_com_2006.unique())),
}

ordering = [
    ['2004'],       # put 2004 on the left...
    ['2006'],   # ... and 2006 on the right.
]

bundles = [
    Bundle('2004', '2006'),
]


# Partition the left and right side of the sankey diagram (called process) 
# Either specify individual points, or group and name them
source_partition=Partition.Simple('process', 
                                  [1.0,2.0,3.0,4.0,5.0,6.0,7.0 
                                  ('out of top 7',[8.0,9.0,10.0,11.0,12.0,13.0,14.0,15.0,16.0,17.0,18.0,19.0,20.0,'NaN']),
                                  ])

# This is another partition.
target_partition = Partition.Simple('process', 
                                  [1.0,2.0,3.0,4.0,5.0,6.0,7.0 
                                  ('out of top 7',[8.0,9.0,10.0,11.0,12.0,13.0,14.0,15.0,16.0,17.0,18.0,19.0,20.0,'NaN']),
                                  ])

# Update the ProcessGroup nodes to use the partitions
nodes['2004'].partition = source_partition
nodes['2006'].partition = target_partition

# Set flow partition
flow_partition = Partition.Simple('source', 
                                  [1.0,2.0,3.0,4.0,5.0,6.0,7.0 
                                  ('out of top 7',[8.0,9.0,10.0,11.0,12.0,13.0,14.0,15.0,16.0,17.0,18.0,19.0,20.0,'NaN']),
                                  ])


#Add waypoints
# 1. Define a new waypoint with a partition fron the flows 2 dataet
waypount_partition = Partition.Simple('waypoint', 
                                  [1.0,2.0,3.0,4.0,5.0,6.0,7.0 
                                  ('out of top 7',[8.0,9.0,10.0,11.0,12.0,13.0,14.0,15.0,16.0,17.0,18.0,19.0,20.0,'NaN']),
                                  ])

nodes['waypoint'] = Waypoint(waypount_partition)

# 2. Update the ordering to show where the waypoint goes: in the middle
ordering = [
    ['2004'],
    ['waypoint'],
    ['2006'],
]

# 3. Update the bundle definition to send the flows via the waypoint
bundles = [
    Bundle('2004', '2006', waypoints=['waypoint']),
]


# Create the SDD with the nodes, ordering & bundles.
sdd2 = SankeyDefinition(nodes, bundles, ordering,
                       flow_partition=flow_partition)

weave(sdd2, flows2, palette='Set2_7').to_widget(**size)
