In [17]:
import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import gc
import re

from itertools import cycle
from IPython.display import display
import pickle, os

import seaborn as sns

encoding = "ISO-8859-1"

import Jupyter_module_loader

In [18]:
# you should clone this git to this subdirectory (in some directory - I call it BES_analysis - doesn't matter though)

if os.getcwd().split(os.sep)[-1] != 'BES_analysis_code':
    raise Exception("Stop! You're in the wrong directory - should be in 'BES_analysis_code'")

BES_code_folder   = "../BES_analysis_code/" # we should be here!
BES_small_data_files = BES_code_folder + "small data files" + os.sep
if not os.path.exists( BES_small_data_files ):
    os.makedirs( BES_small_data_files )

# we should create these if they don't already exist
BES_data_folder   = "../BES_analysis_data/"
if not os.path.exists( BES_data_folder ):
    os.makedirs( BES_data_folder )

BES_output_folder = "../BES_analysis_output/"
if not os.path.exists( BES_output_folder ):
    os.makedirs( BES_output_folder )
    
BES_file_manifest = pd.read_csv( BES_small_data_files + "BES_file_manifest.csv" )

BES_R_data_files = BES_data_folder + "R_data" + os.sep
if not os.path.exists( BES_R_data_files ):
    os.makedirs( BES_R_data_files )


In [19]:
dataset_name = "W13_comb"

In [20]:
%%time

manifest = BES_file_manifest[ BES_file_manifest["Name"] == dataset_name ]

data_subfolder = BES_data_folder + dataset_name + os.sep

dataset_filename = manifest["Stata_Filename"].values[0]
# dataset_description = manifest["Friendlier_Description"].values[0]
# dataset_citation = manifest["Citation"].values[0]
# dataset_start = manifest["Date_Start"].values[0]
# dataset_stop = manifest["Date_Stop"].values[0]
# dataset_wave = manifest["Wave No"].values[0]

BES_Panel = pd.read_stata( data_subfolder + dataset_filename )
print("BES_Panel", BES_Panel.shape )

####

BES_numeric = pd.read_hdf( data_subfolder + "BESnumeric.hdf", "BESnumeric" )
print("BES_numeric",  BES_numeric.shape )

var_type    = pd.read_csv( data_subfolder + "var_type.csv", encoding=encoding)
var_type.set_index("Unnamed: 0", inplace=True)
print("var_type",  var_type.shape )

fname = data_subfolder + "cat_dictionary.pkl"
with open(fname, "rb") as f:
    cat_dictionary = pickle.load( f )
    
####

BES_non_numeric = pd.read_hdf( data_subfolder + "BESnon_numeric.hdf", "BESnon_numeric" )
print("BES_non_numeric",  BES_non_numeric.shape )

BES_reduced = pd.read_hdf( data_subfolder + "BES_reduced.hdf", "BES_reduced" )
print("BES_reduced",  BES_reduced.shape )

BES_reduced_with_na = pd.read_hdf( data_subfolder + "BES_reduced_with_na.hdf", "BES_reduced_with_na")
print("BES_reduced_with_na", BES_reduced_with_na.shape )

fname = data_subfolder + "new_old_col_names.pkl"
with open(fname, "rb") as f:
    new_old_col_names = pickle.load(f) 

BES_Panel (69486, 5173)
BES_numeric (69486, 4993)
var_type (5173, 13)
BES_non_numeric (69486, 125)
BES_reduced (69486, 2526)
BES_reduced_with_na (69486, 2526)
Wall time: 18min 25s


In [5]:
from ipysankeywidget import SankeyWidget
from ipywidgets import Layout

In [6]:
layout = Layout(width="300", height="200")
def sankey(margin_top=10, **value):
    """Show SankeyWidget with default values for size and margins"""
    return SankeyWidget(layout=layout,
                        margins=dict(top=margin_top, bottom=0, left=30, right=60),
                        **value)

In [7]:
from ipywidgets import Button, VBox, HBox, Output
from IPython.display import display, clear_output

links = [
    {'source': 'A', 'target': 'B', 'value': 1},
    {'source': 'B', 'target': 'C', 'value': 1},
    {'source': 'A', 'target': 'D', 'value': 1},
]
order = [
    ['A'],
    ['D', 'B'],
    ['C'],
]
# out = Output()
# display(out)
s = sankey(links=links, order=order)



def swap(x):
    global order
    order = [list(reversed(o)) for o in order]
    s.order = order


def remove(x):
    global links, out, e, s
    links = links[0:len(links)-1]
    s.links = links
   #s.close()
#     IPython.display.clear_output()
    #s.clear_output()
#     out = Output()
#     s.clear_output()
#    s.close()
    e.close()
    clear_output()
#     with out:  
    s = sankey(links=links, order=order)
    b = Button(description='Swap')
    b.on_click(swap)    
    c = Button(description='Remove')
    c.on_click(remove)
    e = VBox([HBox([b,c]), s])
    display(e)

b = Button(description='Swap')
b.on_click(swap)    
    
c = Button(description='Remove')
c.on_click(remove)

e = VBox([HBox([b,c]), s])
display(e)

VBox(children=(HBox(children=(Button(description='Swap', style=ButtonStyle()), Button(description='Remove', st…

In [121]:
s.links

[{'source': 'A', 'target': 'B', 'value': 1},
 {'source': 'B', 'target': 'C', 'value': 1}]

In [89]:
s.order

[['A'], ['B', 'D'], ['C']]

In [49]:
s.close()



In [54]:
s.open()

In [77]:
s.setup_instance()
s.open()

In [75]:
s.close_all()

In [85]:
s.nodes

[]

In [21]:
[x for x in BES_Panel.columns if "wt" in x]

['wt_core_W3',
 'wt_core_W4',
 'wt_core_W5',
 'wt_core_W6',
 'wt_core_W7',
 'wt_core_W8',
 'wt_core_W9',
 'wt_core_W1',
 'wt_core_W2',
 'wt_full_W3',
 'wt_full_W1W2W3',
 'wt_full_W4',
 'wt_full_W1W2W3W4',
 'wt_full_W5',
 'wt_full_W1W2W3W4W5',
 'wt_full_W4W5',
 'wt_full_W6',
 'wt_full_W1W2W3W4W5W6',
 'wt_full_W4W5W6',
 'wt_full_W4W6',
 'wt_full_W7',
 'wt_full_W8',
 'wt_full_W1W2W3W4W5W6W7W8',
 'wt_full_W7W8',
 'wt_full_W9',
 'wt_full_W1W2W3W4W5W6W7W8W9',
 'wt_full_W7W8W9',
 'wt_full_W10',
 'wt_full_W11',
 'wt_full_W1_W11',
 'wt_full_W1',
 'wt_full_W2',
 'wt_full_W1W2',
 'wt_full_W1W2W3W4W5W6W7',
 'wt_daily_W5',
 'wt_new_W11',
 'wt_new_W12',
 'wt_daily_W12',
 'wt_new_W13',
 'wt_new_W1_W11',
 'wt_new_W1_W12',
 'wt_new_W1_W13',
 'wt_new_W6_W11',
 'wt_new_W6_W12',
 'wt_new_W6_W13',
 'wt_new_W11_W13',
 'enviroGrowthW4',
 'enviroGrowthW7',
 'wt_daily_W8',
 'wt_new_W10']

In [22]:
variables_of_interest = [x for x in BES_Panel.columns if re.match("al(_scale|\d)|lr(_scale|\d)|personality|"+
                                          "euRefVote|generalElectionVote|wave|wt_full|wt_new"+
                                          "profile|marital|gender|edlevel|ageGroup|country|housing|countryOfBirth",x)]
len(variables_of_interest)

BES_Panel[variables_of_interest].to_pickle(dataset_name+"_small.pkl")

In [11]:
# BES_Panel= pd.read_pickle(dataset_name+"_small.pkl")

In [12]:
party_list = list( BES_Panel[ ['profile_past_vote_2005','profile_past_vote_2010','profile_past_vote_2015' ] ].stack().unique() )
party_list.append('I would not vote')
replace_dict = {'Labour Party':"LAB", 
 'United Kingdom Independence Party (UKIP)':"UKIP",
 'Labour':"LAB",
 'Conservative Party':"CON",
 'Conservative':"CON",
 'Liberal Democrats':"LD",
 'Did not vote':"DNV",
 'Scottish National Party':"SNP",
 'Scottish National Party (SNP)':"SNP",
 'British National Party (BNP)':"BNP",
 'Green Party':"GP",
 'Liberal Democrat':"LD",
 "Don't know":"DK",
 'Some other party':"OTH",
 'Veritas':"VER",
 'Plaid Cymru':"PC",
 'Other':"OTH",
 'Respect':"RES",
 'I would not vote':"DNV"}
colourmap = {'LAB':'red', 'UKIP':'purple', 'CON':'blue', 'SNP':'yellow', 'BNP':'black', 'LD':'orange', 'DNV':'grey', 'DK':'cyan', 'OTH':'brown', 'GP':'green',
       'VER':'pink', 'PC':'olive', 'RES':'peru'}
base_order = ['LAB','CON','DK','UKIP','LD','SNP','DNV','GP','PC','BNP','OTH','RES','VER']
lr_order   = ['GP','PC','LAB','SNP','DK','OTH','DNV','LD','CON','UKIP','BNP','RES','VER']
cmap = ['red','purple','blue','yellow','black','orange','grey','cyan','brown','green','pink','olive','peru']

In [13]:
def flatten(A):
    rt = []
    for i in A:
        if isinstance(i,list): rt.extend(flatten(i))
        else: rt.append(i)
    return rt

def make_sankey(BES, links, order, first_year, second_year, first_col, second_col,
                replace_dict, threshold, colour, fixed_order, scale, nodes, wt_col):
    
    crosstab = pd.crosstab(index   = BES[ first_col  ],
                           columns = BES[ second_col ],
                           values  = BES[ wt_col ],
                           aggfunc = sum,
                           normalize=True)*scale
#     if order == []: # initialise
#         order = [ [[replace_dict[x]+first_year] for x in crosstab.index] ]
#     order.append( [[replace_dict[x]+second_year] for x in crosstab.columns] ) # add new layer
#     if nodes == []: # initialise
#         nodes =   [ {'id':x+first_year } for x in lr_order ]  #, 'direction':'r'
#     nodes.extend( [ {'id':x+second_year} for x in lr_order ] ) # , 'direction':'r'
    
    if order == []: # initialise
        order = [ [[x+first_year]  for x in fixed_order] ]
    order.append( [[x+second_year] for x in fixed_order] ) # add new layer
    
    for col in crosstab.columns:
        second_party = replace_dict[ col ]
        for ind in crosstab[col].index:
            first_party = replace_dict[ ind ]
            
            if crosstab[col][ind]>threshold:
                if colour == []:
                    col_to_use = colourmap[first_party]
                else:
                    col_to_use = cmap[colour]
                    
                node_id_first = first_party+first_year
                if node_id_first not in [x['id'] for x in nodes]:
                    nodes.append({'id':node_id_first, 'direction':'l'})
                node_id_second = second_party+second_year
                if node_id_second not in [x['id'] for x in nodes]:
                    nodes.append({'id':node_id_second, 'direction':'l'})                    

                d = {'source': node_id_first,
                     'target': node_id_second,
                     'value': crosstab[col][ind],
                     'color': col_to_use,'type' : colour}
                links.append(d)

    return links, order, nodes

In [14]:
threshold = 0.5
margins=dict(top=30, bottom=10, left=70, right=70)

In [23]:
# 2005 -> 2010, weighted
colour = []
order  = []
links  = []
pairs = (["2005",'profile_past_vote_2005'],
         ["2010",'profile_past_vote_2010'])
scale = 100
nodes = []
wt_col = 'wt_new_W1_W13'
for pair_no in range(0,len(pairs)-1):
#     (links,order) = make_sankey(BES_Panel, links, order,
#                         pairs[pair_no][0], pairs[pair_no+1][0], pairs[pair_no][1], pairs[pair_no+1][1],
#                         replace_dict, threshold, colour, lr_order, wt_col)
    (links, order, nodes) = make_sankey(BES_Panel, links, order,
                        pairs[pair_no][0], pairs[pair_no+1][0],
                        pairs[pair_no][1], pairs[pair_no+1][1],
                        replace_dict, threshold, colour, lr_order, scale, nodes, wt_col)    

SankeyWidget(links=links, order=order, margins = margins, scale=2)

SankeyWidget(links=[{'source': 'DNV2005', 'target': 'DNV2010', 'value': 13.595799711610557, 'color': 'grey', '…

In [47]:
import json, urllib
import plotly.plotly as py
import pandas as pd
import numpy as np
from plotly import __version__
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)

In [51]:
iplot([{"x": [1, 2, 3], "y": [3, 1, 6]}])

In [83]:
[colourmap[x.split("2")[0]] for x in nodes_df['id']]

['grey',
 'grey',
 'blue',
 'red',
 'orange',
 'cyan',
 'blue',
 'purple',
 'red',
 'orange',
 'yellow',
 'yellow',
 'purple',
 'cyan']

In [64]:
links_df = pd.DataFrame.from_dict(links)
nodes_df = pd.DataFrame.from_dict(nodes)


In [72]:
source_num = [list(nodes_df['id'].values).index(x) for x in links_df["source"].values]
target_num = [list(nodes_df['id'].values).index(x) for x in links_df["target"].values]
# nodes_num = [list(nodes_df['id'].values).index(x)  for x in nodes_df['id'].values]

In [61]:
nodes_df.id = nodes_df.id.astype('category').cat.set_categories(flatten(order), ordered=True)
nodes_df = nodes_df.sort_values(by='id')

In [84]:
data_trace = dict(
    type='sankey',
    domain = dict(
      x =  [0,1],
      y =  [0,1]
    ),
    orientation = "h",
    valueformat = ".0f",
    node = dict(
      pad = 10,
      thickness = 30,
      line = dict(
        color = "black",
        width = 0
      ),
      label =  list(nodes_df['id'].values),
      color =  [colourmap[x.split("2")[0]] for x in nodes_df['id']]
        # scottish_df['Node, Label'].dropna(axis=0, how='any'),
#      color = scottish_df['Color']
    ),
#     link = links
    link = dict(
        source = source_num,
        target = target_num,
        value = list(links_df["value"].values),
        color = links_df["color"].values,
#       source = scottish_df['Source'].dropna(axis=0, how='any'),
#       target = scottish_df['Target'].dropna(axis=0, how='any'),
#       value = scottish_df['Value'].dropna(axis=0, how='any'),
#       color = scottish_df['Link Color'].dropna(axis=0, how='any'),        
        
#       source = scottish_df['Source'].dropna(axis=0, how='any'),
#       target = scottish_df['Target'].dropna(axis=0, how='any'),
#       value = scottish_df['Value'].dropna(axis=0, how='any'),
#       color = scottish_df['Link Color'].dropna(axis=0, how='any'),
  )
)

layout =  dict(
    title = "Scottish Referendum Voters who now want Independence",
    height = 772,
    font = dict(
      size = 10
    ),    
)

fig = dict(data=[data_trace], layout=layout)
iplot(fig, validate=False)