# Exploring and Visualizing Beyond the East Busway Data
## Matthew Samach

In [1]:
import pandas as pd
import plotly.graph_objects as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import numpy as np
import warnings
import itertools
import helpers as hp

warnings.filterwarnings('ignore')

%matplotlib inline

## Reading in Data Using Pandas

In [2]:
raw_data = pd.read_csv("../Data/responses_anonymous.csv", encoding = "ISO-8859-1")

Data are not in First Normal Form (there are multiple values per cell in some cases). Will have to write a function to move data into that form.

In [None]:
# Writing a function to get this type of data into Sankey format. Will then write a wrapper to graph the Sankey
import helpers as hp

def sankeyFormat(dFrame, col_indeces):
    
    if len(col_indeces) != 2:
        print("Column indeces should only have 2 items")
        return 

    dFrame = hp.firstNormal(dFrame = raw_data, col_indeces = col_indeces)
    agg_dFrame = hp.aggregateFromTo(dFrame, from_col=dFrame.columns[0],
                               to_col=dFrame.columns[1]).sort_values(by = 'Count', ascending = False)

    # Giving unique indeces to each origin and destination
    agg_dFrame['from_id'] = pd.factorize(agg_dFrame.iloc[:,0])[0]
    agg_dFrame['to_id'] = pd.factorize(agg_dFrame.iloc[:,1])[0]
    agg_dFrame['to_id'] = agg_dFrame['to_id'].apply(lambda x: x + 1 + max(agg_dFrame['from_id']))
    agg_dFrame.head()

    # Because the Sankey package takes data in a weird format for labeling, have to do a few more transforms
    agg_dFrame = agg_dFrame.sort_values(by = ["from_id", "to_id"])
    agg_dFrame
    labels = np.append(agg_dFrame.iloc[:,0].unique(), agg_dFrame.iloc[:,1].unique())

    # Attaching labels to the data in Sankey format
    n_blank = agg_dFrame.shape[0] - len(labels)
    labels = np.append(labels, [""] * n_blank)
    agg_dFrame['Label'] = labels
    
    return agg_dFrame

In [None]:
dFrame = hp.sankeyFormat(raw_data,[13,12])

In [29]:
# Writing wrapper function to draw Sankey

def drawSankey(dFrame, title = ""):
    # Creating a data frame just with nodes for our Sankey plot
    # Need ID & Label
    from_nodes = dFrame[['from_id', 'From']].drop_duplicates()
    from_nodes.columns = ['Node', "Label"]

    to_nodes = dFrame[['to_id', 'To']].drop_duplicates()
    to_nodes.columns = ['Node', 'Label']
    nodes_df = from_nodes.append(to_nodes).reset_index(drop = True).sort_values(by = "Node")

    # # Creating a data frame with just links for Sankey plot
    links_df = dFrame[['from_id', 'to_id', 'Count']]
    links_df.columns = ['Source', 'Target', 'Value']
    links_df = links_df.sort_values(by = ['Source', 'Target'])

    # Drawing Sankey
    data_trace = dict(
        type='sankey',
        orientation = "h",
        valueformat = ".0f",

        # Creating node structure
        node = dict(
          pad = 10,
          thickness = 30,
          line = dict(
            color = "black",
            width = 0
          ),
          label =  nodes_df['Label'].dropna(axis=0, how='any'),

        ),

        # Creating link structure
        link = dict(
          source = links_df['Source'].dropna(axis=0, how='any'),
          target = links_df['Target'].dropna(axis=0, how='any'),
          value = links_df['Value'].dropna(axis=0, how='any')
      )
    )

    layout =  dict(
        title = title,
        height = 850,
        width = 1000,
        font = dict(
          size = 15
        ),    
    )

    fig = dict(data=[data_trace], layout=layout)

    return fig

In [30]:
fig = drawSankey(dFrame)
iplot(fig, validate=False)

In [19]:
import colorlover as cl
cl.scales['11']

{'div': {'BrBG': ['rgb(84,48,5)',
   'rgb(140,81,10)',
   'rgb(191,129,45)',
   'rgb(223,194,125)',
   'rgb(246,232,195)',
   'rgb(245,245,245)',
   'rgb(199,234,229)',
   'rgb(128,205,193)',
   'rgb(53,151,143)',
   'rgb(1,102,94)',
   'rgb(0,60,48)'],
  'PRGn': ['rgb(64,0,75)',
   'rgb(118,42,131)',
   'rgb(153,112,171)',
   'rgb(194,165,207)',
   'rgb(231,212,232)',
   'rgb(247,247,247)',
   'rgb(217,240,211)',
   'rgb(166,219,160)',
   'rgb(90,174,97)',
   'rgb(27,120,55)',
   'rgb(0,68,27)'],
  'PiYG': ['rgb(142,1,82)',
   'rgb(197,27,125)',
   'rgb(222,119,174)',
   'rgb(241,182,218)',
   'rgb(253,224,239)',
   'rgb(247,247,247)',
   'rgb(230,245,208)',
   'rgb(184,225,134)',
   'rgb(127,188,65)',
   'rgb(77,146,33)',
   'rgb(39,100,25)'],
  'PuOr': ['rgb(127,59,8)',
   'rgb(179,88,6)',
   'rgb(224,130,20)',
   'rgb(253,184,99)',
   'rgb(254,224,182)',
   'rgb(247,247,247)',
   'rgb(216,218,235)',
   'rgb(178,171,210)',
   'rgb(128,115,172)',
   'rgb(84,39,136)',
   'rgb(45,0,75)

In [None]:
# Two columns that'll be used

col_indeces = [4, 16]
raw_data.columns[col_indeces]

In [None]:
def firstNormal(dFrame, col_indeces = []):
    '''
    Function that takes in a dataframe and returns a the same information as a dataframe in 
    first normal form (each cell contains only one piece of data). Parameter col_indeces is 
    a list of column indeces to include when making pairs. 
    If not included, all possible combinations are too large.
    '''
    
    nrows = dFrame.shape[0]
    
    if col_indeces == []:
        cols = dFrame.columns
    else:
        cols = dFrame.columns[col_indeces]
        
    returnDF = pd.DataFrame(columns = cols)
        
    
    for i in range(nrows):
        row_lst = []
        
        for c in cols:

            cell_list = [x.strip("[]\"' ") for x in dFrame[c].loc[i].replace('", ', "', ").split("', ")]
            row_lst.append(cell_list)
            
        NF1_rows = pd.DataFrame.from_records(list(itertools.product(*row_lst)), columns=cols)
        
        returnDF = returnDF.append(NF1_rows)
    
    return returnDF.reset_index(drop = True)

def oneHot(dFrame):
    '''
    Function that takes in a dataframe and returns the information where all categories 
    are one-hot encoded.
    '''

In [None]:
# Using firstNormal function to get From -> To data pairs and renaming columns

from_to = firstNormal(raw_data, col_indeces=[16,4])
from_to.columns = ['From', 'To']
from_to.head()

In [None]:
def aggregateFromTo(dFrame, from_col = 'From', to_col = 'To'):
    '''
    Function that takes in a dataframe of from -> to preferences and returns aggregated counts 
    of all from -> to pairs.
    '''
    
    # First group by and use size for aggregations
    from_to_agg = dFrame.groupby([from_col, to_col]).size().reset_index()
    from_to_agg.columns = [from_col, to_col, 'Count']
    
    # Remove any rows where From and To are the same
    from_to_agg = from_to_agg[from_to_agg[from_col] != from_to_agg[to_col]]
    
    return from_to_agg

In [None]:
# Getting the counts of from -> to pairs
agg_from_to = aggregateFromTo(from_to).sort_values(by = 'Count', ascending = False)
agg_from_to

In [None]:
# Weird entry going on in the data, just will remove it for now
from_to_id = agg_from_to[(agg_from_to['From'] != 'mck') & (agg_from_to['From'] != '2434 south braddock ave')]

In [None]:
# Giving unique indeces to each origin and destination
from_to_id['from_id'] = pd.factorize(from_to_id.From)[0]
from_to_id['to_id'] = pd.factorize(from_to_id.To)[0]
from_to_id['to_id'] = from_to_id['to_id'].apply(lambda x: x + 1 + max(from_to_id['from_id']))
from_to_id.head()

In [None]:
# Because the Sankey package takes data in a weird format for labeling, have to do a few more transforms
from_to_id = from_to_id.sort_values(by = ["from_id", "to_id"])
labels = np.append(from_to_id.From.unique(), from_to_id.To.unique())

n_blank = from_to_id.shape[0] - len(labels)
print(n_blank)
labels = np.append(labels, [""] * n_blank)
from_to_id['Label'] = labels
from_to_id

In [None]:
# List of all from_ids

from_to_id.from_id.unique()

In [None]:
# List of all to_ids

np.sort(from_to_id.to_id.unique())

In [None]:
# Creating a data frame just with nodes for our Sankey plot
# Need ID & Label
from_nodes = from_to_id[['from_id', 'From']].drop_duplicates()
from_nodes.columns = ['Node', "Label"]

to_nodes = from_to_id[['to_id', 'To']].drop_duplicates()
to_nodes.columns = ['Node', 'Label']

nodes_df = from_nodes.append(to_nodes).reset_index(drop = True).sort_values(by = "Node")
nodes_df

In [None]:
# Creating a data frame with just links for Sankey plot
links_df = from_to_id[['from_id', 'to_id', 'Count']]
links_df.columns = ['Source', 'Target', 'Value']
links_df = links_df.sort_values(by = ['Source', 'Target'])
links_df

In [None]:
data_trace = dict(
    type='sankey',
#     domain = dict(
#       x =  [0,1],
#       y =  [0,1]
#     ),
    orientation = "h",
    valueformat = ".0f",
    node = dict(
      pad = 10,
      thickness = 30,
      line = dict(
        color = "black",
        width = 0
      ),
      label =  nodes_df['Label'].dropna(axis=0, how='any'),

    ),
    link = dict(
      source = links_df['Source'].dropna(axis=0, how='any'),
      target = links_df['Target'].dropna(axis=0, how='any'),
      value = links_df['Value'].dropna(axis=0, how='any')
  )
)


fig = dict(data=[data_trace])
iplot(fig, validate=False)

In [None]:
from_to_id[(from_to_id.From == "Penn Hills Township") & (from_to_id.To == "Edgewood Borough")]

## Sankey: Barriers to Transit -> Current Modes of Transit
### 61C Corridor

'Mon Valley via Homestead (61C Corridor)'
12 - Other Transport Used 	12 - Why Other Transport Used

In [None]:
col_indeces = [13, 12]

cor_61c = raw_data[raw_data['6 - Corridor preference']=="['Mon Valley via Homestead (61C Corridor)']"]
firstNormal(cor_61c, col_indeces=col_indeces)

In [None]:
col_indeces = [13, 12]
print(raw_data.columns[col_indeces])

cor_61c = raw_data[raw_data['6 - Corridor preference']=="['Mon Valley via Homestead (61C Corridor)']"]

# barriers_transit = firstNormal(cor_61c, col_indeces=col_indeces)
# barriers_transit.columns = ["Why", "Other_transit"]
# barriers_transit.head()

In [None]:
# Getting the counts of from -> to pairs
agg_barr_trans = aggregateFromTo(barriers_transit, from_col="Why", 
                                 to_col="Other_transit").sort_values(by = 'Count', ascending = False)
agg_barr_trans.head()

In [None]:
# Giving unique indeces to each origin and destination
from_to_id['from_id'] = pd.factorize(from_to_id.From)[0]
from_to_id['to_id'] = pd.factorize(from_to_id.To)[0]
from_to_id['to_id'] = from_to_id['to_id'].apply(lambda x: x + 1 + max(from_to_id['from_id']))
from_to_id.head()