# Exploring and Visualizing Beyond the East Busway Data
## Matthew Samach

In [44]:
import pandas as pd
import plotly.graph_objects as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import numpy as np
import warnings
import itertools
import re

warnings.filterwarnings('ignore')

%matplotlib inline

## Reading in Data Using Pandas

In [7]:
raw_data = pd.read_csv("../Data/responses_anonymous.csv", encoding = "ISO-8859-1")
raw_data.head()

Unnamed: 0,id,addr,timestamp,3 - Area of interest,4 - Communities of interest,5 - Destinations,6 - Corridor preference,7 - Pittsburgh areas,8 - Allegheny County areas,9 - PAAC Routes Used,10 - Routing Type Preference,11 - Current Transit Usage,12 - Other Transport Used,12 - Why Other Transport Used,13 - Age,13 - Employment status,13 - Where From,14 - Gender,14 - Race,15 - Comments
0,426926b2-be9a-4d8e-8aa6-423714dee82f,172.58.43.140,2019-08-03T18:35:41,"['McKeesport and the Mon Valley', 'Monroeville...","['McKeesport', 'Monroeville']","['Aldi Store #97', 'Centers for Rehab Services...",['Mon Valley via Westinghouse Bridge'],['Oakland (Pittsburgh)'],[],['P1 - East Busway - All Stops'],['2'],"['Rarely, maybe a couple times a year']",['Private car'],"[""Transit isn't reliable to get me where I'm g...",['65-74 years old'],['Full-time employment'],['Regent Square (Pittsburgh)'],['Man'],['Black or African American'],['']
1,250af43d-f6db-41fe-a55b-2c99398d308b,174.242.0.246,2019-09-21T15:57:02,"['McKeesport and the Mon Valley', 'Monroeville...","['Braddock Borough', 'Braddock Hills Borough',...","['Barrett El Sch', 'Beulah Church', 'Homestead...",['Monroeville via 376 (P67 Corridor)'],"['Downtown (Pittsburgh)', 'East End neighborho...","['Airport Corridor / Parkway West', 'North Hil...","['14 - Ohio Valley', '21 - Coraopolis', '28X -...",[],['Several times a week'],"['Private car', 'Uber/Lyft']","[""Transit doesn't come often enough"", 'Too slo...",['35-44 years old'],['Part-time employment'],['Squirrel Hill South (Pittsburgh)'],['Woman'],['White'],['Pittsburgh bus sytem is the worst. unreliabl...
2,aea94a48-3639-4c7c-8f93-9c05c46282ea,173.136.2.46,2019-07-04T15:29:59,['Monroeville and Eastern Suburbs'],"['Chalfant Borough', 'Edgewood Borough', 'Monr...","['Braddock Carnegie Library', 'C C Mellor Memo...",['Monroeville via 376 (P67 Corridor)'],['Oakland (Pittsburgh)'],['East Hills'],['69 - Trafford'],"['7-Slower buses with windy routes, but with l...",['Several times a week'],['Private car'],"[""Transit doesn't come often enough"", 'No tran...",['35-44 years old'],['Part-time employment (more than one employer)'],['Forest Hills Borough'],['Woman'],['White'],['']
3,da2837dc-2047-4d7d-9bb0-82547feee7b4,174.242.35.95,2019-07-30T23:06:44,['McKeesport and the Mon Valley'],"['Braddock Borough', 'Braddock Hills Borough',...",[],['Mon Valley via Forest Hills'],['Downtown (Pittsburgh)'],['Airport Corridor / Parkway West'],[],[],['Multiple trips per day'],['Jitney'],['Public transit is not available where I am t...,['35-44 years old'],['Retired'],['Rankin Borough'],['Man'],['White'],['']
4,009a7c74-36df-485b-b9ab-4d89d73ae94c,107.0.42.235,2019-08-23T12:16:01,['McKeesport and the Mon Valley'],"['Duquesne', 'Homestead Borough', 'Munhall Bor...","['RIDC Riverplace / City Center of Duquesne', ...",['Mon Valley via Homestead (61C Corridor)'],[],[],[],['2'],"['Rarely, maybe a couple times a year']","['Carpool', 'Private car']",['Too slow'],['25-34 years old'],['Full-time employment'],['West Mifflin Borough'],['Woman'],['White'],['']


Data are not in First Normal Form (there are multiple values per cell in some cases). Will have to write a function to move data into that form.

In [8]:
# Two columns that'll be used

col_indeces = [4, 16]
raw_data.columns[col_indeces]

Index(['4 - Communities of interest', '13 - Where From'], dtype='object')

In [57]:
data = raw_data.apply(lambda x: x.str.strip("[]\"' ") if x.dtype == "object" else x)
data.head(2)

Unnamed: 0,id,addr,timestamp,3 - Area of interest,4 - Communities of interest,5 - Destinations,6 - Corridor preference,7 - Pittsburgh areas,8 - Allegheny County areas,9 - PAAC Routes Used,10 - Routing Type Preference,11 - Current Transit Usage,12 - Other Transport Used,12 - Why Other Transport Used,13 - Age,13 - Employment status,13 - Where From,14 - Gender,14 - Race,15 - Comments
0,426926b2-be9a-4d8e-8aa6-423714dee82f,172.58.43.140,2019-08-03T18:35:41,"McKeesport and the Mon Valley', 'Monroeville a...","McKeesport', 'Monroeville","Aldi Store #97', 'Centers for Rehab Services F...",Mon Valley via Westinghouse Bridge,Oakland (Pittsburgh),,P1 - East Busway - All Stops,2.0,"Rarely, maybe a couple times a year",Private car,Transit isn't reliable to get me where I'm goi...,65-74 years old,Full-time employment,Regent Square (Pittsburgh),Man,Black or African American,
1,250af43d-f6db-41fe-a55b-2c99398d308b,174.242.0.246,2019-09-21T15:57:02,"McKeesport and the Mon Valley', 'Monroeville a...","Braddock Borough', 'Braddock Hills Borough', '...","Barrett El Sch', 'Beulah Church', 'Homestead A...",Monroeville via 376 (P67 Corridor),"Downtown (Pittsburgh)', 'East End neighborhood...","Airport Corridor / Parkway West', 'North Hills...","14 - Ohio Valley', '21 - Coraopolis', '28X - A...",,Several times a week,"Private car', 'Uber/Lyft","Transit doesn't come often enough"", 'Too slow'...",35-44 years old,Part-time employment,Squirrel Hill South (Pittsburgh),Woman,White,Pittsburgh bus sytem is the worst. unreliable ...


In [9]:
def firstNormal(dFrame, col_indeces = []):
    '''
    Function that takes in a dataframe and returns a the same information as a dataframe in 
    first normal form (each cell contains only one piece of data). Parameter col_indeces is 
    a list of column indeces to include when making pairs. 
    If not included, all possible combinations are too large.
    '''
    
    nrows = dFrame.shape[0]
    
    if col_indeces == []:
        cols = dFrame.columns
    else:
        cols = dFrame.columns[col_indeces]
        
    returnDF = pd.DataFrame(columns = cols)
        
    
    for i in range(nrows):
        row_lst = []
        
        for c in cols:

            cell_list = [x.strip("[]\"' ") for x in dFrame[c].loc[i].split("', ")]
            row_lst.append(cell_list)
            
        NF1_rows = pd.DataFrame.from_records(list(itertools.product(*row_lst)), columns=cols)
        
        returnDF = returnDF.append(NF1_rows)
    
    return returnDF.reset_index(drop = True)


In [31]:
data['4 - Communities of interest'].str.get_dummies("',")

Unnamed: 0,'Braddock Hills Borough,'Chalfant Borough,'Churchill Borough,'Duquesne,'East McKeesport Borough,'East Pittsburgh Borough,'Edgewood Borough,'Forest Hills Borough,'Homestead Borough,'McKeesport,...,Monroeville,Munhall Borough,North Versailles Township,Rankin Borough,Swissvale Borough,Turtle Creek Borough,West Homestead Borough,Whitaker Borough,Wilkins Township,Wilkinsburg Borough
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,1,0,0,1,0,1,1,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,1,0,1,1,1,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
513,0,0,1,0,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
514,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
515,0,0,1,0,0,1,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
516,0,0,0,0,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
def oneHot(dFrame, col):
    '''
    Fumnction that takes in a dataframe and returns the information where all categories 
    are one-hot encoded.
    '''
    data[col] = data[col].str.get_dummies("',")

In [None]:
# Using firstNormal function to get From -> To data pairs and renaming columns

from_to = firstNormal(raw_data, col_indeces=[16,4])
from_to.columns = ['From', 'To']
from_to.head()

In [None]:
def aggregateFromTo(dFrame, from_col = 'From', to_col = 'To'):
    '''
    Function that takes in a dataframe of from -> to preferences and returns aggregated counts 
    of all from -> to pairs.
    '''
    
    # First group by and use size for aggregations
    from_to_agg = dFrame.groupby([from_col, to_col]).size().reset_index()
    from_to_agg.columns = ['From', 'To', 'Count']
    
    # Remove any rows where From and To are the same
    from_to_agg = from_to_agg[from_to_agg['From'] != from_to_agg['To']]
    
    return from_to_agg

In [None]:
# Getting the counts of from -> to pairs
agg_from_to = aggregateFromTo(from_to).sort_values(by = 'Count', ascending = False)
agg_from_to

In [None]:
# Weird entry going on in the data, just will remove it for now
from_to_id = agg_from_to[(agg_from_to['From'] != 'mck') & (agg_from_to['From'] != '2434 south braddock ave')]

In [None]:
# Giving unique indeces to each origin and destination
from_to_id['from_id'] = pd.factorize(from_to_id.From)[0]
from_to_id['to_id'] = pd.factorize(from_to_id.To)[0]
from_to_id['to_id'] = from_to_id['to_id'].apply(lambda x: x + 1 + max(from_to_id['from_id']))
from_to_id.head()

In [None]:
# Because the Sankey package takes data in a weird format for labeling, have to do a few more transforms
from_to_id = from_to_id.sort_values(by = ["from_id", "to_id"])
labels = np.append(from_to_id.From.unique(), from_to_id.To.unique())

n_blank = from_to_id.shape[0] - len(labels)
print(n_blank)
labels = np.append(labels, [""] * n_blank)
from_to_id['Label'] = labels
from_to_id

In [None]:
# List of all from_ids

from_to_id.from_id.unique()

In [None]:
# List of all to_ids

np.sort(from_to_id.to_id.unique())

In [None]:
# Creating a data frame just with nodes for our Sankey plot
# Need ID & Label
from_nodes = from_to_id[['from_id', 'From']].drop_duplicates()
from_nodes.columns = ['Node', "Label"]

to_nodes = from_to_id[['to_id', 'To']].drop_duplicates()
to_nodes.columns = ['Node', 'Label']

nodes_df = from_nodes.append(to_nodes).reset_index(drop = True).sort_values(by = "Node")
nodes_df

In [None]:
# Creating a data frame with just links for Sankey plot
links_df = from_to_id[['from_id', 'to_id', 'Count']]
links_df.columns = ['Source', 'Target', 'Value']
links_df = links_df.sort_values(by = ['Source', 'Target'])
links_df

In [None]:
data_trace = dict(
    type='sankey',
#     domain = dict(
#       x =  [0,1],
#       y =  [0,1]
#     ),
    orientation = "h",
    valueformat = ".0f",
    node = dict(
      pad = 10,
      thickness = 30,
      line = dict(
        color = "black",
        width = 0
      ),
      label =  nodes_df['Label'].dropna(axis=0, how='any'),

    ),
    link = dict(
      source = links_df['Source'].dropna(axis=0, how='any'),
      target = links_df['Target'].dropna(axis=0, how='any'),
      value = links_df['Value'].dropna(axis=0, how='any')
  )
)


fig = dict(data=[data_trace])
iplot(fig, validate=False)

In [None]:
from_to_id[(from_to_id.From == "Penn Hills Township") & (from_to_id.To == "Edgewood Borough")]