# Exploring and Visualizing Beyond the East Busway Data
## Matthew Samach

In [44]:
import pandas as pd
import plotly.graph_objects as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import numpy as np
import warnings
import itertools
import re

warnings.filterwarnings('ignore')

%matplotlib inline

## Reading in Data Using Pandas

In [86]:
raw_data = pd.read_csv("../Data/responses_anonymous.csv", encoding = "ISO-8859-1")
raw_data.head()

Unnamed: 0,id,addr,timestamp,3 - Area of interest,4 - Communities of interest,5 - Destinations,6 - Corridor preference,7 - Pittsburgh areas,8 - Allegheny County areas,9 - PAAC Routes Used,10 - Routing Type Preference,11 - Current Transit Usage,12 - Other Transport Used,12 - Why Other Transport Used,13 - Age,13 - Employment status,13 - Where From,14 - Gender,14 - Race,15 - Comments
0,426926b2-be9a-4d8e-8aa6-423714dee82f,172.58.43.140,2019-08-03T18:35:41,"['McKeesport and the Mon Valley', 'Monroeville...","['McKeesport', 'Monroeville']","['Aldi Store #97', 'Centers for Rehab Services...",['Mon Valley via Westinghouse Bridge'],['Oakland (Pittsburgh)'],[],['P1 - East Busway - All Stops'],['2'],"['Rarely, maybe a couple times a year']",['Private car'],"[""Transit isn't reliable to get me where I'm g...",['65-74 years old'],['Full-time employment'],['Regent Square (Pittsburgh)'],['Man'],['Black or African American'],['']
1,250af43d-f6db-41fe-a55b-2c99398d308b,174.242.0.246,2019-09-21T15:57:02,"['McKeesport and the Mon Valley', 'Monroeville...","['Braddock Borough', 'Braddock Hills Borough',...","['Barrett El Sch', 'Beulah Church', 'Homestead...",['Monroeville via 376 (P67 Corridor)'],"['Downtown (Pittsburgh)', 'East End neighborho...","['Airport Corridor / Parkway West', 'North Hil...","['14 - Ohio Valley', '21 - Coraopolis', '28X -...",[],['Several times a week'],"['Private car', 'Uber/Lyft']","[""Transit doesn't come often enough"", 'Too slo...",['35-44 years old'],['Part-time employment'],['Squirrel Hill South (Pittsburgh)'],['Woman'],['White'],['Pittsburgh bus sytem is the worst. unreliabl...
2,aea94a48-3639-4c7c-8f93-9c05c46282ea,173.136.2.46,2019-07-04T15:29:59,['Monroeville and Eastern Suburbs'],"['Chalfant Borough', 'Edgewood Borough', 'Monr...","['Braddock Carnegie Library', 'C C Mellor Memo...",['Monroeville via 376 (P67 Corridor)'],['Oakland (Pittsburgh)'],['East Hills'],['69 - Trafford'],"['7-Slower buses with windy routes, but with l...",['Several times a week'],['Private car'],"[""Transit doesn't come often enough"", 'No tran...",['35-44 years old'],['Part-time employment (more than one employer)'],['Forest Hills Borough'],['Woman'],['White'],['']
3,da2837dc-2047-4d7d-9bb0-82547feee7b4,174.242.35.95,2019-07-30T23:06:44,['McKeesport and the Mon Valley'],"['Braddock Borough', 'Braddock Hills Borough',...",[],['Mon Valley via Forest Hills'],['Downtown (Pittsburgh)'],['Airport Corridor / Parkway West'],[],[],['Multiple trips per day'],['Jitney'],['Public transit is not available where I am t...,['35-44 years old'],['Retired'],['Rankin Borough'],['Man'],['White'],['']
4,009a7c74-36df-485b-b9ab-4d89d73ae94c,107.0.42.235,2019-08-23T12:16:01,['McKeesport and the Mon Valley'],"['Duquesne', 'Homestead Borough', 'Munhall Bor...","['RIDC Riverplace / City Center of Duquesne', ...",['Mon Valley via Homestead (61C Corridor)'],[],[],[],['2'],"['Rarely, maybe a couple times a year']","['Carpool', 'Private car']",['Too slow'],['25-34 years old'],['Full-time employment'],['West Mifflin Borough'],['Woman'],['White'],['']


In [90]:
county_gis = pd.read_csv('../Data/Allegheny_County_Magisterial_Districts_Outlines_2015.csv')
county_gis.head(3)

Unnamed: 0,OBJECTID,NAME,TYPE,LABEL,COG,SCHOOLD,CONGDIST,FIPS,REGION,ACRES,...,YEARCONVERTED,GlobalID,Magisterial_District,Judge,Address,Unit,City,State,Zip,Coverage
0,1,EAST DEER,TOWNSHIP,East Deer Township,Allegheny Valley North,Deer Lakes,4,21024,NH,1613.701782,...,1968,814ea69d-3400-4438-9200-318abece9b2e,Magisterial District 05-2-05,Honorable Carolyn S. Bengel,1010 Broadview Boulevard,,Brackenridge,PA,15014,East Deer
1,2,SPRINGDALE,TOWNSHIP,Springdale Township,Allegheny Valley North,Allegheny Valley,18,72968,NH,1553.413696,...,1968,132d931c-f4d5-4322-bbb8-acb7b60a8e0d,Magisterial District 05-3-03,Honorable David J. Sosovicka,721 Gulf Lab Road,,Cheswick,PA,15024,Springdale
2,3,SPRINGDALE,BOROUGH,Springdale Borough,Allegheny Valley North,Allegheny Valley,18,72960,NH,698.273926,...,1967,ab45a710-6b64-41d9-a760-83d8ce4d2a31,Magisterial District 05-3-03,Honorable David J. Sosovicka,721 Gulf Lab Road,,Cheswick,PA,15024,Springdale


In [80]:
raw_data.shape

(518, 20)

Data are not in First Normal Form (there are multiple values per cell in some cases). Will have to write a function to move data into that form.

In [8]:
# Two columns that'll be used

col_indeces = [4, 16]
raw_data.columns[col_indeces]

Index(['4 - Communities of interest', '13 - Where From'], dtype='object')

In [79]:
data = raw_data.replace('(\',)|(\",)', '|',regex=True)
data = data.replace('[\[\]\"\']', '',regex=True)
data.head(3)

Unnamed: 0,id,addr,timestamp,3 - Area of interest,4 - Communities of interest,5 - Destinations,6 - Corridor preference,7 - Pittsburgh areas,8 - Allegheny County areas,9 - PAAC Routes Used,10 - Routing Type Preference,11 - Current Transit Usage,12 - Other Transport Used,12 - Why Other Transport Used,13 - Age,13 - Employment status,13 - Where From,14 - Gender,14 - Race,15 - Comments
0,426926b2-be9a-4d8e-8aa6-423714dee82f,172.58.43.140,2019-08-03T18:35:41,McKeesport and the Mon Valley| Monroeville and...,McKeesport| Monroeville,Aldi Store #97| Centers for Rehab Services For...,Mon Valley via Westinghouse Bridge,Oakland (Pittsburgh),,P1 - East Busway - All Stops,2,"Rarely, maybe a couple times a year",Private car,Transit isnt reliable to get me where Im going...,65-74 years old,Full-time employment,Regent Square (Pittsburgh),Man,Black or African American,
1,250af43d-f6db-41fe-a55b-2c99398d308b,174.242.0.246,2019-09-21T15:57:02,McKeesport and the Mon Valley| Monroeville and...,Braddock Borough| Braddock Hills Borough| Duqu...,Barrett El Sch| Beulah Church| Homestead Apart...,Monroeville via 376 (P67 Corridor),Downtown (Pittsburgh)| East End neighborhoods ...,Airport Corridor / Parkway West| North Hills /...,14 - Ohio Valley| 21 - Coraopolis| 28X - Airpo...,,Several times a week,Private car| Uber/Lyft,Transit doesnt come often enough| Too slow| I ...,35-44 years old,Part-time employment,Squirrel Hill South (Pittsburgh),Woman,White,Pittsburgh bus sytem is the worst. unreliable ...
2,aea94a48-3639-4c7c-8f93-9c05c46282ea,173.136.2.46,2019-07-04T15:29:59,Monroeville and Eastern Suburbs,Chalfant Borough| Edgewood Borough| Monroevill...,Braddock Carnegie Library| C C Mellor Memorial...,Monroeville via 376 (P67 Corridor),Oakland (Pittsburgh),East Hills,69 - Trafford,"7-Slower buses with windy routes, but with les...",Several times a week,Private car,Transit doesnt come often enough| No transit s...,35-44 years old,Part-time employment (more than one employer),Forest Hills Borough,Woman,White,


In [9]:
def firstNormal(dFrame, col_indeces = []):
    '''
    Function that takes in a dataframe and returns a the same information as a dataframe in 
    first normal form (each cell contains only one piece of data). Parameter col_indeces is 
    a list of column indeces to include when making pairs. 
    If not included, all possible combinations are too large.
    '''
    
    nrows = dFrame.shape[0]
    
    if col_indeces == []:
        cols = dFrame.columns
    else:
        cols = dFrame.columns[col_indeces]
        
    returnDF = pd.DataFrame(columns = cols)
        
    
    for i in range(nrows):
        row_lst = []
        
        for c in cols:

            cell_list = [x.strip("[]\"' ") for x in dFrame[c].loc[i].split("', ")]
            row_lst.append(cell_list)
            
        NF1_rows = pd.DataFrame.from_records(list(itertools.product(*row_lst)), columns=cols)
        
        returnDF = returnDF.append(NF1_rows)
    
    return returnDF.reset_index(drop = True)


In [65]:
data['12 - Why Other Transport Used'].str.get_dummies("',")

Unnamed: 0,"""Transit doesn't come often enough","""Transit doesn't come often enough"", ""Transit isn't reliable to get me where I'm going on time","""Transit doesn't come often enough"", 'I usually need to carry too many things","""Transit doesn't come often enough"", 'No transit service at the times I need it","""Transit doesn't come often enough"", 'Other","""Transit doesn't come often enough"", 'The stops/stations are hard to get to because of bad or no sidewalks","""Transit doesn't come often enough"", 'Too expensive","""Transit doesn't come often enough"", 'Too slow","""Transit isn't reliable to get me where I'm going on time","""Transit isn't reliable to get me where I'm going on time"", 'I only use public transit",...,Transit doesn't come often enough,"Transit doesn't come often enough"", ""Transit isn't reliable to get me where I'm going on time","Transit doesn't come often enough"", 'I usually need to carry too many things","Transit doesn't come often enough"", 'No transit service at the times I need it","Transit doesn't come often enough"", 'Other","Transit doesn't come often enough"", 'The stops/stations are hard to get to because of bad or no sidewalks","Transit doesn't come often enough"", 'Too expensive","Transit doesn't come often enough"", 'Too slow",Transit isn't reliable to get me where I'm going on time,"Transit isn't reliable to get me where I'm going on time"", 'Other"
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
513,0,0,0,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
514,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
515,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
516,0,0,0,0,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [83]:
def oneHot(dFrame, col):
    '''
    Fumnction that takes in a dataframe and returns the information where all categories 
    are one-hot encoded.
    '''
    print('{0} col has {1} unique vals'.format(col, dFrame[col].nunique()))
    onehot = data[col].str.get_dummies("|")
    newframe = pd.concat([dFrame, onehot],axis=1)
    return(newframe)

In [84]:
oneHot(data, '13 - Where From')

13 - Where From col has 103 unique vals


Unnamed: 0,id,addr,timestamp,3 - Area of interest,4 - Communities of interest,5 - Destinations,6 - Corridor preference,7 - Pittsburgh areas,8 - Allegheny County areas,9 - PAAC Routes Used,...,White Oak Borough,Whitehall Borough,Wilkins Township,Wilkinsburg Borough,Wilmerding Borough,mck,mon valley,portVue,southside,westend
0,426926b2-be9a-4d8e-8aa6-423714dee82f,172.58.43.140,2019-08-03T18:35:41,McKeesport and the Mon Valley| Monroeville and...,McKeesport| Monroeville,Aldi Store #97| Centers for Rehab Services For...,Mon Valley via Westinghouse Bridge,Oakland (Pittsburgh),,P1 - East Busway - All Stops,...,0,0,0,0,0,0,0,0,0,0
1,250af43d-f6db-41fe-a55b-2c99398d308b,174.242.0.246,2019-09-21T15:57:02,McKeesport and the Mon Valley| Monroeville and...,Braddock Borough| Braddock Hills Borough| Duqu...,Barrett El Sch| Beulah Church| Homestead Apart...,Monroeville via 376 (P67 Corridor),Downtown (Pittsburgh)| East End neighborhoods ...,Airport Corridor / Parkway West| North Hills /...,14 - Ohio Valley| 21 - Coraopolis| 28X - Airpo...,...,0,0,0,0,0,0,0,0,0,0
2,aea94a48-3639-4c7c-8f93-9c05c46282ea,173.136.2.46,2019-07-04T15:29:59,Monroeville and Eastern Suburbs,Chalfant Borough| Edgewood Borough| Monroevill...,Braddock Carnegie Library| C C Mellor Memorial...,Monroeville via 376 (P67 Corridor),Oakland (Pittsburgh),East Hills,69 - Trafford,...,0,0,0,0,0,0,0,0,0,0
3,da2837dc-2047-4d7d-9bb0-82547feee7b4,174.242.35.95,2019-07-30T23:06:44,McKeesport and the Mon Valley,Braddock Borough| Braddock Hills Borough| Chal...,,Mon Valley via Forest Hills,Downtown (Pittsburgh),Airport Corridor / Parkway West,,...,0,0,0,0,0,0,0,0,0,0
4,009a7c74-36df-485b-b9ab-4d89d73ae94c,107.0.42.235,2019-08-23T12:16:01,McKeesport and the Mon Valley,Duquesne| Homestead Borough| Munhall Borough| ...,RIDC Riverplace / City Center of Duquesne| The...,Mon Valley via Homestead (61C Corridor),,,,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
513,5e3d92f8-f40e-41e7-a7cc-1bd4934a51d3,130.49.204.177,2019-06-27T16:00:43,McKeesport and the Mon Valley| Monroeville and...,Braddock Hills Borough| Churchill Borough| Eas...,Aldi #56| Braddock Hills Shopping Center| Kuhn...,Monroeville via Braddock & Turtle Creek,Mon Valley neighborhoods (Pittsburgh)| Oakland...,,61A - Swissvale| 61D - Murray| 67 - Monroevill...,...,0,0,0,0,0,0,0,0,0,0
514,0f5dfb9f-1f27-436d-9354-043a1d630f01,172.58.205.239,2019-07-07T18:56:13,Monroeville and Eastern Suburbs,Churchill Borough| East Pittsburgh Borough| Mo...,Aldi| GetGo #3060| Giant Eagle,Monroeville via 376 (P67 Corridor),Downtown (Pittsburgh)| East End neighborhoods ...,East Hills,67 - Monroeville| 77 - Penn Hills| P1 - East B...,...,0,0,0,1,0,0,0,0,0,0
515,8cd72fa2-1632-49dc-aa7c-5cc3764a7e5e,96.66.171.89,2019-09-26T19:34:16,McKeesport and the Mon Valley| Monroeville and...,Braddock Hills Borough| Churchill Borough| Eas...,Braddock Farms (Grow Pittsburgh)| HOMESTEAD UP...,Mon Valley via Westinghouse Bridge,Downtown (Pittsburgh)| East End neighborhoods ...,Airport Corridor / Parkway West| East Hills| U...,53 - Homestead Park| 59 - Mon Valley| 61A - Sw...,...,0,0,0,0,0,0,0,0,0,0
516,e48c446e-8c3a-4fb6-b14a-6841c87b0a72,128.237.212.143,2019-06-26T15:57:18,McKeesport and the Mon Valley,Braddock Borough| East Pittsburgh Borough| Hom...,Aldi #56| Braddock Carnegie Library| General B...,Mon Valley via Keystone Commons,Downtown (Pittsburgh)| East End neighborhoods ...,Airport Corridor / Parkway West| North Hills /...,1 - Freeport Road| 28X - Airport Flyer| 61A - ...,...,0,0,0,0,0,0,0,0,0,0


In [None]:
# Using firstNormal function to get From -> To data pairs and renaming columns

from_to = firstNormal(raw_data, col_indeces=[16,4])
from_to.columns = ['From', 'To']
from_to.head()

In [None]:
def aggregateFromTo(dFrame, from_col = 'From', to_col = 'To'):
    '''
    Function that takes in a dataframe of from -> to preferences and returns aggregated counts 
    of all from -> to pairs.
    '''
    
    # First group by and use size for aggregations
    from_to_agg = dFrame.groupby([from_col, to_col]).size().reset_index()
    from_to_agg.columns = ['From', 'To', 'Count']
    
    # Remove any rows where From and To are the same
    from_to_agg = from_to_agg[from_to_agg['From'] != from_to_agg['To']]
    
    return from_to_agg

In [None]:
# Getting the counts of from -> to pairs
agg_from_to = aggregateFromTo(from_to).sort_values(by = 'Count', ascending = False)
agg_from_to

In [None]:
# Weird entry going on in the data, just will remove it for now
from_to_id = agg_from_to[(agg_from_to['From'] != 'mck') & (agg_from_to['From'] != '2434 south braddock ave')]

In [None]:
# Giving unique indeces to each origin and destination
from_to_id['from_id'] = pd.factorize(from_to_id.From)[0]
from_to_id['to_id'] = pd.factorize(from_to_id.To)[0]
from_to_id['to_id'] = from_to_id['to_id'].apply(lambda x: x + 1 + max(from_to_id['from_id']))
from_to_id.head()

In [None]:
# Because the Sankey package takes data in a weird format for labeling, have to do a few more transforms
from_to_id = from_to_id.sort_values(by = ["from_id", "to_id"])
labels = np.append(from_to_id.From.unique(), from_to_id.To.unique())

n_blank = from_to_id.shape[0] - len(labels)
print(n_blank)
labels = np.append(labels, [""] * n_blank)
from_to_id['Label'] = labels
from_to_id

In [None]:
# List of all from_ids

from_to_id.from_id.unique()

In [None]:
# List of all to_ids

np.sort(from_to_id.to_id.unique())

In [None]:
# Creating a data frame just with nodes for our Sankey plot
# Need ID & Label
from_nodes = from_to_id[['from_id', 'From']].drop_duplicates()
from_nodes.columns = ['Node', "Label"]

to_nodes = from_to_id[['to_id', 'To']].drop_duplicates()
to_nodes.columns = ['Node', 'Label']

nodes_df = from_nodes.append(to_nodes).reset_index(drop = True).sort_values(by = "Node")
nodes_df

In [None]:
# Creating a data frame with just links for Sankey plot
links_df = from_to_id[['from_id', 'to_id', 'Count']]
links_df.columns = ['Source', 'Target', 'Value']
links_df = links_df.sort_values(by = ['Source', 'Target'])
links_df

In [None]:
data_trace = dict(
    type='sankey',
#     domain = dict(
#       x =  [0,1],
#       y =  [0,1]
#     ),
    orientation = "h",
    valueformat = ".0f",
    node = dict(
      pad = 10,
      thickness = 30,
      line = dict(
        color = "black",
        width = 0
      ),
      label =  nodes_df['Label'].dropna(axis=0, how='any'),

    ),
    link = dict(
      source = links_df['Source'].dropna(axis=0, how='any'),
      target = links_df['Target'].dropna(axis=0, how='any'),
      value = links_df['Value'].dropna(axis=0, how='any')
  )
)


fig = dict(data=[data_trace])
iplot(fig, validate=False)

In [None]:
from_to_id[(from_to_id.From == "Penn Hills Township") & (from_to_id.To == "Edgewood Borough")]