# Exploring and Visualizing Beyond the East Busway Data
## Matthew Samach

## Reading in Data Using Pandas

In [4]:
import pandas as pd
import plotly.graph_objects as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import numpy as np
import warnings
import itertools
import helpers
from fuzzywuzzy import process
from fuzzywuzzy import fuzz
import re

warnings.filterwarnings('ignore')

%matplotlib inline





In [5]:
raw_data = pd.read_csv("../Data/responses_anonymous.csv", encoding = "ISO-8859-1")
raw_data.head(1)

Unnamed: 0,id,addr,timestamp,3 - Area of interest,4 - Communities of interest,5 - Destinations,6 - Corridor preference,7 - Pittsburgh areas,8 - Allegheny County areas,9 - PAAC Routes Used,10 - Routing Type Preference,11 - Current Transit Usage,12 - Other Transport Used,12 - Why Other Transport Used,13 - Age,13 - Employment status,13 - Where From,14 - Gender,14 - Race,15 - Comments
0,426926b2-be9a-4d8e-8aa6-423714dee82f,172.58.43.140,2019-08-03T18:35:41,"['McKeesport and the Mon Valley', 'Monroeville...","['McKeesport', 'Monroeville']","['Aldi Store #97', 'Centers for Rehab Services...",['Mon Valley via Westinghouse Bridge'],['Oakland (Pittsburgh)'],[],['P1 - East Busway - All Stops'],['2'],"['Rarely, maybe a couple times a year']",['Private car'],"[""Transit isn't reliable to get me where I'm g...",['65-74 years old'],['Full-time employment'],['Regent Square (Pittsburgh)'],['Man'],['Black or African American'],['']


In [6]:
county_gis = pd.read_csv('../Data/Allegheny_County_Municipal_Boundaries.csv')
county_gis.head(1)

Unnamed: 0,OBJECTID,NAME,TYPE,LABEL,COG,SCHOOLD,CONGDIST,FIPS,REGION,ACRES,SQMI,MUNICODE,CNTL_ID,CNTYCOUNCIL,EOC,ASSESSORTERRITORY,VALUATIONAREA,YEARCONVERTED,GlobalID
0,1,CHESWICK,BOROUGH,Cheswick Borough,Allegheny Valley North,Allegheny Valley,4,13392,NH,350.191284,0.547174,815,3100,7,NEWCOM,East,Alle-Kiski Valley,1966,f29648dc-0d4f-4e35-8f2d-7b465dcff308


In [7]:
pgh_gis = pd.read_csv('../Data/pgh_neighborhoods.csv')
pgh_gis['hood']

0          Central Oakland
1            North Oakland
2             West Oakland
3         Crawford-Roberts
4           Strip District
              ...         
85               Ridgemont
86                West End
87    California-Kirkbride
88              Glen Hazel
89             Perry North
Name: hood, Length: 90, dtype: object

In [8]:
pivot = pd.read_csv("../Data/pivot_anon_source.csv", encoding = "ISO-8859-1")
pivot.head(2)

Unnamed: 0,id,timestamp,question,answer
0,009a7c74-36df-485b-b9ab-4d89d73ae94c,2019-08-23T12:16:01,3 - Area of interest,McKeesport and the Mon Valley
1,009a7c74-36df-485b-b9ab-4d89d73ae94c,2019-08-23T12:16:01,4 - Communities of interest,Duquesne


Data are not in First Normal Form (there are multiple values per cell in some cases). Will have to write a function to move data into that form.

In [9]:
data = helpers.clean(raw_data)
data.head(1)

IndexError: list index out of range

In [None]:
whereHot = helpers.oneHot(dFrame=data, col='13 - Where From')
whereHot.head(1)

In [None]:
mult_col_dict = {3 : '3 - Area of interest', 
                4 : '4 - Communities of interest', 
                5 : '5 - Destinations', 
                7 : '7 - Pittsburgh areas', 
                8 : '8 - Allegheny County areas', 
                9 : '9 - PAAC Routes Used', 
                10 : '10 - Routing Type Preference', 
                12 : '12 - Other Transport Used', 
                13 : '12 - Why Other Transport Used', 
                16 : '13 - Where From'}

In [None]:
raw_data.columns[list(mult_col_dict.keys())]

In [None]:
def find_unique_loc(dFrame, col):
    #locDict = {}
    locs = []
    for i in range(dFrame.shape[0]):
        cell_list = [x.strip() for x in dFrame[col].loc[i].split("|")]
        locs = locs + cell_list
    return(set(locs))

In [None]:
from_loc = find_unique_loc(data, '13 - Where From')
from_pgh_loc = {x for x in from_loc if re.search('Pittsburgh', x, re.IGNORECASE)}
from_pgh_loc.update(['southside','westend'])
from_out_loc = from_loc - from_pgh_loc
print('{0} PGH neigborhoods, {1} non-PGH municipalities'.format(len(from_pgh_loc), len(from_out_loc)))

In [None]:
def countyMatcher(placelist, gisTown, gisType, gisAll):
    ''' 
    Takes in a location column and returns a matcher to a GIS asset
    Matches on the strict name, then finds municipality type
    '''
    # List for dicts for easy dataframe creation
    dict_list = []

    unusual_match = {'2434 south braddock ave': ('SWISSVALE', 'Swissvale Borough', 'BOROUGH'),
                     'mck': ('MCKEESPORT','McKeesport', 'CITY'),
                     'mon valley': ('','',''),
                     'Wexford': ('PINE', 'Pine Township', 'TOWNSHIP'),
                     'Bethel Park Borough': ('BETHEL PARK', 'Bethel Park Municipality', 'MUNICIPALI'),
                     'Pittsburgh': ('PITTSBURGH', 'Pittsburgh', 'CITY')}
    # extracting municipality type
    p = re.compile("(Township|Borough|Municipality|City)")
    # Iterating over nonpgh places
    for place in placelist:
        # New dict for storing data
        dict_ = {}
        # Find muni type first

        # Replace pittsburgh:
        if re.search('Pittsburgh', place, re.IGNORECASE):
            mName = (unusual_match['Pittsburgh'][0], 100)
            aMatch = (unusual_match['Pittsburgh'][1], 100)
            tMatch = (unusual_match['Pittsburgh'][2], 100)
        # Take out messy matches o
        elif place in unusual_match.keys():
            mName = (unusual_match[place][0], 100)
            aMatch = (unusual_match[place][1], 100)
            tMatch = (unusual_match[place][2], 100)
        # Use our method to find best match, we can set a threshold here
        else:
            type_result = p.search(place, re.IGNORECASE)
            if type_result:
                muni_type = type_result.group(1)
                tMatch = process.extractOne(muni_type, gisType, scorer=fuzz.ratio)
            else:
                tMatch = ('', 100)
            mName = process.extractOne(place, gisTown,  scorer=fuzz.ratio)#, score_cutoff = 60)
            aMatch = process.extractOne(place, gisAll,  scorer=fuzz.ratio)
        
        dict_.update({"from_ppt" : place})
        dict_.update({"from_LabelGIS" : aMatch[0]})
        dict_.update({"labelScore" : aMatch[1]})
        dict_.update({"from_NameGIS" : mName[0]})
        dict_.update({"nameScore" : mName[1]})
        dict_.update({"from_TypeGIS" : tMatch[0]})
        dict_.update({"typeScore" : tMatch[1]})
        dict_list.append(dict_)

    matches_all = pd.DataFrame(dict_list)
    return(matches_all)

In [None]:
def add_county_cat(dFrame, col, countyDF, prefix):
    '''
    Takes a cleaned dataframe and appends geographic information
        at the county level
    '''
    loc_list = find_unique_loc(dFrame, col)
    fromCounty = countyMatcher(loc_list, countyDF.NAME, countyDF.TYPE, countyDF.LABEL)
    dataLoc = pd.merge(dFrame, fromCounty[['from_ppt','from_LabelGIS']],  how='left', left_on=[col], right_on = ['from_ppt'])
    dataLoc = pd.merge(dataLoc,
                        countyDF[['NAME', 'LABEL', 'TYPE', 'COG', 'FIPS', 'MUNICODE', 'OBJECTID']],
                        left_on=['from_LabelGIS'], right_on = ['LABEL'])
    dataLoc = dataLoc.drop(['from_ppt', 'NAME', 'from_LabelGIS', 'TYPE'], axis=1)
    dataLoc = dataLoc.rename(columns={'COG': '{0}COG'.format(prefix), 'FIPS': '{0}FIPS'.format(prefix),
                                      'LABEL': '{0}LABEL'.format(prefix),
                                      'MUNICODE': '{0}MUNICODE'.format(prefix),
                                      'OBJECTID': '{0}OBJECTID'.format(prefix)})
    print('{0} unique regions'.format(dataLoc['{0}COG'.format(prefix)].nunique()))
    return(dataLoc)

In [None]:
fromCounty = helpers.countyMatcher(from_loc, county_gis.NAME, county_gis.TYPE, county_gis.LABEL)
fromCounty

In [None]:
county_gis[['NAME', 'LABEL', 'TYPE', 'COG', 'FIPS', 'MUNICODE', 'OBJECTID', 'REGION', 'SQMI', 'ACRES']]

In [None]:
dataLoc = pd.merge(data, fromCounty[['from_ppt','from_LabelGIS']],  how='left', left_on=['13 - Where From'], right_on = ['from_ppt'])
print(dataLoc.columns)
dataLoc = pd.merge(dataLoc,
                    county_gis[['NAME', 'LABEL', 'TYPE', 'COG', 'FIPS', 'MUNICODE', 'OBJECTID']],
                    left_on=['from_LabelGIS'], right_on = ['LABEL'], suffixes = ('', '_from'))
print(dataLoc.columns)
dataLoc[['13 - Where From', 'from_LabelGIS', 'COG']]
dataLoc = dataLoc.drop(['from_ppt', 'NAME', 'from_LabelGIS', 'TYPE'], axis=1)
dataLoc = dataLoc.rename(columns={'COG': 'from_COG', 'FIPS': 'from_FIPS', 
                                  'LABEL': 'from_LABEL', 
                                  'MUNICODE': 'from_MUNICODE',
                                  'OBJECTID': 'from_OBJECTID'})
dataLoc['from_COG'].nunique()
print(dataLoc.columns)
dataLoc.to_csv('../Data/survey_fromclean.csv')

In [None]:
loc_format = helpers.sankeyFormat(dataLoc, col_indeces = [21,4])
loc_sankey = helpers.drawSankey(loc_format, 
                          title= "Origins to Communities of Interest")
iplot(loc_sankey, validate = False)