## Initial Data Cleaning

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from ast import literal_eval as make_tuple

In [2]:
stops_oct2012 = pd.read_csv('./datasets/CTA_-_Ridership_-_Avg._Weekday_Bus_Stop_Boardings_in_October_2012.csv')

In [3]:
stops_oct2012

Unnamed: 0,stop_id,on_street,cross_street,routes,boardings,alightings,month_beginning,daytype,location
0,1,JACKSON,AUSTIN,126,183.4,150.0,10/01/2012,Weekday,"(41.87632184, -87.77410482)"
1,2,JACKSON,MAYFIELD (EXTENDED),126,5.3,0.2,10/01/2012,Weekday,"(41.87706679, -87.77131794)"
2,3,JACKSON,MENARD,126,8.3,0.7,10/01/2012,Weekday,"(41.87695725, -87.76975039)"
3,4,JACKSON,5700 WEST,126,17.9,3.0,10/01/2012,Weekday,"(41.87702418, -87.76745055)"
4,6,JACKSON,LOTUS,126,74.0,11.2,10/01/2012,Weekday,"(41.876513, -87.761446)"
...,...,...,...,...,...,...,...,...,...
11588,17706,100TH STREET,PAXTON,"J14,15",3.6,49.5,10/01/2012,Weekday,"(41.71339537, -87.56963306)"
11589,17707,43RD STREET,OAKENWALD,43,21.0,69.1,10/01/2012,Weekday,"(41.81685612, -87.59757281)"
11590,17708,43RD STREET,LAKE PARK,43,24.8,0.9,10/01/2012,Weekday,"(41.81697313, -87.59910809)"
11591,17709,43RD STREET,BERKELEY,43,14.7,0.9,10/01/2012,Weekday,"(41.81695808, -87.60049861)"


In [4]:
stops_oct2012.isna().sum()

stop_id            0
on_street          0
cross_street       0
routes             2
boardings          0
alightings         0
month_beginning    0
daytype            0
location           0
dtype: int64

In [5]:
stops_oct2012[stops_oct2012['routes'].isna()]

Unnamed: 0,stop_id,on_street,cross_street,routes,boardings,alightings,month_beginning,daytype,location
6642,9267,BELMONT,KEELER,,62.4,35.2,10/01/2012,Weekday,"(41.93897, -87.73212)"
9211,12548,ADDISON,LAKE SHORE,,2.1,185.5,10/01/2012,Weekday,"(41.94853216, -87.64355208)"


In [6]:
stops_oct2012.loc[6642, 'routes'] = '77'
stops_oct2012.loc[9211, 'routes'] = '152'

In [7]:
stops_oct2012['location'] = stops_oct2012['location'].apply(make_tuple)

In [8]:
stops_oct2012.drop(columns = ['month_beginning', 'daytype'])

Unnamed: 0,stop_id,on_street,cross_street,routes,boardings,alightings,location
0,1,JACKSON,AUSTIN,126,183.4,150.0,"(41.87632184, -87.77410482)"
1,2,JACKSON,MAYFIELD (EXTENDED),126,5.3,0.2,"(41.87706679, -87.77131794)"
2,3,JACKSON,MENARD,126,8.3,0.7,"(41.87695725, -87.76975039)"
3,4,JACKSON,5700 WEST,126,17.9,3.0,"(41.87702418, -87.76745055)"
4,6,JACKSON,LOTUS,126,74.0,11.2,"(41.876513, -87.761446)"
...,...,...,...,...,...,...,...
11588,17706,100TH STREET,PAXTON,"J14,15",3.6,49.5,"(41.71339537, -87.56963306)"
11589,17707,43RD STREET,OAKENWALD,43,21.0,69.1,"(41.81685612, -87.59757281)"
11590,17708,43RD STREET,LAKE PARK,43,24.8,0.9,"(41.81697313, -87.59910809)"
11591,17709,43RD STREET,BERKELEY,43,14.7,0.9,"(41.81695808, -87.60049861)"


## Converting a kmz file into a csv

All the code from here until the `print(outstr)` statement was directly found at this link: http://programmingadvent.blogspot.com/2013/06/kmzkml-file-parsing-with-python.html

In [9]:
from zipfile import ZipFile

filename = './datasets/CTA_BusStops.kmz'

kmz = ZipFile(filename)
kml = kmz.open('doc.kml')

In [10]:
import xml.sax, xml.sax.handler
class PlacemarkHandler(xml.sax.handler.ContentHandler):
    def __init__(self):
        self.inName = False # handle XML parser events
        self.inPlacemark = False
        self.mapping = {}
        self.buffer = ""
        self.name_tag = ""
       
    def startElement(self, name, attributes):
        if name == "Placemark": # on start Placemark tag
            self.inPlacemark = True
            self.buffer = ""
        if self.inPlacemark:
            if name == "name": # on start title tag
                self.inName = True # save name text to follow
           
    def characters(self, data):
        if self.inPlacemark: # on text within tag
            self.buffer += data # save text if in title
           
    def endElement(self, name):
        self.buffer = self.buffer.strip('\n\t')
       
        if name == "Placemark":
            self.inPlacemark = False
            self.name_tag = "" #clear current name
       
        elif name == "name" and self.inPlacemark:
            self.inName = False # on end title tag           
            self.name_tag = self.buffer.strip()
            self.mapping[self.name_tag] = {}
        elif self.inPlacemark:
            if name in self.mapping[self.name_tag]:
                self.mapping[self.name_tag][name] += self.buffer
            else:
                self.mapping[self.name_tag][name] = self.buffer
        self.buffer = ""

In [11]:
parser = xml.sax.make_parser()
handler = PlacemarkHandler()
parser.setContentHandler(handler)
parser.parse(kml)
kmz.close()

In [12]:
def build_table(mapping):
    sep = ','
       
    output = 'Name' + sep + 'Coordinates\n'
    points = ''
    lines = ''
    shapes = ''
    for key in mapping:
        coord_str = mapping[key]['coordinates'] + sep
       
        if 'LookAt' in mapping[key]: #points
            points += key + sep + coord_str + "\n"
        elif 'LineString' in mapping[key]: #lines
            lines += key + sep + coord_str + "\n"
        else: #shapes
            shapes += key + sep + coord_str + "\n"
    output += points + lines + shapes
    return output

In [13]:
outstr = build_table(handler.mapping)
out_filename = filename[:-3] + "csv" #output filename same as input plus .csv
f = open(out_filename, "w")
f.write(outstr)
f.close()
#print(outstr)

## Making an Interactive Map
Click on any stop to see its name

In [14]:
df = pd.read_csv('./datasets/CTA_BusStops.csv')

In [15]:
df.reset_index(inplace=True)
df.rename(columns={'level_0': 'name', 
                   'level_1': 'longitude', 
                   'level_2': 'latitude', 
                   'Coordinates': 'coordinates'}, inplace=True)
df.drop(columns = ['Name'], inplace=True)

In [16]:
df = df[['name', 'latitude', 'longitude', 'coordinates']]

In [17]:
df['coordinates'] = (df['latitude'].astype(str)).str.cat((df['longitude'].astype(str)), sep=', ')

In [18]:
df['coordinates'] = df['coordinates'].apply(make_tuple)

In [19]:
df

Unnamed: 0,name,latitude,longitude,coordinates
0,East River Rd & Carmen,41.971303,-87.846583,"(41.971302724, -87.84658291699998)"
1,Pavilion 5441 Building,41.978525,-87.842992,"(41.97852496899998, -87.84299182199999)"
2,Irving Park & Cumberland,41.951955,-87.836418,"(41.95195466000001, -87.83641751300001)"
3,Lawrence & Chester,41.966693,-87.838993,"(41.96669260300001, -87.838993233)"
4,Lawrence & Cumberland,41.966700,-87.836433,"(41.96669985300002, -87.83643297399999)"
...,...,...,...,...
6151,Buffalo & 89th Street,41.733970,-87.544433,"(41.73397, -87.54443300000003)"
6152,87th Street & Buffalo,41.737398,-87.544620,"(41.73739799999998, -87.54462000000001)"
6153,Buffalo & 88th Street,41.735802,-87.544468,"(41.73580200100002, -87.544468)"
6154,Avenue C & 107th Street,41.700960,-87.527025,"(41.70096037600001, -87.52702479099997)"


All the code below was directly found at https://towardsdatascience.com/making-3-easy-maps-with-python-fb7dfb1036

In [20]:
import folium
import json
from folium import plugins

In [21]:
with open('./datasets/Boundaries - Neighborhoods.geojson') as f:
    neighborhoods = json.load(f)

In [22]:
chi_map = folium.Map(location=[41.881832, -87.623177], tiles='Stamen Toner', zoom_start=11)

folium.GeoJson(neighborhoods).add_to(chi_map)


for i,row in df.iterrows():
    folium.CircleMarker((row['latitude'], row['longitude']), 
                        popup=row['name'], 
                        radius=3,
                        weight=1,
                        color='red',
                        fill_color='red',
                        fill_opacity=0.5).add_to(chi_map)


    
chi_map.save('chi_point_map.html')