# Segmenting and Clustering Neighborhoods in Toronto

In [65]:
import pandas as pd
import numpy as np

###### Using read_html from pandas to read in tables from given html

In [129]:
toronto_raw = pd.read_html("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")

###### create a data frame from first row

In [130]:
toronto = pd.DataFrame(toronto_raw[0])
toronto.head()

Unnamed: 0,Postal code,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront


In [131]:
toronto.shape

(180, 3)

In [132]:
toronto["Borough"].unique()

array(['Not assigned', 'North York', 'Downtown Toronto', 'Etobicoke',
       'Scarborough', 'East York', 'York', 'East Toronto', 'West Toronto',
       'Central Toronto', 'Mississauga'], dtype=object)

In [134]:
toronto = toronto[toronto["Borough"] != "Not assigned"]
toronto.head(20)

Unnamed: 0,Postal code,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront
5,M6A,North York,Lawrence Manor / Lawrence Heights
6,M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government
8,M9A,Etobicoke,Islington Avenue
9,M1B,Scarborough,Malvern / Rouge
11,M3B,North York,Don Mills
12,M4B,East York,Parkview Hill / Woodbine Gardens
13,M5B,Downtown Toronto,"Garden District, Ryerson"


###### there is no cell that has a borough but a Not assigned neighborhood

In [136]:
len(toronto["Neighborhood"][toronto["Neighborhood"] == "Not assigned"])

0

In [138]:
toronto.shape

(103, 3)

###### In order to combine the value of "Neighborhood" that have the same Postal code, I will use .groupby() and join value, separating them by a comma, and overwrite the "toronto" dataframe with the new dataframe where "Neighborhood"s are listed in one row. Then I'll reset the index

In [139]:
toronto = pd.DataFrame(
    toronto.groupby(["Postal code","Borough"])["Neighborhood"].apply(lambda hoods: ", ".join(hoods)))
toronto.reset_index(inplace=True)
toronto.head()

Unnamed: 0,Postal code,Borough,Neighborhood
0,M1B,Scarborough,Malvern / Rouge
1,M1C,Scarborough,Rouge Hill / Port Union / Highland Creek
2,M1E,Scarborough,Guildwood / Morningside / West Hill
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [150]:
toronto.shape

(103, 3)

###### Get Geospatial Data

In [144]:
url="http://cocl.us/Geospatial_data"
lls = pd.read_csv(url)
lls.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


###### Merge Geo Data with Wiki List

In [152]:
toronto1 = toronto.merge(lls, how='left', left_on='Postal code', right_on='Postal Code').drop('Postal Code', axis = 1)
toronto1.head()

Unnamed: 0,Postal code,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,Malvern / Rouge,43.806686,-79.194353
1,M1C,Scarborough,Rouge Hill / Port Union / Highland Creek,43.784535,-79.160497
2,M1E,Scarborough,Guildwood / Morningside / West Hill,43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [154]:
toronto1.shape

(103, 5)

###### Cluster Boroughs

In [157]:
from sklearn.cluster import KMeans
k_means = KMeans(init="k-means++", n_clusters=4, n_init=12)
k_means.fit(toronto1[['Latitude', 'Longitude']])

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=4, n_init=12, n_jobs=None, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

###### The clusters and corresponding colors are added to the data frame

In [160]:
toronto1['Cluster'] = k_means.labels_
toronto1['Color'] = 'color'
for i in range(0,toronto1.Cluster.count()):
    if toronto1.Cluster[i] == 0:
        toronto1.loc[i:, 'Color'] = 'red'
    elif toronto1.Cluster[i] == 1:
        toronto1.loc[i:, 'Color'] = 'blue'
    elif toronto1.Cluster[i] == 2:
        toronto1.loc[i:, 'Color'] = 'green'
    else:
        toronto1.loc[i:, 'Color'] = 'yellow'
toronto1.head(30)

Unnamed: 0,Postal code,Borough,Neighborhood,Latitude,Longitude,Cluster,Color
0,M1B,Scarborough,Malvern / Rouge,43.806686,-79.194353,1,blue
1,M1C,Scarborough,Rouge Hill / Port Union / Highland Creek,43.784535,-79.160497,1,blue
2,M1E,Scarborough,Guildwood / Morningside / West Hill,43.763573,-79.188711,1,blue
3,M1G,Scarborough,Woburn,43.770992,-79.216917,1,blue
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476,1,blue
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476,1,blue
6,M1K,Scarborough,Kennedy Park / Ionview / East Birchmount Park,43.727929,-79.262029,1,blue
7,M1L,Scarborough,Golden Mile / Clairlea / Oakridge,43.711112,-79.284577,1,blue
8,M1M,Scarborough,Cliffside / Cliffcrest / Scarborough Village West,43.716316,-79.239476,1,blue
9,M1N,Scarborough,Birch Cliff / Cliffside West,43.692657,-79.264848,1,blue


###### Map the Data
Install and import folium

In [None]:
!conda install -c conda-forge folium=0.5.0 --yes
import folium

print('Folium installed and imported!')

Solving environment: | 

In [None]:
# San Francisco latitude and longitude values
latitude = 43.6532
longitude = -79.3832

# create map and display it
toronto_map = folium.Map(location=[latitude, longitude], zoom_start=12)

In [164]:
# instantiate a feature group for the incidents in the dataframe
boroughs = folium.map.FeatureGroup()

# loop through the 100 crimes and add each to the incidents feature group
for lat, long, color in zip(toronto1.Latitude, toronto1.Longitude, toronto1.Color):
    boroughs.add_child(
        folium.features.CircleMarker(
            [lat, long],
            radius=5, # define how big you want the circle markers to be
            color = color,
        )
    )

# add incidents to map
toronto_map.add_child(boroughs)

In [None]:
from IPython.display import HTML, display

width, height = (400,500) # width and height of the displayed iFrame, in pixels

srcdoc = toronto_map._repr_html_().replace('"', '&quot;')
embed = HTML('<iframe srcdoc="{}" '
             'style="width: {}px; height: {}px; display:block; width: 50%; margin: 0 auto; '
             'border: none"></iframe>'.format(srcdoc, width, height))
embed