# Segmenting and Clustering Neighborhoods in Toronto

## 1. Obtain the Neighborhood information in Toronto.

### 1) Scrape the Wikipedia page to retrieve the information from a table.

In [73]:
# import modules
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Collecting package metadata: done
Solving environment: / 
The environment is inconsistent, please check the package plan carefully
The following packages are causing the inconsistency:

  - defaults/osx-64::dask==0.19.1=py37_0
  - defaults/osx-64::anaconda==5.3.0=py37_0
  - defaults/osx-64::mkl_fft==1.0.4=py37h5d10147_1
  - defaults/osx-64::scikit-learn==0.19.2=py37h4f467ca_0
  - defaults/osx-64::bottleneck==1.2.1=py37h1d22016_1
  - defaults/osx-64::h5py==2.8.0=py37h878fce3_3
  - defaults/osx-64::pytest-arraydiff==0.2=py37h39e3cac_0
  - defaults/osx-64::bkcharts==0.2=py37_0
  - defaults/osx-64::pywavelets==1.0.0=py37h1d22016_0
  - defaults/osx-64::imageio==2.4.1=py37_0
  - defaults/osx-64::mkl_random==1.0.1=py37h5d10147_1
  - defaults/osx-64::numpy-base==1.15.1=py37h8a80b8c_0
  - defaults/osx-64::numba==0.39.0=py37h6440ff4_0
  - defaults/osx-64::odo==0.5.1=py37_0
  - defaults/osx-64::pytables==3.4.4=py37h13cba08_0
  - defaults/osx-64::datashape==0.5.4=py37_1
  - defaults/osx-64::scikit

In [53]:
# put the url into a variable called url
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

In [54]:
# grab content from the url
page = requests.get(url, verify = False)



In [55]:
page

<Response [200]>

In [57]:
soup = BeautifulSoup(page.text, 'html.parser')

In [58]:
print(soup.prettify())

<!DOCTYPE html>
<html class="client-nojs" dir="ltr" lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   List of postal codes of Canada: M - Wikipedia
  </title>
  <script>
   document.documentElement.className=document.documentElement.className.replace(/(^|\s)client-nojs(\s|$)/,"$1client-js$2");RLCONF={"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"List_of_postal_codes_of_Canada:_M","wgTitle":"List of postal codes of Canada: M","wgCurRevisionId":900271985,"wgRevisionId":900271985,"wgArticleId":539066,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Communications in Ontario","Postal codes in Canada","Toronto","Ontario-related lists"],"wgBreakFrames":!1,"wgPageContentLanguage":"en","wgPageContentModel":"wikitext","wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June",

In [59]:
#get the table
table = soup.find('table', attrs={'class':'wikitable sortable'})
table_rows = table.find_all('tr')

### 2) Transfer the information into a dataframe.

In [60]:
#transform the data from the table to a dataframe
l = []
for tr in table_rows:
    td = tr.find_all('td')
    row = [tr.text for tr in td]
    l.append(row)
df = pd.DataFrame(l, columns=['PostalCode', 'Borough', 'Neighborhood'])
df.head(5)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,,,
1,M1A,Not assigned,Not assigned\n
2,M2A,Not assigned,Not assigned\n
3,M3A,North York,Parkwoods\n
4,M4A,North York,Victoria Village\n


### 3) Data cleaning.

In [61]:
df = df.replace('\n','', regex=True)
df.head(5)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,,,
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village


In [62]:
# ignore cells with a borough that is Not assigned
df = df[(df['Borough'].isnull() == False) & (df['Borough'] != 'Not assigned')]
df.head(5)

Unnamed: 0,PostalCode,Borough,Neighborhood
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Harbourfront
6,M5A,Downtown Toronto,Regent Park
7,M6A,North York,Lawrence Heights


In [63]:
# Merge all neighborhoods of the same postal code area
df = df.groupby('PostalCode').agg({'Borough':'first', 
                             'Neighborhood': ', '.join}).reset_index()
df.head(5)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [64]:
# Assign the borough as its neighborhood if there is no neighborhood assigned
df['Neighborhood'].loc[df['Neighborhood'] == 'Not assigned'] = df['Borough']
df[df['Neighborhood'] == 'Not assigned']


Unnamed: 0,PostalCode,Borough,Neighborhood


In [65]:
# print the number of rows of the dataframe
df.shape

(103, 3)

### 4) Get the latitude and longitude information.

In [66]:
geo = pd.read_csv('Geospatial_Coordinates.csv')
geo.head(5)

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [67]:
geo = geo.rename(columns = {'Postal Code': 'PostalCode'})
geo.head(5)

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [68]:
data = pd.merge(df, geo, how = 'left', on = ['PostalCode', 'PostalCode'])
data.head(5)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


### 5) Explore and cluster the neighborhoods in Toronto.

#### Use geopy library to get the latitude and longitude values of Toronto City.

In [74]:
address = 'Toronto, TO'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinates of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinates of Toronto are 43.6523873, -79.3835641.


#### Create map of Toronto City using latitude and longitude values.

In [76]:
map_Toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(data['Latitude'], data['Longitude'], data['Borough'], data['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_Toronto)  
    
map_Toronto#### Use geopy library to get the latitude and longitude values of Toronto City.

#### Create a new dataframe of the Central Toronto data

In [77]:
Toronto_data = data[data['Borough'].str.contains('Central Toronto')]
Toronto_data

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
44,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879
45,M4P,Central Toronto,Davisville North,43.712751,-79.390197
46,M4R,Central Toronto,North Toronto West,43.715383,-79.405678
47,M4S,Central Toronto,Davisville,43.704324,-79.38879
48,M4T,Central Toronto,"Moore Park, Summerhill East",43.689574,-79.38316
49,M4V,Central Toronto,"Deer Park, Forest Hill SE, Rathnelly, South Hi...",43.686412,-79.400049
63,M5N,Central Toronto,Roselawn,43.711695,-79.416936
64,M5P,Central Toronto,"Forest Hill North, Forest Hill West",43.696948,-79.411307
65,M5R,Central Toronto,"The Annex, North Midtown, Yorkville",43.67271,-79.405678


#### Get the geographical coordinates of Central Toronto.

In [78]:
address = 'Central Toronto, TO'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinates of Central Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinates of Central Toronto are 43.653963, -79.387207.


#### Visualizat Central Toronto and the neighborhoods in it.

In [81]:
# create map of Manhattan using latitude and longitude values
map_CentralToronto = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(Toronto_data['Latitude'], Toronto_data['Longitude'], Toronto_data['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_CentralToronto)  
    
map_CentralToronto 