The project includes web scraping of postal code of Canada and then segmenting and clustering the neighborhoods in Toronto.All actions of web scraping, cleaning, clustering and ploting the results are in one notebook.

First install all required libraries for the project.

In [1]:
import pandas as pd
import requests
import folium
!conda install -c conda-forge beautifulsoup4 lxml --yes 
!conda install -c conda-forge geopy --yes
!conda install -c conda-forge geocoder --yes
from bs4 import BeautifulSoup
import json
import numpy as np
import urllib
from geopy.geocoders import Nominatim
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans
import geocoder 

Solving environment: done


  current version: 4.5.11
  latest version: 4.7.12

Please update conda by running

    $ conda update -n base -c defaults conda



# All requested packages already installed.

Solving environment: done


  current version: 4.5.11
  latest version: 4.7.12

Please update conda by running

    $ conda update -n base -c defaults conda



# All requested packages already installed.

Solving environment: done


  current version: 4.5.11
  latest version: 4.7.12

Please update conda by running

    $ conda update -n base -c defaults conda



# All requested packages already installed.



Then get the list of the postal codes of Canada from wikipedia web page. The results are stored in a pandas dataframe. The resulsts are shown below the code.

In [2]:

url = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')

soup = BeautifulSoup(requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text, 'lxml')
data = soup.find('table',{'class':'wikitable sortable'})

df = pd.read_html(str(data), header = 0)[0]
df = pd.DataFrame(np.array(df), columns = ["PostalCode", "Borough", "Neighborhood"])
df = df[df.Borough != 'Not assigned']

df = df.groupby('PostalCode').agg({'Borough':'first',
                                  'Neighborhood': ', '.join,}).reset_index()
df['Neighborhood'] = np.where((df['Neighborhood'] == 'Not assigned'),
                               df['Borough'],
                               df['Neighborhood'])

df

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
...,...,...,...
98,M9N,York,Weston
99,M9P,Etobicoke,Westmount
100,M9R,Etobicoke,"Kingsview Village, Martin Grove Gardens, Richv..."
101,M9V,Etobicoke,"Albion Gardens, Beaumond Heights, Humbergate, ..."


We set the address of Toronto as variable and get the geographical coordinates of the city.

In [4]:
address = 'Toronto, TO'
geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto  are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto  are 43.653963, -79.387207.


In order to retrieve the geographical coordinates of all postal codes in the city of Toronto we read the .csv file "Geospatial data" and add the coordinates of each postal code to our dataframe.
 


In [5]:
url = 'http://cocl.us/Geospatial_data'
postal_code_data = pd.read_csv(url)
postal_code_data
postal_code_data.rename({'Postal Code': 'PostalCode'}, axis=1, inplace=True)
postal_code_data
tornto_data=df.merge(postal_code_data,on='PostalCode')
tornto_data
tornto_data

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
...,...,...,...,...,...
98,M9N,York,Weston,43.706876,-79.518188
99,M9P,Etobicoke,Westmount,43.696319,-79.532242
100,M9R,Etobicoke,"Kingsview Village, Martin Grove Gardens, Richv...",43.688905,-79.554724
101,M9V,Etobicoke,"Albion Gardens, Beaumond Heights, Humbergate, ...",43.739416,-79.588437


I choose to cluster only the neighborhoods in the Borough of Scarborough.After we select the neighborhoods we want we plot them to a map using the folium library. 
 


In [11]:
scarborough_data = tornto_data.loc[tornto_data['Borough'] == 'Scarborough']
scarborough_data.head()

map_scarborough = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(scarborough_data['Latitude'], scarborough_data['Longitude'], scarborough_data['Borough'], scarborough_data['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_scarborough)  
    
map_scarborough


Now we cluster the selected neighborhoods in 4 clusters and after this we add in a new column the number of the cluster for each postal code in our dataset.
The results are shown below.
 


In [13]:
k=4
toronto_clustering = scarborough_data.drop(['PostalCode','Borough','Neighborhood'],1)
kmeans = KMeans(n_clusters = k,random_state=0).fit(toronto_clustering)
kmeans.labels_
scarborough_data.insert(0, 'Cluster Labels', kmeans.labels_)
scarborough_data


Unnamed: 0,Cluster Labels,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,1,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,1,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,1,M1G,Scarborough,Woburn,43.770992,-79.216917
4,3,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,3,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,2,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",43.727929,-79.262029
7,2,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",43.711112,-79.284577
8,2,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West",43.716316,-79.239476
9,2,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848


We plot the clustered neighborhoods on map as each cluster is in different color.

 


In [14]:
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=12)

# set color scheme for the clusters
x = np.arange(k)
ys = [i + x + (i*x)**2 for i in range(k)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(scarborough_data['Latitude'], scarborough_data['Longitude'], scarborough_data['Neighborhood'], scarborough_data['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters