### Segmenting and Clustering Neighborhoods in Toronto (Table Manipulation)

In [1]:
#The purpose of this project is to cluster, visualize, and analyze neighborhoods in Toronto,CA

import pandas as pd #load library for data analysis
import numpy as np #load library for vector data management
import requests #load http library to send requests with python
import matplotlib.pyplot as plt #load a 2D plotting library
import matplotlib.cm as cm #load for built in color maps
import matplotlib.colors as colors #load for color plotting
import time #load time library to delay geocoder requests

%matplotlib inline #set backend of matplotlib to inline backend

!conda install -c conda beautifulsoup4 --yes #install the beautifulsoup package for webscraping
from bs4 import BeautifulSoup as bs

!conda install -c conda lxml --yes #install to handle html files

!conda install -c conda-forge geopy --yes #install geocoder for location data
from geopy.geocoders import Nominatim as nm

!conda install -c conda-forge geopandas --yes #install to allow spatial operations on geometric types

!conda install -c conda-forge folium=0.5.0 --yes #install for map rendering
import folium 

from IPython.display import Image 
from IPython.core.display import HTML

from pandas.io.json import json_normalize #load to convert json file to pandas dataframe

from sklearn.cluster import KMeans #load for clustering analysis
from sklearn.datasets.samples_generator import make_blobs #generate gaussian blobs for clustering

print("Packages installed.")

Solving environment: done


  current version: 4.5.11
  latest version: 4.7.12

Please update conda by running

    $ conda update -n base -c defaults conda



## Package Plan ##

  environment location: /home/jupyterlab/conda/envs/python

  added / updated specs: 
    - beautifulsoup4


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    openssl-1.0.2t             |       h7b6447c_1         3.1 MB
    certifi-2019.11.28         |           py36_0         156 KB
    ------------------------------------------------------------
                                           Total:         3.3 MB

The following packages will be UPDATED:

    certifi: 2019.11.28-py36_0 conda-forge --> 2019.11.28-py36_0
    openssl: 1.0.2t-h14c3975_0 conda-forge --> 1.0.2t-h7b6447c_1


Downloading and Extracting Packages
openssl-1.0.2t       | 3.1 MB    | ##################################### | 100% 
certifi-2019.11.

In [2]:
#Assign the website to a variable and request it
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
rq = requests.get(url)

In [3]:
#Use beautifulsoup as bs to webscrape the table
soup = bs(rq.content,'html')
table = soup.find_all('table')[0] 
df = pd.read_html(str(table))
dfa = df[0]
dfa.head(15)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor
7,M7A,Queen's Park,Not assigned
8,M8A,Not assigned,Not assigned
9,M9A,Downtown Toronto,Queen's Park


In [4]:
#Remove rows that don't have a borough assigned
dfb = dfa[dfa.Borough != 'Not assigned'].reset_index(drop=True)
dfb.head(15)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,Lawrence Heights
4,M6A,North York,Lawrence Manor
5,M7A,Queen's Park,Not assigned
6,M9A,Downtown Toronto,Queen's Park
7,M1B,Scarborough,Rouge
8,M1B,Scarborough,Malvern
9,M3B,North York,Don Mills North


In [5]:
#Assign the borough to the cells without a neighborhood
dfc = pd.DataFrame(dfb)
dfc['Neighbourhood'] = np.where(dfc['Neighbourhood'] == 'Not assigned', dfc['Borough'], dfc['Neighbourhood'])
dfc.head(15)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,Lawrence Heights
4,M6A,North York,Lawrence Manor
5,M7A,Queen's Park,Queen's Park
6,M9A,Downtown Toronto,Queen's Park
7,M1B,Scarborough,Rouge
8,M1B,Scarborough,Malvern
9,M3B,North York,Don Mills North


In [6]:
#Group the dataframe by postcode and merge the neighborhoods into one row separated by commas
dfd = dfc.groupby(['Postcode', 'Borough'])['Neighbourhood'].apply(lambda x: "%s" % ', '.join(x))
dfe = pd.DataFrame(dfd).reset_index()
dfe.head(15)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


In [11]:
#Use geocoding to acquire location data for all of the boroughs
locator = nm(user_agent='Toronto_Geocoder')

dflat=[]
dflong=[]
BACKOFF_TIME=1

for i in dfe['Borough']:
    location = locator.geocode('{}, Toronto, Ontario'.format(i))
    if location == None:
        lat = pd.DataFrame({'Latitude': ['Nan']})  
        long = pd.DataFrame({'Longitude': ['Nan']})
        dflat.append(lat)
        dflong.append(long)
    else:
        lat = pd.DataFrame({'Latitude': [location.latitude]})  
        long = pd.DataFrame({'Longitude': [location.longitude]})
        dflat.append(lat)
        dflong.append(long)
        time.sleep(BACKOFF_TIME)

dflat = pd.concat(dflat, axis=0)
dflong = pd.concat(dflong, axis=0)

In [12]:
dflong2 = dflong.reset_index(drop=True)
dflat2 = dflat.reset_index(drop=True)

In [13]:
#Concatenate the dataframes together
dff = pd.concat([dfe, dflat2, dflong2], axis=1)
dff

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.773077,-79.257774
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.773077,-79.257774
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.773077,-79.257774
3,M1G,Scarborough,Woburn,43.773077,-79.257774
4,M1H,Scarborough,Cedarbrae,43.773077,-79.257774
...,...,...,...,...,...
98,M9N,York,Weston,43.679105,-79.491184
99,M9P,Etobicoke,Westmount,43.671459,-79.552492
100,M9R,Etobicoke,"Kingsview Village, Martin Grove Gardens, Richv...",43.671459,-79.552492
101,M9V,Etobicoke,"Albion Gardens, Beaumond Heights, Humbergate, ...",43.671459,-79.552492


In [14]:
#Examine the shape of the dataframe
dff.shape

(103, 5)