# Segmenting and Clustering Neighborhoods in Toronto

### Import the libraries.

In [1]:
from bs4 import BeautifulSoup

import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

#!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Libraries imported.


### Send the GET request for the website.

In [4]:
url='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
raw_html_doc = requests.get(url)

html_doc=raw_html_doc.text

### Use BeautifulSoup to parse the data and get the table of neighborhood postal codes.

In [5]:
tree = BeautifulSoup(html_doc,"lxml")
table_tag = tree.select("table")[0]
tab_data = [[item.text for item in row_data.select("th,td")]
                for row_data in table_tag.select("tr")]

### Create the dataframe.

In [7]:
# Create the dataframe

# Define the dataframe columns
column_names = ['PostalCode','Borough', 'Neighborhood'] 

# Convert tab_data  to dataframe.
neighborhoods_data = pd.DataFrame(tab_data,columns=column_names)

# Remove rows that have NA values.
neighborhoods_data = neighborhoods_data.dropna(axis=0)

# Remove \n from data.
neighborhoods_data['Neighborhood'] = neighborhoods_data['Neighborhood'].replace('\n',' ', regex=True) 

# Remove rows that have NA values.
neighborhoods_data = neighborhoods_data.dropna(axis=0)

neighborhoods_data.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,Postcode,Borough,Neighbourhood
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village


### Get the Geospatial data.

In [8]:
df_geo = pd.read_csv('https://cocl.us/Geospatial_data')

df_geo.rename(columns={'Postal Code':'PostalCode'}, inplace=True)

df_geo.head()

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


### Merge the Geospatial data into the neighborhoods data.

In [9]:
neighborhoods = pd.merge(neighborhoods_data, df_geo, on='PostalCode', how='left')

# Remove rows that have NA values.
neighborhoods = neighborhoods.dropna(axis=0)

# Remove \n from data.
neighborhoods['Neighborhood'] = neighborhoods['Neighborhood'].replace('\n',' ', regex=True) 
#neighborhoods['Neighborhood'] = neighborhoods['Neighborhood'].str.replace(r'\n', '')

print('The dataframe has {} boroughs and {} neighborhoods.'.format(
        len(neighborhoods['Borough'].unique()),
        neighborhoods.shape[0]
    )
)
neighborhoods.head()

The dataframe has 11 boroughs and 211 neighborhoods.


Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
3,M3A,North York,Parkwoods,43.753259,-79.329656
4,M4A,North York,Victoria Village,43.725882,-79.315572
5,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636
6,M5A,Downtown Toronto,Regent Park,43.65426,-79.360636
7,M6A,North York,Lawrence Heights,43.718518,-79.464763
