# Peer-Graded Assignment : Segmenting and Clustering Neighborhoods in Toronto (Part 1 & 2)
- Build a dataframe of the postal code of each neighborhood along with the borough name and neighborhood name in Toronto.
- Get the geographical coordinates of the neighborhoods in Toronto.
*********
###  Import libraries

In [1]:

import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

import json # library to handle JSON files

from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from bs4 import BeautifulSoup # library to parse HTML and XML documents

from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

import folium # map rendering library

print("Libraries imported.")

Libraries imported.


###  Scrap data from Wikipedia page into a DataFrame

In [2]:
# send the GET request
data = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text

In [3]:
# parse data from the html into a beautifulsoup object
soup = BeautifulSoup(data, 'html.parser')

In [4]:
# create three lists to store table data
postalCodeList = []
boroughList = []
neighborhoodList = []

In [5]:
for row in soup.find('table').find_all('tr'):
    cells = row.find_all('td')
    if(len(cells) > 0):
        postalCodeList.append(cells[0].text)
        boroughList.append(cells[1].text)
        neighborhoodList.append(cells[2].text.rstrip(' ')) # avoid new lines in neighborhood cell

In [6]:
# create a new DataFrame from the three lists
toronto_df = pd.DataFrame({"PostalCode": postalCodeList,
                           "Borough": boroughList,
                           "Neighborhood": neighborhoodList})

toronto_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A\n,Not assigned\n,\n
1,M2A\n,Not assigned\n,\n
2,M3A\n,North York\n,Parkwoods\n
3,M4A\n,North York\n,Victoria Village\n
4,M5A\n,Downtown Toronto\n,Regent Park / Harbourfront\n


###  Drop cells with a borough that is "Not assigned"

In [7]:
#drop cells with a borough that is Not assigned
toronto_df_dropna = toronto_df[toronto_df.Borough != "Not assigned\n"].reset_index(drop=True)
toronto_df_dropna.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A\n,North York\n,Parkwoods\n
1,M4A\n,North York\n,Victoria Village\n
2,M5A\n,Downtown Toronto\n,Regent Park / Harbourfront\n
3,M6A\n,North York\n,Lawrence Manor / Lawrence Heights\n
4,M7A\n,Downtown Toronto\n,Queen's Park / Ontario Provincial Government\n


In [8]:
# group neighborhoods in the same borough
toronto_df_grouped = toronto_df_dropna.groupby(["PostalCode", "Borough"], as_index=False).agg(lambda x: ", ".join(x))
toronto_df_grouped.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B\n,Scarborough\n,Malvern / Rouge\n
1,M1C\n,Scarborough\n,Rouge Hill / Port Union / Highland Creek\n
2,M1E\n,Scarborough\n,Guildwood / Morningside / West Hill\n
3,M1G\n,Scarborough\n,Woburn\n
4,M1H\n,Scarborough\n,Cedarbrae\n


###  Clean up the syntax

In [9]:
# for Neighborhood="Not assigned", make the value the same as Borough
for index, row in toronto_df_grouped.iterrows():
    if row["Neighborhood"] == "Not assigned":
        row["Neighborhood"] = row["Borough"]
    

toronto_df_grouped = toronto_df_grouped.replace('\n',' ', regex=True)
toronto_df_grouped.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,Malvern / Rouge
1,M1C,Scarborough,Rouge Hill / Port Union / Highland Creek
2,M1E,Scarborough,Guildwood / Morningside / West Hill
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


###  Print the number of rows of the cleaned dataframe

In [10]:
toronto_df_grouped.shape

(103, 3)

### Load the coordinates from the csv file on Coursera

In [12]:
# load the coordinates from the csv file on Coursera
coordinates = pd.read_csv("http://cocl.us/Geospatial_data")
coordinates.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


### Merge two tables to get the coordinates

In [68]:
result = pd.concat([toronto_df_grouped, coordinates], axis=1, join='outer', verify_integrity=False)
result.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood,PostalCode.1,Latitude,Longitude
0,M1B,Scarborough,Malvern / Rouge,M1B,43.8066863,-79.19435340000001
1,M1C,Scarborough,Rouge Hill / Port Union / Highland Creek,M1C,43.7845351,-79.16049709999999
2,M1E,Scarborough,Guildwood / Morningside / West Hill,M1E,43.7635726,-79.1887115
3,M1G,Scarborough,Woburn,M1G,43.7709921,-79.21691740000001
4,M1H,Scarborough,Cedarbrae,M1H,43.773136,-79.23947609999999
5,M1J,Scarborough,Scarborough Village,M1J,43.7447342,-79.23947609999999
6,M1K,Scarborough,Kennedy Park / Ionview / East Birchmount Park,M1K,43.7279292,-79.26202940000002
7,M1L,Scarborough,Golden Mile / Clairlea / Oakridge,M1L,43.7111117,-79.2845772
8,M1M,Scarborough,Cliffside / Cliffcrest / Scarborough Village W...,M1M,43.716316,-79.23947609999999
9,M1N,Scarborough,Birch Cliff / Cliffside West,M1N,43.692657,-79.2648481


## Removing duplicate column

In [69]:
result = result.loc[:,~result.columns.duplicated()]
result.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,Malvern / Rouge,43.8066863,-79.19435340000001
1,M1C,Scarborough,Rouge Hill / Port Union / Highland Creek,43.7845351,-79.16049709999999
2,M1E,Scarborough,Guildwood / Morningside / West Hill,43.7635726,-79.1887115
3,M1G,Scarborough,Woburn,43.7709921,-79.21691740000001
4,M1H,Scarborough,Cedarbrae,43.773136,-79.23947609999999


In [70]:
result.shape

(103, 5)