## Toronto Neighborhood Segmentation and Clustering 

## Part 1: Retrieving and Cleaning Data

In [1]:
# install necessary packages 
!pip install bs4;
!pip install requests;
!pip install lxml;



In [2]:
# importing required libraries 
import numpy as np; # library to handle data in a vectorized manner
import pandas as pd; # library for data analsysis
import urllib;
from bs4 import BeautifulSoup;
import requests; # library to handle requests
# extract table from wikipedia
import html5lib;

### Transformed Data into a Dataframe

In [3]:
# extracting table from wikipedia
page = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

wikitable = pd.read_html(page, attrs ={'class':'wikitable'})

print('Extrated {num} wikitables'.format(num= len(wikitable)))

Extrated 1 wikitables


In [5]:
df_to = wikitable[0]
df_to.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor
7,M7A,Downtown Toronto,Queen's Park
8,M8A,Not assigned,Not assigned
9,M9A,Queen's Park,Not assigned


In [6]:
df_to.shape

(287, 3)

### Data cleaining and Wrangling 

#### Clearing all rows without a "Postcode"

In [7]:
boolean_mask = df_to.Postcode.notnull()

In [8]:
df_to = df_to.loc[boolean_mask]

In [9]:
df_to.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


#### Getting rid of all the Borough "Not assigned"

In [10]:
# Creating a variable called not assigned in which you attributes all 
# Borough that are not assigned and proceed to retrieve a list without it 
not_assigned = df_to.Borough == 'Not assigned'
df_to = df_to.loc[~not_assigned]
df_to

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor
...,...,...,...
281,M8Z,Etobicoke,Kingsway Park South West
282,M8Z,Etobicoke,Mimico NW
283,M8Z,Etobicoke,The Queensway West
284,M8Z,Etobicoke,Royal York South West


In [11]:
df_to.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor
7,M7A,Downtown Toronto,Queen's Park
9,M9A,Queen's Park,Not assigned
10,M1B,Scarborough,Rouge
11,M1B,Scarborough,Malvern
13,M3B,North York,Don Mills North


In [12]:
df_to.shape

(210, 3)

#### Aggregating Boroughs with same Postcode

In [13]:
# Aggregating Boroughs with same Postcode 
df_to = df_to.groupby(['Postcode']).  \
    agg({'Borough' : 'first' , 'Neighbourhood' : ', '.join})  \
   .reset_index()  \
   .reindex(columns = df_to.columns)

In [14]:
#printing the dataframe after aggregating 
df_to.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


In [15]:
#Checking the rows and columns in the dataset 
df_to.shape

(103, 3)

In [16]:
#rename "Postcode" to "Postal Code"
df_to = df_to.rename(columns={'Postcode': 'Postal Code'})

In [17]:
df_to.shape

(103, 3)

In [18]:
#where 'Neighbourhood' is Not assigned, copy the Borough to Neighbourhood
df_to.loc[(df_to.Neighbourhood=='Not assigned'), 'Neighbourhood'] = df_to.Borough

#print the result as a string for inspection
print(df_to.to_string())

    Postal Code           Borough                                      Neighbourhood
0           M1B       Scarborough                                     Rouge, Malvern
1           M1C       Scarborough             Highland Creek, Rouge Hill, Port Union
2           M1E       Scarborough                  Guildwood, Morningside, West Hill
3           M1G       Scarborough                                             Woburn
4           M1H       Scarborough                                          Cedarbrae
5           M1J       Scarborough                                Scarborough Village
6           M1K       Scarborough        East Birchmount Park, Ionview, Kennedy Park
7           M1L       Scarborough                    Clairlea, Golden Mile, Oakridge
8           M1M       Scarborough    Cliffcrest, Cliffside, Scarborough Village West
9           M1N       Scarborough                        Birch Cliff, Cliffside West
10          M1P       Scarborough  Dorset Park, Scarborough Town 

#### Final DataFrame as requested on the assignment groupedby Postal Code

In [19]:
#display the dataframe in the format specified in the assignment
df_to.head(12)

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


In [20]:
df_to.shape

(103, 3)

## Part 2: Adding Geospatial Coordinates to Toronto Neighbourhoods

In [21]:
# creating a dataframe for geocordinates using read_csv

geo_loc = pd.read_csv('http://cocl.us/Geospatial_data')
geo_loc.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [22]:
# merging the neighbourhood and location dataframe using the Postal Code
df_to_loc = pd.merge(df_to, geo_loc, on= 'Postal Code')

#Showing consolidated dataframes
df_to_loc.head(10)

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",43.727929,-79.262029
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848


In [23]:
# Check the shape to see if the same amount of data still exist 
#and the merge went as planned
df_to_loc.shape

(103, 5)

## Part 3: Exploring and Clustering Toronto Neighbourhoods

In [24]:
!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.

Libraries imported.


###  Getting the geographical coordinates of Toronto

In [25]:
address = 'Toronto, ON'

geolocator = Nominatim(user_agent="to_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.653963, -79.387207.


### Counting the amount of Boroughs and Neighbourhoods

In [27]:
print("The dataframe has {} boroughs and {} neighbourhoods".format(len(df_to_loc['Borough'].unique()),df_to_loc.shape[0]))

The dataframe has 11 boroughs and 103 neighbourhoods


### Create a map of Toronto using latitude and longitude values with neighbourhoods superimposed on top

In [28]:
to_map = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, borough, neighbourhood in zip(df_to_loc['Latitude'], df_to_loc['Longitude'], df_to_loc['Borough'], df_to_loc['Neighbourhood']):
    label = '{}, {}'.format(neighbourhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(to_map)  
    
to_map