# Peer-graded assignment : Segmentation and Clustering Neighborhoods of Toronto

In [1]:
import numpy as np
import pandas as pd

## Step 1 :
Getting data from wikipedia : https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M,

In [18]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
df_raw = pd.read_html(url)

print('There are {} table(s) on this pages'.format(len(df_raw)))

There are 3 table(s) on this pages


## Step 2:
We only need the first table. 

In [19]:
df_toronto = df_raw[0]
df_toronto.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


## Step 3 : Clean Data

### Remove Borough equals 'Not Assigned' rows
### Replace Nan Neighborhoods by Borough.

In [34]:
#### Function to replace Nan or 'Not Assigned Neighborhoods with the Borough value'

def normalize_Neighborhood(row):
    if (row['Neighborhood'] == np.nan or row['Neighborhood'] == 'Not Assigned'):
        print('Replacing {} by {}'.format(row['Neighborhood'], row['Borough']))
        return row['Borough']
    else:
        return row['Neighborhood']


Removing Not assigned rows en reset index

In [35]:
print('Number of rows before Not Assigned cleaning {}', len(df_toronto))
df_toronto = df_toronto[df_toronto['Borough'] != 'Not assigned'].reset_index(drop=True)
print('Number of rows after N ot Assigned cleaning {}', len(df_toronto))

Number of rows before Not Assigned cleaning {} 103
Number of rows after N ot Assigned cleaning {} 103


replace 'Not assigned neighborhoods by Borough values.

In [36]:
df_toronto['Neighborhood'] = df_toronto.apply(normalize_Neighborhood, axis=1)
df_toronto.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [39]:
print('thez dimensions of the dataframe is {}.'.format(df_toronto.shape))

thez dimensions of the dataframe is (103, 3).


## Step 4 : Get coordinates from  http://cocl.us/Geospatial_data (csv file) and merge with df_toronto dataframe

Save data local

In [41]:
!wget -q -O 'Geospatial_data.csv' http://cocl.us/Geospatial_data
print('Data downloaded!')

Data downloaded!


Load data from file to dataframe

In [46]:
df_geo = pd.read_csv("Geospatial_data.csv")

df_geo.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


Merge df_toronto with df_geo

In [49]:
df_toronto = pd.merge(df_toronto, df_geo, on="Postal Code")
df_toronto.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


In [55]:
print('The Totronto dataframe has {} boroughs and {} neighborhoods.'.format(
        len(df_toronto['Borough'].unique()),
        df_toronto.shape[0]
    )
)
df_toronto['Borough'].unique()

The Totronto dataframe has 10 boroughs and 103 neighborhoods.


array(['North York', 'Downtown Toronto', 'Etobicoke', 'Scarborough',
       'East York', 'York', 'East Toronto', 'West Toronto',
       'Central Toronto', 'Mississauga'], dtype=object)

# Now the data is ready lets play.

## show Boroughs in map.

### Import libraries

In [50]:
try:
    from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
except errorValue:
    print('geopy not installed. Installing now. May take a while')
    !conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
    from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

try:
    import folium # map rendering library
except:
    print('Folium not installed. Installing now. This can take a while')
    !conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
    import folium # map rendering library

print('Libraries imported.')

Libraries imported.


In [52]:
address = 'Toronto, Canada'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of New York City are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of New York City are 43.6534817, -79.3839347.


In [54]:
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df_toronto['Latitude'], df_toronto['Longitude'], df_toronto['Borough'], df_toronto['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color=color,
        fill=True,
        fill_color=color,
        fill_opacity=0.3,
        parse_html=False).add_to(map_toronto)  
    
map_toronto