In [5]:
import pandas as pd 
import numpy as np

## I. Creating a dataFrame with Canada information 
### /!\ You will see the third part below which consists in exploring and clustering the neighborhoods in Toronto 

In [6]:
link = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
CodePostalCanada_Html = pd.read_html(link, header = 0) #Header =0 permet de définir la première ligne du tableau comme header
df_CodePostalCanada = CodePostalCanada_Html[0]

df_CodePostalCanada.columns

Index(['Postcode', 'Borough', 'Neighbourhood'], dtype='object')

In [7]:
df_CodePostalCanada.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


##### The dataframe will consist of three columns: PostalCode, Borough, and Neighborhood. Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned. More than one neighborhood can exist in one postal code area. For example, in the table on the Wikipedia page, you will notice that M5A is listed twice and has two neighborhoods: Harbourfront and Regent Park. These two rows will be combined into one row with the neighborhoods separated with a comma as shown in row 11 in the above table.

In [8]:
#Ignore cells with a borough that is Not assigned
df_CodePostalCanadaB = df_CodePostalCanada[df_CodePostalCanada.Borough != 'Not assigned']

In [9]:
df_CodePostalCanadaBGrouped = df_CodePostalCanadaB.groupby(['Postcode'], as_index=False).agg(lambda x :', '.join(x)) 
df_CodePostalCanadaBGrouped.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,"Scarborough, Scarborough","Rouge, Malvern"
1,M1C,"Scarborough, Scarborough, Scarborough","Highland Creek, Rouge Hill, Port Union"
2,M1E,"Scarborough, Scarborough, Scarborough","Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In the code below, I have removed duplicates words (due to join function) in the column named Borough

In [10]:
for row in df_CodePostalCanadaBGrouped.iterrows():
    row[1][1] = row[1][1].split(",")[0]
    # print (row[1][1])

In [11]:
df_CodePostalCanadaBGrouped.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


##### If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough. So for the 9th cell in the table on the Wikipedia page, the value of the Borough and the Neighborhood columns will be Queen's Park.

In [12]:
for row in df_CodePostalCanadaBGrouped.iterrows():
    if row[1][2]=='Not assigned' :
        row[1][2] = row[1][1]

In [13]:
df_CodePostalCanadaBGrouped.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [14]:
df_CodePostalCanadaBGrouped.shape

(103, 3)

## II. Creating a dataFrame with geospatial data

In [15]:
df_Location = pd.read_csv("https://cocl.us/Geospatial_data")

In [16]:
df_Location.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


Rename the first column to have the same key on both two dataframes: df_CodePostalCanadaBGrouped and df_Location

In [17]:
df_Location.rename(columns={'Postal Code':'Postcode'}, inplace=True)
df_Location.head()

Unnamed: 0,Postcode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


##### Join the two Dataframe (df_CodePostalCanadaBGrouped and df_Location) on key = Postal Code

In [124]:
df_CodePostalCanada_Location = pd.merge(df_CodePostalCanadaBGrouped, df_Location, on='Postcode')
df_CodePostalCanada_Location.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


## III. Exploring and clustering the neighborhoods in Toronto 

Import necessary libraries

In [3]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

#!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/DSX-Python35

  added / updated specs: 
    - folium=0.5.0


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    folium-0.5.0               |             py_0          45 KB  conda-forge
    certifi-2018.8.24          |        py35_1001         139 KB  conda-forge
    altair-2.2.2               |           py35_1         462 KB  conda-forge
    branca-0.3.1               |             py_0          25 KB  conda-forge
    ca-certificates-2019.3.9   |       hecc5488_0         146 KB  conda-forge
    vincent-0.4.4              |             py_1          28 KB  conda-forge
    openssl-1.0.2r             |       h14c3975_0         3.1 MB  conda-forge
    ------------------------------------------------------------
                                           Total:         4.0 MB

The following NEW packages will

Define the dataframe columns for the new Dataframe and Initialize it with colums

In [127]:
column_names = ['Postcode','Borough', 'Neighbourhood', 'Latitude', 'Longitude'] 

df_CodePostalCanada_Toronto = pd.DataFrame(columns=column_names)
df_CodePostalCanada_Toronto

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude


Selecting rows which contains the word Toronto in the column named "Borough", and then append rows to the new Dataframe

In [128]:
 for row in df_CodePostalCanada_Location.iterrows():
    if ("Toronto" in row[1][1]): 
        df_CodePostalCanada_Toronto = df_CodePostalCanada_Toronto.append({'Postcode':row[1][0] ,
                                            'Borough': row[1][1],
                                            'Neighbourhood': row[1][2],
                                            'Latitude': row[1][3],
                                            'Longitude': row[1][4]}, ignore_index=True)

In [129]:
df_CodePostalCanada_Toronto.head(20)

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M4E,East Toronto,The Beaches,43.676357,-79.293031
1,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
2,M4L,East Toronto,"The Beaches West, India Bazaar",43.668999,-79.315572
3,M4M,East Toronto,Studio District,43.659526,-79.340923
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879
5,M4P,Central Toronto,Davisville North,43.712751,-79.390197
6,M4R,Central Toronto,North Toronto West,43.715383,-79.405678
7,M4S,Central Toronto,Davisville,43.704324,-79.38879
8,M4T,Central Toronto,"Moore Park, Summerhill East",43.689574,-79.38316
9,M4V,Central Toronto,"Deer Park, Forest Hill SE, Rathnelly, South Hi...",43.686412,-79.400049


In [130]:
address = 'Toronto'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.653963, -79.387207.


###### Creating the Map with neighbourhood of Toronto. 
###### /!\ For neighbourhood, I have chosen only neighbourhood which has "Toronto" on their borough name

In [131]:
# create map of New York using latitude and longitude values
map_Toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for postcode, lat, lng, borough, neighbourhood in zip(df_CodePostalCanada_Toronto['Postcode'], df_CodePostalCanada_Toronto['Latitude'], df_CodePostalCanada_Toronto['Longitude'], df_CodePostalCanada_Toronto['Borough'], df_CodePostalCanada_Toronto['Neighbourhood']):
    label = '{}, {}, {}'.format(postcode, neighbourhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_Toronto)  
    
map_Toronto