In [9]:
# Install all needed packages
#pip install Beautifulsoup4
#pip install geocoder
!conda install -c conda-forge geopy --yes 
!conda install -c conda-forge geocoder --yes
!conda install -c conda-forge Beautifulsoup4 --yes
!conda install -c conda-forge folium=0.5.0 --yes 

Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - geopy


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    ca-certificates-2019.6.16  |       hecc5488_0         145 KB  conda-forge
    geopy-1.20.0               |             py_0          57 KB  conda-forge
    certifi-2019.6.16          |           py36_1         149 KB  conda-forge
    openssl-1.1.1c             |       h516909a_0         2.1 MB  conda-forge
    geographiclib-1.49         |             py_0          32 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         2.5 MB

The following NEW packages will be INSTALLED:

    geographiclib:   1.49-py_0         conda-forge
    geopy:           1.20.0-py_0       conda-forge

The following packages will be UPDATED:

    ca-

In [10]:
#import libraries
import requests
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd  
import geocoder # import geocoder
from geopy.geocoders import Nominatim 
import folium # map rendering library

## PART 1 - Web Scrapping and Dataframe creation


In [12]:
# Make BeautifulSoup request
website_url = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(website_url,'html.parser')

In [13]:
# Scrap the HTML to find the values in the table
My_table = soup.find('table',{'class':'wikitable sortable'})
columns = My_table.find_all('td')

col_new = []
for cols in range(0,len(columns)):
    col_new.append(columns[cols].text)

col_new = np.array(col_new).reshape(288,3)

In [14]:
# Creates a dataframe with the required columns 
df = pd.DataFrame(col_new, columns =['PostalCode', 'Borough', 'Neighborhood']) 
df['Neighborhood'] = df['Neighborhood'].replace('\n','', regex=True)

In [15]:
# Ignore cells with a borough that is not assigned. 
# If a cell has a borough but a not assigned neighborhood, then the neighborhood will be the same as the borough. 
df = df[df.Borough != 'Not assigned']
df['Neighborhood'] = [row[-2] if row[-1]=='Not assigned' else row[-1] for row in df.itertuples()]

In [16]:
# Put more than one neighborhood in one postal code area
df = df.groupby(['PostalCode','Borough'])['Neighborhood'].agg(lambda x: ', '.join(set(x))).reset_index()
df.shape
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Port Union, Rouge Hill, Highland Creek"
2,M1E,Scarborough,"West Hill, Morningside, Guildwood"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


## PART 2 - Getting the latitude and longitude coordenates using Geocoder package

In [18]:
def get_geocoder(postal_code_from_df):
        # initialize your variable to None
        lat_lng_coords = None
        # loop until you get the coordinates
        while(lat_lng_coords is None):
            g = geocoder.arcgis('{}, Toronto, Ontario'.format(postal_code_from_df.strip()))
            lat_lng_coords = g.latlng
            latitude = lat_lng_coords[0]
            longitude = lat_lng_coords[1]
        return latitude,longitude

df['Latitude'], df['Longitude'] = zip(*df['PostalCode'].apply(get_geocoder))
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.811525,-79.195517
1,M1C,Scarborough,"Port Union, Rouge Hill, Highland Creek",43.78573,-79.15875
2,M1E,Scarborough,"West Hill, Morningside, Guildwood",43.76569,-79.175256
3,M1G,Scarborough,Woburn,43.768359,-79.21759
4,M1H,Scarborough,Cedarbrae,43.769688,-79.23944


## PART 3 - Exploring and clustering the neighborhoods in Toronto 

In [19]:
# Setting the parameters
address = 'Toronto, Ontario'

geolocator = Nominatim(user_agent="toronto_ontario")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinates of Toronto, Ontario are {}, {}.'.format(latitude, longitude))

The geograpical coordinates of Toronto, Ontario are 43.653963, -79.387207.


In [20]:
#Showing the map with with the analisis result 

map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

for lat, long, post, borough, neigh in zip(df['Latitude'], df['Longitude'], df['PostalCode'], df['Borough'], df['Neighborhood']):
    label = '{}, {}'.format(borough, post, neigh)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, long],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto