In [1]:
import pandas as pd 
import requests 

In [2]:
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text 

In [3]:
from bs4 import BeautifulSoup 
soup = BeautifulSoup(source, 'lxml')
#print(soup.prettify())

In [4]:
# Obtain only the table from the html data 
table = soup.find('table', {'class' : 'wikitable sortable'})
#table
                            

In [5]:
# The dataframe will consist of 3 columns, PostCode, Borough and Neighborhood 
df = pd.read_html(str(table), header=0)
df = pd.DataFrame(df[0])

In [6]:
# Only process the cells that have an assigned borough and ignore cells without assigned borough 
df.drop(df[df.Borough == 'Not assigned'].index, inplace = True)

In [7]:
# More than one neighborhoods can exist in one PostCode 
df = df.groupby("Postcode").agg(lambda x:','.join(set(x)))
df.reset_index(inplace = True)

In [8]:
# For a cell with a borough but no assigned neighborhood, the neighborhood will be same as the borough 
df.loc[df.Neighbourhood == 'Not assigned', 'Neighbourhood'] = df.loc[df.Neighbourhood == 'Not assigned', 'Borough']

In [9]:
# In the last cell of your notebook, use the .shape method to print the number of rows of your dataframe 
df.shape

(103, 3)

In [10]:
df.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Malvern,Rouge"
1,M1C,Scarborough,"Highland Creek,Port Union,Rouge Hill"
2,M1E,Scarborough,"Morningside,West Hill,Guildwood"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"Ionview,East Birchmount Park,Kennedy Park"
7,M1L,Scarborough,"Oakridge,Clairlea,Golden Mile"
8,M1M,Scarborough,"Cliffside,Scarborough Village West,Cliffcrest"
9,M1N,Scarborough,"Birch Cliff,Cliffside West"


### Get geospatial Data 

In [11]:
import io 

In [12]:
url = 'http://cocl.us/Geospatial_data'
S = requests.get(url).content 
geospatial_data = pd.read_csv(io.StringIO(S.decode('utf-8')))
geospatial_data.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [13]:
#Rename columns so that they match 
df.columns = ['Postalcode', 'Borough', 'Neighbourhood']
geospatial_data.columns = ['Postalcode', 'Latitude', 'Longitude']

In [14]:
# Merge both dataframes 
neighborhood = pd.merge(df, geospatial_data, on = df['Postalcode'], right_index = True, left_index = True)
neighborhood.drop('Postalcode_y', axis = 1, inplace = True)
neighborhood.columns = ['Postalcode', 'Borough', 'Neighbourhood', 'Latitude', 'Longitude']
neighborhood.head(10)

Unnamed: 0,Postalcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern,Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Port Union,Rouge Hill",43.784535,-79.160497
2,M1E,Scarborough,"Morningside,West Hill,Guildwood",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"Ionview,East Birchmount Park,Kennedy Park",43.727929,-79.262029
7,M1L,Scarborough,"Oakridge,Clairlea,Golden Mile",43.711112,-79.284577
8,M1M,Scarborough,"Cliffside,Scarborough Village West,Cliffcrest",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff,Cliffside West",43.692657,-79.264848


In [15]:
# Number of uniqueBoroughs in Toronto 
print(len(neighborhood['Borough'].unique()))
print(neighborhood['Borough'].unique())

11
['Scarborough' 'North York' 'East York' 'East Toronto' 'Central Toronto'
 'Downtown Toronto' 'York' 'West Toronto' "Queen's Park" 'Mississauga'
 'Etobicoke']


In [16]:
# Create a map of Toronto 
! conda install -c conda-forge geopy --yes 
from geopy.geocoders import Nominatim 
import matplotlib.cm as cm 
import matplotlib.colors as colors 
from sklearn.cluster import KMeans 
! conda install -c conda-forge folium=0.5.0 --yes 
import folium 
print('Libraries imported')

Fetching package metadata .............
Solving package specifications: .

Package plan for installation in environment /opt/conda/envs/DSX-Python35:

The following NEW packages will be INSTALLED:

    geographiclib: 1.49-py_0   conda-forge
    geopy:         1.18.1-py_0 conda-forge

geographiclib- 100% |################################| Time: 0:00:00  24.93 MB/s
geopy-1.18.1-p 100% |################################| Time: 0:00:00  35.43 MB/s
Fetching package metadata .............
Solving package specifications: .

Package plan for installation in environment /opt/conda/envs/DSX-Python35:

The following NEW packages will be INSTALLED:

    altair:  2.2.2-py35_1 conda-forge
    branca:  0.3.1-py_0   conda-forge
    folium:  0.5.0-py_0   conda-forge
    vincent: 0.4.4-py_1   conda-forge

altair-2.2.2-p 100% |################################| Time: 0:00:00  45.92 MB/s
branca-0.3.1-p 100% |################################| Time: 0:00:00  34.71 MB/s
vincent-0.4.4- 100% |###################

In [17]:
neighborhood.columns

Index(['Postalcode', 'Borough', 'Neighbourhood', 'Latitude', 'Longitude'], dtype='object')

In [18]:
address = 'Toronto, ON'

geolocator = Nominatim(user_agent="ny_explore")
location = geolocator.geocode(address)
latitude = location.latitude 
longitude = location.longitude 
print('The geographical coordinates of Toronto are {}, {}.'.format(latitude, longitude))

The geographical coordinates of Toronto are 43.653963, -79.387207.


In [19]:
# Create the map of Toronto using the coordinate values 
map_toronto = folium.Map(location = [latitude, longitude], zoom_start=10)

# add markers to map 
for lat, lng, borough, neighborhoods in zip(neighborhood['Latitude'], neighborhood['Longitude'], 
                                           neighborhood['Borough'], neighborhood['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html = True)
    folium.CircleMarker(
    [lat,lng],
    raduis=5,
    popup=label,
    color='blue',
    fill=True,
    fill_color='#3186cc',
    fill_opacity=0.7,
    parse_html=False).add_to(map_toronto)
    
    map_toronto 

In [20]:
neighborhood.groupby("Borough").count()

Unnamed: 0_level_0,Postalcode,Neighbourhood,Latitude,Longitude
Borough,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Central Toronto,9,9,9,9
Downtown Toronto,18,18,18,18
East Toronto,5,5,5,5
East York,5,5,5,5
Etobicoke,12,12,12,12
Mississauga,1,1,1,1
North York,24,24,24,24
Queen's Park,1,1,1,1
Scarborough,17,17,17,17
West Toronto,6,6,6,6


In [21]:
# Since North York has the maximum number of neighborhoods, let's segment and cluster it's neighborhoods 
northyork_data = neighborhood[neighborhood['Borough']=='North York'].reset_index(drop=True)
northyork_data.head()

Unnamed: 0,Postalcode,Borough,Neighbourhood,Latitude,Longitude
0,M2H,North York,Hillcrest Village,43.803762,-79.363452
1,M2J,North York,"Oriole,Henry Farm,Fairview",43.778517,-79.346556
2,M2K,North York,Bayview Village,43.786947,-79.385975
3,M2L,North York,"York Mills,Silver Hills",43.75749,-79.374714
4,M2M,North York,"Newtonbrook,Willowdale",43.789053,-79.408493


In [22]:
# let's plot North York map 
address = 'North York, ON'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude 
longitude = location.longitude 
print('The geographical coordinates of York are {}, {}.'.format(latitude, longitude))

The geographical coordinates of York are 43.7709163, -79.4124102.


In [23]:
# create the map of North York using the coordinate values 
map_northyork = folium.Map(location=[latitude, longitude], zoom_start = 10)

# add markers to map 
for lat, lng, borough, neighborhoods in zip(northyork_data['Latitude'], northyork_data['Longitude'],
                                           northyork_data['Borough'], northyork_data['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html = True)
    folium.CircleMarker(
    [lat,lng],
    radius=5,
    popup=label,
    color='blue',
    fill=True,
    fill_color='#3186cc',
    fill_opacity=0.7,
    parse_html=False).add_to(map_northyork)
    
    map_northyork 

In [24]:
# Foursquare credentials and version 
CLIENT_ID = '31DMD01K2VSDHMZ2GWQ5OPWTSFXIRE105JFDN2IW2NEWNISZ'
CLIENT_SECRET = 'XR4MJA0GY25GFN0O1MG22O1MFYLUVQBRGS4NAAUDKXPNPHRU'
VERSION = '20190131'
LIMIT = 100 
radius = 500 
print('Your credentials:')
print('CLIENT_ID:'+ CLIENT_ID)
print('CLIENT_SECRET:'+ CLIENT_SECRET)

Your credentials:
CLIENT_ID:31DMD01K2VSDHMZ2GWQ5OPWTSFXIRE105JFDN2IW2NEWNISZ
CLIENT_SECRET:XR4MJA0GY25GFN0O1MG22O1MFYLUVQBRGS4NAAUDKXPNPHRU


In [32]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
        
        # Create the API request URL 
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
        CLIENT_ID,
        CLIENT_SECRET,
        VERSION,
        lat,
        lng,
        radius,
        LIMIT)
        
        # Make the GET request 
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
        name,
        lat,
        lng,
        v['venue']['name'],
        v['venue']['location']['lat'],
        v['venue']['location']['lng'],
        v['venue']['categories'][0]['name']) for v in results])
        
        nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
        nearby_venues.columns = ['Neighborhood',
                                'Neighborhood Latitude',
                                'Neighborhood Longitude',
                                'Venue',
                                'Venue Latitude',
                                'Venue Longitude',
                                'Venue Category']
        return(nearby_venues)

In [33]:
northyork_data.columns

Index(['Postalcode', 'Borough', 'Neighbourhood', 'Latitude', 'Longitude'], dtype='object')

In [34]:
northyork_venues = getNearbyVenues(names=northyork_data['Neighbourhood'],
                                  latitudes=northyork_data['Latitude'],
                                  longitudes=northyork_data['Longitude']
                                  )

Hillcrest Village


In [57]:
northyork_venues.shape 

(4, 7)

In [58]:
northyork_venues.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Hillcrest Village,43.803762,-79.363452,Eagle's Nest Golf Club,43.805455,-79.364186,Golf Course
1,Hillcrest Village,43.803762,-79.363452,AY Jackson Pool,43.804515,-79.366138,Pool
2,Hillcrest Village,43.803762,-79.363452,Villa Madina,43.801685,-79.363938,Mediterranean Restaurant
3,Hillcrest Village,43.803762,-79.363452,Duncan Creek Park,43.805539,-79.360695,Dog Run


In [28]:
# Let's know how many unique venue category 
print('There are  {} unique categories.'.format(len(northyork_venues['Venue Category'].unique())))

There are  4 unique categories.


In [30]:
# Let's cluster based on Neighborhoods 

# one hot encoding 
northyork_onehot = pd.get_dummies(northyork_venues[['Venue Category']], prefix = "", prefix_sep ="")

# add Neighborhood column back to dataframe 
northyork_onehot['Neighborhood'] = northyork_venues['Neighborhood']

# Move neighborhood column to the first column 
fixed_columns = [northyork_onehot.columns[-1]]+list(northyork_onehot.columns[:-1])
northyork_onehot = northyork_onehot[fixed_columns]

northyork_grouped = northyork_onehot.groupby('Neighborhood').mean().reset_index()

In [31]:
northyork_grouped.head()

Unnamed: 0,Neighborhood,Dog Run,Golf Course,Mediterranean Restaurant,Pool
0,Hillcrest Village,0.25,0.25,0.25,0.25
