In this notebook we will be scraping data from a webpage and perform segmentation of neighborhoods on folium maps

### Ask 1:

In [1]:
# Libraries

import requests
from bs4 import BeautifulSoup

In [2]:
import urllib.request

url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
req = urllib.request.urlopen(url)
article = req.read().decode()

#with open('List_of_postal_codes_of_Canada:_M.html', 'w') as fo:
 #   fo.write(article)
    
from bs4 import BeautifulSoup

# Load article, turn into soup and get the <table>s.
#article = open('ISO_3166-1_alpha-2.html').read()
soup = BeautifulSoup(article, 'html.parser')
tables = soup.find_all('table', class_='sortable')

# Search through the tables for the one with the headings we want.
for table in tables:
    ths = table.find_all('th')
    headings = [th.text.strip() for th in ths]
    if headings[:3] == ['Postcode', 'Borough', 'Neighbourhood']:
        break    

In [3]:
import pandas as pd
# define the dataframe columns
column_names = ['Postcode', 'Borough', 'Neighborhood'] 

rowsList = []
for tr in table.find_all('tr'):
        tds = tr.find_all('td')
        if not tds:
            continue
        Postcode, Borough, Neighbourhood = [td.text.strip() for td in tds[:4]]
        rowsList.append([Postcode, Borough, Neighbourhood])

In [4]:
# instantiate the dataframe
neighborhoods = pd.DataFrame(rowsList, columns=column_names)

Here, we perform data cleaning and data wrangling as per instructions given

In [5]:
neighborhoods = neighborhoods[neighborhoods.Borough != "Not assigned"]

In [6]:
neighborhoods.loc[neighborhoods.Neighborhood == 'Not assigned', 'Neighborhood'] = neighborhoods.Borough

In [7]:
neighborhoods.head(15)

Unnamed: 0,Postcode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Queen's Park
10,M9A,Etobicoke,Islington Avenue
11,M1B,Scarborough,Rouge
12,M1B,Scarborough,Malvern


In [8]:
neighborhoods['Neighborhood'] = neighborhoods[['Postcode','Neighborhood','Borough']].groupby(['Postcode','Borough'])['Neighborhood'].transform(lambda x: ','.join(x))
neighborhoods = neighborhoods[['Postcode','Neighborhood','Borough']].drop_duplicates()

Coupling rows neighborhoods with common zipcode, as per instructions given

In [9]:
neighborhoods.head()

Unnamed: 0,Postcode,Neighborhood,Borough
2,M3A,Parkwoods,North York
3,M4A,Victoria Village,North York
4,M5A,"Harbourfront,Regent Park",Downtown Toronto
6,M6A,"Lawrence Heights,Lawrence Manor",North York
8,M7A,Queen's Park,Queen's Park


In [10]:
neighborhoods.shape

(103, 3)

In [None]:
# This is end of the ask 1

# This is beginning of ask 2

In [11]:
import pandas as pd
LatLongFile = pd.read_csv('C:\\Users\\saich\\Downloads\\Geospatial_Coordinates.csv')
LatLongFile.rename(columns = {"Postal Code":"Postcode"}, inplace=True)
LatLongFile.head()

Unnamed: 0,Postcode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


Above file provides lat and long locations to our dataset

In [76]:
#GeoFull = neighborhoods.join(LatLongFile, on="PostCode")
GeoFull = pd.merge(neighborhoods, LatLongFile, on='Postcode', how='outer')
GeoFull.head()

Unnamed: 0,Postcode,Neighborhood,Borough,Latitude,Longitude
0,M3A,Parkwoods,North York,43.753259,-79.329656
1,M4A,Victoria Village,North York,43.725882,-79.315572
2,M5A,"Harbourfront,Regent Park",Downtown Toronto,43.65426,-79.360636
3,M6A,"Lawrence Heights,Lawrence Manor",North York,43.718518,-79.464763
4,M7A,Queen's Park,Queen's Park,43.662301,-79.389494


In [78]:
GeoFull.shape

(103, 5)

# This is end of ask 2

# This is beginning of Ask 3

In [23]:
# Renaming dataset
TorontoData = GeoFull.drop(['Postcode'], axis = 1)
del GeoFull

In [None]:
# Extracting venue information from foursquare API

In [29]:
CLIENT_ID = 'MHPVEPYUQ2KBD3OJTU2SF1JWFQ2NQKYDE1JLJ1DXZ3VVKXDV' # your Foursquare ID
CLIENT_SECRET = '1FGACI0MBBDVVDQ1QBAEB1QYVSSXGYPOSKY2CU0ISSOIRGLW' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version
# type your answer here
LIMIT = 100 # limit of number of venues returned by Foursquare API
radius = 500 # define radius
print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: MHPVEPYUQ2KBD3OJTU2SF1JWFQ2NQKYDE1JLJ1DXZ3VVKXDV
CLIENT_SECRET:1FGACI0MBBDVVDQ1QBAEB1QYVSSXGYPOSKY2CU0ISSOIRGLW


In [30]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [31]:
# Function that extracts venue names, venue categories, lat & long locations for each neighborhood
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [32]:
Toronto_Venues = getNearbyVenues(names=TorontoData['Neighborhood'],
                                   latitudes=TorontoData['Latitude'],
                                   longitudes=TorontoData['Longitude']
                                  )

Parkwoods
Victoria Village
Harbourfront,Regent Park
Lawrence Heights,Lawrence Manor
Queen's Park
Islington Avenue
Rouge,Malvern
Don Mills North
Woodbine Gardens,Parkview Hill
Ryerson,Garden District
Glencairn
Cloverdale,Islington,Martin Grove,Princess Gardens,West Deane Park
Highland Creek,Rouge Hill,Port Union
Flemingdon Park,Don Mills South
Woodbine Heights
St. James Town
Humewood-Cedarvale
Bloordale Gardens,Eringate,Markland Wood,Old Burnhamthorpe
Guildwood,Morningside,West Hill
The Beaches
Berczy Park
Caledonia-Fairbanks
Woburn
Leaside
Central Bay Street
Christie
Cedarbrae
Hillcrest Village
Bathurst Manor,Downsview North,Wilson Heights
Thorncliffe Park
Adelaide,King,Richmond
Dovercourt Village,Dufferin
Scarborough Village
Fairview,Henry Farm,Oriole
Northwood Park,York University
East Toronto
Harbourfront East,Toronto Islands,Union Station
Little Portugal,Trinity
East Birchmount Park,Ionview,Kennedy Park
Bayview Village
CFB Toronto,Downsview East
The Danforth West,Riverdale
Design E

In [33]:
Toronto_Venues.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Parkwoods,43.753259,-79.329656,Brookbanks Park,43.751976,-79.33214,Park
1,Parkwoods,43.753259,-79.329656,Variety Store,43.751974,-79.333114,Food & Drink Shop
2,Victoria Village,43.725882,-79.315572,Victoria Village Arena,43.723481,-79.315635,Hockey Arena
3,Victoria Village,43.725882,-79.315572,Tim Hortons,43.725517,-79.313103,Coffee Shop
4,Victoria Village,43.725882,-79.315572,Portugril,43.725819,-79.312785,Portuguese Restaurant


In [34]:
Toronto_onehot = pd.get_dummies(Toronto_Venues[['Venue Category']], prefix="", prefix_sep="")
Toronto_onehot['Neighborhood'] = Toronto_Venues['Neighborhood'] 

fixed_columns = [Toronto_onehot.columns[-1]] + list(Toronto_onehot.columns[:-1])
Toronto_onehot = Toronto_onehot[fixed_columns]

In [35]:
toronto_grouped = Toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped

Unnamed: 0,Neighborhood,Yoga Studio,Accessories Store,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,...,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store
0,"Adelaide,King,Richmond",0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.0,0.020000,0.000000,0.000000,0.000000,0.000000,0.010000,0.000000,0.0
1,Agincourt,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
2,"Agincourt North,L'Amoreaux East,Milliken,Steel...",0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
3,"Albion Gardens,Beaumond Heights,Humbergate,Jam...",0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
4,"Alderwood,Long Branch",0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
5,"Bathurst Manor,Downsview North,Wilson Heights",0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.0,0.000000,0.000000,0.052632,0.000000,0.000000,0.000000,0.000000,0.0
6,Bayview Village,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
7,"Bedford Park,Lawrence Manor East",0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
8,Berczy Park,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.0,0.017857,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
9,"Birch Cliff,Cliffside West",0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0


In [None]:
# Steps to extract most venues for the neighborhood

In [39]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [99]:
import numpy as np
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Adelaide,King,Richmond",Coffee Shop,Café,Steakhouse,Bar,Hotel,Restaurant,Cosmetics Shop,Burger Joint,American Restaurant,Thai Restaurant
1,Agincourt,Lounge,Skating Rink,Sandwich Place,Breakfast Spot,Chinese Restaurant,Electronics Store,Eastern European Restaurant,Empanada Restaurant,Dumpling Restaurant,Dessert Shop
2,"Agincourt North,L'Amoreaux East,Milliken,Steel...",Park,Coffee Shop,Playground,Donut Shop,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store,Dog Run,Doner Restaurant
3,"Albion Gardens,Beaumond Heights,Humbergate,Jam...",Grocery Store,Fried Chicken Joint,Beer Store,Fast Food Restaurant,Sandwich Place,Liquor Store,Coffee Shop,Pizza Place,Pharmacy,Comfort Food Restaurant
4,"Alderwood,Long Branch",Pizza Place,Gym,Athletics & Sports,Pub,Sandwich Place,Pool,Dance Studio,Pharmacy,Skating Rink,Coffee Shop


In [100]:
# Performing clustering on toronto dataset

In [101]:
# import k-means from clustering stage
from sklearn.cluster import KMeans

# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([3, 3, 0, 3, 3, 3, 3, 3, 3, 3])

In [102]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = TorontoData

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

toronto_merged.head() # check the last columns!

Unnamed: 0,Neighborhood,Borough,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Parkwoods,North York,43.753259,-79.329656,2.0,Park,Food & Drink Shop,Women's Store,Donut Shop,Dim Sum Restaurant,Diner,Discount Store,Dog Run,Doner Restaurant,Drugstore
1,Victoria Village,North York,43.725882,-79.315572,3.0,French Restaurant,Coffee Shop,Pizza Place,Hockey Arena,Portuguese Restaurant,Doner Restaurant,Department Store,Dessert Shop,Dim Sum Restaurant,Diner
2,"Harbourfront,Regent Park",Downtown Toronto,43.65426,-79.360636,3.0,Coffee Shop,Park,Bakery,Pub,Café,Mexican Restaurant,Breakfast Spot,Restaurant,Theater,Gym / Fitness Center
3,"Lawrence Heights,Lawrence Manor",North York,43.718518,-79.464763,3.0,Clothing Store,Furniture / Home Store,Boutique,Accessories Store,Miscellaneous Shop,Vietnamese Restaurant,Shoe Store,Coffee Shop,Athletics & Sports,Fraternity House
4,Queen's Park,Queen's Park,43.662301,-79.389494,3.0,Coffee Shop,Park,Gym,Diner,Seafood Restaurant,Sandwich Place,Burger Joint,Burrito Place,Café,Portuguese Restaurant


In [103]:
from geopy.geocoders import Nominatim 
address = 'Toronto'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.653963, -79.387207.


In [104]:
import folium

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

In [105]:
toronto_merged.head()

Unnamed: 0,Neighborhood,Borough,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Parkwoods,North York,43.753259,-79.329656,2.0,Park,Food & Drink Shop,Women's Store,Donut Shop,Dim Sum Restaurant,Diner,Discount Store,Dog Run,Doner Restaurant,Drugstore
1,Victoria Village,North York,43.725882,-79.315572,3.0,French Restaurant,Coffee Shop,Pizza Place,Hockey Arena,Portuguese Restaurant,Doner Restaurant,Department Store,Dessert Shop,Dim Sum Restaurant,Diner
2,"Harbourfront,Regent Park",Downtown Toronto,43.65426,-79.360636,3.0,Coffee Shop,Park,Bakery,Pub,Café,Mexican Restaurant,Breakfast Spot,Restaurant,Theater,Gym / Fitness Center
3,"Lawrence Heights,Lawrence Manor",North York,43.718518,-79.464763,3.0,Clothing Store,Furniture / Home Store,Boutique,Accessories Store,Miscellaneous Shop,Vietnamese Restaurant,Shoe Store,Coffee Shop,Athletics & Sports,Fraternity House
4,Queen's Park,Queen's Park,43.662301,-79.389494,3.0,Coffee Shop,Park,Gym,Diner,Seafood Restaurant,Sandwich Place,Burger Joint,Burrito Place,Café,Portuguese Restaurant


In [106]:
toronto_merged.dropna(subset=['Cluster Labels'], how='all', inplace = True)
toronto_merged['Cluster Labels'] = toronto_merged['Cluster Labels'].apply(np.int64)

In [107]:
# add markers to the map
toronto_merged['Cluster Labels'] = toronto_merged['Cluster Labels'].astype(int)

markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

In [108]:
# Key observations:

# 1. Majority of neighborhoods belong to cluster 3
# 2. Cluster 0,2,4 have more than 1 neighborhoods mapped to it
# 3. Cluster 1 has only 1 neighborhood associated with it

# End of ask 3