# Packages Importing

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np

# Data Creation

In [2]:
source = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")
bsoup = BeautifulSoup(source.text, 'lxml')

data = []
columns = []
table = bsoup.find(class_='wikitable')
for index, tr in enumerate(table.find_all('tr')):
    section = []
    for td in tr.find_all(['th','td']):
        section.append(td.text.rstrip())
    
    #First row of data is the header
    if (index == 0):
        columns = section
    else:
        data.append(section)

canada_pc_df = pd.DataFrame(data = data, columns = columns)
canada_pc_df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


# Data cleaning

### Ignoring cells with a borough that is Not assigned

In [3]:
canada_pc_df = canada_pc_df[canada_pc_df['Borough'] != 'Not assigned']
canada_pc_df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


### Combining different neighborhood for same postal code rows

In [4]:
canada_pc_df["Neighbourhood"] = canada_pc_df.groupby("Postal Code")["Neighbourhood"].transform(lambda x: ', '.join(x))
canada_pc_df = canada_pc_df.drop_duplicates()
canada_pc_df.reset_index(drop=True,inplace=True)
canada_pc_df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


### replacing not assigned neighborhood by borough

In [5]:
canada_pc_df['Neighbourhood'].replace("Not assigned", canada_pc_df["Borough"], inplace=True)
canada_pc_df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


### Printing dataframe shape

In [6]:
canada_pc_df.shape

(103, 3)

# Adding Geolocalisation attributes 

### Solution 1: Using Geocoder (this solution is unreliable) 

### Solution 2: Using CSV file containing coordinates 

In [7]:
# reading the csv file
geo_canada_df = pd.read_csv("Geospatial_Coordinates.csv")
geo_canada_df.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [8]:
# mapping geolocalisation parameters with canada_pc_df
canada_pc_df = canada_pc_df.merge(geo_canada_df, on="Postal Code")
canada_pc_df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


# Exploring neighborhoods in Toronto

### limiting data set to boroughs located in Toronto

In [9]:
toronto_pc_df = canada_pc_df[canada_pc_df["Borough"].str.contains("Toronto")].reset_index(drop=True)
toronto_pc_df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
1,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
2,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
3,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
4,M4E,East Toronto,The Beaches,43.676357,-79.293031


### Exploring Toronto data set

#### Let's work on a specific neighbourhood "St. James Town"

In [13]:
neighbourhood_name = "St. James Town"
neighbourhood_latitude = toronto_pc_df.loc[3,"Latitude"]
neighbourhood_longitude = toronto_pc_df.loc[3,"Longitude"]
print(neighbourhood_name, ", Latitude: ", neighbourhood_latitude, "; Longitude: ",neighbourhood_longitude)

St. James Town , Latitude:  43.6514939 ; Longitude:  -79.3754179


#### Defining Foursquare Credentials and version

In [14]:
CLIENT_ID = ****
CLIENT_SECRET = ****
ACCESS_TOKEN = *****
VERSION = *****

#### Getting the top 10 venues that are in the St. James Town neighbourhood within a radius of 500m

In [15]:
venues_limit = 10 
radius = 500
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighbourhood_latitude, 
    neighbourhood_longitude, 
    radius, 
    venues_limit)

# get the result to a json file
results = requests.get(url).json()

In [16]:
from pandas.io.json import json_normalize

# getting the category of a venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']
    
venues = results['response']['groups'][0]['items']
venues_df = json_normalize(venues)
venue_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
venues_df =venues_df.loc[:, venue_columns]
venues_df['venue.categories'] = venues_df.apply(get_category_type, axis=1)
venues_df.columns = [col.split(".")[-1] for col in venues_df.columns]
venues_df

Unnamed: 0,name,categories,lat,lng
0,Gyu-Kaku Japanese BBQ,Japanese Restaurant,43.651422,-79.375047
1,Fahrenheit Coffee,Coffee Shop,43.652384,-79.372719
2,Crepe TO,Creperie,43.650063,-79.374587
3,Versus Coffee,Coffee Shop,43.651213,-79.375236
4,GEORGE Restaurant,Restaurant,43.653346,-79.374445
5,Terroni,Italian Restaurant,43.650927,-79.375602
6,Hogtown Smoke,Food Truck,43.649287,-79.374689
7,Mystic Muffin,Middle Eastern Restaurant,43.652484,-79.372655
8,Aveda Institute Toronto,Cosmetics Shop,43.650096,-79.37363
9,GoodLife Fitness Toronto 137 Yonge Street,Gym,43.651242,-79.378068


### Let's do now the same thing for all neighborhoods in Toronto

In [17]:
def getVenues(names, latitudes, longitudes, radius, venues_limit):
    venues_list=[]    
    for neighbourhood_name, neighbourhood_latitude, neighbourhood_longitude in zip(names, latitudes, longitudes):

        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            neighbourhood_latitude, 
            neighbourhood_longitude, 
            radius, 
            venues_limit)
            
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        venues_list.append([(
            neighbourhood_name, 
            neighbourhood_latitude, 
            neighbourhood_longitude, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    venues_df = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    venues_df.columns = ['Neighbourhood', 
                  'Neighbourhood Latitude', 
                  'Neighbourhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(venues_df)

In [18]:
toronto_venues_df = getVenues(names=toronto_pc_df['Neighbourhood'],
                           latitudes=toronto_pc_df['Latitude'],
                           longitudes=toronto_pc_df['Longitude'],
                           radius=500, 
                           venues_limit=10)
toronto_venues_df.head()

Unnamed: 0,Neighbourhood,Neighbourhood Latitude,Neighbourhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Regent Park, Harbourfront",43.65426,-79.360636,Tandem Coffee,43.653559,-79.361809,Coffee Shop
1,"Regent Park, Harbourfront",43.65426,-79.360636,Roselle Desserts,43.653447,-79.362017,Bakery
2,"Regent Park, Harbourfront",43.65426,-79.360636,Cooper Koo Family YMCA,43.653249,-79.358008,Distribution Center
3,"Regent Park, Harbourfront",43.65426,-79.360636,Body Blitz Spa East,43.654735,-79.359874,Spa
4,"Regent Park, Harbourfront",43.65426,-79.360636,Impact Kitchen,43.656369,-79.35698,Restaurant


#### Let's explore the toronto venues dataset and see how much venues categories we are getting

In [19]:
toronto_venues_categories = list(toronto_venues_df['Venue Category'].unique())
print(len(toronto_venues_categories))
print(toronto_venues_categories)

123
['Coffee Shop', 'Bakery', 'Distribution Center', 'Spa', 'Restaurant', 'Pub', 'Breakfast Spot', 'Park', 'Gym / Fitness Center', 'Historic Site', 'Portuguese Restaurant', 'Italian Restaurant', 'Beer Bar', 'Creperie', 'Yoga Studio', 'Sushi Restaurant', 'Clothing Store', 'Pizza Place', 'Burrito Place', 'Comic Shop', 'Café', 'Plaza', 'Burger Joint', 'Music Venue', 'Ramen Restaurant', 'Theater', 'Japanese Restaurant', 'Food Truck', 'Middle Eastern Restaurant', 'Cosmetics Shop', 'Gym', 'Trail', 'Health Food Store', 'Neighborhood', 'Liquor Store', 'Vegetarian / Vegan Restaurant', 'Museum', 'Farmers Market', 'Cocktail Bar', 'Seafood Restaurant', 'Gastropub', 'Modern European Restaurant', 'Grocery Store', 'Candy Store', 'Concert Hall', 'Hotel', 'Speakeasy', 'Steakhouse', 'Lounge', 'Bar', 'Brewery', 'Supermarket', 'Bank', 'Dessert Shop', 'Lake', 'Performing Arts Venue', 'Salad Place', 'Sporting Goods Shop', 'Asian Restaurant', 'Korean Restaurant', 'Cuban Restaurant', 'Wine Bar', 'Ice Cream Sh

### One hot encoding to transform categories to binary columns

In [20]:
toronto_venues_transformed = pd.get_dummies(toronto_venues_df[['Venue Category']], prefix="", prefix_sep="")


toronto_venues_transformed['Neighbourhood Latitude'] = toronto_venues_df['Neighbourhood Latitude'] 
toronto_venues_transformed['Neighbourhood Longitude'] = toronto_venues_df['Neighbourhood Longitude'] 
toronto_venues_transformed['Neighbourhood'] = toronto_venues_df['Neighbourhood']

edited_columns = [toronto_venues_transformed.columns[-1]] + list(toronto_venues_transformed.columns[:-1])
toronto_venues_transformed = toronto_venues_transformed[edited_columns]

toronto_venues_transformed.head()

Unnamed: 0,Neighbourhood,Airport,Airport Food Court,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,Arts & Crafts Store,Asian Restaurant,...,Thai Restaurant,Theater,Theme Restaurant,Trail,Vegetarian / Vegan Restaurant,Vietnamese Restaurant,Wine Bar,Yoga Studio,Neighbourhood Latitude,Neighbourhood Longitude
0,"Regent Park, Harbourfront",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,43.65426,-79.360636
1,"Regent Park, Harbourfront",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,43.65426,-79.360636
2,"Regent Park, Harbourfront",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,43.65426,-79.360636
3,"Regent Park, Harbourfront",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,43.65426,-79.360636
4,"Regent Park, Harbourfront",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,43.65426,-79.360636


### Let's group now the toronto_venues_transformed per neighbourhood using mean of frequency of occurence of category

In [21]:
toronto_venues_transformed_grouped = toronto_venues_transformed.groupby('Neighbourhood').mean().reset_index()
toronto_venues_transformed_grouped.head()

Unnamed: 0,Neighbourhood,Airport,Airport Food Court,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,Arts & Crafts Store,Asian Restaurant,...,Thai Restaurant,Theater,Theme Restaurant,Trail,Vegetarian / Vegan Restaurant,Vietnamese Restaurant,Wine Bar,Yoga Studio,Neighbourhood Latitude,Neighbourhood Longitude
0,Berczy Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.1,0.0,0.0,0.0,43.644771,-79.373306
1,"Brockton, Parkdale Village, Exhibition Place",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,43.636847,-79.428191
2,"Business reply mail Processing Centre, South C...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,43.662744,-79.321558
3,"CN Tower, King and Spadina, Railway Lands, Har...",0.1,0.1,0.1,0.1,0.2,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,43.628947,-79.39442
4,Central Bay Street,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,43.657952,-79.387383


# Clustering neighbourhoods in Toronto

In [22]:
from sklearn.cluster import KMeans
import matplotlib.cm as cm
import matplotlib.colors as colors

kclusters = 5

toronto_venues_transformed_grouped_clustering = toronto_venues_transformed_grouped.drop('Neighbourhood', 1)

kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_venues_transformed_grouped_clustering)

toronto_venues_transformed_grouped.insert(1,"Cluster Labels",kmeans.labels_)

toronto_venues_transformed_grouped.head()

Unnamed: 0,Neighbourhood,Cluster Labels,Airport,Airport Food Court,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,Arts & Crafts Store,...,Thai Restaurant,Theater,Theme Restaurant,Trail,Vegetarian / Vegan Restaurant,Vietnamese Restaurant,Wine Bar,Yoga Studio,Neighbourhood Latitude,Neighbourhood Longitude
0,Berczy Park,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.1,0.0,0.0,0.0,43.644771,-79.373306
1,"Brockton, Parkdale Village, Exhibition Place",2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,43.636847,-79.428191
2,"Business reply mail Processing Centre, South C...",0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,43.662744,-79.321558
3,"CN Tower, King and Spadina, Railway Lands, Har...",2,0.1,0.1,0.1,0.1,0.2,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,43.628947,-79.39442
4,Central Bay Street,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,43.657952,-79.387383


### Visualizing clusters on map

In [23]:
from geopy.geocoders import Nominatim
import folium

address = "Toronto, ON"
geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('Latitude & Longitude of Toronto city are {}, {}.'.format(latitude, longitude))

Latitude & Longitude of Toronto city are 43.6534817, -79.3839347.


In [24]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(
        toronto_venues_transformed_grouped['Neighbourhood Latitude'], 
        toronto_venues_transformed_grouped['Neighbourhood Longitude'], 
        toronto_venues_transformed_grouped['Neighbourhood'], 
        toronto_venues_transformed_grouped['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters