### This notebook scrap data of Toronto city from Wikipedia to dataframe

In [1]:
import requests
from bs4 import BeautifulSoup

!conda install -c conda-forge geopy --yes 
from geopy.geocoders import Nominatim # module to convert an address into latitude and longitude values
import requests # library to handle requests
import pandas as pd # library for data analsysis
import numpy as np # library to handle data in a vectorized manner
import random # library for random number generation

# libraries for displaying images
from IPython.display import Image 
from IPython.core.display import HTML 
    
# tranforming json file into a pandas dataframe library
from pandas.io.json import json_normalize

# import k-means from clustering stage
from sklearn.cluster import KMeans

import matplotlib.cm as cm
import matplotlib.colors as colors

!conda install -c conda-forge folium=0.5.0 --yes
import folium # plotting library

print('Folium installed')
print('Libraries imported.')

Fetching package metadata .............
Solving package specifications: .

Package plan for installation in environment /opt/conda/envs/DSX-Python35:

The following NEW packages will be INSTALLED:

    geographiclib: 1.49-py_0   conda-forge
    geopy:         1.17.0-py_0 conda-forge

geographiclib- 100% |################################| Time: 0:00:00   1.06 MB/s
geopy-1.17.0-p 100% |################################| Time: 0:00:00   1.62 MB/s
Fetching package metadata .............
Solving package specifications: .

Package plan for installation in environment /opt/conda/envs/DSX-Python35:

The following NEW packages will be INSTALLED:

    altair:  2.2.2-py35_1 conda-forge
    branca:  0.3.0-py_0   conda-forge
    folium:  0.5.0-py_0   conda-forge
    vincent: 0.4.4-py_1   conda-forge

altair-2.2.2-p 100% |################################| Time: 0:00:00   2.98 MB/s
branca-0.3.0-p 100% |################################| Time: 0:00:00  28.09 MB/s
vincent-0.4.4- 100% |###################

#### FROM PART 1 & PART 2

In [2]:
page = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')
contents = page.content
soup = BeautifulSoup(page.content,'lxml')
table = soup.find_all('table')[0] 
df_raw = pd.read_html(str(table))[0]

df_raw.to_csv('Toronto.csv')
df_t1 = pd.read_csv('Toronto.csv', skiprows = 1)
df_t2 = df_t1.drop('0', axis=1)

df_t3 = df_t2.drop(df_t2[df_t2.Borough =='Not assigned'].index)
df_t3['Neighbourhood'] = np.where(df_t3['Neighbourhood'] == 'Not assigned',\
                                  df_t3['Borough'], df_t3['Neighbourhood'] )

df =df_t3.groupby(['Postcode', 'Borough'], sort=False)['Neighbourhood'].apply(', '.join).reset_index()

df_geo=pd.read_csv('https://cocl.us/Geospatial_data/Geospatial_Coordinates.csv',  \
                   skiprows = 1, names = ['Postcode', 'Latitude', 'Longitude'])

df_f = pd.merge(df, df_geo, on='Postcode')

#### Select all the borough named 'Downtown Toronto'

In [3]:
downtown = df_f[df_f['Borough'] == 'Downtown Toronto'].reset_index(drop=True)
downtown

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M5A,Downtown Toronto,"Harbourfront, Regent Park",43.65426,-79.360636
1,M5B,Downtown Toronto,"Ryerson, Garden District",43.657162,-79.378937
2,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
3,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306
4,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
5,M6G,Downtown Toronto,Christie,43.669542,-79.422564
6,M5H,Downtown Toronto,"Adelaide, King, Richmond",43.650571,-79.384568
7,M5J,Downtown Toronto,"Harbourfront East, Toronto Islands, Union Station",43.640816,-79.381752
8,M5K,Downtown Toronto,"Design Exchange, Toronto Dominion Centre",43.647177,-79.381576
9,M5L,Downtown Toronto,"Commerce Court, Victoria Hotel",43.648198,-79.379817


In [5]:
address = 'Downtown Toronto, ON'

geolocator = Nominatim()
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Downtown Toronto are {}, {}.'.format(latitude, longitude))



The geograpical coordinate of Downtown Toronto are 43.655115, -79.380219.


#### First look at the map of downtown

In [7]:
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(downtown['Latitude'], downtown['Longitude'], downtown['Neighbourhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
#map_toronto

#### Provide Foursquare credentials

In [8]:
CLIENT_ID = 'DCJ1IPZDGGG5CGIFJJJ5WAFSYU4NK0SDT5WQ0A1OJ3MIMIRB' # your Foursquare ID
CLIENT_SECRET = '0SW2I53JTBYJCM2IHWBNBSYYRBXFDW3ABLRSZI4XQBY20YR0' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentials:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentials:
CLIENT_ID: DCJ1IPZDGGG5CGIFJJJ5WAFSYU4NK0SDT5WQ0A1OJ3MIMIRB
CLIENT_SECRET:0SW2I53JTBYJCM2IHWBNBSYYRBXFDW3ABLRSZI4XQBY20YR0


#### Function to explore venues within radius 500m of each neighbourhood 

In [9]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    LIMIT = 100
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighbourhood', 
                  'Neighbourhood Latitude', 
                  'Neighbourhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

#### List of all venues found in downtown

In [10]:
downtown_venues = getNearbyVenues(names=downtown['Neighbourhood'],
                                   latitudes=downtown['Latitude'],
                                   longitudes=downtown['Longitude']
                                  )

Harbourfront, Regent Park
Ryerson, Garden District
St. James Town
Berczy Park
Central Bay Street
Christie
Adelaide, King, Richmond
Harbourfront East, Toronto Islands, Union Station
Design Exchange, Toronto Dominion Centre
Commerce Court, Victoria Hotel
Harbord, University of Toronto
Chinatown, Grange Park, Kensington Market
CN Tower, Bathurst Quay, Island airport, Harbourfront West, King and Spadina, Railway Lands, South Niagara
Rosedale
Stn A PO Boxes 25 The Esplanade
Cabbagetown, St. James Town
First Canadian Place, Underground city
Church and Wellesley


In [11]:
downtown_venues.head()

Unnamed: 0,Neighbourhood,Neighbourhood Latitude,Neighbourhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Harbourfront, Regent Park",43.65426,-79.360636,Roselle Desserts,43.653447,-79.362017,Bakery
1,"Harbourfront, Regent Park",43.65426,-79.360636,Tandem Coffee,43.653559,-79.361809,Coffee Shop
2,"Harbourfront, Regent Park",43.65426,-79.360636,Body Blitz Spa East,43.654735,-79.359874,Spa
3,"Harbourfront, Regent Park",43.65426,-79.360636,Morning Glory Cafe,43.653947,-79.361149,Breakfast Spot
4,"Harbourfront, Regent Park",43.65426,-79.360636,Cooper Koo YMCA,43.653191,-79.357947,Gym / Fitness Center


#### Number of venues found at each neighbourhood

In [12]:
downtown_venues.groupby('Neighbourhood').count()

Unnamed: 0_level_0,Neighbourhood Latitude,Neighbourhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighbourhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Adelaide, King, Richmond",100,100,100,100,100,100
Berczy Park,53,53,53,53,53,53
"CN Tower, Bathurst Quay, Island airport, Harbourfront West, King and Spadina, Railway Lands, South Niagara",14,14,14,14,14,14
"Cabbagetown, St. James Town",48,48,48,48,48,48
Central Bay Street,82,82,82,82,82,82
"Chinatown, Grange Park, Kensington Market",100,100,100,100,100,100
Christie,16,16,16,16,16,16
Church and Wellesley,88,88,88,88,88,88
"Commerce Court, Victoria Hotel",100,100,100,100,100,100
"Design Exchange, Toronto Dominion Centre",100,100,100,100,100,100


#### Top five venues at each neighbourhood

In [13]:
# one hot encoding
downtown_onehot = pd.get_dummies(downtown_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
downtown_onehot['Neighbourhood'] = downtown_venues['Neighbourhood'] 

# move neighborhood column to the first column
fixed_columns = [downtown_onehot.columns[-1]] + list(downtown_onehot.columns[:-1])
downtown_onehot = downtown_onehot[fixed_columns]

downtown_grouped = downtown_onehot.groupby('Neighbourhood').mean().reset_index()

num_top_venues = 5

for hood in downtown_grouped['Neighbourhood']:
    print("----"+hood+"----")
    temp = downtown_grouped[downtown_grouped['Neighbourhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Adelaide, King, Richmond----
                 venue  freq
0          Coffee Shop  0.07
1                 Café  0.06
2  American Restaurant  0.04
3      Thai Restaurant  0.04
4           Steakhouse  0.04


----Berczy Park----
                venue  freq
0         Coffee Shop  0.09
1        Cocktail Bar  0.06
2          Steakhouse  0.04
3         Cheese Shop  0.04
4  Seafood Restaurant  0.04


----CN Tower, Bathurst Quay, Island airport, Harbourfront West, King and Spadina, Railway Lands, South Niagara----
              venue  freq
0    Airport Lounge  0.14
1   Airport Service  0.14
2  Airport Terminal  0.14
3             Plane  0.07
4     Boat or Ferry  0.07


----Cabbagetown, St. James Town----
               venue  freq
0        Coffee Shop  0.10
1         Restaurant  0.08
2                Pub  0.04
3  Indian Restaurant  0.04
4             Bakery  0.04


----Central Bay Street----
                 venue  freq
0          Coffee Shop  0.15
1                 Café  0.06
2   Italian Re

#### Detail of 10 most common venues at each neighbourhood

In [14]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [15]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighbourhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighbourhood'] = downtown_grouped['Neighbourhood']

for ind in np.arange(downtown_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(downtown_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted

Unnamed: 0,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Adelaide, King, Richmond",Coffee Shop,Café,Thai Restaurant,Steakhouse,American Restaurant,Cosmetics Shop,Gym,Restaurant,Hotel,Bar
1,Berczy Park,Coffee Shop,Cocktail Bar,Farmers Market,Bakery,Seafood Restaurant,Cheese Shop,Café,Steakhouse,Beer Bar,Restaurant
2,"CN Tower, Bathurst Quay, Island airport, Harbo...",Airport Lounge,Airport Terminal,Airport Service,Harbor / Marina,Sculpture Garden,Boutique,Plane,Boat or Ferry,Airport Gate,Airport
3,"Cabbagetown, St. James Town",Coffee Shop,Restaurant,Park,Pub,Bakery,Pizza Place,Café,Chinese Restaurant,Indian Restaurant,Italian Restaurant
4,Central Bay Street,Coffee Shop,Café,Italian Restaurant,Bubble Tea Shop,Sandwich Place,Bar,Burger Joint,Japanese Restaurant,Spa,Salad Place
5,"Chinatown, Grange Park, Kensington Market",Café,Vegetarian / Vegan Restaurant,Chinese Restaurant,Bar,Vietnamese Restaurant,Mexican Restaurant,Bakery,Coffee Shop,Dumpling Restaurant,Noodle House
6,Christie,Grocery Store,Café,Park,Athletics & Sports,Italian Restaurant,Diner,Nightclub,Convenience Store,Restaurant,Baby Store
7,Church and Wellesley,Japanese Restaurant,Coffee Shop,Gay Bar,Burger Joint,Sushi Restaurant,Restaurant,Mediterranean Restaurant,Pub,Men's Store,Café
8,"Commerce Court, Victoria Hotel",Coffee Shop,Café,Hotel,Restaurant,American Restaurant,Deli / Bodega,Gastropub,Steakhouse,Italian Restaurant,Gym
9,"Design Exchange, Toronto Dominion Centre",Coffee Shop,Hotel,Café,American Restaurant,Sports Bar,Italian Restaurant,Deli / Bodega,Gastropub,Gym,Restaurant


#### Cluster neighbourhood

In [16]:
# set number of clusters
kclusters = 6

downtown_grouped_clustering = downtown_grouped.drop('Neighbourhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(downtown_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([2, 2, 3, 2, 2, 5, 4, 2, 0, 0], dtype=int32)

In [17]:
downtown_merged = downtown

# add clustering labels
downtown_merged['Cluster Labels'] = kmeans.labels_

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
downtown_merged = downtown_merged.join(neighborhoods_venues_sorted.set_index('Neighbourhood'), on='Neighbourhood')

downtown_merged.head() 

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M5A,Downtown Toronto,"Harbourfront, Regent Park",43.65426,-79.360636,2,Coffee Shop,Bakery,Café,Park,Mexican Restaurant,Restaurant,Breakfast Spot,Pub,Theater,Yoga Studio
1,M5B,Downtown Toronto,"Ryerson, Garden District",43.657162,-79.378937,2,Coffee Shop,Clothing Store,Café,Cosmetics Shop,Bar,Japanese Restaurant,Theater,Tea Room,Movie Theater,Pizza Place
2,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418,3,Coffee Shop,Café,Restaurant,Clothing Store,Hotel,Cosmetics Shop,Gastropub,Bakery,Italian Restaurant,Cocktail Bar
3,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306,2,Coffee Shop,Cocktail Bar,Farmers Market,Bakery,Seafood Restaurant,Cheese Shop,Café,Steakhouse,Beer Bar,Restaurant
4,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383,2,Coffee Shop,Café,Italian Restaurant,Bubble Tea Shop,Sandwich Place,Bar,Burger Joint,Japanese Restaurant,Spa,Salad Place


#### Visualize the results

In [18]:
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(downtown_merged['Latitude'], downtown_merged['Longitude'], downtown_merged['Neighbourhood'], downtown_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

#### Check the cluster 1

In [19]:
downtown_merged.loc[downtown_merged['Cluster Labels'] == 0, \
                     downtown_merged.columns[[1] + list(range(5, downtown_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
8,Downtown Toronto,0,Coffee Shop,Hotel,Café,American Restaurant,Sports Bar,Italian Restaurant,Deli / Bodega,Gastropub,Gym,Restaurant
9,Downtown Toronto,0,Coffee Shop,Café,Hotel,Restaurant,American Restaurant,Deli / Bodega,Gastropub,Steakhouse,Italian Restaurant,Gym
10,Downtown Toronto,0,Café,Theater,Bakery,Bar,Bookstore,Restaurant,Japanese Restaurant,Coffee Shop,Chinese Restaurant,Poutine Place
12,Downtown Toronto,0,Airport Lounge,Airport Terminal,Airport Service,Harbor / Marina,Sculpture Garden,Boutique,Plane,Boat or Ferry,Airport Gate,Airport


#### Check the cluster 3

In [20]:
downtown_merged.loc[downtown_merged['Cluster Labels'] == 2, \
                     downtown_merged.columns[[1] + list(range(5, downtown_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Downtown Toronto,2,Coffee Shop,Bakery,Café,Park,Mexican Restaurant,Restaurant,Breakfast Spot,Pub,Theater,Yoga Studio
1,Downtown Toronto,2,Coffee Shop,Clothing Store,Café,Cosmetics Shop,Bar,Japanese Restaurant,Theater,Tea Room,Movie Theater,Pizza Place
3,Downtown Toronto,2,Coffee Shop,Cocktail Bar,Farmers Market,Bakery,Seafood Restaurant,Cheese Shop,Café,Steakhouse,Beer Bar,Restaurant
4,Downtown Toronto,2,Coffee Shop,Café,Italian Restaurant,Bubble Tea Shop,Sandwich Place,Bar,Burger Joint,Japanese Restaurant,Spa,Salad Place
7,Downtown Toronto,2,Coffee Shop,Hotel,Aquarium,Pizza Place,Café,Italian Restaurant,Scenic Lookout,Sports Bar,Brewery,Fried Chicken Joint
13,Downtown Toronto,2,Park,Playground,Trail,Yoga Studio,Department Store,Electronics Store,Dumpling Restaurant,Donut Shop,Doner Restaurant,Dog Run
15,Downtown Toronto,2,Coffee Shop,Restaurant,Park,Pub,Bakery,Pizza Place,Café,Chinese Restaurant,Indian Restaurant,Italian Restaurant
16,Downtown Toronto,2,Coffee Shop,Café,Hotel,Restaurant,Steakhouse,American Restaurant,Deli / Bodega,Bar,Gym,Gastropub
17,Downtown Toronto,2,Japanese Restaurant,Coffee Shop,Gay Bar,Burger Joint,Sushi Restaurant,Restaurant,Mediterranean Restaurant,Pub,Men's Store,Café
