# Part 1: Scraping of Data on Toronto Neighborhoods from Wikipedia

#### Installing and importing libraries


In [1]:
!pip install bs4
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup



#### Request web_page and convert it using BS

In [2]:
link="https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
toronto_wiki = requests.get(link).text

toronto_bs=BeautifulSoup(toronto_wiki,"html5lib")


#### Extract the table 

In [3]:
toronto_table_bs=toronto_bs.find("table")

#### Extract data from the table and append to dataframe

In [4]:
toronto_neigh_data = pd.DataFrame(columns=["Postal Code", "Borough","Neighborhood"])

for row in toronto_table_bs.find_all("tr"):
    col = row.find_all("td")
    
    for cell in col:
                    
            Postal_Code =str(cell.b.string)

            rest_of_cell = cell.find_all('a',href=True) 
                       
            Neighborhood=""
            Borough = ""
            
            for i,borough_and_neighs in enumerate(rest_of_cell):
                
                if i == 0:
                    Borough =str(borough_and_neighs.string)
                    
                elif  i==1:
                    Neighborhood +=str(borough_and_neighs.string)
                
                else:
                    Neighborhood += ", " + str(borough_and_neighs.string)
            
            if Neighborhood=="":
                    Neighborhood += Borough

            toronto_neigh_data = toronto_neigh_data.append({"Postal Code":Postal_Code, "Borough":Borough, "Neighborhood":Neighborhood}, ignore_index=True)
    

#### Remove empty cells from the dataframe and reset index


In [5]:
toronto_neigh_data.replace("",np.nan,inplace=True)
toronto_neigh_data.dropna(axis=0,inplace=True)
toronto_neigh_data.reset_index(drop=True,inplace=True)

In [6]:
toronto_neigh_data.head(20)

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Queen's Park,Queen's Park
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


In [7]:
toronto_neigh_data.shape

(101, 3)

# Part 2: Obtaining the Longitude and Latitudes of the Postal Codes

#### Install and import all required libraries

In [8]:
!pip install folium
import folium 

!pip install geocoder
import geocoder 



#### Get latitudes and longitudes for each postal code using geolocator

In [9]:
#def get_lat_lng(toronto):

    # initialize your variable to None
    #lat_lng_coords = None

    # loop until you get the coordinates
    #while(lat_lng_coords is None):
        #g = geocoder.google('{}, Toronto, Ontario'.format(toronto))
        #lat_lng_coords = g.latlng

    #latitude = lat_lng_coords[0]
    #longitude = lat_lng_coords[1]
    
    #return latitude, longitude
    

#### Get latitudes and longitudes for each postal codes using the csv: geolocator was taking too long

In [17]:
lat_lng_df=pd.read_csv("Geospatial_Coordinates.csv")
lat_lng_df

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
...,...,...,...
98,M9N,43.706876,-79.518188
99,M9P,43.696319,-79.532242
100,M9R,43.688905,-79.554724
101,M9V,43.739416,-79.588437


In [18]:
toronto_merged = toronto_neigh_data.merge(lat_lng_df,on="Postal Code")
toronto_merged

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.654260,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Queen's Park,Queen's Park,43.662301,-79.389494
...,...,...,...,...,...
96,M8X,Etobicoke,"The Kingsway, Old Mill",43.653654,-79.506944
97,M4Y,Downtown Toronto,Church and Wellesley,43.665860,-79.383160
98,M7Y,Business reply mail,Business reply mail,43.662744,-79.321558
99,M8Y,Etobicoke,"Old Mill, Sunnylea, Humber Bay, Mimico, The Qu...",43.636258,-79.498509


# Part 3: Exploring and clustering the neighborhoods in Toronto

#### Extract only the neighbourhoods with boroughs that have the word Toronto in them

In [53]:
toronto_explore = toronto_merged[toronto_merged["Borough"].str.contains("Toronto")]
toronto_explore

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
15,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
20,M5E,Downtown Toronto,Downtown Toronto,43.644771,-79.373306
24,M5G,Downtown Toronto,Bay Street,43.657952,-79.387383
25,M6G,Downtown Toronto,Downtown Toronto,43.669542,-79.422564
30,M5H,Downtown Toronto,"Richmond, King",43.650571,-79.384568
36,M5J,Downtown Toronto,"Harbourfront, Union Station, Toronto Islands",43.640816,-79.381752
42,M5K,Downtown Toronto,"Toronto Dominion Centre, Design Exchange",43.647177,-79.381576
48,M5L,Downtown Toronto,"Commerce Court, Victoria Hotel",43.648198,-79.379817


#### Define Foursquare credentials

In [115]:
CLIENT_ID = '...' # your Foursquare ID
CLIENT_SECRET = '....' # your Foursquare Secret
ACCESS_TOKEN = '...'
VERSION = '20180605' # Foursquare API version
LIMIT = 100 # A default Foursquare API limit value

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: ...
CLIENT_SECRET:....


#### Get nearby venues for the neighbourhoods

In [55]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&oauth_token={}&v={}&ll={},{}&radius={}&limit={}'.format(CLIENT_ID,CLIENT_SECRET,ACCESS_TOKEN,VERSION,lat,lng,radius,LIMIT)
       
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [56]:
# type your answer here
toronto_venues = getNearbyVenues(names=toronto_explore['Neighborhood'],
                                   latitudes=toronto_explore['Latitude'],
                                   longitudes=toronto_explore['Longitude']
                                  )

Regent Park, Harbourfront
Garden District, Ryerson
St. James Town
Downtown Toronto
Bay Street
Downtown Toronto
Richmond, King
Harbourfront, Union Station, Toronto Islands
Toronto Dominion Centre, Design Exchange
Commerce Court, Victoria Hotel
North Toronto
University of Toronto
Kensington Market, Chinatown, Grange Park
CN Tower, King and Spadina, Railway Lands, Harbourfront, South Niagara, Island airport
Rosedale
Downtown Toronto
St. James Town, Cabbagetown
First Canadian Place, Underground city
Church and Wellesley


In [57]:
toronto_venues.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Regent Park, Harbourfront",43.65426,-79.360636,Tandem Coffee,43.653559,-79.361809,Coffee Shop
1,"Regent Park, Harbourfront",43.65426,-79.360636,Roselle Desserts,43.653447,-79.362017,Bakery
2,"Regent Park, Harbourfront",43.65426,-79.360636,Cooper Koo Family YMCA,43.653249,-79.358008,Distribution Center
3,"Regent Park, Harbourfront",43.65426,-79.360636,Impact Kitchen,43.656369,-79.35698,Restaurant
4,"Regent Park, Harbourfront",43.65426,-79.360636,Body Blitz Spa East,43.654735,-79.359874,Spa


In [58]:
toronto_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Bay Street,100,100,100,100,100,100
"CN Tower, King and Spadina, Railway Lands, Harbourfront, South Niagara, Island airport",19,19,19,19,19,19
Church and Wellesley,100,100,100,100,100,100
"Commerce Court, Victoria Hotel",100,100,100,100,100,100
Downtown Toronto,229,229,229,229,229,229
"First Canadian Place, Underground city",100,100,100,100,100,100
"Garden District, Ryerson",100,100,100,100,100,100
"Harbourfront, Union Station, Toronto Islands",100,100,100,100,100,100
"Kensington Market, Chinatown, Grange Park",100,100,100,100,100,100
North Toronto,39,39,39,39,39,39


#### Encode the venues so they form columns of the dataframe to allow clusterng

In [96]:
toronto_onehot=[]

In [97]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column and move it to the beginning of the dataframe

toronto_onehot["Neighborhood"]=toronto_venues['Neighborhood'] 
first_col= toronto_onehot.pop('Neighborhood') 
toronto_onehot.insert(0,'Neighborhood',first_col)

toronto_onehot.head()

Unnamed: 0,Neighborhood,Adult Boutique,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Wings Joint,Yoga Studio
0,"Regent Park, Harbourfront",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"Regent Park, Harbourfront",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,"Regent Park, Harbourfront",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"Regent Park, Harbourfront",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"Regent Park, Harbourfront",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [98]:
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped

Unnamed: 0,Neighborhood,Adult Boutique,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Wings Joint,Yoga Studio
0,Bay Street,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.01,0.0,0.0,0.01,0.0,0.0,0.01
1,"CN Tower, King and Spadina, Railway Lands, Har...",0.0,0.052632,0.052632,0.052632,0.105263,0.157895,0.105263,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Church and Wellesley,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.01,0.0,...,0.0,0.0,0.0,0.0,0.0,0.01,0.0,0.01,0.01,0.03
3,"Commerce Court, Victoria Hotel",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.03,0.0,...,0.0,0.0,0.0,0.02,0.0,0.0,0.01,0.0,0.0,0.0
4,Downtown Toronto,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0131,0.004367,...,0.0,0.0,0.0,0.0131,0.0,0.0,0.0,0.0,0.0,0.004367
5,"First Canadian Place, Underground city",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.03,0.0,...,0.0,0.0,0.01,0.01,0.0,0.0,0.01,0.0,0.0,0.0
6,"Garden District, Ryerson",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,"Harbourfront, Union Station, Toronto Islands",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.01,0.01,0.0,0.0,0.01,0.0,0.0,0.0
8,"Kensington Market, Chinatown, Grange Park",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.01,0.0,0.0,0.05,0.0,0.05,0.01,0.0,0.0,0.0
9,North Toronto,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.025641,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.025641


In [99]:
toronto_grouped.shape

(17, 225)

#### top five venues per neighborhood

In [100]:
num_top_venues = 5

for hood in toronto_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Bay Street----
              venue  freq
0       Coffee Shop  0.14
1              Café  0.08
2    Sandwich Place  0.05
3  Sushi Restaurant  0.04
4    Ice Cream Shop  0.03


----CN Tower, King and Spadina, Railway Lands, Harbourfront, South Niagara, Island airport----
                 venue  freq
0      Airport Service  0.16
1  Rental Car Location  0.11
2       Airport Lounge  0.11
3     Airport Terminal  0.11
4      Harbor / Marina  0.05


----Church and Wellesley----
                 venue  freq
0          Coffee Shop  0.07
1              Gay Bar  0.05
2     Sushi Restaurant  0.05
3  Japanese Restaurant  0.04
4         Burger Joint  0.03


----Commerce Court, Victoria Hotel----
         venue  freq
0  Coffee Shop  0.14
1   Restaurant  0.07
2         Café  0.07
3        Hotel  0.06
4          Gym  0.04


----Downtown Toronto----
          venue  freq
0   Coffee Shop  0.11
1          Café  0.04
2           Pub  0.03
3  Cocktail Bar  0.03
4        Bakery  0.03


----First Canadian Pl

#### Put the top 5 venues per neighborhood in a dtaframe

In [101]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [102]:
num_top_venues = 5

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,Bay Street,Coffee Shop,Café,Sandwich Place,Sushi Restaurant,Chinese Restaurant
1,"CN Tower, King and Spadina, Railway Lands, Har...",Airport Service,Rental Car Location,Airport Lounge,Airport Terminal,Coffee Shop
2,Church and Wellesley,Coffee Shop,Gay Bar,Sushi Restaurant,Japanese Restaurant,Yoga Studio
3,"Commerce Court, Victoria Hotel",Coffee Shop,Restaurant,Café,Hotel,Gym
4,Downtown Toronto,Coffee Shop,Café,Cocktail Bar,Bakery,Italian Restaurant


#### Cluster the neighborhoods

In [103]:
#import required libraries
from sklearn.cluster import KMeans

# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(init="k-means++",n_clusters=kclusters, random_state=0,n_init=5).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:20] 

array([2, 3, 0, 2, 2, 2, 0, 2, 0, 0, 0, 0, 1, 0, 2, 2, 4])

In [104]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_lmerged = toronto_explore

toronto_lmerged = toronto_lmerged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

toronto_lmerged.reset_index(drop=True,inplace=True)# check the last columns!

In [105]:
toronto_lmerged.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,0,Coffee Shop,Pub,Bakery,Café,Park
1,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937,0,Coffee Shop,Clothing Store,Café,Cosmetics Shop,Middle Eastern Restaurant
2,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418,0,Café,Coffee Shop,Restaurant,Italian Restaurant,Hotel
3,M5E,Downtown Toronto,Downtown Toronto,43.644771,-79.373306,2,Coffee Shop,Café,Cocktail Bar,Bakery,Italian Restaurant
4,M5G,Downtown Toronto,Bay Street,43.657952,-79.387383,2,Coffee Shop,Café,Sandwich Place,Sushi Restaurant,Chinese Restaurant


In [107]:
toronto_lmerged.drop('Postal Code',axis=1,inplace=True)
toronto_lmerged

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,0,Coffee Shop,Pub,Bakery,Café,Park
1,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937,0,Coffee Shop,Clothing Store,Café,Cosmetics Shop,Middle Eastern Restaurant
2,Downtown Toronto,St. James Town,43.651494,-79.375418,0,Café,Coffee Shop,Restaurant,Italian Restaurant,Hotel
3,Downtown Toronto,Downtown Toronto,43.644771,-79.373306,2,Coffee Shop,Café,Cocktail Bar,Bakery,Italian Restaurant
4,Downtown Toronto,Bay Street,43.657952,-79.387383,2,Coffee Shop,Café,Sandwich Place,Sushi Restaurant,Chinese Restaurant
5,Downtown Toronto,Downtown Toronto,43.669542,-79.422564,2,Coffee Shop,Café,Cocktail Bar,Bakery,Italian Restaurant
6,Downtown Toronto,"Richmond, King",43.650571,-79.384568,0,Coffee Shop,Café,Steakhouse,Sushi Restaurant,Hotel
7,Downtown Toronto,"Harbourfront, Union Station, Toronto Islands",43.640816,-79.381752,2,Coffee Shop,Aquarium,Café,Hotel,Italian Restaurant
8,Downtown Toronto,"Toronto Dominion Centre, Design Exchange",43.647177,-79.381576,2,Coffee Shop,Café,Hotel,Restaurant,Seafood Restaurant
9,Downtown Toronto,"Commerce Court, Victoria Hotel",43.648198,-79.379817,2,Coffee Shop,Restaurant,Café,Hotel,Gym


In [114]:
# create map
#import geocoder # import geocoder

# initialize your variable to None
#lat_lng_coords = None

# loop until you get the coordinates
#while(lat_lng_coords is None):
#    g = geocoder.google('Toronto, Ontario')
#    lat_lng_coords = g.latlng

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

latitude = 43.651070
longitude =-79.347015

map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_lmerged['Latitude'], toronto_lmerged['Longitude'], toronto_lmerged['Neighborhood'], toronto_lmerged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters