In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

In [2]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
req = requests.get(url).text
soup = BeautifulSoup(req, 'html')

## Let's read the contents of the web page and look for the cells

In [3]:
# The contents of the table cell for reference
print(soup.find_all('td')[6].text)
soup.find_all('td')[6]


M7AQueen's Park(Ontario Provincial Government)




<td style="width:11%; vertical-align:top;">
<p><b>M7A</b><br/><span style="font-size:85%;"><a href="/wiki/Queen%27s_Park_(Toronto)" title="Queen's Park (Toronto)">Queen's Park</a><br/>(Ontario Provincial Government)</span>
</p>
</td>

In [4]:
# First let's make a list that will contain all the information
comm = []
# Run through all the instances of a table cell
for instance in soup.find_all('td'):
    # Initiate the dictionary that will hold the cell's data
    commDict = {}
    # The try will kick out if there is no information in the cell and make no entry
    try:
        # The postal code is wrapped up in the first Bold tag <b></b>
        commDict['PostalCode'] = instance.b.text
        # If the cell is unassaigned it will not allocate any information
        if instance.span.text == 'Not assigned':
            pass
        else:
            data = instance.span.text.split(')')
            # Now the Borough is in the first bit of the text before the '('
            commDict['Borough'] = instance.span.text[:instance.span.text.find('(')]
            commDict['Neighborhood'] = ''
            for area in data:
                if commDict['Neighborhood'] != '' and len(area.split('(')>1):
                    commDict['Neighborhood'] = commDict['Neighborhood'] + ',' 
                # The neighborhoods content is in between the cells and seperated by ' / '
                commDict['Neighborhood'] = commDict['Neighborhood'] + area[instance.span.text.find('(')+1:].replace(' / ',',')
    except:
        pass
    # Now we read the data into the list 'comm' if there is information
    try:
        if commDict['Borough'] is not None:
            comm.append(commDict)
    except:
        pass

In [5]:
# Now we read the dictionary into a dataframe
df_scrape = pd.DataFrame(comm)
df_scrape

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park,Harbourfront"
3,M6A,North York,"Lawrence Manor,Lawrence Heights"
4,M7A,Queen's Park,Ontario Provincial Government
...,...,...,...
98,M8X,Etobicoke,"The Kingsway,Montgomery Road,Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East TorontoBusiness reply mail Processing Cen...,Enclave of M4L
101,M8Y,Etobicoke,"Old Mill South,King's Mill Park,Sunnylea,Humbe..."


In [6]:
# Let's remove areas that are just postal addresses
ban_list = ['M7Y','M5W','M7R','M7A']
df_scrape = df_scrape[~df_scrape['PostalCode'].isin(ban_list)]
df_scrape.shape

(99, 3)

In [7]:
import geocoder # import geocoder

df_PScodes = pd.read_csv("https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-DS0701EN-SkillsNetwork/labs_v1/Geospatial_Coordinates.csv")
df_PScodes = df_PScodes.set_index('Postal Code')
df_PScodes

Unnamed: 0_level_0,Latitude,Longitude
Postal Code,Unnamed: 1_level_1,Unnamed: 2_level_1
M1B,43.806686,-79.194353
M1C,43.784535,-79.160497
M1E,43.763573,-79.188711
M1G,43.770992,-79.216917
M1H,43.773136,-79.239476
...,...,...
M9N,43.706876,-79.518188
M9P,43.696319,-79.532242
M9R,43.688905,-79.554724
M9V,43.739416,-79.588437


Now if we make the indexes the same for the two dataframes we will have the new columns automatically take on the correct values for the postal codes.

In [8]:
df_scrape = df_scrape.set_index('PostalCode')
df_scrape['Latitude'], df_scrape['Longitude'] = df_PScodes['Latitude'], df_PScodes['Longitude']
df_scrape

Unnamed: 0_level_0,Borough,Neighborhood,Latitude,Longitude
PostalCode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
M3A,North York,Parkwoods,43.753259,-79.329656
M4A,North York,Victoria Village,43.725882,-79.315572
M5A,Downtown Toronto,"Regent Park,Harbourfront",43.654260,-79.360636
M6A,North York,"Lawrence Manor,Lawrence Heights",43.718518,-79.464763
M9A,Etobicoke,Islington Avenue,43.667856,-79.532242
...,...,...,...,...
M5X,Downtown Toronto,"First Canadian Place,Underground city",43.648429,-79.382280
M8X,Etobicoke,"The Kingsway,Montgomery Road,Old Mill North",43.653654,-79.506944
M4Y,Downtown Toronto,Church and Wellesley,43.665860,-79.383160
M8Y,Etobicoke,"Old Mill South,King's Mill Park,Sunnylea,Humbe...",43.636258,-79.498509


In [9]:
df_scrape.reset_index(inplace=True)

In [10]:
df_scrape

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park,Harbourfront",43.654260,-79.360636
3,M6A,North York,"Lawrence Manor,Lawrence Heights",43.718518,-79.464763
4,M9A,Etobicoke,Islington Avenue,43.667856,-79.532242
...,...,...,...,...,...
94,M5X,Downtown Toronto,"First Canadian Place,Underground city",43.648429,-79.382280
95,M8X,Etobicoke,"The Kingsway,Montgomery Road,Old Mill North",43.653654,-79.506944
96,M4Y,Downtown Toronto,Church and Wellesley,43.665860,-79.383160
97,M8Y,Etobicoke,"Old Mill South,King's Mill Park,Sunnylea,Humbe...",43.636258,-79.498509


## Now it's time to group the neighbourhoods according to their venues (shops and parks and so on)

First we will get a general feel for Toronto by plotting all the areas on a Toronto map

In [44]:
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

In [12]:
address = 'Toronto, Canada'

latitude = df_scrape['Latitude'].mean()

In [13]:
longitude = df_scrape['Longitude'].mean()
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.70672879191919, -79.39601043535355.


In [14]:
# create map of New York using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11.4)

# add markers to map
for lat, lng, borough, neighborhood in zip(df_scrape['Latitude'], df_scrape['Longitude'], df_scrape['Borough'], df_scrape['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

![image](Areas.png)

## Now we need to get the venues close to the areas

The following code will take the coordinates, and call Foursquare to see what venues in a 1300m radius.

In [16]:
def getNearbyVenues(names, latitudes, longitudes, radius=1300):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

## Now we will fill a new dataframe with the venue data

In [None]:
toronto_venues = getNearbyVenues(names=df_scrape['PostalCode'],
                                   latitudes=df_scrape['Latitude'],
                                   longitudes=df_scrape['Longitude']
                                  )

In [18]:
toronto_venues.shape

(2739, 7)

In [19]:
toronto_venues.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,M3A,43.753259,-79.329656,Allwyn's Bakery,43.75984,-79.324719,Caribbean Restaurant
1,M3A,43.753259,-79.329656,Brookbanks Park,43.751976,-79.33214,Park
2,M3A,43.753259,-79.329656,Tim Hortons,43.760668,-79.326368,Café
3,M3A,43.753259,-79.329656,Donalda Golf & Country Club,43.752816,-79.342741,Golf Course
4,M3A,43.753259,-79.329656,Bruno's valu-mart,43.746143,-79.32463,Grocery Store


In [20]:
len(toronto_venues['Venue'].unique())

1794

## Now let's map the venues on the Toronto map to make sure we have a good spread

In [21]:
map_toronto_venues = folium.Map(location=[latitude, longitude], zoom_start=11.4)

# add markers to map
for lat, lng, venue, neighborhood in zip(toronto_venues['Venue Latitude'], toronto_venues['Venue Longitude'], toronto_venues['Venue'], toronto_venues['Neighborhood']):
    label = '{}, {}'.format(neighborhood, venue)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto_venues)  
    
map_toronto_venues

![image](Venues.png)

## Now let's count all of the venues by neighbourhood

In [22]:
toronto_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
M1B,29,29,29,29,29,29
M1C,8,8,8,8,8,8
M1E,28,28,28,28,28,28
M1G,20,20,20,20,20,20
M1H,30,30,30,30,30,30
...,...,...,...,...,...,...
M9N,30,30,30,30,30,30
M9P,30,30,30,30,30,30
M9R,30,30,30,30,30,30
M9V,20,20,20,20,20,20


In [23]:
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

There are 275 uniques categories.


## We need to take all the types of venues and turn them into dummy variables

In [24]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

Unnamed: 0,Zoo Exhibit,Afghan Restaurant,African Restaurant,Airport,Airport Lounge,American Restaurant,Amphitheater,Antique Shop,Aquarium,Arcade,...,Transportation Service,Turkish Restaurant,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [25]:
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped

Unnamed: 0,Neighborhood,Zoo Exhibit,Afghan Restaurant,African Restaurant,Airport,Airport Lounge,American Restaurant,Amphitheater,Antique Shop,Aquarium,...,Transportation Service,Turkish Restaurant,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,M1B,0.068966,0.0,0.034483,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.034483,0.00,0.0,0.0,0.000000,0.034483,0.000000
1,M1C,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.00,0.0,0.0,0.000000,0.000000,0.000000
2,M1E,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.00,0.0,0.0,0.000000,0.000000,0.000000
3,M1G,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.05,0.0,0.0,0.000000,0.000000,0.000000
4,M1H,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.00,0.0,0.0,0.033333,0.000000,0.033333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94,M9N,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.00,0.0,0.0,0.000000,0.000000,0.000000
95,M9P,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.00,0.0,0.0,0.000000,0.000000,0.000000
96,M9R,0.000000,0.0,0.000000,0.0,0.0,0.033333,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.00,0.0,0.0,0.000000,0.000000,0.000000
97,M9V,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.00,0.0,0.0,0.000000,0.000000,0.000000


In [26]:
toronto_grouped.shape

(99, 275)

In [27]:
num_top_venues = 5

for hood in toronto_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----M1B----
                  venue  freq
0           Zoo Exhibit  0.07
1  Fast Food Restaurant  0.07
2            Restaurant  0.07
3                 Trail  0.07
4                  Park  0.03


----M1C----
                  venue  freq
0        Breakfast Spot  0.12
1                 Hotel  0.12
2            Playground  0.12
3          Burger Joint  0.12
4  Gym / Fitness Center  0.12


----M1E----
                  venue  freq
0           Pizza Place  0.11
1  Fast Food Restaurant  0.07
2                  Park  0.07
3                  Bank  0.07
4       Automotive Shop  0.04


----M1G----
                  venue  freq
0           Pizza Place  0.15
1           Coffee Shop  0.10
2  Fast Food Restaurant  0.10
3                  Park  0.10
4           Music Store  0.05


----M1H----
               venue  freq
0  Indian Restaurant  0.10
1               Bank  0.07
2     Sandwich Place  0.07
3         Restaurant  0.07
4        Gas Station  0.07


----M1J----
               venue  freq
0     San

In [35]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [36]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M1B,Zoo Exhibit,Trail,Restaurant,Fast Food Restaurant,Supermarket,Spa,Bus Station,Caribbean Restaurant,Chinese Restaurant,Pizza Place
1,M1C,Park,Playground,Breakfast Spot,Gym / Fitness Center,Italian Restaurant,Hotel,Burger Joint,Dumpling Restaurant,Dog Run,Doner Restaurant
2,M1E,Pizza Place,Park,Bank,Fast Food Restaurant,Liquor Store,Food & Drink Shop,Supermarket,Sports Bar,Laundromat,Beer Store
3,M1G,Pizza Place,Coffee Shop,Park,Fast Food Restaurant,Music Store,Discount Store,Sandwich Place,Supermarket,Department Store,Indian Restaurant
4,M1H,Indian Restaurant,Sandwich Place,Restaurant,Bank,Coffee Shop,Gas Station,Yoga Studio,Thai Restaurant,Fish & Chips Shop,Music Store


## Now let's group the neighborhoods according to the similar venues in that group

After testing several numbers of clusters it was found that 10 clusters gave a good distribution

In [37]:
# set number of clusters
kclusters = 10

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([0, 9, 0, 6, 0, 0, 6, 4, 0, 8], dtype=int32)

In [38]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = df_scrape

# merge manhattan_grouped with manhattan_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='PostalCode')

toronto_merged.head() # check the last columns!

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M3A,North York,Parkwoods,43.753259,-79.329656,8,Park,Skating Rink,Pharmacy,Coffee Shop,Bus Stop,Chinese Restaurant,Laundry Service,Bank,Supermarket,Food & Drink Shop
1,M4A,North York,Victoria Village,43.725882,-79.315572,8,Coffee Shop,Park,Grocery Store,Optical Shop,Shoe Store,Sandwich Place,Portuguese Restaurant,Gym,Pharmacy,Cosmetics Shop
2,M5A,Downtown Toronto,"Regent Park,Harbourfront",43.65426,-79.360636,4,Coffee Shop,Park,Bakery,Breakfast Spot,Historic Site,Thai Restaurant,Mediterranean Restaurant,Performing Arts Venue,Farmers Market,Spa
3,M6A,North York,"Lawrence Manor,Lawrence Heights",43.718518,-79.464763,0,Restaurant,Furniture / Home Store,Clothing Store,Fried Chicken Joint,Department Store,Electronics Store,Men's Store,Mediterranean Restaurant,Bowling Alley,Boutique
4,M9A,Etobicoke,Islington Avenue,43.667856,-79.532242,8,Pharmacy,Bank,Grocery Store,Park,Ice Cream Shop,Spa,Bakery,Camera Store,Café,Japanese Restaurant


In [45]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11.4)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

![image](Final_grouping.png)