# Segmenting and Clustering Neighborhoods in the city of Toronto, Canada

## 1. Scraping & parsing wikipedia with request + bs4

Postal codes wiki page to be scraped = https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M

In [1]:
import urllib.request

url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
req = urllib.request.urlopen(url)
article = req.read().decode()

with open('List_of_postal_codes_of_Canada:_M.html', 'w') as fo:
    fo.write(article)

In [2]:
from bs4 import BeautifulSoup

# Load article, turn into soup and get the <table>s.
article = open('List_of_postal_codes_of_Canada:_M.html').read()
soup = BeautifulSoup(article, 'html.parser')
tables = soup.find_all('table', class_='sortable')


In [3]:

# Search through the tables for the one with the headings we want.
for table in tables:
    ths = table.find_all('th')
    headings = [th.text.strip() for th in ths]
    if headings[:3] == ['Postcode', 'Borough', 'Neighborhood']:
        break



In [4]:
# Extract the columns we want and write to a comma separated text file.
with open('List_of_postal_codes_of_Canada:_M.txt', 'w') as fo:
    for tr in table.find_all('tr'):
        tds = tr.find_all('td')
        if not tds:
            continue
        Postcode, Borough, Neighborhood = [td.text.strip() for td in tds[:4]]      
        print(', '.join([Postcode, Borough, Neighborhood]), file=fo)


In [5]:
import pandas as pd
data = pd.read_csv('List_of_postal_codes_of_Canada:_M.txt', sep=',', header=None)
data.columns = ['Postcode', 'Borough', 'Neighborhood']
data=data[~data.Borough.str.contains("assigned")]
Neighborhood= data['Neighborhood']
Neighborhood.replace(to_replace='Not assigned', value=data['Borough'],inplace=True, regex=True)
combine = data.groupby(data['Postcode'], sort=False).agg(','.join)
new= combine["Borough"].str.split(",", n = 1, expand = True) 
combine['Borough'] = new[0]
Postcode_df=combine
Postcode_df.head()

Unnamed: 0_level_0,Borough,Neighborhood
Postcode,Unnamed: 1_level_1,Unnamed: 2_level_1
M3A,North York,Parkwoods
M4A,North York,Victoria Village
M5A,Downtown Toronto,"Harbourfront, Regent Park"
M6A,North York,"Lawrence Heights, Lawrence Manor"
M7A,Queen's Park,Queen's Park


In [6]:
Postcode_df.shape

(103, 2)

## 2. Latitude and the longitude coordinates of each neighborhood.
Geocoder api very faulty and only solid solutions were through paid apis.
More efficient to merge geospatial coordinates csv http://cocl.us/Geospatial_data

In [27]:
# The code was removed by Watson Studio for sharing.

In [8]:
Geodata=pd.DataFrame(df_data_1)
Geodata.rename(columns={'Postal Code':'Postcode'}, inplace=True)
Geodata.head()

Unnamed: 0,Postcode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


##### Left merge scraped wiki Postcode dataframe with coordinate data on key Postcode. 
<i>NOTE: Geodata Postalcode column renamed to match Postcode key in Postcode dataframe (see cell above)

In [9]:
Postcode_df=Postcode_df.reset_index() #reset index so that merge will not return keyerror
Postal_Geo_df = Postcode_df.merge(Geodata, how='left', on='Postcode')
Postal_Geo_df.head()

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Harbourfront, Regent Park",43.65426,-79.360636
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763
4,M7A,Queen's Park,Queen's Park,43.662301,-79.389494


## 3. Explore and cluster the neighborhoods in Toronto based on most prevalent venues/attractions with K-Means algorithm.
#### Venue data is provided through foursquare api.

Import ALL relevant modules

In [11]:
import numpy as np # library to handle data in a vectorized manner
import json # library to handle JSON files

#!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Fetching package metadata .............
Solving package specifications: .

Package plan for installation in environment /opt/conda/envs/DSX-Python35:

The following NEW packages will be INSTALLED:

    altair:  2.2.2-py35_1 conda-forge
    branca:  0.3.1-py_0   conda-forge
    folium:  0.5.0-py_0   conda-forge
    vincent: 0.4.4-py_1   conda-forge

altair-2.2.2-p 100% |################################| Time: 0:00:00  18.93 MB/s
branca-0.3.1-p 100% |################################| Time: 0:00:00  27.55 MB/s
vincent-0.4.4- 100% |################################| Time: 0:00:00  30.82 MB/s
folium-0.5.0-p 100% |################################| Time: 0:00:00  35.67 MB/s
Libraries imported.


#### Map New Postal-Geo Dataframe (created by merging wiki data and coordinates) with folium library to view neighborhoods as per postal code. 

In [12]:
Postal_Geo_df.head()

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Harbourfront, Regent Park",43.65426,-79.360636
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763
4,M7A,Queen's Park,Queen's Park,43.662301,-79.389494


In [13]:

latitude= 43.679165
longitude= -79.374336
Toronto_Hoods = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(Postal_Geo_df['Latitude'], Postal_Geo_df['Longitude'], Postal_Geo_df['Borough'], Postal_Geo_df['Neighborhood']):
    label = 'NEIGHBORHOOD(S): {}, BOROUGH: {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True) #add popups to view neighborhoods in a postal code
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=True).add_to(Toronto_Hoods)  
    

Toronto_Hoods


#### Create function that when called...  
<ul><li>accesses Foursquare API</li>
    <li>returns top 100 venues for each request</li>
    <li>creates new dataframe by appending venues to list according to coordinates in a lat/long list </li></ul>

In [26]:
# The code was removed by Watson Studio for sharing.

In [15]:
LIMIT= 100

def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)



Call above function with  params
... The output will list the names of the neighborhoods  

In [16]:
Toronto_venues = getNearbyVenues(names=Postal_Geo_df['Neighborhood'],
                                   latitudes=Postal_Geo_df['Latitude'],
                                   longitudes=Postal_Geo_df['Longitude']
                                  )

 Parkwoods
 Victoria Village
 Harbourfront, Regent Park
 Lawrence Heights, Lawrence Manor
  Queen's Park
 Islington Avenue
 Rouge, Malvern
 Don Mills North
 Woodbine Gardens, Parkview Hill
 Ryerson, Garden District
 Glencairn
 Cloverdale, Islington, Martin Grove, Princess Gardens, West Deane Park
 Highland Creek, Rouge Hill, Port Union
 Flemingdon Park, Don Mills South
 Woodbine Heights
 St. James Town
 Humewood-Cedarvale
 Bloordale Gardens, Eringate, Markland Wood, Old Burnhamthorpe
 Guildwood, Morningside, West Hill
 The Beaches
 Berczy Park
 Caledonia-Fairbanks
 Woburn
 Leaside
 Central Bay Street
 Christie
 Cedarbrae
 Hillcrest Village
 Bathurst Manor, Downsview North, Wilson Heights
 Thorncliffe Park
 Adelaide, King, Richmond
 Dovercourt Village, Dufferin
 Scarborough Village
 Fairview, Henry Farm, Oriole
 Northwood Park, York University
 East Toronto
 Harbourfront East, Toronto Islands, Union Station
 Little Portugal, Trinity
 East Birchmount Park, Ionview, Kennedy Park
 Bayview 

In [17]:
#view the first 5 rows in the new DataFrame
Toronto_venues.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Parkwoods,43.753259,-79.329656,Brookbanks Park,43.751976,-79.33214,Park
1,Parkwoods,43.753259,-79.329656,KFC,43.754387,-79.333021,Fast Food Restaurant
2,Parkwoods,43.753259,-79.329656,Variety Store,43.751974,-79.333114,Food & Drink Shop
3,Victoria Village,43.725882,-79.315572,Victoria Village Arena,43.723481,-79.315635,Hockey Arena
4,Victoria Village,43.725882,-79.315572,Tim Hortons,43.725517,-79.313103,Coffee Shop


#### Create dataframe with one-hot encoding.
<i>One hot encoding is the technique to convert categorical values into a 1-dimensional numerical vector. The resulting vector will have only one element equal to 1 and the rest will be 0. The 1 is called Hot and the 0's are Cold.
This means the Toronto venues df will be converted to show each venue as a 1 with respect to the name of the neighborhood(s) it is found within.

In [18]:
# one hot encoding
Toronto_onehot = pd.get_dummies(Toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
Toronto_onehot['Neighborhood'] = Toronto_venues['Neighborhood'] 
Toronto_onehot['Neighborhoods'] = Toronto_onehot['Neighborhood']
# move neighborhood column to the first column
fixed_columns = [Toronto_onehot.columns[-1]] + list(Toronto_onehot.columns[:-1])
Toronto_onehot = Toronto_onehot[fixed_columns]
Toronto_onehot.head()

Unnamed: 0,Neighborhoods,Accessories Store,Adult Boutique,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,...,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Victoria Village,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Victoria Village,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


<ul><li> Create another dataframe that groups venues together by mean per neighborhood and sorts the top 5 most prevalent venue categories per neighborhood.</li>
    <li> From the grouped dataframe create and pass function that creates a new dataframe showing the top 10 venue categories according to their frequency</li>

In [19]:
Toronto_grouped = Toronto_onehot.groupby('Neighborhoods').mean().reset_index()
num_top_venues = 5

for hood in Toronto_grouped['Neighborhoods']:
    print("----"+hood+"----")
    temp = Toronto_grouped[Toronto_grouped['Neighborhoods'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----  Queen's Park----
                 venue  freq
0          Coffee Shop  0.25
1                  Gym  0.05
2         Burger Joint  0.05
3  Japanese Restaurant  0.05
4                Diner  0.05


---- Adelaide, King, Richmond----
             venue  freq
0      Coffee Shop  0.06
1             Café  0.04
2  Thai Restaurant  0.04
3       Steakhouse  0.04
4           Bakery  0.03


---- Agincourt----
                 venue  freq
0       Clothing Store  0.25
1         Skating Rink  0.25
2       Breakfast Spot  0.25
3               Lounge  0.25
4  Monument / Landmark  0.00


---- Agincourt North, L'Amoreaux East, Milliken, Steeles East----
                             venue  freq
0                       Playground   0.5
1                             Park   0.5
2               Mexican Restaurant   0.0
3              Monument / Landmark   0.0
4  Molecular Gastronomy Restaurant   0.0


---- Albion Gardens, Beaumond Heights, Humbergate, Jamestown, Mount Olive, Silverstone, South Steeles, Thi

In [20]:
Toronto_grouped

Unnamed: 0,Neighborhoods,Accessories Store,Adult Boutique,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,...,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,Queen's Park,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.025000,0.000000,0.025000
1,"Adelaide, King, Richmond",0.01,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.010000,0.000000,0.000000,0.000000,0.000000,0.010000,0.000000,0.010000,0.000000
2,Agincourt,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,"Agincourt North, L'Amoreaux East, Milliken, S...",0.00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,"Albion Gardens, Beaumond Heights, Humbergate,...",0.00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
5,"Alderwood, Long Branch",0.00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
6,"Bathurst Manor, Downsview North, Wilson Heights",0.00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.000000,0.000000,0.055556,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
7,Bayview Village,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
8,"Bedford Park, Lawrence Manor East",0.00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
9,Berczy Park,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.017241,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [21]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [22]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = Toronto_grouped['Neighborhoods']

for ind in np.arange(Toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(Toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Queen's Park,Coffee Shop,Burger Joint,Japanese Restaurant,Diner,Gym,College Auditorium,Park,Smoothie Shop,Seafood Restaurant,Sandwich Place
1,"Adelaide, King, Richmond",Coffee Shop,Thai Restaurant,Steakhouse,Café,Sushi Restaurant,American Restaurant,Bakery,Bar,Hotel,Gym
2,Agincourt,Lounge,Breakfast Spot,Clothing Store,Skating Rink,Yoga Studio,Eastern European Restaurant,Dog Run,Doner Restaurant,Donut Shop,Drugstore
3,"Agincourt North, L'Amoreaux East, Milliken, S...",Playground,Park,Yoga Studio,Dumpling Restaurant,Diner,Discount Store,Dog Run,Doner Restaurant,Donut Shop,Drugstore
4,"Albion Gardens, Beaumond Heights, Humbergate,...",Grocery Store,Pharmacy,Fried Chicken Joint,Pizza Place,Coffee Shop,Sandwich Place,Beer Store,Fast Food Restaurant,Golf Course,Gluten-free Restaurant


<ul><li>Cluster the neighborhoods based on K-Means Machine Learning algorithm 
<li>Create a new dataset with cluster labels
<li>Merge new cluster dataframe with Postal/neighborhood coordinate dataframe</li></ul>
NOTE: pay attention to cluster labels and their object types in order to color code clusters correctly(should all be labeled to avoid NaN and should be integer and not float)

In [23]:
# set number of clusters
kclusters = 5

Toronto_grouped_clustering = Toronto_grouped.drop('Neighborhoods', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(Toronto_grouped_clustering)

In [24]:
# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:17]

# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

Toronto_merged = Postal_Geo_df

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
Toronto_merged = Toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

# check the last columns!
Toronto_merged.dropna(subset=['Cluster Labels'], inplace=True)
Toronto_merged['Cluster Labels']=Toronto_merged['Cluster Labels'].astype('int32')
Toronto_merged.head()

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M3A,North York,Parkwoods,43.753259,-79.329656,0,Fast Food Restaurant,Park,Food & Drink Shop,Eastern European Restaurant,Discount Store,Dog Run,Doner Restaurant,Donut Shop,Drugstore,Dumpling Restaurant
1,M4A,North York,Victoria Village,43.725882,-79.315572,2,Pizza Place,Coffee Shop,Hockey Arena,Portuguese Restaurant,Intersection,Yoga Studio,Donut Shop,Dim Sum Restaurant,Diner,Discount Store
2,M5A,Downtown Toronto,"Harbourfront, Regent Park",43.65426,-79.360636,2,Coffee Shop,Bakery,Café,Pub,Park,Mexican Restaurant,Breakfast Spot,Theater,Italian Restaurant,Hotel
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763,2,Clothing Store,Accessories Store,Event Space,Shoe Store,Miscellaneous Shop,Furniture / Home Store,Arts & Crafts Store,Boutique,Vietnamese Restaurant,Coffee Shop
4,M7A,Queen's Park,Queen's Park,43.662301,-79.389494,2,Coffee Shop,Burger Joint,Japanese Restaurant,Diner,Gym,College Auditorium,Park,Smoothie Shop,Seafood Restaurant,Sandwich Place


# Create map of clustered data with folium
Points are color-coded by cluster

In [25]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(Toronto_merged['Latitude'], Toronto_merged['Longitude'], Toronto_merged['Neighborhood'], Toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[int(cluster)-1],
        fill=True,
        fill_color=rainbow[int(cluster)-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters