# Segmenting and Clustering Neighborhoods in Toronto

1. Importing required libraries

In [137]:
import pandas as pd

2. Fetching data from wiki page and converting it to Pandas dataframe 

In [138]:
link = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
post_codes = pd.read_html(link,header=0)[0]
post_codes

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
...,...,...,...
283,M8Z,Etobicoke,Mimico NW
284,M8Z,Etobicoke,The Queensway West
285,M8Z,Etobicoke,Royal York South West
286,M8Z,Etobicoke,South of Bloor


3. If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough.

In [139]:
def fix_neighbourhood(record):
    if record.Borough != 'Not assigned' and record.Neighbourhood == 'Not assigned':
        record.Neighbourhood = record.Borough
    return record

post_codes.apply(fix_neighbourhood, axis = 1)  #Applying to each row

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
...,...,...,...
283,M8Z,Etobicoke,Mimico NW
284,M8Z,Etobicoke,The Queensway West
285,M8Z,Etobicoke,Royal York South West
286,M8Z,Etobicoke,South of Bloor


4. Removing Boroughs where data is not available

In [140]:
post_codes = post_codes[post_codes.Borough != 'Not assigned']
post_codes

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
...,...,...,...
282,M8Z,Etobicoke,Kingsway Park South West
283,M8Z,Etobicoke,Mimico NW
284,M8Z,Etobicoke,The Queensway West
285,M8Z,Etobicoke,Royal York South West


5. Combining Neighbourhoods associated with same Post code

In [141]:
def combine_neighbourhood(df):
    return pd.Series({'Neighbourhood':",".join(df.Neighbourhood.values.tolist())})

post_codes = post_codes.groupby(['Postcode', 'Borough']).apply(combine_neighbourhood)
post_codes

Unnamed: 0_level_0,Unnamed: 1_level_0,Neighbourhood
Postcode,Borough,Unnamed: 2_level_1
M1B,Scarborough,"Rouge,Malvern"
M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
M1E,Scarborough,"Guildwood,Morningside,West Hill"
M1G,Scarborough,Woburn
M1H,Scarborough,Cedarbrae
...,...,...
M9N,York,Weston
M9P,Etobicoke,Westmount
M9R,Etobicoke,"Kingsview Village,Martin Grove Gardens,Richvie..."
M9V,Etobicoke,"Albion Gardens,Beaumond Heights,Humbergate,Jam..."


6. Reset the index

In [142]:
post_codes.reset_index(inplace = True)
post_codes

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
...,...,...,...
98,M9N,York,Weston
99,M9P,Etobicoke,Westmount
100,M9R,Etobicoke,"Kingsview Village,Martin Grove Gardens,Richvie..."
101,M9V,Etobicoke,"Albion Gardens,Beaumond Heights,Humbergate,Jam..."


7. Dump shape

In [143]:
post_codes.shape

(103, 3)

8. Loading Location data as geocoder is not working

In [144]:
location_data = pd.read_csv('https://cocl.us/Geospatial_data')
location_data

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
...,...,...,...
98,M9N,43.706876,-79.518188
99,M9P,43.696319,-79.532242
100,M9R,43.688905,-79.554724
101,M9V,43.739416,-79.588437


9. Merge data frames on postal codes

In [145]:
merged_df = post_codes.merge(location_data, how = 'inner', left_on = 'Postcode', right_on = 'Postal Code')
merged_df.drop('Postal Code', axis = 1, inplace = True)
merged_df

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
...,...,...,...,...,...
98,M9N,York,Weston,43.706876,-79.518188
99,M9P,Etobicoke,Westmount,43.696319,-79.532242
100,M9R,Etobicoke,"Kingsview Village,Martin Grove Gardens,Richvie...",43.688905,-79.554724
101,M9V,Etobicoke,"Albion Gardens,Beaumond Heights,Humbergate,Jam...",43.739416,-79.588437


10. Filter boroughs containing 'Toronto'

In [146]:
toronto_postcodes = merged_df[merged_df.Borough.str.contains('Toronto')]

11. Get Location for toronto

In [147]:
from geopy.geocoders import Nominatim

geolocator = Nominatim(user_agent="coursera_cap_proj")
location = geolocator.geocode('Toronto')
latitude = location.latitude
longitude = location.longitude

12. Get Toronto Map and add markers for all locations in toronto_postcodes

In [148]:
import folium

# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(toronto_postcodes['Latitude'], toronto_postcodes['Longitude'], toronto_postcodes['Borough']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

13. K Means clustering algorithm on filtered locations

In [149]:
from IPython.display import display

import numpy as np

#Taking 3 random locations as centroids
centroids = np.random.randint(0, high = toronto_postcodes.shape[0],size=3)
centroids = toronto_postcodes.iloc[centroids, :][['Latitude', 'Longitude']].values

cluster_group = []

#Assigning each location to cluster based on min euclidean distance from centroid
def assign_to_centroids():
    cluster_group.clear()
    for i in range(toronto_postcodes.shape[0]):
        distances = [np.linalg.norm(centroids[0].reshape(-1, 1) - toronto_postcodes.iloc[i,:][['Latitude', 'Longitude']].values.reshape(-1, 1)),
                     np.linalg.norm(centroids[1].reshape(-1, 1) - toronto_postcodes.iloc[i,:][['Latitude', 'Longitude']].values.reshape(-1, 1)),
                     np.linalg.norm(centroids[2].reshape(-1, 1) - toronto_postcodes.iloc[i,:][['Latitude', 'Longitude']].values.reshape(-1, 1))]
        cluster_group.append(distances.index(min(distances)))

#Move centroids as mean of locations in centroid's cluster
def move_centroids():
    # create map of Toronto using latitude and longitude values
    map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)
    
    c0_group = toronto_postcodes[np.asarray((cluster_group))== 0]
    centroids[0] = c0_group[['Latitude', 'Longitude']].mean().values
    #print(c0_group, centroids[0])
    for lat, lng, label in zip(c0_group['Latitude'], c0_group['Longitude'], c0_group['Borough']):
        folium.CircleMarker(
            [lat, lng],
            radius=5,
            color='blue').add_to(map_toronto)
    
    folium.CircleMarker(
        centroids[0],
        radius=10,
        color='blue').add_to(map_toronto)
    
    c1_group = toronto_postcodes[np.asarray((cluster_group))== 1]
    centroids[1] = c1_group[['Latitude', 'Longitude']].mean().values
    #print(c1_group, centroids[1])
    for lat, lng, label in zip(c1_group['Latitude'], c1_group['Longitude'], c1_group['Borough']):
        folium.CircleMarker(
            [lat, lng],
            radius=5,
            color='yellow').add_to(map_toronto)
    
    folium.CircleMarker(
        centroids[1],
        radius=10,
        color='yellow').add_to(map_toronto)
    
    c2_group = toronto_postcodes[np.asarray((cluster_group))== 2]
    centroids[2] = c2_group[['Latitude', 'Longitude']].mean().values
    #print(c2_group, centroids[2])
    for lat, lng, label in zip(c2_group['Latitude'], c2_group['Longitude'], c2_group['Borough']):
        folium.CircleMarker(
            [lat, lng],
            radius=5,
            color='red').add_to(map_toronto)
    
    folium.CircleMarker(
        centroids[2],
        radius=10,
        color='red').add_to(map_toronto)
    
    display(map_toronto)

#Taking five iterations
for i in range(5):
    assign_to_centroids()
    #print(cluster_group)
    move_centroids()
    #print(centroids)

