# Import data from Wikipedia

In [82]:
import pandas as pd
import numpy as np
dfs = pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M', header=0)
df = pd.concat(dfs[:])

# Eliminate non-necessary columns and NaN values

In [83]:
df.drop(df.columns[3:50], axis=1, inplace=True)
df.dropna(subset = ["Borough"], inplace=True)

# Eliminate records with "Not assigned" statement & Reset index

In [84]:
df2 = df[df.Borough != 'Not assigned']
df2.sort_values(by=['Postal Code'], inplace=True)
df2 = df2.reset_index(drop=True)
df2.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


# Convert the database into a dataframe to use shape method

In [85]:
df2 = pd.DataFrame(df2)
df2.shape

(103, 3)

# Import Geospatial coordinates csv file

In [86]:
import os
os.chdir("/Users/Raul/Desktop")
geo_coordinates = pd.read_csv('Geospatial_Coordinates.csv')
geo_coordinates.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


# Eliminate Postal Code column

In [87]:
geo_coordinates = geo_coordinates[['Latitude', 'Longitude']]
geo_coordinates.head()

Unnamed: 0,Latitude,Longitude
0,43.806686,-79.194353
1,43.784535,-79.160497
2,43.763573,-79.188711
3,43.770992,-79.216917
4,43.773136,-79.239476


# Merge both Data Frames according to a sorted list by Postal Code

In [88]:
result = pd.concat([df2, geo_coordinates], axis=1)
result.head(30)

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"Kennedy Park, Ionview, East Birchmount Park",43.727929,-79.262029
7,M1L,Scarborough,"Golden Mile, Clairlea, Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffside, Cliffcrest, Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848


# Consider only those records where Borough contains word "Toronto"

In [89]:
for string in result.Borough:
    if "Toronto" in string:
        print(string)

East Toronto
East Toronto
East Toronto
East Toronto
Central Toronto
Central Toronto
Central Toronto
Central Toronto
Central Toronto
Central Toronto
Downtown Toronto
Downtown Toronto
Downtown Toronto
Downtown Toronto
Downtown Toronto
Downtown Toronto
Downtown Toronto
Downtown Toronto
Downtown Toronto
Downtown Toronto
Downtown Toronto
Downtown Toronto
Central Toronto
Central Toronto
Central Toronto
Downtown Toronto
Downtown Toronto
Downtown Toronto
Downtown Toronto
Downtown Toronto
Downtown Toronto
West Toronto
West Toronto
West Toronto
West Toronto
West Toronto
West Toronto
Downtown Toronto
East Toronto


In [91]:
result2 = result
counter = 0
while counter < 103:
    temp = result2.Borough[counter]
    pos = temp.find("Toronto")
    if pos == -1:
        result2 = result2.drop([counter])
    counter = counter + 1

result2 = result2.reset_index(drop=True)
result2.head(50)

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M4E,East Toronto,The Beaches,43.676357,-79.293031
1,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
2,M4L,East Toronto,"India Bazaar, The Beaches West",43.668999,-79.315572
3,M4M,East Toronto,Studio District,43.659526,-79.340923
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879
5,M4P,Central Toronto,Davisville North,43.712751,-79.390197
6,M4R,Central Toronto,"North Toronto West, Lawrence Park",43.715383,-79.405678
7,M4S,Central Toronto,Davisville,43.704324,-79.38879
8,M4T,Central Toronto,"Moore Park, Summerhill East",43.689574,-79.38316
9,M4V,Central Toronto,"Summerhill West, Rathnelly, South Hill, Forest...",43.686412,-79.400049


In [92]:
print('The dataframe has {} boroughs and {} postal codes.'.format(
        len(result2['Borough'].unique()),
        result2.shape[0]
    )
)

The dataframe has 4 boroughs and 39 postal codes.


# Use geopy library to get the latitude and longitude values of Toronto.

In [93]:
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

address = 'Toronto, ON'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinates of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinates of Toronto are 43.6534817, -79.3839347.


# Create a map of Toronto with neighborhoods superimposed on top.

In [94]:
import folium # map rendering library

# create map of New York using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(result2['Latitude'], result2['Longitude'], result2['Borough'], result2['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

# Next, let's keep quantitative variables for clustering

In [64]:
toronto_clustering = result2.drop(['Postal Code', 'Borough','Neighborhood'], 1)

# Assign clusters

In [65]:
from sklearn.cluster import KMeans
# set number of clusters
kclusters = 5

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:38] 

array([4, 4, 4, 4, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       2, 2, 3, 3, 3, 1, 1, 1, 3, 0, 3, 3, 0, 0, 0, 1], dtype=int32)

In [66]:
result2.insert(0, 'Cluster Labels', kmeans.labels_)
result2

Unnamed: 0,Cluster Labels,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,4,M4E,East Toronto,The Beaches,43.676357,-79.293031
1,4,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
2,4,M4L,East Toronto,"India Bazaar, The Beaches West",43.668999,-79.315572
3,4,M4M,East Toronto,Studio District,43.659526,-79.340923
4,2,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879
5,2,M4P,Central Toronto,Davisville North,43.712751,-79.390197
6,2,M4R,Central Toronto,"North Toronto West, Lawrence Park",43.715383,-79.405678
7,2,M4S,Central Toronto,Davisville,43.704324,-79.38879
8,2,M4T,Central Toronto,"Moore Park, Summerhill East",43.689574,-79.38316
9,2,M4V,Central Toronto,"Summerhill West, Rathnelly, South Hill, Forest...",43.686412,-79.400049


# Visualize the resulting clusters

In [68]:
import matplotlib.cm as cm
import matplotlib.colors as colors

# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(result2['Latitude'], result2['Longitude'], result2['Neighborhood'], result2['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

# Examine clusters

Cluster 1

In [73]:
result2.loc[result2['Cluster Labels'] == 0, result2.columns[[3,4] + list(range(5, result2.shape[1]))]]

Unnamed: 0,Neighborhood,Latitude,Longitude
31,"Dufferin, Dovercourt Village",43.669005,-79.442259
34,"High Park, The Junction South",43.661608,-79.464763
35,"Parkdale, Roncesvalles",43.64896,-79.456325
36,"Runnymede, Swansea",43.651571,-79.48445


Cluster 2

In [78]:
result2.loc[result2['Cluster Labels'] == 1, result2.columns[[3,4] + list(range(5, result2.shape[1]))]]

Unnamed: 0,Neighborhood,Latitude,Longitude
10,Rosedale,43.679563,-79.377529
11,"St. James Town, Cabbagetown",43.667967,-79.367675
12,Church and Wellesley,43.66586,-79.38316
13,"Regent Park, Harbourfront",43.65426,-79.360636
14,"Garden District, Ryerson",43.657162,-79.378937
15,St. James Town,43.651494,-79.375418
16,Berczy Park,43.644771,-79.373306
17,Central Bay Street,43.657952,-79.387383
18,"Richmond, Adelaide, King",43.650571,-79.384568
19,"Harbourfront East, Union Station, Toronto Islands",43.640816,-79.381752


Cluster 3

In [79]:
result2.loc[result2['Cluster Labels'] == 2, result2.columns[[3,4] + list(range(5, result2.shape[1]))]]

Unnamed: 0,Neighborhood,Latitude,Longitude
4,Lawrence Park,43.72802,-79.38879
5,Davisville North,43.712751,-79.390197
6,"North Toronto West, Lawrence Park",43.715383,-79.405678
7,Davisville,43.704324,-79.38879
8,"Moore Park, Summerhill East",43.689574,-79.38316
9,"Summerhill West, Rathnelly, South Hill, Forest...",43.686412,-79.400049
22,Roselawn,43.711695,-79.416936
23,"Forest Hill North & West, Forest Hill Road Park",43.696948,-79.411307


Cluster 4

In [80]:
result2.loc[result2['Cluster Labels'] == 3, result2.columns[[3,4] + list(range(5, result2.shape[1]))]]

Unnamed: 0,Neighborhood,Latitude,Longitude
24,"The Annex, North Midtown, Yorkville",43.67271,-79.405678
25,"University of Toronto, Harbord",43.662696,-79.400049
26,"Kensington Market, Chinatown, Grange Park",43.653206,-79.400049
30,Christie,43.669542,-79.422564
32,"Little Portugal, Trinity",43.647927,-79.41975
33,"Brockton, Parkdale Village, Exhibition Place",43.636847,-79.428191


Cluster 5

In [81]:
result2.loc[result2['Cluster Labels'] == 4, result2.columns[[3,4] + list(range(5, result2.shape[1]))]]

Unnamed: 0,Neighborhood,Latitude,Longitude
0,The Beaches,43.676357,-79.293031
1,"The Danforth West, Riverdale",43.679557,-79.352188
2,"India Bazaar, The Beaches West",43.668999,-79.315572
3,Studio District,43.659526,-79.340923
38,"Business reply mail Processing Centre, South C...",43.662744,-79.321558
