# Segmenting and Clustering Neighborhoods in Toronto (Part 1, 2 and 3)

# Part 1

## Loading libraries

In [1]:
import requests
from bs4 import BeautifulSoup

!conda install -c conda-forge geopy --yes
!conda install -c conda-forge folium=0.5.0 --yes

from geopy.geocoders import Nominatim
import folium
import numpy as np
import pandas as pd

Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - geopy


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    certifi-2019.9.11          |           py36_0         147 KB  conda-forge
    ca-certificates-2019.9.11  |       hecc5488_0         144 KB  conda-forge
    openssl-1.1.1c             |       h516909a_0         2.1 MB  conda-forge
    geographiclib-1.50         |             py_0          34 KB  conda-forge
    geopy-1.20.0               |             py_0          57 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         2.5 MB

The following NEW packages will be INSTALLED:

    geographiclib:   1.50-py_0         conda-forge
    geopy:           1.20.0-py_0       conda-forge

The following packages will be UPDATED:

    ca-

## Get data from Wikipedia

In [11]:
url = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(url,'lxml')

## Create dataframe

In [12]:
table_post = soup.find('table')
fields = table_post.find_all('td')

postcode = []
borough = []
neighbourhood = []

for i in range(0, len(fields), 3):
    postcode.append(fields[i].text.strip())
    borough.append(fields[i+1].text.strip())
    neighbourhood.append(fields[i+2].text.strip())
        
df = pd.DataFrame(data=[postcode, borough, neighbourhood]).transpose()
df.columns = ['Postcode', 'Borough', 'Neighbourhood']
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


## Remove 'Not Assigned' cells

In [13]:
df['Borough'].replace('Not assigned', np.nan, inplace=True)
df.dropna(subset=['Borough'], inplace=True)
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights


## Aggregate Neighbourhood with comma ','

In [14]:
df = df.groupby(['Postcode', 'Borough'])['Neighbourhood'].apply(', '.join).reset_index()
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


## Replace 'Not Assigned' with 'Queens Park'

In [15]:
df['Neighbourhood'].replace('Not assigned', "Queen's Park", inplace=True)
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


## Print the number of rows of the dataframe

In [16]:
df.shape

(103, 3)

# Part 2

## Get the latitude and the longitude coordinates of each neighborhood

In [17]:
df_geo = pd.read_csv('http://cocl.us/Geospatial_data')
df_geo.columns = ['Postcode', 'Latitude', 'Longitude']
df_geo.head()

Unnamed: 0,Postcode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [18]:
df_final = pd.merge(df, df_geo, on=['Postcode'], how='inner')
df_final.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


# Part 3

## Map Toronto only 

In [20]:
address = 'Toronto, Ontario'
geolocator = Nominatim(user_agent='julen')
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude

toronto_map = folium.Map(location = [latitude, longitude], zoom_start = 11) 

for lat, long, borough, neighbor in zip(df_final.Latitude, df_final.Longitude, df_final.Borough, df_final.Neighbourhood):
    label = '{}, {}'.format(borough, neighbor)
    label = folium.Popup(label, parse_html = True)
    folium.CircleMarker([lat, long], radius = 5, popup = label, color = 'green', fill = True, fill_color = '#32a852', fill_opacity = 0.6, parse_html = False).add_to(toronto_map)
    
toronto_map

## Clustering

In [21]:
from sklearn.cluster import KMeans

kclusters = 3
df_cluster = df_final.loc[:, ['Latitude', 'Longitude']]
kmeans = KMeans(n_clusters = kclusters, init = "k-means++", n_init = 5, random_state = 0).fit(df_cluster)
df_final['Labels'] = kmeans.labels_
df_final.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude,Labels
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353,2
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497,2
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711,2
3,M1G,Scarborough,Woburn,43.770992,-79.216917,2
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476,2


## Clustered map

In [22]:
#create map
toronto_map = folium.Map(location  = [latitude, longitude], zoom_start = 11)

#set color scheme for the clusters
import numpy as np
import matplotlib.cm as cm
import matplotlib.colors as colors
x = np.arange(kclusters)
ys = [i*x + (i*x)**2 for i in range(kclusters)]
color_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in color_array]

markers_color = []
for lat, long, borough, cluster in zip(df_final.Latitude, df_final.Longitude, df_final.Borough, df_final.Labels):
    label = folium.Popup(borough + ' Cluster ' + str(cluster), parse_html = True)
    folium.CircleMarker([lat, long], radius = 5, popup = label, color = rainbow[cluster-1], fill = True, fill_color = rainbow[cluster-1], fill_opacity = 0.6, parse_html = False).add_to(toronto_map)
    
toronto_map