## This notebook will be mainly used for the capstone project
### Created by Nelson Ibarra V.

In [21]:
import pandas as pd
import numpy as np

In [22]:
print('Hello Capstone Project Course!')

Hello Capstone Project Course!


#  Explore, segment, and cluster the neighborhoods in the city of Toronto

## 1. Scrape Data

In [23]:
#import libraries
import requests # retrieve data
from bs4 import BeautifulSoup # retrieve data
import folium # plotting library
import matplotlib.cm as cm # plotting library
import matplotlib.colors as colors # plotting library

In [24]:
# settings
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
custom_header = {'user-agent': 'customUserAgent'}
# making request
r = requests.get(url, headers=custom_header)
#preparing data
soup = BeautifulSoup(r.content)
tab = str(soup.table)
dataframes = pd.read_html(tab)
df = dataframes[0]
print('shape', df.shape)
df.head()

shape (180, 3)


Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


## 2. Data preprocessing and cleaning

In [25]:
# new df 
# Filtering 'Not assigned' in Borough Column
df1 = df[df.Borough != 'Not assigned']
print('shape df1', df1.shape)

# New df 
# Merging neighbourhoods with same Postal Code
df2 = df1.groupby(['Postal Code','Borough'], sort=False).agg(', '.join)
df2 = df2.reset_index()
# Replacing the name of the neighbourhoods which are 'Not assigned' with names of Borough
df2['Neighborhood'] = np.where(df2['Neighborhood'] == 'Not assigned', df2['Borough'], df2['Neighborhood'])
print('shape df2', df2.shape)
df2.head()

shape df1 (103, 3)
shape df2 (103, 3)


Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [26]:
# new df
# df of latitudes and longitudes
df3 = pd.read_csv('https://cocl.us/Geospatial_data')
df3.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [27]:
# new df
# merging data frame: add latitude and longitude data
df4 = pd.merge(df2,df3,on='Postal Code')
print('shape df4', df4.shape)
df4.head()

shape df4 (103, 5)


Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


In [28]:
# checking lat and long data from merged df
df4.describe()

Unnamed: 0,Latitude,Longitude
count,103.0,103.0
mean,43.704608,-79.397153
std,0.052463,0.097146
min,43.602414,-79.615819
25%,43.660567,-79.464763
50%,43.696948,-79.38879
75%,43.74532,-79.340923
max,43.836125,-79.160497


In [29]:
# filtering just toronto borough
df5 = df4[df4['Borough'].str.contains('Toronto',regex=False)]
print('shape df5', df5.shape)
df5.head()

shape df5 (39, 5)


Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
15,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
19,M4E,East Toronto,The Beaches,43.676357,-79.293031


## 3. Checking values

In [30]:
print('N° of Borough: {}, N° of Neighborhood: {}'.format(len(df4.Borough.unique()), df4.shape[0]))
print(df4.Borough.unique())

print('#####################################################')

print('N° of Borough: {}, N° of Neighborhood: {}'.format(len(df5.Borough.unique()), df5.shape[0]))
print(df5.Borough.unique())

N° of Borough: 10, N° of Neighborhood: 103
['North York' 'Downtown Toronto' 'Etobicoke' 'Scarborough' 'East York'
 'York' 'East Toronto' 'West Toronto' 'Central Toronto' 'Mississauga']
#####################################################
N° of Borough: 4, N° of Neighborhood: 39
['Downtown Toronto' 'East Toronto' 'West Toronto' 'Central Toronto']


## Plotting results

### All toronto points

In [31]:
toronto_location = [43.651070,-79.347015]
map_toronto = folium.Map(location=toronto_location,zoom_start=10)

for lat,lng,borough,neighborhood in zip(df4['Latitude'],df4['Longitude'],df4['Borough'],df4['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
    [lat,lng],
    radius=5,
    popup=label,
    color='blue',
    fill=True,
    fill_color='#3186cc',
    fill_opacity=0.7,
    parse_html=False).add_to(map_toronto)
map_toronto

### Toronto Center points

In [32]:
map_toronto = folium.Map(location=toronto_location,zoom_start=10)

for lat,lng,borough,neighborhood in zip(df5['Latitude'],df5['Longitude'],df5['Borough'],df5['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
    [lat,lng],
    radius=5,
    popup=label,
    color='blue',
    fill=True,
    fill_color='#3186cc',
    fill_opacity=0.7,
    parse_html=False).add_to(map_toronto)
map_toronto

## 4. Cluster Neighborhoods

Run *k*-means to cluster the neighborhood into 5 clusters.

In [33]:
# import k-means from clustering stage
from sklearn.cluster import KMeans

In [34]:
# set number of clusters
kclusters = 5

toronto_grouped_clustering = df4.drop(['Postal Code','Borough','Neighborhood'],1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_

df4.insert(0, 'Cluster Labels', kmeans.labels_)

In [35]:
df4

Unnamed: 0,Cluster Labels,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,4,M3A,North York,Parkwoods,43.753259,-79.329656
1,4,M4A,North York,Victoria Village,43.725882,-79.315572
2,2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.654260,-79.360636
3,0,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,2,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
...,...,...,...,...,...,...
98,1,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.653654,-79.506944
99,2,M4Y,Downtown Toronto,Church and Wellesley,43.665860,-79.383160
100,4,M7Y,East Toronto,"Business reply mail Processing Centre, South C...",43.662744,-79.321558
101,1,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.636258,-79.498509


In [36]:
# create map
map_clusters = folium.Map(location=toronto_location,zoom_start=10)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, neighborhood, cluster in zip(df4['Latitude'], df4['Longitude'], df4['Neighborhood'], df4['Cluster Labels']):
    label = folium.Popup(' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters