<h1> *** Part 1 ***</h1>

<h1>1. Scrape Toronto neighborhood data from Wikipedia link</h1>

In [1]:
!conda install -c anaconda beautifulsoup4 --yes

Solving environment: done


  current version: 4.5.11
  latest version: 4.7.11

Please update conda by running

    $ conda update -n base -c defaults conda



## Package Plan ##

  environment location: /home/jupyterlab/conda/envs/python

  added / updated specs: 
    - beautifulsoup4


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    soupsieve-1.9.2            |           py36_0          61 KB  anaconda
    openssl-1.0.2s             |       h7b6447c_0         3.1 MB  anaconda
    certifi-2019.6.16          |           py36_1         156 KB  anaconda
    beautifulsoup4-4.7.1       |           py36_1         143 KB  anaconda
    ------------------------------------------------------------
                                           Total:         3.5 MB

The following NEW packages will be INSTALLED:

    soupsieve:      1.9.2-py36_0      anaconda   

The following packages will be UPDATED

<h3>1.1 Scrape the data and make a dataframe</h3>

In [58]:
import requests
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup

url='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
response=requests.get(url)
soup=BeautifulSoup(response.text,'html.parser')
table=soup.find('table',{'class':'wikitable sortable'}).tbody
rows=table.find_all('tr')

# Fetch titles
columns= [v.text.replace('\n','') for v in rows[0].find_all('th')]
df=pd.DataFrame(columns=columns)
df

# Fill the dataframe
for i in range(1,len(rows)):
    table_data_arr=rows[i].find_all('td')
    values=[table_data_arr[0].text,table_data_arr[1].text,table_data_arr[2].text.replace('\n','')]
    df=df.append(pd.Series(values,index=columns),ignore_index=True)
df.head(5)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


<h3>1.2 Refine the dataframe</h3>

In [59]:
# Ignore 'not assigned' data
df=df[df.Borough!='Not assigned']
df.reset_index(drop=True,inplace=True)

# Fill unassigned neighborhoods 
for i in range(0,df.shape[0]):
    if df.iloc[i][2]=='Not assigned':
        df.iloc[i][2]=df.iloc[i][1]
        i=i+1
df


df=df.groupby(['Postcode', 'Borough'])['Neighbourhood'].apply(list)
df=df.apply(lambda x: ','.join(x)).to_frame().reset_index()
df.head(5)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


<h3>1.3 Shape of the dataframe</h3>

In [60]:
df.shape

(103, 3)

<h1> *** Part 2 ***</h1>

<h1>2. Read geospatial data of Toronto neighborhood from csv</h1>

<h3>2.1 Load data into csv from url</h3>

In [61]:
!wget -q -O 'GeospatialCoordinates.csv' http://cocl.us/Geospatial_data
print('Data loaded into csv file successfully!')

Data loaded into csv file successfully!


In [62]:
geospatial_df=pd.read_csv('GeospatialCoordinates.csv')
geospatial_df.head(5)

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


<h3>2.2 Rename postal code column to be same as the previous dataframe postcode column</h3>

In [63]:
geospatial_df.columns=['Postcode', 'Latitude' ,'Longitude']

<h3>2.3 Merge the dataframes to get the required dataframe</h3>

In [64]:
merged_df = pd.merge(df, geospatial_df)
merged_df

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park,Ionview,Kennedy Park",43.727929,-79.262029
7,M1L,Scarborough,"Clairlea,Golden Mile,Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest,Cliffside,Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff,Cliffside West",43.692657,-79.264848


<h1> *** Part 3 ***</h1>

<h1>3. Explore and cluster the neighborhoods in Toronto</h1>

In [10]:
import folium
import matplotlib.pyplot as plt
!conda install -c conda-forge geopy --yes
from geopy.geocoders import Nominatim

Solving environment: done


  current version: 4.5.11
  latest version: 4.7.11

Please update conda by running

    $ conda update -n base -c defaults conda



## Package Plan ##

  environment location: /home/jupyterlab/conda/envs/python

  added / updated specs: 
    - geopy


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    certifi-2019.6.16          |           py36_1         149 KB  conda-forge
    geographiclib-1.49         |             py_0          32 KB  conda-forge
    geopy-1.20.0               |             py_0          57 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         239 KB

The following NEW packages will be INSTALLED:

    geographiclib: 1.49-py_0         conda-forge
    geopy:         1.20.0-py_0       conda-forge

The following packages will be UPDATED:

    certifi:       2019.

<h3>3.1 Create a map of Toronto</h3>

In [65]:
Toronto_lat=43.6532 #latitude
Toronto_lng=-79.3832 #longitude

map_toronto= folium.Map(location=[Toronto_lat,Toronto_lng],zoom_start=11)

for lat,lng,borough,neighbourhood in zip(merged_df['Latitude'], merged_df['Longitude'], merged_df['Borough'], merged_df['Neighbourhood']):
    label='{},{}'.format(neighbourhood,borough)
    label=folium.Popup(label,parse_html=True)
    folium.CircleMarker(
    [lat,lng],
    radius=4,
    popup=label,
    color='blue',
    fill=True,
    fill_color='#3186cc',
    fill_opacity=0.7,
    parse_html=False).add_to(map_toronto)

map_toronto

<h3>3.2 Prepare data to cluster neighborhoods</h3>

In [66]:
#merged_df.Borough.unique()
toronto_grouped = merged_df[merged_df['Borough'].str.contains("Toronto")]
toronto_grouped.reset_index(drop=True,inplace=True)
toronto_grouped.head(5)

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M4E,East Toronto,The Beaches,43.676357,-79.293031
1,M4K,East Toronto,"The Danforth West,Riverdale",43.679557,-79.352188
2,M4L,East Toronto,"The Beaches West,India Bazaar",43.668999,-79.315572
3,M4M,East Toronto,Studio District,43.659526,-79.340923
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879


In [67]:
# import k-means from clustering stage
from sklearn.cluster import KMeans

# set number of clusters
kclusters = 4

toronto_grouped_clustering = toronto_grouped.drop(['Postcode', 'Neighbourhood', 'Borough'], axis=1)

<h3>3.3 Run the clustering algorithm</h3>

In [68]:
# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([0, 0, 0, 0, 2, 2, 2, 2, 2, 2], dtype=int32)

In [69]:
# add clustering labels
toronto_grouped.insert(0, 'Cluster Labels', kmeans.labels_)
toronto_grouped.head(5)

Unnamed: 0,Cluster Labels,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,0,M4E,East Toronto,The Beaches,43.676357,-79.293031
1,0,M4K,East Toronto,"The Danforth West,Riverdale",43.679557,-79.352188
2,0,M4L,East Toronto,"The Beaches West,India Bazaar",43.668999,-79.315572
3,0,M4M,East Toronto,Studio District,43.659526,-79.340923
4,2,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879


<h3>3.4 Put the clusters on the map of Toronto</h3>

In [70]:
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

map_clusters= folium.Map(location=[Toronto_lat,Toronto_lng],zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_grouped['Latitude'], toronto_grouped['Longitude'], toronto_grouped['Neighbourhood'], toronto_grouped['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters