# Toronto Clusterting

In [1]:
# install and import libraries
!conda install -c conda-forge geopy --yes
!conda install -c conda-forge folium=0.5.0 --yes
!conda install -c anaconda beautifulsoup4 --yes


import requests
from bs4 import BeautifulSoup
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
import numpy as np
from geopy.geocoders import Nominatim
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans
import folium

Solving environment: done


  current version: 4.5.11
  latest version: 4.7.12

Please update conda by running

    $ conda update -n base -c defaults conda



## Package Plan ##

  environment location: /home/jupyterlab/conda/envs/python

  added / updated specs: 
    - geopy


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    geopy-1.20.0               |             py_0          57 KB  conda-forge
    certifi-2019.11.28         |           py36_0         149 KB  conda-forge
    geographiclib-1.50         |             py_0          34 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         240 KB

The following NEW packages will be INSTALLED:

    geographiclib: 1.50-py_0         conda-forge
    geopy:         1.20.0-py_0       conda-forge

The following packages will be UPDATED:

    certifi:       2019.

## Steps from Previous Notebooks

### Aquire and clean Data

In [4]:
wiki_page = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")
soup = BeautifulSoup(wiki_page.content, 'html.parser')

#creating DF
table = soup.find('tbody')
rows = table.select('tr')
row = [r.get_text() for r in rows]
df = pd.DataFrame(row)

#cleanling and organizing DF
df1 = df[0].str.split('\n', expand=True)
df1.rename(columns=df1.iloc[0], inplace=True)
df1.drop(df1.index[0], inplace=True)

# new df with no NAN Boroughs
df2 = df1[df1.Borough != 'Not assigned']

#new df with combined Neighborhoods
df3 = df2.groupby(['Postcode', 'Borough'], sort = False).agg(','.join)
df3.reset_index(inplace=True)

df3.replace("Not assigned", "Queen's Park", inplace=True)
df3.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,"Lawrence Heights,Lawrence Manor"
4,M7A,Queen's Park,Queen's Park


### Create DF with Latitude and Longitude values

In [5]:
url = "http://cocl.us/Geospatial_data"
df_LL = pd.read_csv(url)

#rename
df_LL.rename(columns={'Postal Code': 'Postcode'}, inplace=True)
#merge
df_final = pd.merge(df3, df_LL, on='Postcode')
df_final.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636
3,M6A,North York,"Lawrence Heights,Lawrence Manor",43.718518,-79.464763
4,M7A,Queen's Park,Queen's Park,43.662301,-79.389494


## Clustering Neighborhoods by Distance

### Checking unique Boroughs and Neighborhoods in DF

In [7]:
print('This DF has {} Boroughs and {} Neighbourhoods.'.format(len(df_final['Borough'].unique()),df_final.shape[0]))

This DF has 11 Boroughs and 103 Neighbourhoods.


### Selecting only Boroughs with 'Toronto' in the name

In [9]:
df_TO=df_final[df_final['Borough'].str.contains('Toronto')]
df_TO.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
2,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636
5,M9A,Downtown Toronto,Queen's Park,43.667856,-79.532242
9,M5B,Downtown Toronto,"Ryerson,Garden District",43.657162,-79.378937
15,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
19,M4E,East Toronto,The Beaches,43.676357,-79.293031


### Generating a Map to see the clustering of Neighborhoods in Toronto

In [15]:
geolocator = Nominatim(user_agent="Toronto_explorer")
location = geolocator.geocode('Toronto')
latitude = location.latitude
longitude = location.longitude

Toronto_map = folium.Map(location=[latitude, longitude], zoom_start=10)

for lat, lng, borough, neighborhood in zip(df_TO['Latitude'], df_TO['Longitude'], 
                                           df_TO['Borough'], df_TO['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='red',
        fill=True,
        fill_opacity=1.0,
        parse_html=False).add_to(Toronto_map)  
    
Toronto_map