## IBM Data Science Professional Certification

#### Capstone Project - Week 3 - Segmenting and Clustering Neighborhoods in Toronto

## PART 1

Data : https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M

##### Environment

In [1]:
#Install librairies
!conda install -c conda-forge geopy --yes
!conda install -c conda-forge folium=0.5.0 --yes
!pip install beautifulsoup4

Collecting package metadata (current_repodata.json): done
Solving environment: done

## Package Plan ##

  environment location: /home/jupyterlab/conda/envs/python

  added / updated specs:
    - geopy


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    geographiclib-1.50         |             py_0          34 KB  conda-forge
    geopy-1.22.0               |     pyh9f0ad1d_0          63 KB  conda-forge
    ------------------------------------------------------------
                                           Total:          97 KB

The following NEW packages will be INSTALLED:

  geographiclib      conda-forge/noarch::geographiclib-1.50-py_0
  geopy              conda-forge/noarch::geopy-1.22.0-pyh9f0ad1d_0



Downloading and Extracting Packages
geopy-1.22.0         | 63 KB     | ##################################### | 100% 
geographiclib-1.50   | 34 KB     | ###############################

In [2]:
#Import Libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
import numpy as np
from geopy.geocoders import Nominatim
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans
import folium

##### Data scrapping & Dataframe

In [3]:
#Dataset
page = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")
soup = BeautifulSoup(page.content, 'html.parser')

#Read
table = soup.find('tbody')
rows = table.select('tr')
row = [r.get_text() for r in rows]

In [11]:
#Create Dataframe
df = pd.DataFrame(row)
df = df[0].str.split('\n', expand=True)
df = df.rename(columns=df.iloc[0])
df = df.drop(df.index[0])

#Ignore Not assigned
df = df[df.Borough != 'Not assigned']

#Rename col
df.rename(columns={'Postal Code': 'Postcode'}, inplace=True)

#Group
df = df.groupby(['Postcode', 'Borough'], sort = False).agg(','.join)

#Index
df.reset_index(inplace = True)

In [12]:
df

Unnamed: 0,Postcode,Borough,Unnamed: 3,Unnamed: 4,Unnamed: 5,Neighborhood,Unnamed: 7
0,M3A,North York,,,,Parkwoods,
1,M4A,North York,,,,Victoria Village,
2,M5A,Downtown Toronto,,,,"Regent Park, Harbourfront",
3,M6A,North York,,,,"Lawrence Manor, Lawrence Heights",
4,M7A,Downtown Toronto,,,,"Queen's Park, Ontario Provincial Government",
5,M9A,Etobicoke,,,,Islington Avenue,
6,M1B,Scarborough,,,,"Malvern, Rouge",
7,M3B,North York,,,,Don Mills,
8,M4B,East York,,,,"Parkview Hill, Woodbine Gardens",
9,M5B,Downtown Toronto,,,,"Garden District, Ryerson",


## PART 2

##### Data scrapping & Dataframe

In [13]:
url = "http://cocl.us/Geospatial_data"
df_geo = pd.read_csv(url)

#Rename col
df_geo.rename(columns={'Postal Code': 'Postcode'}, inplace=True)

#Dataframe header
df_geo.head()

Unnamed: 0,Postcode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


##### Merging

In [14]:
df_merged = pd.merge(df, df_geo, on='Postcode')
df_merged.head()

Unnamed: 0,Postcode,Borough,Unnamed: 3,Unnamed: 4,Unnamed: 5,Neighborhood,Unnamed: 7,Latitude,Longitude
0,M3A,North York,,,,Parkwoods,,43.753259,-79.329656
1,M4A,North York,,,,Victoria Village,,43.725882,-79.315572
2,M5A,Downtown Toronto,,,,"Regent Park, Harbourfront",,43.65426,-79.360636
3,M6A,North York,,,,"Lawrence Manor, Lawrence Heights",,43.718518,-79.464763
4,M7A,Downtown Toronto,,,,"Queen's Park, Ontario Provincial Government",,43.662301,-79.389494


## PART 3

##### Filter Toronto Entries

In [15]:
df_Toronto=df_merged[df_merged['Borough'].str.contains('Toronto')]
df_Toronto

Unnamed: 0,Postcode,Borough,Unnamed: 3,Unnamed: 4,Unnamed: 5,Neighborhood,Unnamed: 7,Latitude,Longitude
2,M5A,Downtown Toronto,,,,"Regent Park, Harbourfront",,43.65426,-79.360636
4,M7A,Downtown Toronto,,,,"Queen's Park, Ontario Provincial Government",,43.662301,-79.389494
9,M5B,Downtown Toronto,,,,"Garden District, Ryerson",,43.657162,-79.378937
15,M5C,Downtown Toronto,,,,St. James Town,,43.651494,-79.375418
19,M4E,East Toronto,,,,The Beaches,,43.676357,-79.293031
20,M5E,Downtown Toronto,,,,Berczy Park,,43.644771,-79.373306
24,M5G,Downtown Toronto,,,,Central Bay Street,,43.657952,-79.387383
25,M6G,Downtown Toronto,,,,Christie,,43.669542,-79.422564
30,M5H,Downtown Toronto,,,,"Richmond, Adelaide, King",,43.650571,-79.384568
31,M6H,West Toronto,,,,"Dufferin, Dovercourt Village",,43.669005,-79.442259


#### Generate maps to visualize neighborhoods on Toronto

In [16]:
#Prepare
address = 'Toronto'
geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('Geograpical coordinate : {}, {}.'.format(latitude, longitude))

# Map of Toronto
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# Markers
for lat, lng, label in zip(df_Toronto['Latitude'], df_Toronto['Longitude'], df_Toronto['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        popup=label,
        radius=6,
        color='orange',
        fill=True,
        parse_html=False).add_to(map_toronto)  

#Display
map_toronto

Geograpical coordinate : 43.6534817, -79.3839347.
