# This project is about clustering neighborhoods in Toronto using the Wikipedia website

## Part 1

1- Import the appropriate libraries

In [4]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
import requests
from bs4 import BeautifulSoup

2- Prepare the web scraping code by utilizing BeautifulSoup

In [5]:
url='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
source = requests.get(url).content
content = BeautifulSoup(requests.get(url).content, 'lxml')

In [6]:
table = content.find('table')
td = table.find_all('td')
postcode = []
borough = []
neighbourhood = []

3- Create a list with the scraped data

In [7]:
for i in range(0, len(td), 3):
    postcode.append(td[i].text.strip())
    borough.append(td[i+1].text.strip())
    neighbourhood.append(td[i+2].text.strip())

4- Create the actual DataFrame with the lists previously scraped and give the columns appropriate names

In [8]:
df_codes = pd.DataFrame(data=[postcode, borough, neighbourhood]).transpose()
df_codes.columns = ['PostalCode', 'Borough', 'Neighborhood']

5- Ignore boroughs with the 'Not assigned' value

In [14]:
df_codes['Borough'].replace('Not assigned', np.nan, inplace=True)
df_codes.dropna(subset=['Borough'], inplace=True)

6- Combine neighborhoods if they exist in one postal code

In [21]:
df_codes = df_codes.groupby(['PostalCode', 'Borough'])['Neighborhood'].apply(', '.join).reset_index()
df_codes.columns = ['PostalCode', 'Borough', 'Neighborhood']

7- Assign the borough name to the neighborhood name  if the cell has a borough but a 'Not assigned' neighborhood value

In [None]:
df_codes['Neighborhood'].replace('Not assigned', "Queen's Park", inplace=True)

In [19]:
df_codes.head(12)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


8- Use the .shape method to print the number of dataframe rows

In [20]:
df_codes.shape

(103, 3)

## Part 2

1- Import the geocoder

In [26]:
!conda install -c conda-forge geocoder --yes
import geocoder

Fetching package metadata .............
Solving package specifications: .

# All requested packages already installed.
# packages in environment at /opt/conda/envs/DSX-Python35:
#
geocoder                  1.38.1                     py_0    conda-forge


 2- Create a function to get the Lat Long data from the Postal Code

In [31]:
def get_geocoder(postal_code_from_df):
    # initialize your variable to None
    lat_lng_coords = None
    # loop until you get the coordinates
    while(lat_lng_coords is None):
        g = geocoder.arcgis('{}, Toronto, Ontario'.format(postal_code_from_df.strip()))
        lat_lng_coords = g.latlng
        latitude = lat_lng_coords[0]
        longitude = lat_lng_coords[1]
    return latitude,longitude

3- Add the Latitude and Longitude columns to the dataFrame

In [33]:
df_codes['Latitude'], df_codes['Longitude'] = zip(*df_codes['PostalCode'].apply(get_geocoder))

In [34]:
df_codes

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.811525,-79.195517
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.785730,-79.158750
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.765690,-79.175256
3,M1G,Scarborough,Woburn,43.768359,-79.217590
4,M1H,Scarborough,Cedarbrae,43.769688,-79.239440
5,M1J,Scarborough,Scarborough Village,43.743125,-79.231750
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",43.726245,-79.263670
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",43.713133,-79.285055
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West",43.723575,-79.234976
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.696665,-79.260163


## Part 3

1- Use geolocator for mapping Toronto

In [36]:
from geopy.geocoders import Nominatim

address = 'Toronto, Ontario'
geolocator = Nominatim(user_agent="toronto_ontario")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinates of Toronto, Ontario are {}, {}.'.format(latitude, longitude))

The geograpical coordinates of Toronto, Ontario are 43.653963, -79.387207.


2- Use Folium to map

In [43]:
!conda install -c conda-forge folium=0.5.0 --yes
import folium

Fetching package metadata .............
Solving package specifications: .

Package plan for installation in environment /opt/conda/envs/DSX-Python35:

The following NEW packages will be INSTALLED:

    altair:  2.2.2-py35_1 conda-forge
    branca:  0.3.1-py_0   conda-forge
    folium:  0.5.0-py_0   conda-forge
    vincent: 0.4.4-py_1   conda-forge

altair-2.2.2-p 100% |################################| Time: 0:00:00  11.53 MB/s
branca-0.3.1-p 100% |################################| Time: 0:00:00  36.04 MB/s
vincent-0.4.4- 100% |################################| Time: 0:00:00  40.22 MB/s
folium-0.5.0-p 100% |################################| Time: 0:00:00  30.20 MB/s


In [44]:
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)
for lat, long, post, borough, neigh in zip(df_codes['Latitude'], df_codes['Longitude'], df_codes['PostalCode'], df_codes['Borough'], df_codes['Neighborhood']):
    label = "{} ({}): {}".format(borough, post, neigh)
    popup = folium.Popup(label, parse_html=True)
    folium.CircleMarker([lat, long],radius=5,popup=popup,color='blue',fill=True,fill_color='#3186cc',fill_opacity=0.7,parse_html=False).add_to(map_toronto)

In [46]:
map_toronto