# Neighborhoods in Toronto

In [1]:
!pip install bs4
import pandas as pd
from bs4 import BeautifulSoup
import requests



### Task 1: Scraping the table from wikipedia.org and read into a pandas dataframe

In [2]:
url='https://en.wikipedia.org/w/index.php?title=List_of_postal_codes_of_Canada:_M&direction=prev&oldid=926287641'

html_data=requests.get(url).text

In [3]:
soup = BeautifulSoup(html_data, 'html5lib')
table=soup.find('table')

In [4]:
table_contents=[]
rows=table.findAll('td')
i=0
while i < len(rows)-3:
    cell={}
    if rows[i+1].text=='Not assigned\n':
        pass
    elif rows[i+2].text=='Not assigned\n':
        pass
    else:
        cell['PostalCode'] = rows[i].text
        cell['Borough'] = rows[i+1].text
        cell['Neighborhood'] = rows[i+2].text.split('\n')[0]
        table_contents.append(cell)
    i+=3


In [5]:
df=pd.DataFrame(table_contents)
df['Borough']=df['Borough'].replace({'Downtown TorontoStn A PO Boxes25 The Esplanade':'Downtown Toronto Stn A',
                                             'East TorontoBusiness reply mail Processing Centre969 Eastern':'East Toronto Business',
                                             'EtobicokeNorthwest':'Etobicoke Northwest','East YorkEast Toronto':'East York/East Toronto',
                                             'MississaugaCanada Post Gateway Processing Centre':'Mississauga'})

grouped_df=df.groupby(['PostalCode','Borough'])
df1=grouped_df['Neighborhood'].agg(lambda column: ", ".join(column))
df1=df1.reset_index(name="Neighborhood")
df1

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
...,...,...,...
97,M9N,York,Weston
98,M9P,Etobicoke,Westmount
99,M9R,Etobicoke,"Kingsview Village, Martin Grove Gardens, Richv..."
100,M9V,Etobicoke,"Albion Gardens, Beaumond Heights, Humbergate, ..."


### Task 2: Read the csv file to get the latitude and the longitude coordinates and update the dataframe

#### Note: Unable to get coordinates of the neighborhoods using the geocoder package. 

In [6]:
#read the geospatial coordinates csv file from local folder
geo_df=pd.read_csv('Geospatial_Coordinates.csv')

##### Add the latitude and longitude columns to the data frame

In [7]:
df1['Latitude']=geo_df['Latitude']
df1['Longitude']=geo_df['Longitude']
df1

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
...,...,...,...,...,...
97,M9N,York,Weston,43.724766,-79.532242
98,M9P,Etobicoke,Westmount,43.706876,-79.518188
99,M9R,Etobicoke,"Kingsview Village, Martin Grove Gardens, Richv...",43.696319,-79.532242
100,M9V,Etobicoke,"Albion Gardens, Beaumond Heights, Humbergate, ...",43.688905,-79.554724


### Task 3: Explore and cluster the neighborhoods in Toronto on a map

In [8]:
#Create a new dataframe with the boroughs that contain the word 'Toronto' 
df2=df1[df1['Borough'].str.contains('Toronto')]

#Reset index starts from 0
df2 = df2.reset_index(drop=True)
df2

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M4E,East Toronto,The Beaches,43.676357,-79.293031
1,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
2,M4L,East Toronto,"The Beaches West, India Bazaar",43.668999,-79.315572
3,M4M,East Toronto,Studio District,43.659526,-79.340923
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879
5,M4P,Central Toronto,Davisville North,43.712751,-79.390197
6,M4R,Central Toronto,North Toronto West,43.715383,-79.405678
7,M4S,Central Toronto,Davisville,43.704324,-79.38879
8,M4T,Central Toronto,"Moore Park, Summerhill East",43.689574,-79.38316
9,M4V,Central Toronto,"Deer Park, Forest Hill SE, Rathnelly, South Hi...",43.686412,-79.400049


In [9]:
import numpy as np
!pip install geopy
# convert an address into latitude and longitude values
from geopy.geocoders import Nominatim

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!pip install folium
import folium # map rendering library
print('Libraries imported.')

Libraries imported.


#### Use geopy library to get the latitude and longitude values of Toronto.

In [10]:
address = 'Toronto, Canada'
geolocator = Nominatim(user_agent="Toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


#### Create a map of Toronto with neighborhoods superimposed on top.

In [11]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)
# add markers to map
for lat, lng, borough, neighborhood in zip(df2['Latitude'], df2['Longitude'], df2['Borough'], df2['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc', fill_opacity=0.7, parse_html=False).add_to(map_toronto)
map_toronto