#Data Analysis of Neighbourhoods in Toronto

### Imports

In [9]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
!pip install pgeocode
import pgeocode

Collecting pgeocode
  Downloading https://files.pythonhosted.org/packages/86/44/519e3db3db84acdeb29e24f2e65991960f13464279b61bde5e9e96909c9d/pgeocode-0.2.1-py2.py3-none-any.whl
Installing collected packages: pgeocode
Successfully installed pgeocode-0.2.1


### Web Scraping

In [24]:
html_doc = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')

soup = BeautifulSoup(html_doc.content, 'html.parser')
head_columns = ['PostalCode','Borough','Neighborhood']

df = pd.DataFrame(columns=head_columns)
canada_table = soup.find('table', attrs={'class':'wikitable sortable'})
canada_table_data = canada_table.tbody.find_all('tr')
del(canada_table_data[0])

lst = []
for tr in canada_table_data:
  lst_temp = tr.find_all('td')
  lst.append({'PostalCode':lst_temp[0].text.replace('\n',''),'Borough':lst_temp[1].text.replace('\n',''),'Neighborhood':lst_temp[2].text.replace('\n','')})
for data in lst:
  df = df.append(data,ignore_index=True)

df = df[df['Borough']!='Not assigned']
df.reset_index(inplace=True, drop=True)
df

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C..."
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


In [3]:
len(df['PostalCode'].unique())

103

This means that all postal codes are unique

In [4]:
df[df['Neighborhood']=='Not assigned']

Unnamed: 0,PostalCode,Borough,Neighborhood


This means that all neighborhoods are assigned some value

In [5]:
df.shape

(103, 3)

### Adding coordinates ( Latitude & Longitude )

In [28]:
nomi = pgeocode.Nominatim('ca')
for index,postcode in enumerate(df['PostalCode']):
  code = nomi.query_postal_code(postcode)
  lat = code.latitude
  lon = code.longitude
  df.loc[index,'Latitude'] = lat
  df.loc[index,'Longitude'] = lon 
df

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.7545,-79.33
1,M4A,North York,Victoria Village,43.7276,-79.3148
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.6555,-79.3626
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.7223,-79.4504
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.6641,-79.3889
...,...,...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.6518,-79.5076
99,M4Y,Downtown Toronto,Church and Wellesley,43.6656,-79.383
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C...",43.7804,-79.2505
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.6325,-79.4939


43.7545