## This notebook is scraping data from web related to Toronto Neighbourhoods


In [1]:
import pandas as pd
import numpy as np
#Web scraping packages
import lxml.html as lh
import requests

In [4]:
url='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

In [5]:
page = requests.get(url)
#Store the contents of the website under doc
doc = lh.fromstring(page.content)
#Parse data that are stored between <tr>..</tr> of HTML
tr_elements = doc.xpath('//tr')

In [6]:
#Check the length of the first 10 rows
[len(T) for T in tr_elements[:10]]

[3, 3, 3, 3, 3, 3, 3, 3, 3, 3]

In [7]:
tr_elements = doc.xpath('//tr')
#Create empty list
col=[]
i=0
#For each row, store each first element (header) and an empty list
for t in tr_elements[0]:
    i+=1
    name=t.text_content()
    col.append((name,[]))
    
col

[('Postcode', []), ('Borough', []), ('Neighbourhood\n', [])]

In [8]:
#Since out first row is the header, data is stored on the second row onwards

for j in range(1,len(tr_elements)):
    #T is our j'th row
    T=tr_elements[j]
    
    #If row is not of size 3, the //tr data is not from our table 
    if len(T)!=3:
        break
    
    #i is the index of our column
    i=0
    
    #Iterate through each element of the row
    for t in T.iterchildren():
        data=t.text_content() 
        #Check if row is empty
        if i>0:
        #Convert any numerical value to integers
            try:
                data=int(data)
            except:
                pass
        #Append the data to the empty list of the i'th column
        col[i][1].append(data)
        #Increment i for the next column
        i+=1

In [9]:
#check number of data per column
[len(C) for (title,C) in col]

[287, 287, 287]

In [10]:
#now we can create Dataframe
Dict={title:column for (title,column) in col}
df=pd.DataFrame(Dict)

#### Now to visualize dataframe

In [11]:
df.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned\n
1,M2A,Not assigned,Not assigned\n
2,M3A,North York,Parkwoods\n
3,M4A,North York,Victoria Village\n
4,M5A,Downtown Toronto,Harbourfront\n
5,M6A,North York,Lawrence Heights\n
6,M6A,North York,Lawrence Manor\n
7,M7A,Downtown Toronto,Queen's Park\n
8,M8A,Not assigned,Not assigned\n
9,M9A,Queen's Park,Not assigned\n


In [12]:
#Deleting rows with Borough not assigned
no_borough = df[ (df['Borough']=='Not assigned')].index
df.drop(no_borough , inplace=True)
df.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods\n
3,M4A,North York,Victoria Village\n
4,M5A,Downtown Toronto,Harbourfront\n
5,M6A,North York,Lawrence Heights\n
6,M6A,North York,Lawrence Manor\n
7,M7A,Downtown Toronto,Queen's Park\n
9,M9A,Queen's Park,Not assigned\n
10,M1B,Scarborough,Rouge\n
11,M1B,Scarborough,Malvern\n
13,M3B,North York,Don Mills North\n


In [13]:
#data in Neighbourhood has a '\n' that want to be eliminated
df2 = df.assign(Neighbourhood=df['Neighbourhood\n'].str.replace(r'\n', ''))
df=df2.drop('Neighbourhood\n',1)
df.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor
7,M7A,Downtown Toronto,Queen's Park
9,M9A,Queen's Park,Not assigned
10,M1B,Scarborough,Rouge
11,M1B,Scarborough,Malvern
13,M3B,North York,Don Mills North


In [14]:
#if Neighbourhood is not assigned, then assign same name of Borough
mask = df['Neighbourhood'].isin(['not assigned', 'Not assigned']) 
df.loc[mask, 'Neighbourhood'] = df.loc[mask, 'Borough']
df.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor
7,M7A,Downtown Toronto,Queen's Park
9,M9A,Queen's Park,Queen's Park
10,M1B,Scarborough,Rouge
11,M1B,Scarborough,Malvern
13,M3B,North York,Don Mills North


In [67]:
# OK.  Now we see M9A Queen's Park for Borough and Neighbourhood
# but we still have rows with same postcode, so we consolidate in same row and join the neighbourhoods with a ","

In [15]:
df1=df.groupby(['Postcode','Borough'], as_index = False).agg({'Neighbourhood': ','.join})

In [16]:
df1.shape

(103, 3)

In [17]:
df1

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park,Ionview,Kennedy Park"
7,M1L,Scarborough,"Clairlea,Golden Mile,Oakridge"
8,M1M,Scarborough,"Cliffcrest,Cliffside,Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff,Cliffside West"


## Getting Coordinates of postal codes

In [2]:
!pip install geocoder
import geocoder # import geocoder

Collecting geocoder
[?25l  Downloading https://files.pythonhosted.org/packages/4f/6b/13166c909ad2f2d76b929a4227c952630ebaf0d729f6317eb09cbceccbab/geocoder-1.38.1-py2.py3-none-any.whl (98kB)
[K     |████████████████████████████████| 102kB 7.9MB/s ta 0:00:011
Collecting ratelim (from geocoder)
  Downloading https://files.pythonhosted.org/packages/f2/98/7e6d147fd16a10a5f821db6e25f192265d6ecca3d82957a4fdd592cad49c/ratelim-0.1.6-py2.py3-none-any.whl
Installing collected packages: ratelim, geocoder
Successfully installed geocoder-1.38.1 ratelim-0.1.6


In [29]:
# initialize your variable to None
lat_lng_coords = None
i=0
postal_code =df1.Postcode
while i
# loop until you get the coordinates
 while(lat_lng_coords is None):
  g = geocoder.arcgis('{}, Toronto, Ontario'.format(postal_code[i])) 
  lat_lng_coords = g.latlng

 latitude[i] = lat_lng_coords[0]
 longitude[i] = lat_lng_coords[1]
 i=i+1

'M1C'

43.811525000000074

In [None]:
# initialize your variable to None
lat_lng_coords = None
postal_code ='M1B'

# loop until you get the coordinates
while(lat_lng_coords is None):
 g = geocoder.arcgis('{}, Toronto, Ontario'.format(postal_code)) 
 lat_lng_coords = g.latlng

latitude = lat_lng_coords[0]
longitude = lat_lng_coords[1]