***Web Scraping of Neighborhoods in Toronto City***

In [1]:
#import pandas library
import pandas as pd


In [18]:
#Web scraping for Toronto postal codes from Wikipedia
url='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
postal_codes = pd.io.html.read_html(url)
postal_codes=pd.DataFrame(postal_codes[0])
postal_codes.head()

Unnamed: 0,Postcode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [20]:
#Dropping rows that dont have an assigned Borough
postal_codes = postal_codes[postal_codes['Borough'] != 'Not assigned'].reset_index(drop=True)
postal_codes.head()

Unnamed: 0,Postcode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,Lawrence Heights
4,M6A,North York,Lawrence Manor


In [21]:
#Neighborhoods that have 'Not assigned' observations,replace it with the corresponding Borough observation
for i in range(len(postal_codes)):
    if postal_codes['Neighborhood'][i] == 'Not assigned':
        postal_codes['Neighborhood'][i] = postal_codes['Borough'][i]

In [22]:

postal_codes = postal_codes[postal_codes['Neighborhood'] != 'Not assigned']
postal_codes.shape[0]

210

In [23]:
#Grouping Neighborhoods by postal codes
postal_codes.sort_values(by=['Postcode'],axis=0, inplace=True)
postal_codes.reset_index(inplace=True,drop=True)
postal_codes.head()

Unnamed: 0,Postcode,Borough,Neighborhood
0,M1B,Scarborough,Rouge
1,M1B,Scarborough,Malvern
2,M1C,Scarborough,Port Union
3,M1C,Scarborough,Rouge Hill
4,M1C,Scarborough,Highland Creek


In [35]:
#Create an auxilary dataframe for merging
postal_codes_new = postal_codes[['Postcode','Borough']]
postal_codes_new.drop_duplicates(inplace=True)
postal_codes_new.reset_index(inplace=True, drop=True)
postal_codes_new.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,Postcode,Borough
0,M1B,Scarborough
1,M1C,Scarborough
2,M1E,Scarborough
3,M1G,Scarborough
4,M1H,Scarborough


In [25]:
postcode = postal_codes.Postcode.unique()
postcode

array(['M1B', 'M1C', 'M1E', 'M1G', 'M1H', 'M1J', 'M1K', 'M1L', 'M1M',
       'M1N', 'M1P', 'M1R', 'M1S', 'M1T', 'M1V', 'M1W', 'M1X', 'M2H',
       'M2J', 'M2K', 'M2L', 'M2M', 'M2N', 'M2P', 'M2R', 'M3A', 'M3B',
       'M3C', 'M3H', 'M3J', 'M3K', 'M3L', 'M3M', 'M3N', 'M4A', 'M4B',
       'M4C', 'M4E', 'M4G', 'M4H', 'M4J', 'M4K', 'M4L', 'M4M', 'M4N',
       'M4P', 'M4R', 'M4S', 'M4T', 'M4V', 'M4W', 'M4X', 'M4Y', 'M5A',
       'M5B', 'M5C', 'M5E', 'M5G', 'M5H', 'M5J', 'M5K', 'M5L', 'M5M',
       'M5N', 'M5P', 'M5R', 'M5S', 'M5T', 'M5V', 'M5W', 'M5X', 'M6A',
       'M6B', 'M6C', 'M6E', 'M6G', 'M6H', 'M6J', 'M6K', 'M6L', 'M6M',
       'M6N', 'M6P', 'M6R', 'M6S', 'M7A', 'M7R', 'M7Y', 'M8V', 'M8W',
       'M8X', 'M8Y', 'M8Z', 'M9A', 'M9B', 'M9C', 'M9L', 'M9M', 'M9N',
       'M9P', 'M9R', 'M9V', 'M9W'], dtype=object)

In [26]:

neighborhoods = []
range_ = len(postal_codes)-1
i = 0
while i < 210:
    j = i + 1
    aux = postal_codes['Neighborhood'][i]
    if i < 209:
        if postal_codes['Postcode'][j] == postal_codes['Postcode'][i]:
            while postal_codes['Postcode'][j] == postal_codes['Postcode'][i]:
                aux = aux + ', ' + postal_codes['Neighborhood'][j]
                j = j + 1
            i = j
        else:
            i = i + 1
    else:
        i = i + 1
    neighborhoods.append(aux)

len(neighborhoods)

103

In [27]:
postal_codes_new.insert(2, 'Neighborhood', neighborhoods)
postal_codes_new.head()

Unnamed: 0,Postcode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Port Union, Rouge Hill, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [33]:
postal_codes_new.columns = ['Postalcode','Borough','Neighborhood']
postal_codes_new

Unnamed: 0,Postalcode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Port Union, Rouge Hill, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
...,...,...,...
98,M9N,York,Weston
99,M9P,Etobicoke,Westmount
100,M9R,Etobicoke,"Richview Gardens, Kingsview Village, St. Phill..."
101,M9V,Etobicoke,"Albion Gardens, Beaumond Heights, Humbergate, ..."


In [34]:
postal_codes_new.shape

(103, 3)