# Coursera IBM Data Science Capstone Project
## Toronto Neighborhood Analysis - Part A

In [2]:
import numpy as np
import pandas as pd

### Read Wikipedia data using pandas

In [3]:
# read wikipedia web page data
site_list = pd.read_html("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")

### read_html returns a list the first item of which is the postal code table

In [4]:
# extract postal code table from result list
toronto_postal_codes_raw = site_list.copy()[0]
print(toronto_postal_codes_raw.shape)
toronto_postal_codes_raw.head(2)

(288, 3)


Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned


### Explore dataset

In [5]:
# overview
print(toronto_postal_codes_raw.describe())

       Postcode       Borough Neighbourhood
count       288           288           288
unique      180            12           209
top         M8Y  Not assigned  Not assigned
freq          8            77            78


In [6]:
# Neighbourhoods per borough
print(toronto_postal_codes_raw['Borough'].value_counts())

Not assigned        77
Etobicoke           45
North York          38
Downtown Toronto    37
Scarborough         37
Central Toronto     17
West Toronto        13
York                 9
East Toronto         7
East York            6
Queen's Park         1
Mississauga          1
Name: Borough, dtype: int64


In [7]:
# Neighbourhoods without names
m= (toronto_postal_codes_raw.Borough!='Not assigned') & (toronto_postal_codes_raw.Neighbourhood == 'Not assigned')
print(toronto_postal_codes_raw[m])

  Postcode       Borough Neighbourhood
8      M7A  Queen's Park  Not assigned


### Remove rows where borough is not assigned

In [12]:
toronto_postal_codes = toronto_postal_codes_raw[toronto_postal_codes_raw.Borough!='Not assigned']
print(toronto_postal_codes.shape)
toronto_postal_codes.head(2)

(211, 3)


Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village


### Handle neighbourhoods with value 'Not assigned' 

In [13]:
# suppress chained assignment warning
pd.options.mode.chained_assignment = None  # default='warn'

# Mask
m = toronto_postal_codes.Neighbourhood=='Not assigned'

# Replace
toronto_postal_codes.loc[m,'Neighbourhood'] = toronto_postal_codes.loc[m,'Borough']
print(toronto_postal_codes.shape)
toronto_postal_codes.head(2)

(211, 3)


Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village


In [14]:
df_boroughs = toronto_postal_codes.groupby(['Postcode','Borough']).count()
df_boroughs.columns=['NumNeighbourhoods']
print(df_boroughs.shape)
df_boroughs.head(2)

(103, 1)


Unnamed: 0_level_0,Unnamed: 1_level_0,NumNeighbourhoods
Postcode,Borough,Unnamed: 2_level_1
M1B,Scarborough,2
M1C,Scarborough,3


### Create a column which holds the list of neighborhoods for each borough

In [15]:
df_boroughs['Neighbourhoods'] = ''
df_boroughs.head(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,NumNeighbourhoods,Neighbourhoods
Postcode,Borough,Unnamed: 2_level_1,Unnamed: 3_level_1
M1B,Scarborough,2,
M1C,Scarborough,3,


In [17]:
for i, r in toronto_postal_codes.iterrows():
    ix_tup = (r[0],r[1],)
    if df_boroughs.loc[ix_tup, 'Neighbourhoods'] == '':
        df_boroughs.loc[ix_tup, 'Neighbourhoods'] = r[2]
    else:
        df_boroughs.loc[ix_tup, 'Neighbourhoods'] = df_boroughs.loc[ix_tup, 'Neighbourhoods'] + ', ' + r[2]

print(df_boroughs.shape)
df_boroughs.head(2)

(103, 2)


Unnamed: 0_level_0,Unnamed: 1_level_0,NumNeighbourhoods,Neighbourhoods
Postcode,Borough,Unnamed: 2_level_1,Unnamed: 3_level_1
M1B,Scarborough,2,"Rouge,Malvern, Rouge, Malvern"
M1C,Scarborough,3,"Highland Creek,Rouge Hill,Port Union, Highland..."


In [18]:
df_boroughs.reset_index(inplace=True)

print(df_boroughs.shape)
df_boroughs.head(2)

(103, 4)


Unnamed: 0,Postcode,Borough,NumNeighbourhoods,Neighbourhoods
0,M1B,Scarborough,2,"Rouge,Malvern, Rouge, Malvern"
1,M1C,Scarborough,3,"Highland Creek,Rouge Hill,Port Union, Highland..."


In [19]:
df_boroughs.drop('NumNeighbourhoods',axis=1, inplace=True)

In [20]:
df_boroughs.head(25)

Unnamed: 0,Postcode,Borough,Neighbourhoods
0,M1B,Scarborough,"Rouge,Malvern, Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union, Highland..."
2,M1E,Scarborough,"Guildwood,Morningside,West Hill, Guildwood, Mo..."
3,M1G,Scarborough,"Woburn, Woburn"
4,M1H,Scarborough,"Cedarbrae, Cedarbrae"
5,M1J,Scarborough,"Scarborough Village, Scarborough Village"
6,M1K,Scarborough,"East Birchmount Park,Ionview,Kennedy Park, Eas..."
7,M1L,Scarborough,"Clairlea,Golden Mile,Oakridge, Clairlea, Golde..."
8,M1M,Scarborough,"Cliffcrest,Cliffside,Scarborough Village West,..."
9,M1N,Scarborough,"Birch Cliff,Cliffside West, Birch Cliff, Cliff..."


In [21]:
result = df_boroughs

### Store data for next section of assignment

In [22]:
result.to_pickle('toronto_postal_codes.pckl')