<a href="https://colab.research.google.com/github/Go660088/Coursera_Capstone/blob/master/Toronto.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Segmenting and Clustering Neighborhoods in Toronto

In [0]:
import requests
import pandas as pd

#### Use ``pandas.read_html`` to import wiki table to data frame

In [2]:
df = pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')[0]

print(df.sort_values('Postcode').head())
print(df.shape)

   Postcode       Borough Neighbourhood
0       M1A  Not assigned  Not assigned
11      M1B   Scarborough       Malvern
10      M1B   Scarborough         Rouge
28      M1C   Scarborough    Port Union
27      M1C   Scarborough    Rouge Hill
(287, 3)


#### Ignore cells with a borough that is Not assigned

In [3]:
df = df[df['Borough'] != 'Not assigned']

print(df.sort_values('Postcode').head())
print(df.shape)

   Postcode      Borough   Neighbourhood
10      M1B  Scarborough           Rouge
11      M1B  Scarborough         Malvern
28      M1C  Scarborough      Port Union
27      M1C  Scarborough      Rouge Hill
26      M1C  Scarborough  Highland Creek
(210, 3)


#### Combine multi rows into one row with the neighborhoods separated with a comma 

In [0]:
group = df.groupby(by='Postcode')
grp_neighbourhood = group['Neighbourhood'].apply(','.join)

#### Rebuild the dataframe, with the combined neighbourhood column

In [5]:
grp_borough = group.first()['Borough']
df = pd.concat([grp_borough,grp_neighbourhood], axis=1).reset_index()

print(df.head())
print(df.shape)

  Postcode      Borough                         Neighbourhood
0      M1B  Scarborough                         Rouge,Malvern
1      M1C  Scarborough  Highland Creek,Rouge Hill,Port Union
2      M1E  Scarborough       Guildwood,Morningside,West Hill
3      M1G  Scarborough                                Woburn
4      M1H  Scarborough                             Cedarbrae
(103, 3)


#### If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough. 

In [6]:
mask = df['Neighbourhood'] == 'Not assigned'
df[mask]

Unnamed: 0,Postcode,Borough,Neighbourhood
85,M7A,Queen's Park,Not assigned


In [7]:
df['Neighbourhood'][mask] = df['Borough'][mask]
df[mask]

Unnamed: 0,Postcode,Borough,Neighbourhood
85,M7A,Queen's Park,Queen's Park


#### Number of rows

In [8]:
df.shape

(103, 3)

#### Get the csv file that has the geographical coordinates of each postal code

In [9]:
df_geo = pd.read_csv('http://cocl.us/Geospatial_data')
df_geo.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


#### Combine the geographical coordinates into dataframe

In [10]:
pd.concat([df,df_geo], axis=1)

Unnamed: 0,Postcode,Borough,Neighbourhood,Postal Code,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",M1B,43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",M1C,43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",M1E,43.763573,-79.188711
3,M1G,Scarborough,Woburn,M1G,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,M1H,43.773136,-79.239476
...,...,...,...,...,...,...
98,M9N,York,Weston,M9N,43.706876,-79.518188
99,M9P,Etobicoke,Westmount,M9P,43.696319,-79.532242
100,M9R,Etobicoke,"Kingsview Village,Martin Grove Gardens,Richvie...",M9R,43.688905,-79.554724
101,M9V,Etobicoke,"Albion Gardens,Beaumond Heights,Humbergate,Jam...",M9V,43.739416,-79.588437
