# Segmenting and Clustering Neighborhoods in Toronto

## Leobardo Gómez

### Part 1

In [1]:
# Read the data
import pandas as pd # library to process data as dataframes

toronto_df = pd.read_csv('Toronto.csv')
toronto_df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [3]:
# The dataframe will consist of three columns: PostalCode, Borough, and Neighborhood
toronto_df.columns = ['PostalCode','Borough', 'Neighborhood']
toronto_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [14]:
# Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned.
toronto_assigned = toronto_df[toronto_df['Borough'] != 'Not assigned']
toronto_assigned = toronto_assigned.reset_index(drop=True)
toronto_assigned.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights


In [31]:
# More than one neighborhood can exist in one postal code area. 
# These two rows will be combined into one row with the neighborhoods separated with a comma.
toronto_combined = toronto_assigned.groupby(['PostalCode','Borough']).agg(Neighborhood=('Neighborhood',', '.join)).reset_index()
toronto_combined.head()


Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [33]:
# If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough
toronto_combined.loc[toronto_combined.Neighborhood == 'Not assigned', 'Neighborhood'] = toronto_combined.loc[toronto_combined.Neighborhood == 'Not assigned', 'Borough']
toronto_combined.loc[toronto_combined.PostalCode == 'M7A']

Unnamed: 0,PostalCode,Borough,Neighborhood
85,M7A,Queen's Park,Queen's Park


In [34]:
# In the last cell of your notebook, use the .shape method to print the number of rows of your dataframe.
toronto_combined.shape

(103, 3)