# Segmenting and Clustering Neighborhoods in Toronto

### 1. Downloading and Cleaning Toronto Dataset.

#### Importing necessary packages

In [67]:
import lxml
import numpy as np
import pandas as pd 
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json
import requests
import geopy

#### Reading the data directly from the html into a dataframe.

In [58]:
url = 'https://en.wikipedia.org/w/index.php?title=List_of_postal_codes_of_Canada:_M&oldid=1011037969'
data = pd.read_html(url)
neighborhoods = data[0]

#Assigning column names.
column_names = ['Postal Code', 'Borough', 'Neighborhood']
neighborhoods.columns = column_names
neighborhoods.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


#### Removing rows with unassigned borough by changing it to NaN and droping rows containing NaN. 

In [59]:
neighborhoods['Borough'] = neighborhoods['Borough'].replace('Not assigned', np.NaN)
neighborhoods = neighborhoods.dropna().reset_index(drop=True)

In [60]:
neighborhoods.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


#### No occurances of not assigned in Neighborhood column, so no need to cast any Borough values into this column.

In [61]:
(neighborhoods['Neighborhood'] == 'Not assigned').sum()

0

In [62]:
neighborhoods.shape

(103, 3)

### 2. Adding Latitude and Longitude of Each Postcode.

##### Downloading csv file from Coursera since geopy would not work...

In [63]:
url = 'https://cocl.us/Geospatial_data'
latlang = pd.read_table(url, sep=',')

In [57]:
latlang.shape

(103, 3)

#### Merging the two dataframes based on Postal Code

In [65]:
geospatial = pd.merge(neighborhoods, latlang, on='Postal Code', how='outer')
geospatial.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


In [66]:
geospatial.shape

(103, 5)