# Segmenting and Clustering Neighborhoods in Toronto

## Part 1

In [131]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

# !conda install -c conda-forge geopy 
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

# !conda install -c conda-forge folium=0.5.0 
import folium # map rendering library

# !conda install -c anaconda beautifulsoup4
from bs4 import BeautifulSoup as bs

print('Libraries imported.')

Libraries imported.


In [132]:
wiki_page = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')
html = wiki_page.content

In [133]:
page = bs(html, 'html.parser')
data = page.find_all("td")
# print(data)

for item in data:
    if item.a:
        item = item.a.unwrap()
#     print(item)
    
# page


In [134]:
rows = []
t_rows = page.table.find_all("tr")

for item in t_rows:
    t_row = []
    contents = item.contents
    for content in contents:
        try:
            t_row.append(content.contents[0].strip())
#             print(content.contents[0].strip())
        except:
            t_row.append(content.string.strip())
#             print(content.string)
#         print(t_row)
    rows.append(t_row)
rows
    
    

[['', 'Postcode', '', 'Borough', '', 'Neighbourhood'],
 ['', 'M1A', '', 'Not assigned', '', 'Not assigned'],
 ['', 'M2A', '', 'Not assigned', '', 'Not assigned'],
 ['', 'M3A', '', 'North York', '', 'Parkwoods'],
 ['', 'M4A', '', 'North York', '', 'Victoria Village'],
 ['', 'M5A', '', 'Downtown Toronto', '', 'Harbourfront'],
 ['', 'M6A', '', 'North York', '', 'Lawrence Heights'],
 ['', 'M6A', '', 'North York', '', 'Lawrence Manor'],
 ['', 'M7A', '', 'Downtown Toronto', '', "Queen's Park"],
 ['', 'M8A', '', 'Not assigned', '', 'Not assigned'],
 ['', 'M9A', '', "Queen's Park", '', 'Not assigned'],
 ['', 'M1B', '', 'Scarborough', '', 'Rouge'],
 ['', 'M1B', '', 'Scarborough', '', 'Malvern'],
 ['', 'M2B', '', 'Not assigned', '', 'Not assigned'],
 ['', 'M3B', '', 'North York', '', 'Don Mills North'],
 ['', 'M4B', '', 'East York', '', 'Woodbine Gardens'],
 ['', 'M4B', '', 'East York', '', 'Parkview Hill'],
 ['', 'M5B', '', 'Downtown Toronto', '', 'Ryerson'],
 ['', 'M5B', '', 'Downtown Toronto'

In [135]:
t_data = pd.DataFrame(rows)
t_data.drop([0,2,4], axis = 1, inplace=True)
t_data.columns = t_data.iloc[0]
t_data.drop(0, axis=0, inplace=True)
t_data.reset_index(drop=True, inplace=True)
t_data.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [136]:
for ind in t_data.index:
#     print(ind)
    if t_data["Borough"][ind] == "Not assigned":
        t_data.drop(ind, axis=0, inplace=True)
#         print(ind)



t_data.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor


In [137]:
for ind in t_data.index:
    if t_data["Neighbourhood"][ind] == "Not assigned":
        t_data["Neighbourhood"][ind] = t_data["Borough"][ind]
t_data.reset_index(drop=True, inplace=True)

t_data.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,Lawrence Heights
4,M6A,North York,Lawrence Manor


In [138]:
for ind in t_data.index:
    try:
        ind + 1
        if t_data["Postcode"][ind] == t_data["Postcode"][ind+1] and t_data["Borough"][ind] == t_data["Borough"][ind+1]:
            t_data["Neighbourhood"][ind] = t_data["Neighbourhood"][ind] + ", " + t_data["Neighbourhood"][ind+1]
            t_data.drop(ind+1, axis=0, inplace=True)
    except:
        continue
        
t_data.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
5,M7A,Downtown Toronto,Queen's Park


In [139]:
t_data.shape

(139, 3)

## Part 2

In [140]:
coords = pd.read_csv('Geospatial_Coordinates.csv')
coords.sort_values(by=["Postal Code"], axis=0, ascending="true")
coords.head()

t_data.insert(3, column="Latitude", value=np.nan)
t_data.insert(4, column="Longitude", value=np.nan)

t_data

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,,
1,M4A,North York,Victoria Village,,
2,M5A,Downtown Toronto,Harbourfront,,
3,M6A,North York,"Lawrence Heights, Lawrence Manor",,
5,M7A,Downtown Toronto,Queen's Park,,
6,M9A,Queen's Park,Queen's Park,,
7,M1B,Scarborough,"Rouge, Malvern",,
9,M3B,North York,Don Mills North,,
10,M4B,East York,"Woodbine Gardens, Parkview Hill",,
12,M5B,Downtown Toronto,"Ryerson, Garden District",,


In [141]:
for ind in t_data.index:
#     print(t_data['Postcode'][ind])
    for index in coords.index:
        if t_data["Postcode"][ind] == coords["Postal Code"][index]:
            t_data["Longitude"][ind] = coords["Longitude"][index]
            t_data["Latitude"][ind] = coords["Latitude"][index]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [142]:
t_data

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763
5,M7A,Downtown Toronto,Queen's Park,43.662301,-79.389494
6,M9A,Queen's Park,Queen's Park,43.667856,-79.532242
7,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
9,M3B,North York,Don Mills North,43.745906,-79.352188
10,M4B,East York,"Woodbine Gardens, Parkview Hill",43.706397,-79.309937
12,M5B,Downtown Toronto,"Ryerson, Garden District",43.657162,-79.378937
