  # Segmenting and Clustering Neighborhoods in Toronto : Part-1

Before we get the data and start exploring it, let's download all the dependencies that we will need.

In [1]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

!pip install geopy 
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
#!pip install -U scikit-learn scipy matplotlib
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Libraries imported.


##### Let's install the `BeautifulSoup` package, Beautiful Soup is a Python library for pulling data out of HTML and XML files.

In [2]:
!pip install BeautifulSoup4
print('Package installed !')

Package installed !


##### Importing `bs4` and `request`, Requests is a simple HTTP library for Python

In [3]:
from bs4 import BeautifulSoup
import requests

##### Now lets initialize the url with the Wikipedia page that has Torranto's neighborhood data

In [5]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
data = requests.get(url)
print(data.headers)

{'Date': 'Fri, 29 May 2020 19:08:01 GMT', 'Vary': 'Accept-Encoding,Cookie,Authorization', 'Server': 'ATS/8.0.7', 'Content-Type': 'text/html; charset=UTF-8', 'X-Content-Type-Options': 'nosniff', 'P3P': 'CP="See https://en.wikipedia.org/wiki/Special:CentralAutoLogin/P3P for more info."', 'Content-language': 'en', 'Last-Modified': 'Sat, 23 May 2020 19:32:34 GMT', 'Content-Encoding': 'gzip', 'Age': '46881', 'X-Cache': 'cp1081 hit, cp1077 hit/55', 'X-Cache-Status': 'hit-front', 'Server-Timing': 'cache;desc="hit-front"', 'Strict-Transport-Security': 'max-age=106384710; includeSubDomains; preload', 'Set-Cookie': 'WMF-Last-Access=30-May-2020;Path=/;HttpOnly;secure;Expires=Wed, 01 Jul 2020 00:00:00 GMT, WMF-Last-Access-Global=30-May-2020;Path=/;Domain=.wikipedia.org;HttpOnly;secure;Expires=Wed, 01 Jul 2020 00:00:00 GMT, GeoIP=CA:ON:Markham:43.88:-79.26:v4; Path=/; secure; Domain=.wikipedia.org', 'X-Client-IP': '158.85.103.71', 'Cache-Control': 'private, s-maxage=0, max-age=0, must-revalidate', 

##### Since Data is present on HTML page, Using HTML parser to pull out data
Points to be noted for processing the Dataset: 
1. We have to process the cells that have an assigned borough
2. If a cell has a borough but a **Not assigned** neighborhood, then the neighborhood will be the same as the borough.
3. More than one neighborhood can exist in one postal code area, These rows will be combined into one row with the neighborhoods separated with a comma

In [8]:
soup = BeautifulSoup(data.content, 'html.parser')
table = soup.find('table')
trs = table.find_all('tr')
rows = []

for tr in trs:
    i = tr.find_all('td')
    if i:
        rows.append(i)
        
lst = []
for row in rows:
    postalcode = row[0].text.rstrip()
    borough = row[1].text.rstrip()
    neighborhood = row[2].text.rstrip()
    if borough != 'Not assigned':
        if neighborhood == 'Not assigned':
            neighborhood = borough
        lst.append([postalcode, borough, neighborhood])

Now, Let's convert data into pandas dataframe

In [9]:
cols = ['PostalCode', 'Borough', 'Neighborhood']
df = pd.DataFrame(lst, columns=cols)

Let's see few entries from our dataset 

In [10]:
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


Let's see the dimensions of the dataset

In [11]:
df.shape

(103, 3)