# Segmenting and Clustering Neighborhoods in Toronto

In [7]:
conda install -c anaconda beautifulsoup4

Collecting package metadata (current_repodata.json): done
Solving environment: done

## Package Plan ##

  environment location: /home/jupyterlab/conda/envs/python

  added / updated specs:
    - beautifulsoup4


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    beautifulsoup4-4.8.2       |           py36_0         161 KB  anaconda
    ca-certificates-2020.1.1   |                0         132 KB  anaconda
    certifi-2019.11.28         |           py36_1         157 KB  anaconda
    openssl-1.1.1              |       h7b6447c_0         5.0 MB  anaconda
    soupsieve-2.0              |             py_0          33 KB  anaconda
    ------------------------------------------------------------
                                           Total:         5.5 MB

The following NEW packages will be INSTALLED:

  beautifulsoup4     anaconda/linux-64::beautifulsoup4-4.8.2-py36_0
  soupsieve          a

### First of all lets import the required libraries 

In [8]:
import requests
import urllib.request
import time
from bs4 import BeautifulSoup 
import numpy as np
import pandas as pd 
from urllib.request import urlopen

### Know lets define some variables to be used later

In [9]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

html = urlopen(url)

soup = BeautifulSoup(html, 'html.parser')

In [10]:
my_table = soup.find_all('table', class_= 'wikitable')

### It is time to write a for loop to pull the data from the url mentioned above

In [57]:
postal_codes = []
boroughs = []
neighbourhoods = []

for table in my_table:
    rows = table.find_all('tr')
    
    for row in rows:
        cells = row.find_all('td')
        
        if len(cells)==3:
            postal_codes.append(cells[0].find(text=True).strip())
            boroughs.append(cells[1].find(text=True).strip())
            neighbourhoods.append(cells[2].find(text=True).strip())
            

### Lets put the data in a dataframe and check if anything is missing

In [58]:
df = pd.DataFrame(postal_codes,
                  columns = ['Postal Code'])

df['Borough'] = boroughs
df['Neighbourhood'] = neighbourhoods

print(df.shape)
df.head(10)

(180, 3)


Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront
5,M6A,North York,Lawrence Manor / Lawrence Heights
6,M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government
7,M8A,Not assigned,
8,M9A,Etobicoke,Islington Avenue
9,M1B,Scarborough,Malvern / Rouge


### Lets check if any of the values in the 'Neighbourhood' column is 'Not assigned'

In [59]:
df['Neighbourhood'] != 'Not assigned'

0      True
1      True
2      True
3      True
4      True
       ... 
175    True
176    True
177    True
178    True
179    True
Name: Neighbourhood, Length: 180, dtype: bool

### Know it is time to clean the data so that it looks like the end result table required to complete the assignment 

In [60]:
df_filter = df['Borough'] != 'Not assigned'
df_filter.head(10)

df2 = df[df_filter]
df2.head()

df2.reset_index(inplace = True, drop = True)
df2.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Regent Park / Harbourfront
3,M6A,North York,Lawrence Manor / Lawrence Heights
4,M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government


In [61]:
df2['Neighbourhood'].replace('/', ',',regex = True, inplace = True)
df2.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)


Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park , Harbourfront"
3,M6A,North York,"Lawrence Manor , Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park , Ontario Provincial Government"


### A final check that the shape matches the intended shape

In [62]:
df2.shape

(103, 3)

### Lets read the csv containing our latitudes and longitude as a dataframe

In [63]:
df_lat = pd.read_csv('http://cocl.us/Geospatial_data')
df_lat.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


### Now we will merge both dataframes to get the latitudes and longitudes for 

In [70]:
df2_lat = df2.merge(df_lat, how = 'inner', on = 'Postal Code')
df2_lat.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park , Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor , Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park , Ontario Provincial Government",43.662301,-79.389494


In [69]:
df2_lat.shape

(103, 5)