### Segmenting and Clustering Neighborhoods in Toronto

#### Importing Libraries

In [166]:
from bs4 import BeautifulSoup
import requests   # library to handle requests
import lxml       # parse the website in lxml format
import numpy as np
import pandas as pd

#### Scraping website using Beautiful Soup

In [167]:
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(source, 'lxml')
table = soup.find('table', class_='wikitable sortable')
# print(table.prettify())

#### Getting Table Values

In [168]:
table1 = ""
for tr in table.find_all('tr'):
    row = ""
    for tds in tr.find_all('td'):
        row = row + " " + tds.text
    table1 = table1 + row[1:]
print(table1)

M1A
 Not assigned
 Not assigned
M2A
 Not assigned
 Not assigned
M3A
 North York
 Parkwoods
M4A
 North York
 Victoria Village
M5A
 Downtown Toronto
 Regent Park, Harbourfront
M6A
 North York
 Lawrence Manor, Lawrence Heights
M7A
 Downtown Toronto
 Queen's Park, Ontario Provincial Government
M8A
 Not assigned
 Not assigned
M9A
 Etobicoke
 Islington Avenue, Humber Valley Village
M1B
 Scarborough
 Malvern, Rouge
M2B
 Not assigned
 Not assigned
M3B
 North York
 Don Mills
M4B
 East York
 Parkview Hill, Woodbine Gardens
M5B
 Downtown Toronto
 Garden District, Ryerson
M6B
 North York
 Glencairn
M7B
 Not assigned
 Not assigned
M8B
 Not assigned
 Not assigned
M9B
 Etobicoke
 West Deane Park, Princess Gardens, Martin Grove, Islington, Cloverdale
M1C
 Scarborough
 Rouge Hill, Port Union, Highland Creek
M2C
 Not assigned
 Not assigned
M3C
 North York
 Don Mills
M4C
 East York
 Woodbine Heights
M5C
 Downtown Toronto
 St. James Town
M6C
 York
 Humewood-Cedarvale
M7C
 Not assigned
 Not assigned
M8C
 N

#### Loading Table to CSV File

In [169]:
csv_file = open('toronto.csv', 'wb')
csv_file.write(bytes(table1,encoding="ascii",errors="ignore"))

7590

#### Converting Table Values to Data Frame

In [170]:
col_names = ["col1", "col2", "col3"]
df = pd.read_csv('toronto.csv', names=col_names)
df.columns = ['Postalcode', 'Borough', 'Neighbourhood']
df

Unnamed: 0,Postalcode,Borough,Neighbourhood
0,M1A,,
1,Not assigned,,
2,Not assigned,,
3,M2A,,
4,Not assigned,,
5,Not assigned,,
6,M3A,,
7,North York,,
8,Parkwoods,,
9,M4A,,


In [171]:
for n in range(539):
    df['Borough'][n]=df['Postalcode'][n+1]

df.head()

Unnamed: 0,Postalcode,Borough,Neighbourhood
0,M1A,Not assigned,
1,Not assigned,Not assigned,
2,Not assigned,M2A,
3,M2A,Not assigned,
4,Not assigned,Not assigned,


In [172]:
for i in range(539):
    df['Neighbourhood'][i]=df['Borough'][i+1]

df.head()

Unnamed: 0,Postalcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,Not assigned,Not assigned,M2A
2,Not assigned,M2A,Not assigned
3,M2A,Not assigned,Not assigned
4,Not assigned,Not assigned,M3A


In [173]:
df_2=df.iloc[::3]
df_2.reset_index
df_2.head()

Unnamed: 0,Postalcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
3,M2A,Not assigned,Not assigned
6,M3A,North York,Parkwoods
9,M4A,North York,Victoria Village
12,M5A,Downtown Toronto,Regent Park


#### Deleting 'Not Assigned' Values

In [174]:

x=df_2[df_2['Borough']==' Not assigned'].index
x
df_2.drop(x,inplace=True)
df_2.head(10)

Unnamed: 0,Postalcode,Borough,Neighbourhood
6,M3A,North York,Parkwoods
9,M4A,North York,Victoria Village
12,M5A,Downtown Toronto,Regent Park
15,M6A,North York,Lawrence Manor
18,M7A,Downtown Toronto,Queen's Park
24,M9A,Etobicoke,Islington Avenue
27,M1B,Scarborough,Malvern
33,M3B,North York,Don Mills
36,M4B,East York,Parkview Hill
39,M5B,Downtown Toronto,Garden District


In [176]:
df_3=df_2.reset_index()
df_n=df_3.drop(['index'], axis=1)
df_n.head()

#### Using .shape method to find the number of rows of DF

In [177]:
print('Rows of DF,Columns of DF',df_n.shape)

Rows of DF,Columns of DF (103, 3)


### Dataframe with LAT, LNG Values

In [157]:
!wget -q -O 'Toronto_location.csv' https://cocl.us/Geospatial_data

df_loc = pd.read_csv('Toronto_location.csv')
df_loc.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


#### Correcting the column names and checking the shape of the data frame

In [158]:
df_loc.columns=['Postalcode','Latitude','Longitude']
df_loc.shape


(103, 3)

#### Joining the data frames to get Lat & Long for the PCs

In [159]:
df_merg=pd.merge(df_n,df_loc,on='Postalcode')

In [165]:
df_merged=df_merg.drop(['index'], axis=1)
df_merged.head()

Unnamed: 0,Postalcode,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,Regent Park,43.65426,-79.360636
3,M6A,North York,Lawrence Manor,43.718518,-79.464763
4,M7A,Downtown Toronto,Queen's Park,43.662301,-79.389494
