# Segmenting and Clustering Neighborhoods in Toronto

In [39]:
import numpy as np # library to handle data in a vectorized manner
from bs4 import BeautifulSoup
import requests
import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Libraries imported.


#### Storing the contents of website in soup object 

In [4]:
r = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')
content = r.content
soup = BeautifulSoup(content)

#### Creating a DataFrame 

In [6]:
table_contents=[]
table=soup.find('table')
for row in table.findAll('td'):
    cell = {}
    if row.span.text=='Not assigned':
        pass
    else:
        cell['PostalCode'] = row.p.text[:3]
        cell['Borough'] = (row.span.text).split('(')[0]
        cell['Neighborhood'] = (((((row.span.text).split('(')[1]).strip(')')).replace(' /',',')).replace(')',' ')).strip(' ')
        table_contents.append(cell)

# print(table_contents)
df=pd.DataFrame(table_contents)
df['Borough']=df['Borough'].replace({'Downtown TorontoStn A PO Boxes25 The Esplanade':'Downtown Toronto Stn A',
                                             'East TorontoBusiness reply mail Processing Centre969 Eastern':'East Toronto Business',
                                             'EtobicokeNorthwest':'Etobicoke Northwest','East YorkEast Toronto':'East York/East Toronto',
                                             'MississaugaCanada Post Gateway Processing Centre':'Mississauga'})

In [7]:
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Queen's Park,Ontario Provincial Government


In [10]:
df.tail()

Unnamed: 0,PostalCode,Borough,Neighborhood
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto Business,Enclave of M4L
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."
102,M8Z,Etobicoke,"Mimico NW, The Queensway West, South of Bloor,..."


In [8]:
df.shape

(103, 3)

In [17]:
geodata = pd.read_csv('Geospatial_Coordinates.csv')
geodata.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [19]:
geodata.shape

(103, 3)

In [22]:
frames = [df,geodata]
result_df = pd.concat(frames,axis=1) #using concat to merge both the dataframes

In [23]:
result_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Postal Code,Latitude,Longitude
0,M3A,North York,Parkwoods,M1B,43.806686,-79.194353
1,M4A,North York,Victoria Village,M1C,43.784535,-79.160497
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",M1E,43.763573,-79.188711
3,M6A,North York,"Lawrence Manor, Lawrence Heights",M1G,43.770992,-79.216917
4,M7A,Queen's Park,Ontario Provincial Government,M1H,43.773136,-79.239476


In [63]:
final_df = result_df.drop('Postal Code', axis=1) #dropping th redundant column from final dataframe 
final_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.806686,-79.194353
1,M4A,North York,Victoria Village,43.784535,-79.160497
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.763573,-79.188711
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.770992,-79.216917
4,M7A,Queen's Park,Ontario Provincial Government,43.773136,-79.239476


In [97]:
final_df.shape #check number of rows and columns in final dataframe 

(103, 5)

In [98]:
final_df.isnull().sum() #check for NULL values 

PostalCode      0
Borough         0
Neighborhood    0
Latitude        0
Longitude       0
dtype: int64

In [99]:
print('The dataframe has {} boroughs and {} neighborhoods.'.format(
        len(final_df['Borough'].unique()),
        final_df.shape[0]
    )
)

The dataframe has 15 boroughs and 103 neighborhoods.


In [100]:
#setting default locations for latitude and longitude 
latitude =43.7111 
longitude =-79.2845

In [101]:
#create map of New York using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(final_df['Latitude'], final_df['Longitude'], final_df['Borough'], final_df['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

## Other simple way to create a folium map

In [103]:
locations = final_df[['Latitude', 'Longitude']]
locationlist = locations.values.tolist()
len(locationlist)

103

In [104]:
locationlist[7]

[43.711111700000004, -79.2845772]

In [117]:
map = folium.Map(location=[43.711111700000004, -79.2845772], zoom_start=12)
for point in range(0, len(locationlist)):
    folium.Marker(locationlist[point], popup=final_df['Borough'][point]).add_to(map)
map