### Segmenting and Clustering Neighborhoods in Toronto

#### Importing Libraries

In [1]:
from bs4 import BeautifulSoup
import requests   # library to handle requests
import lxml       # parse the website in lxml format
import numpy as np
import pandas as pd

#### Scraping website using Beautiful Soup

In [2]:
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(source, 'lxml')
table = soup.find('table', class_='wikitable sortable')
# print(table.prettify())

#### Getting Table Values

In [3]:
table1 = ""
for tr in table.find_all('tr'):
    row = ""
    for tds in tr.find_all('td'):
        row = row + " " + tds.text
    table1 = table1 + row[1:]
print(table1)

M1A
 Not assigned
 Not assigned
M2A
 Not assigned
 Not assigned
M3A
 North York
 Parkwoods
M4A
 North York
 Victoria Village
M5A
 Downtown Toronto
 Regent Park, Harbourfront
M6A
 North York
 Lawrence Manor, Lawrence Heights
M7A
 Downtown Toronto
 Queen's Park, Ontario Provincial Government
M8A
 Not assigned
 Not assigned
M9A
 Etobicoke
 Islington Avenue, Humber Valley Village
M1B
 Scarborough
 Malvern, Rouge
M2B
 Not assigned
 Not assigned
M3B
 North York
 Don Mills
M4B
 East York
 Parkview Hill, Woodbine Gardens
M5B
 Downtown Toronto
 Garden District, Ryerson
M6B
 North York
 Glencairn
M7B
 Not assigned
 Not assigned
M8B
 Not assigned
 Not assigned
M9B
 Etobicoke
 West Deane Park, Princess Gardens, Martin Grove, Islington, Cloverdale
M1C
 Scarborough
 Rouge Hill, Port Union, Highland Creek
M2C
 Not assigned
 Not assigned
M3C
 North York
 Don Mills
M4C
 East York
 Woodbine Heights
M5C
 Downtown Toronto
 St. James Town
M6C
 York
 Humewood-Cedarvale
M7C
 Not assigned
 Not assigned
M8C
 N

#### Loading Table to CSV File

In [4]:
csv_file = open('toronto.csv', 'wb')
csv_file.write(bytes(table1,encoding="ascii",errors="ignore"))

7590

#### Converting Table Values to Data Frame

In [5]:
col_names = ["col1", "col2", "col3"]
df = pd.read_csv('toronto.csv', names=col_names)
df.columns = ['Postalcode', 'Borough', 'Neighbourhood']
df

Unnamed: 0,Postalcode,Borough,Neighbourhood
0,M1A,,
1,Not assigned,,
2,Not assigned,,
3,M2A,,
4,Not assigned,,
5,Not assigned,,
6,M3A,,
7,North York,,
8,Parkwoods,,
9,M4A,,


In [6]:
for n in range(539):
    df['Borough'][n]=df['Postalcode'][n+1]

df.head()

Unnamed: 0,Postalcode,Borough,Neighbourhood
0,M1A,Not assigned,
1,Not assigned,Not assigned,
2,Not assigned,M2A,
3,M2A,Not assigned,
4,Not assigned,Not assigned,


In [7]:
for i in range(539):
    df['Neighbourhood'][i]=df['Borough'][i+1]

df.head()

Unnamed: 0,Postalcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,Not assigned,Not assigned,M2A
2,Not assigned,M2A,Not assigned
3,M2A,Not assigned,Not assigned
4,Not assigned,Not assigned,M3A


In [8]:
df_2=df.iloc[::3]
df_2.reset_index
df_2.head()

Unnamed: 0,Postalcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
3,M2A,Not assigned,Not assigned
6,M3A,North York,Parkwoods
9,M4A,North York,Victoria Village
12,M5A,Downtown Toronto,Regent Park


#### Deleting 'Not Assigned' Values

In [9]:

x=df_2[df_2['Borough']==' Not assigned'].index
x
df_2.drop(x,inplace=True)
df_2.head(10)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


Unnamed: 0,Postalcode,Borough,Neighbourhood
6,M3A,North York,Parkwoods
9,M4A,North York,Victoria Village
12,M5A,Downtown Toronto,Regent Park
15,M6A,North York,Lawrence Manor
18,M7A,Downtown Toronto,Queen's Park
24,M9A,Etobicoke,Islington Avenue
27,M1B,Scarborough,Malvern
33,M3B,North York,Don Mills
36,M4B,East York,Parkview Hill
39,M5B,Downtown Toronto,Garden District


In [10]:
df_3=df_2.reset_index()
df_n=df_3.drop(['index'], axis=1)
df_n.head()

Unnamed: 0,Postalcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Regent Park
3,M6A,North York,Lawrence Manor
4,M7A,Downtown Toronto,Queen's Park


#### Using .shape method to find the number of rows of DF

In [11]:
print('Rows of DF,Columns of DF',df_n.shape)

Rows of DF,Columns of DF (103, 3)


### Dataframe with LAT, LNG Values

In [12]:
!wget -q -O 'Toronto_location.csv' https://cocl.us/Geospatial_data

df_loc = pd.read_csv('Toronto_location.csv')
df_loc.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


#### Correcting the column names and checking the shape of the data frame

In [13]:
df_loc.columns=['Postalcode','Latitude','Longitude']
df_loc.shape


(103, 3)

#### Joining the data frames to get Lat & Long for the PCs

In [14]:
df_merg=pd.merge(df_n,df_loc,on='Postalcode')

In [15]:
df_merg.head()

Unnamed: 0,Postalcode,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,Regent Park,43.65426,-79.360636
3,M6A,North York,Lawrence Manor,43.718518,-79.464763
4,M7A,Downtown Toronto,Queen's Park,43.662301,-79.389494


### Q3: __*Explore and cluster the neighborhoods in Toronto. You can decide to work with only boroughs that contain the word Toronto and then replicate the same analysis we did to the New York City data. It is up to you.*__

Just make sure:

1. to add enough Markdown cells to explain what you decided to do and to report any observations you make.
2. to generate maps to visualize your neighborhoods and how they cluster together.

In [16]:
import requests # library to handle requests
import pandas as pd # library for data analsysis
import numpy as np # library to handle data in a vectorized manner
import random # library for random number generation

!conda install -c conda-forge geopy --yes 
from geopy.geocoders import Nominatim # module to convert an address into latitude and longitude values

# libraries for displaying images
from IPython.display import Image 
from IPython.core.display import HTML 
    
# tranforming json file into a pandas dataframe library
from pandas.io.json import json_normalize

!conda install -c conda-forge folium=0.5.0 --yes
import folium # plotting library

print('Folium installed')
print('Libraries imported.')

Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - geopy


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    certifi-2020.4.5.1         |   py36h9f0ad1d_0         151 KB  conda-forge
    python_abi-3.6             |          1_cp36m           4 KB  conda-forge
    ca-certificates-2020.4.5.1 |       hecc5488_0         146 KB  conda-forge
    openssl-1.1.1g             |       h516909a_0         2.1 MB  conda-forge
    geographiclib-1.50         |             py_0          34 KB  conda-forge
    geopy-1.22.0               |     pyh9f0ad1d_0          63 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         2.5 MB

The following NEW packages will be INSTALLED:

    geographiclib:   1.50-py_0           conda-forge
    geopy:          

In [17]:
CLIENT_ID = 'TQ0EQVSWEW1PZSVXLNC0CPJ1QEXZIYAAH1GGUVKU4IOWS4GP' # your Foursquare ID
CLIENT_SECRET = 'WOAR3RL51SFCKQZ0QZKQ2YVQIUHXXWLAABMAAOC5NZL4USGN' # your Foursquare Secret
VERSION = '20180604'
LIMIT = 30
print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: TQ0EQVSWEW1PZSVXLNC0CPJ1QEXZIYAAH1GGUVKU4IOWS4GP
CLIENT_SECRET:WOAR3RL51SFCKQZ0QZKQ2YVQIUHXXWLAABMAAOC5NZL4USGN


In [18]:
address = 'Toronto,ON'

geolocator = Nominatim(user_agent="Toronto")
location = geolocator.geocode(address)
latitude_tor = location.latitude
longitude_tor = location.longitude
print(latitude_tor, longitude_tor)

43.6534817 -79.3839347


In [19]:
Toronto_map = folium.Map(location=[latitude_tor, longitude_tor], zoom_start=13) # generate map centred around the Conrad Hotel
# add a red circle marker to represent the general Lat Long of Toronto
folium.features.CircleMarker(
    [latitude_tor, longitude_tor],
    radius=10,
    color='red',
    fill = True,
    fill_color = 'red',
    fill_opacity = 0.6
).add_to(Toronto_map)

# add the Italian restaurants as blue circle markers
for lat, lng, borough,Neighborhood in zip(df_merg['Latitude'], df_merg['Longitude'], df_merg['Borough'], df_merg['Neighbourhood']):
    label = '{}, {}'.format(Neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.features.CircleMarker(
        [lat, lng],
        radius=5,
        color='blue',
        popup=label,
        fill = True,
        fill_color='blue',
        fill_opacity=0.6
    ).add_to(Toronto_map)



# display map
Toronto_map

In [20]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])
        nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [22]:
toronto_venues = getNearbyVenues(names=df_merg['Neighbourhood'],
                                   latitudes=df_merg['Latitude'],
                                   longitudes=df_merg['Longitude']
                                  )
toronto_venues.head()

 Parkwoods
 Victoria Village
 Regent Park
 Lawrence Manor
 Queen's Park
 Islington Avenue
 Malvern
 Don Mills
 Parkview Hill
 Garden District
 Glencairn
 West Deane Park
 Rouge Hill
 Don Mills
 Woodbine Heights
 St. James Town
 Humewood-Cedarvale
 Eringate
 Guildwood
 The Beaches
 Berczy Park
 Caledonia-Fairbanks
 Woburn
 Leaside
 Central Bay Street
 Christie
 Cedarbrae
 Hillcrest Village
 Bathurst Manor
 Thorncliffe Park
 Richmond
 Dufferin
 Scarborough Village
 Fairview
 Northwood Park
 East Toronto
 Harbourfront East
 Little Portugal
 Kennedy Park
 Bayview Village
 Downsview
 The Danforth West
 Toronto Dominion Centre
 Brockton
 Golden Mile
 York Mills
 Downsview
 India Bazaar
 Commerce Court
 North Park
 Humber Summit
 Cliffside
 Willowdale
 Downsview
 Studio District
 Bedford Park
 Del Ray
 Humberlea
 Birch Cliff
 Willowdale
 Downsview
 Lawrence Park
 Roselawn
 Runnymede
 Weston
 Dorset Park
 York Mills West
 Davisville North
 Forest Hill North & West
 High Park
 Westmount
 Wexfor

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Parkwoods,43.753259,-79.329656,Brookbanks Park,43.751976,-79.33214,Park
1,Parkwoods,43.753259,-79.329656,KFC,43.754387,-79.333021,Fast Food Restaurant
2,Parkwoods,43.753259,-79.329656,Variety Store,43.751974,-79.333114,Food & Drink Shop
3,Victoria Village,43.725882,-79.315572,Victoria Village Arena,43.723481,-79.315635,Hockey Arena
4,Victoria Village,43.725882,-79.315572,Tim Hortons,43.725517,-79.313103,Coffee Shop


### And now to find how many Tim Hortons there are in Toronto

In [182]:
X=toronto_venues.groupby('Venue').count().loc['Tim Hortons']
X


Neighborhood              27
Neighborhood Latitude     27
Neighborhood Longitude    27
Venue Latitude            27
Venue Longitude           27
Venue Category            27
Name: Tim Hortons, dtype: int64

#### Lets find locations of all Tim Hortons and put them on the map

In [172]:
T_H=toronto_venues[toronto_venues['Venue']=='Tim Hortons'].index
TH_Map=toronto_venues.loc[T_H]
TH_Map.reset_index()
TH_Map.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
4,Victoria Village,43.725882,-79.315572,Tim Hortons,43.725517,-79.313103,Coffee Shop
41,Lawrence Manor,43.718518,-79.464763,Tim Hortons,43.719427,-79.467995,Coffee Shop
71,Queen's Park,43.662301,-79.389494,Tim Hortons,43.661038,-79.393797,Coffee Shop
75,Queen's Park,43.662301,-79.389494,Tim Hortons,43.659415,-79.391221,Coffee Shop
77,Queen's Park,43.662301,-79.389494,Tim Hortons,43.658175,-79.390681,Coffee Shop


In [181]:
Toronto_map = folium.Map(location=[latitude_tor, longitude_tor], zoom_start=13) 
# add a red circle marker to represent the Tim Hortons


for vlat,vlng,venue,hood in zip(TH_Map['Venue Latitude'],TH_Map['Venue Longitude'],TH_Map['Venue'],TH_Map['Neighborhood']):
        label = '{}, {}'.format(venue, hood)
        label = folium.Popup(label, parse_html=True)

        folium.features.CircleMarker(
       [vlat,vlng],
       radius=10,
       color='red',
       popup=label,
       fill = True,
       fill_color = 'red',
       fill_opacity = 0.6
).add_to(Toronto_map)

# add the Neighborhoods as blue circle markers
for lat, lng, borough,Neighborhood in zip(df_merg['Latitude'], df_merg['Longitude'], df_merg['Borough'], df_merg['Neighbourhood']):
    label = '{}, {}'.format(Neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.features.CircleMarker(
        [lat, lng],
        radius=5,
        color='blue',
        popup=label,
        fill = True,
        fill_color='blue',
        fill_opacity=0.6
    ).add_to(Toronto_map)



# display map
Toronto_map