# Segmenting and clustering neighbourhoods in Toronto
### (Capstone project course on coursera)

# Part 1

In [1]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup as bsoup

### Toronto postal codes wikipedia page url:

In [2]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

In [3]:
page_req = requests.get(url)

### Reading html file and extracting table from it:

In [4]:
page = bsoup(page_req.text, "html.parser")
pgtable = page.table

In [5]:
results = pgtable.find_all("tr")
nrows = len(results)
nrows

181

In [6]:
headers = np.array(results[0].text.split("\n"))[[1,3,5]].tolist()
headers

['Postal Code', 'Borough', 'Neighbourhood']

### Creating dataframe of the table on wiki page:

In [7]:
records = []
n = 1

while n < nrows:
    current_row = results[n].text.split("\n")
    postcode = current_row[1]
    borough = current_row[3]
    nhood = current_row[5]
    records.append((postcode, borough, nhood))
    n = n+1
    
df = pd.DataFrame(records, columns = ["Postalcode", "Borough", "Neighbourhood"])
print(df.head(), "\n", df.shape)

  Postalcode           Borough              Neighbourhood
0        M1A      Not assigned               Not assigned
1        M2A      Not assigned               Not assigned
2        M3A        North York                  Parkwoods
3        M4A        North York           Victoria Village
4        M5A  Downtown Toronto  Regent Park, Harbourfront 
 (180, 3)


In [8]:
df.tail()

Unnamed: 0,Postalcode,Borough,Neighbourhood
175,M5Z,Not assigned,Not assigned
176,M6Z,Not assigned,Not assigned
177,M7Z,Not assigned,Not assigned
178,M8Z,Etobicoke,"Mimico NW, The Queensway West, South of Bloor,..."
179,M9Z,Not assigned,Not assigned


### Dropping rows for which borough is not assigned:

In [9]:
df1 = df[~df.Borough.str.contains("Not assigned")]
df1 = df1.reset_index(drop = True)
df1.head()

Unnamed: 0,Postalcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


### Rows for which neighbourhood is not assigned, it is same as borough:

In [10]:
df1.loc[df1['Neighbourhood'] == 'Not assigned', 'Neighbourhood'] = df1['Borough']
df1.head()

Unnamed: 0,Postalcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [11]:
df1['Postalcode'].nunique()

103

There are no repeated postal codes in the dataframe.

In [12]:
df1.head(15)

Unnamed: 0,Postalcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


In [13]:
df1.shape

(103, 3)

------End of part 1------

# Part 2: Adding lattitude and longitude to the dataframe

In [14]:
df_pcodes = pd.read_csv("http://cocl.us/Geospatial_data")
df_pcodes.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [15]:
df_pcodes.columns = ['Postalcode', 'Latitude', 'Longitude']
df_pcodes.head()

Unnamed: 0,Postalcode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [16]:
df_lat_lang=pd.merge(df1, df_pcodes, how='right', on = 'Postalcode')
df_lat_lang.head(15)

Unnamed: 0,Postalcode,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village",43.667856,-79.532242
6,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
7,M3B,North York,Don Mills,43.745906,-79.352188
8,M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937


In [17]:
df_lat_lang.shape

(103, 5)

------ End of part 2 ------

# Part 3:

In [18]:
import json
from geopy.geocoders import Nominatim


In [19]:
#!conda install -c conda-forge folium=0.5.0 --yes
import folium

In [20]:
address = "Toronto city, CA"

geolocator = Nominatim(user_agent = "ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto City are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto City are 43.6534817, -79.3839347.


In [21]:
map_toronto = folium.Map(location = [latitude, longitude], zoom_start = 10)

for lat, lng, borough, neighbourhood in zip(df_lat_lang["Latitude"], df_lat_lang["Longitude"], df_lat_lang["Borough"], df_lat_lang["Neighbourhood"]):
    label = '{},{}'.format(neighbourhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius = 5,
        popup = label,
        color = "blue",
        fill = True,
        fill_color = "#3186cc",
        fill_opacity = 0.7,
        parse_html = False).add_to(map_toronto)
    
    
map_toronto

In [28]:
# Foursquare credentials

In [23]:
df_lat_lang.loc[0, 'Neighbourhood']

'Parkwoods'

In [24]:
neighbourhood_latitude = df_lat_lang.loc[0, 'Latitude']
neighbourhood_longitude = df_lat_lang.loc[0, 'Longitude']
neighbourhood_name = df_lat_lang.loc[0, 'Neighbourhood']

print ("Latitude and longitude of {} are {}, {}".format(neighbourhood_name, neighbourhood_latitude, neighbourhood_longitude))

Latitude and longitude of Parkwoods are 43.7532586, -79.3296565


In [27]:
# URL

In [26]:
json_req = requests.get(json_url).json()
json_req

{'meta': {'code': 200, 'requestId': '5fa633bc841fee376f0c2ced'},
  'headerLocation': 'Parkwoods - Donalda',
  'headerFullLocation': 'Parkwoods - Donalda, Toronto',
  'headerLocationGranularity': 'neighborhood',
  'totalResults': 2,
  'suggestedBounds': {'ne': {'lat': 43.757758604500005,
    'lng': -79.32343823984928},
   'sw': {'lat': 43.7487585955, 'lng': -79.33587476015072}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '4e8d9dcdd5fbbbb6b3003c7b',
       'name': 'Brookbanks Park',
       'location': {'address': 'Toronto',
        'lat': 43.751976046055574,
        'lng': -79.33214044722958,
        'labeledLatLngs': [{'label': 'display',
          'lat': 43.751976046055574,
          'lng': -79.33214044722958}],
        'distance': 245,
        'cc': 'CA',
        'c