## Segmenting and Clustering Neighborhoods in Toronto
### Peer-graded Assignment: Notebook for Github

In [2]:
#import data analyzation tools
import pandas as pd
import numpy as np

#json tools
import json
from pandas.io.json import json_normalize

!conda install -c anaconda beautifulsoup4 -y

#import scraping tools
import requests
from urllib.request import urlopen
from bs4 import BeautifulSoup

#import visual tools
import matplotlib.cm as cm
import matplotlib.colors as colors
import folium

!conda install -c conda-forge geopy -y

#import geocoder tools
from geopy.geocoders import Nominatim

#import kmean tools
from sklearn.cluster import KMeans



print('All tools imported')

Solving environment: done


  current version: 4.5.11
  latest version: 4.7.12

Please update conda by running

    $ conda update -n base -c defaults conda



## Package Plan ##

  environment location: /home/jupyterlab/conda/envs/python

  added / updated specs: 
    - beautifulsoup4


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    certifi-2019.9.11          |           py36_0         154 KB  anaconda

The following packages will be UPDATED:

    certifi: 2019.9.11-py36_0  conda-forge --> 2019.9.11-py36_0 anaconda
    openssl: 1.1.1d-h516909a_0 conda-forge --> 1.1.1-h7b6447c_0 anaconda


Downloading and Extracting Packages
certifi-2019.9.11    | 154 KB    | ##################################### | 100% 
Preparing transaction: done
Verifying transaction: done
Executing transaction: done
Solving environment: done


  current version: 4.5.11
  latest version: 4.7.12

Please update conda b

## Put data from WikiPage into a Dataframe

In [25]:
# use BSoup to scrape initial data
wiki_url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
url_raw = urlopen(wiki_url).read().decode('utf-8')
info = BeautifulSoup(url_raw, 'html.parser')
info_table = info.body.table.tbody

In [53]:
#create functions to add scraped data to the info_table
# code based on info from https://beautiful-soup-4.readthedocs.io/en/latest/

def get_cell(x):
    cells = x.find_all('td') #use 'td' as that is the main area that the info is located in the data, see "inspect" on wiki page
    row = []
    for cell in cells:
        if cell.a:            
            if (cell.a.text):
                row.append(cell.a.text)
                continue
        row.append(cell.string.strip())
    return row

def get_row():    
    data = []  
    for y in table.find_all('tr'):
        row = get_cell(y)
        if len(row) != 3:
            continue
        data.append(row)        
    return data

In [65]:
#Now let's use the functions to put the data in a dataframe
data = get_row()
headers = ['PostalCode', 'Borough', 'Neighborhood']
df = pd.DataFrame(data, columns=headers)
df.head(9)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Not assigned


In [67]:
# If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough
df1 = df[df.Borough != 'Not assigned']
df1 = df1.sort_values(by=['PostalCode','Borough'])

df1.reset_index(inplace=True)
df1.drop('index',axis=1,inplace=True)
df1.head(9)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,Rouge
1,M1B,Scarborough,Malvern
2,M1C,Scarborough,Highland Creek
3,M1C,Scarborough,Rouge Hill
4,M1C,Scarborough,Port Union
5,M1E,Scarborough,Guildwood
6,M1E,Scarborough,Morningside
7,M1E,Scarborough,West Hill
8,M1G,Scarborough,Woburn


In [69]:
#Organize the neighborhoods by their Postal Codes

df_postalcodes = df1['PostalCode']
df_postalcodes.drop_duplicates(inplace=True)
df2 = pd.DataFrame(df_postalcodes)
df2['Borough'] = '';
df2['Neighborhood'] = '';

#realign dataframes
df2.reset_index(inplace=True)
df2.drop('index', axis=1, inplace=True)
df1.reset_index(inplace=True)
df1.drop('index', axis=1, inplace=True)

#index through data frames
for i in df2.index:
    for j in df1.index:
        if df2.iloc[i, 0] == df1.iloc[j, 0]:
            df2.iloc[i, 1] = df1.iloc[j, 1]
            df2.iloc[i, 2] = df2.iloc[i, 2] + ',' + df1.iloc[j, 2]
            
for i in df2.index:
    s = df2.iloc[i, 2]
    if s[0] == ',':
        s =s [1:]
    df2.iloc[i,2 ] = s
    
df2.head(9)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park,Ionview,Kennedy Park"
7,M1L,Scarborough,"Clairlea,Golden Mile,Oakridge"
8,M1M,Scarborough,"Cliffcrest,Cliffside,Scarborough Village West"


In [71]:
#get the shape of the dataframe
df2.shape

(103, 3)

## Latitude and Longitude with Foursquare

In [80]:
#Load the Geospatial data from link
df2['Latitude'] = '0';
df2['Longitude'] = '0';
coordinates = pd.read_csv('http://cocl.us/Geospatial_data')

In [82]:
#merging dataframe that contain coordinates with the one that contains borough names
for x in df2.index:
    for y in coordinates.index:
        if df2.iloc[x, 0] == coordinates.iloc[y, 0]:
            df2.iloc[x, 3] = coordinates.iloc[y, 1]
            df2.iloc[x, 4] = coordinates.iloc[y, 2]

#checking the results            
df2.head(9)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.8067,-79.1944
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.7845,-79.1605
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.7636,-79.1887
3,M1G,Scarborough,Woburn,43.771,-79.2169
4,M1H,Scarborough,Cedarbrae,43.7731,-79.2395
5,M1J,Scarborough,Scarborough Village,43.7447,-79.2395
6,M1K,Scarborough,"East Birchmount Park,Ionview,Kennedy Park",43.7279,-79.262
7,M1L,Scarborough,"Clairlea,Golden Mile,Oakridge",43.7111,-79.2846
8,M1M,Scarborough,"Cliffcrest,Cliffside,Scarborough Village West",43.7163,-79.2395


## Analysis of the Toronto Boroughs

In [98]:
#get just toronto data
df_toronto = df2[df2['Borough'] == 'Downtown Toronto'].reset_index(drop=True)
df_toronto.head(9)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M4W,Downtown Toronto,Rosedale,43.6796,-79.3775
1,M4X,Downtown Toronto,"Cabbagetown,St. James Town",43.668,-79.3677
2,M4Y,Downtown Toronto,Church and Wellesley,43.6659,-79.3832
3,M5A,Downtown Toronto,"Harbourfront,Regent Park",43.6543,-79.3606
4,M5B,Downtown Toronto,"Ryerson,Garden District",43.6572,-79.3789
5,M5C,Downtown Toronto,St. James Town,43.6515,-79.3754
6,M5E,Downtown Toronto,Berczy Park,43.6448,-79.3733
7,M5G,Downtown Toronto,Central Bay Street,43.658,-79.3874
8,M5H,Downtown Toronto,"Adelaide,King,Richmond",43.6506,-79.3846


In [99]:
#get size of toronto data
df_toronto.shape

(18, 5)

In [100]:
#get the location of the six, based off code from NY exercise
address = 'Toronto, Canada'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('Toronto\'s central coordinates are {}, {}.'.format(latitude, longitude))

Toronto's central coordinates are 43.653963, -79.387207.


In [102]:
#map of the six, based off code from NY exercise
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=12.5)

#place neighborhoods on map based with circle locaters
for lat, lng, label in zip(df_toronto['Latitude'], df_toronto['Longitude'], df_toronto['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=7,
        popup=label,
        color='red',
        fill=True,
        fill_color='#3166cc',
        fill_opacity=0.55,
        parse_html=False).add_to(map_toronto)  
    
map_toronto