# <center> Capstone Neighborhood Clustering.</center>

In [1]:
#importing the neccesary libraries
from bs4 import BeautifulSoup as bs
import pandas as pd
import requests

In [2]:
#getting the wiki url
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

In [3]:
# we load the url and use soup to get all the values in the table class = wikitable
web = requests.get(url).text
soup = bs(web,'lxml')
wiki = soup.find("table",class_='wikitable')

In [4]:
#we need now to iterate and find all values between tr and td, we use strip to remove the tr and td
data_clean=[]
for tr in wiki.find_all('tr'):
    for td in tr.find_all('td'):
        data_clean.append((td.find_all(text=True)[0].strip()))

In [5]:
# This gets us a list of all the values we need, but now, in order to transform into a dataframe, we need to group them every 3, so we iterate again 
data_final=[]
for i in range(0, len(data_clean), 3):
  data_final.append([data_clean[i],data_clean[i+1],data_clean[i+2]])

In [6]:
#we convert the list to a df
data_fr=pd.DataFrame(data_final)

In [7]:
#add the column headers
data_fr.columns = ['PostalCode', 'Borough', 'Neighborhood']

In [8]:
#we check how we are doing
data_fr.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor
7,M7A,Downtown Toronto,Queen's Park
8,M8A,Not assigned,Not assigned
9,M9A,Queen's Park,Not assigned


In [9]:
#we remove the not assigned rows on borough
data_fr = data_fr.loc[data_fr['Borough'] != 'Not assigned']

In [10]:
# replacing the not assigned value with the borough value on neiighborhood column
data_fr['Neighborhood'].replace('Not assigned', data_fr['Borough'], inplace=True)
   

In [11]:
#we check, notice than Queens Park on the Neighborhood colum is no longer 'Not assigned' and correctly has borough value
data_fr.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor
7,M7A,Downtown Toronto,Queen's Park
9,M9A,Queen's Park,Queen's Park
10,M1B,Scarborough,Rouge
11,M1B,Scarborough,Malvern
13,M3B,North York,Don Mills North


In [12]:
#grouping first by postal code and then by borough, to join the last value
asd = data_fr.groupby(['PostalCode', 'Borough']).agg(', '.join)

In [13]:
#resetting the indexes, and defining the df variable to the one we are goign to use.
df = asd.reset_index()

In [14]:
df.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


In [15]:
#final shape of the dataframe
df.shape

(103, 3)

## Part 2

In [16]:
!wget -O geodata.csv https://cocl.us/Geospatial_data

--2020-01-18 05:48:51--  https://cocl.us/Geospatial_data
Resolving cocl.us (cocl.us)... 169.48.113.194, 158.85.108.86, 158.85.108.83
Connecting to cocl.us (cocl.us)|169.48.113.194|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://ibm.box.com/shared/static/9afzr83pps4pwf2smjjcf1y5mvgb18rr.csv [following]
--2020-01-18 05:48:52--  https://ibm.box.com/shared/static/9afzr83pps4pwf2smjjcf1y5mvgb18rr.csv
Resolving ibm.box.com (ibm.box.com)... 107.152.26.197, 107.152.27.197
Connecting to ibm.box.com (ibm.box.com)|107.152.26.197|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: /public/static/9afzr83pps4pwf2smjjcf1y5mvgb18rr.csv [following]
--2020-01-18 05:48:52--  https://ibm.box.com/public/static/9afzr83pps4pwf2smjjcf1y5mvgb18rr.csv
Reusing existing connection to ibm.box.com:443.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://ibm.ent.box.com/public/static/9afzr83pps4pwf2s

In [17]:
geo = pd.read_csv('geodata.csv')

In [18]:
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [19]:
geo.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [20]:
# we merge both databases
df2 = pd.merge(df,geo, left_index=True, right_index=True)

In [21]:
#We drop the duplicated column
df2.drop(columns=['Postal Code'], inplace=True)

In [22]:
df2.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [23]:
df2.shape

(103, 5)

## Part 3

In [24]:
#!conda install -c conda-forge folium=0.5.0 --yes
import numpy as np 
from geopy.geocoders import Nominatim 
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans
import folium 


In [25]:
#using the code we learn in the NYC Neighborhood class.

address = 'Toronto, CA'

geolocator = Nominatim(user_agent="Not")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto, CA are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto, CA are 43.653963, -79.387207.


In [26]:
#we drop postal code
df2.drop(columns=['PostalCode'], inplace=True)

In [27]:
# We create a map
map_canada = folium.Map(location=[latitude, longitude], zoom_start=15)

# add markers
for lat, lng, borough, neighborhood in zip(df2['Latitude'], df2['Longitude'], df2['Borough'], df2['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_canada)  

In [28]:
map_canada

In [29]:
#we are just going to analyze downtown toronto, so we drop the rest.
df_tor = df2[df2['Borough'] == 'Downtown Toronto'].reset_index(drop=True)

In [30]:
df_tor.head()

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
0,Downtown Toronto,Rosedale,43.679563,-79.377529
1,Downtown Toronto,"Cabbagetown, St. James Town",43.667967,-79.367675
2,Downtown Toronto,Church and Wellesley,43.66586,-79.38316
3,Downtown Toronto,Harbourfront,43.65426,-79.360636
4,Downtown Toronto,"Ryerson, Garden District",43.657162,-79.378937


In [31]:
#we get the coordinates for Downtown Toronto.

address = 'Downtown Toronto, CA'

geolocator = Nominatim(user_agent="Not")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto, CA are {}, {}.'.format(latitude, longitude))


The geograpical coordinate of Toronto, CA are 43.6563221, -79.3809161.


In [35]:
# We create a map
map_tor = folium.Map(location=[latitude, longitude], zoom_start=14)

# add markers
for lat, lng, neighborhood in zip(df_tor['Latitude'], df_tor['Longitude'], df_tor['Neighborhood']):
    label = '{}'.format(neighborhood)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_tor)  

In [36]:
map_tor