<a href="https://colab.research.google.com/github/MaguireMaName/Coursera_Capstone/blob/master/Machine_Learning_w_Python_Segment_%26_Clustering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
#!pip install geocoder
#!pip install requests

In [0]:
# bring in dependencies 
import geocoder
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests as rq
import folium
from geopy.geocoders import Nominatim

## Machine Learning with Python: Dataframe of postal code, neighborhood, & borough
*For the Applied Data Science Capstone Project*

In [6]:
# define url for scraping and print

url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
print(url)

https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M


In [0]:
response = rq.get(url)
soup = BeautifulSoup(response.text, 'html.parser')

#print(soup)

In [0]:
table = soup.find('table', {'class':'wikitable sortable'}).tbody
#print(table)

rows = table.find_all('tr')

columns = [v.text.replace('\n','') for v in rows[0].find_all('th')]

df_a = pd.DataFrame(columns=columns)

for i in range(1, len(rows)):
    tds = rows[i].find_all('td')
    
    if len(tds) ==4:
      values = [tds[0].text, tds[1].text,'', tds[2].text, tds[3].text.replace('\n','').replace('\xa0','')]
    else:
      values = [td.text.replace('\n','').replace('\xa0','') for td in tds]
    
    df_a = df_a.append(pd.Series(values, index=columns), ignore_index=True)


In [9]:
# dimensions before aggregation

df_a.shape

(288, 3)

In [0]:
# aggregate data

df_b = df_a.groupby(['Postcode','Borough']).agg(lambda x: x.tolist()).reset_index()

In [12]:
df_b.shape

(180, 3)

In [13]:
# check results

df_b.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,[Not assigned]
1,M1B,Scarborough,"[Rouge, Malvern]"
2,M1C,Scarborough,"[Highland Creek, Rouge Hill, Port Union]"
3,M1E,Scarborough,"[Guildwood, Morningside, West Hill]"
4,M1G,Scarborough,[Woburn]


In [0]:
# where neighbourhood is not assigned, replace it with borough

df_b['Neighbourhood'] = np.where(df_b['Neighbourhood'] == "Not assigned", df_b['Borough'], df_b['Neighbourhood'])


In [15]:
# exception table

x_neighbourhood = df_b.loc[(df_b['Neighbourhood'] == "Not assigned")]
x_neighbourhood.shape

(0, 3)

In [16]:
# exception table

x_borough = df_b.loc[(df_b['Borough'] == "Not assigned")]
x_borough.shape

(77, 3)

In [0]:
# 77 boroughs not assigned. Don't process obs. where borough = 'Not assigned'

df_c = df_b.drop(df_b[df_b.Borough == "Not assigned"].index)


In [19]:
# exception table

x_borough = df_c.loc[(df_b['Borough'] == "Not assigned")]
x_borough.shape

(0, 3)

In [20]:
# dimensions after aggregation

df_c.shape

(103, 3)

In [0]:
# load in lat and lon info

df_geo = pd.read_csv('Geospatial_Coordinates.csv')

In [23]:
# check geo load

df_geo.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [0]:
df_geo.rename(columns = {"Postal Code": "Postcode"}, 
                                 inplace = True) 

In [0]:
# join geo with neighbourhood data

df_d = pd.merge(df_geo, df_c, on='Postcode')

In [26]:
# check dimensions

df_d.shape

(103, 5)

In [34]:
# check data

df_d.head()

Unnamed: 0,Postcode,Latitude,Longitude,Borough,Neighbourhood
98,M9N,43.706876,-79.518188,York,[Weston]
99,M9P,43.696319,-79.532242,Etobicoke,[Westmount]
100,M9R,43.688905,-79.554724,Etobicoke,"[Kingsview Village, Martin Grove Gardens, Rich..."
101,M9V,43.739416,-79.588437,Etobicoke,"[Albion Gardens, Beaumond Heights, Humbergate,..."
102,M9W,43.706748,-79.594054,Etobicoke,[Northwest]


In [59]:
# will explore features of neighbourhood in Toronto only

df_e = df_d[df_d['Borough'].str.contains("Toronto")].reset_index(drop=True)
df_e.head()

Unnamed: 0,Postcode,Latitude,Longitude,Borough,Neighbourhood
0,M4E,43.676357,-79.293031,East Toronto,[The Beaches]
1,M4K,43.679557,-79.352188,East Toronto,"[The Danforth West, Riverdale]"
2,M4L,43.668999,-79.315572,East Toronto,"[The Beaches West, India Bazaar]"
3,M4M,43.659526,-79.340923,East Toronto,[Studio District]
4,M4N,43.72802,-79.38879,Central Toronto,[Lawrence Park]


In [70]:
address = 'Toronto, Ontario'

geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude

# create map of Manhattan using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(df_e['Latitude'], df_e['Longitude'], df_e['Neighbourhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='green',
        fill=True,
        fill_color='green',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

In [0]:
# define Foursquare Credentials and Version

CLIENT_ID = 'KL5SVGOS40RKZBQK4G1VXYBKBICWCDQL2NMCASHFYER432SS'
' # your Foursquare ID'
CLIENT_SECRET = '1A5KPYJQIATH0SDZXPPZ5YK0SHLBYVEGPER5AAIIMDXLZ0AB' # your Foursquare Secret
VERSION = '20180604'
LIMIT = 30
print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

In [0]:
# let's create a function to repeat the same process to all the neighborhoods in toronto

def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [73]:
#### run the above function on each neighborhood and create a new dataframe called toronto_venues

toronto_venues = getNearbyVenues(names=df_e['Neighbourhood'],
                                   latitudes=df_e['Latitude'],
                                   longitudes=df_e['Longitude']
                                  )

['The Beaches']
['The Danforth West', 'Riverdale']
['The Beaches West', 'India Bazaar']
['Studio District']
['Lawrence Park']
['Davisville North']
['North Toronto West']
['Davisville']
['Moore Park', 'Summerhill East']
['Deer Park', 'Forest Hill SE', 'Rathnelly', 'South Hill', 'Summerhill West']
['Rosedale']
['Cabbagetown', 'St. James Town']
['Church and Wellesley']
['Harbourfront', 'Regent Park']
['Ryerson', 'Garden District']
['St. James Town']
['Berczy Park']
['Central Bay Street']
['Adelaide', 'King', 'Richmond']
['Harbourfront East', 'Toronto Islands', 'Union Station']
['Design Exchange', 'Toronto Dominion Centre']
['Commerce Court', 'Victoria Hotel']
['Roselawn']
['Forest Hill North', 'Forest Hill West']
['The Annex', 'North Midtown', 'Yorkville']
['Harbord', 'University of Toronto']
['Chinatown', 'Grange Park', 'Kensington Market']
['CN Tower', 'Bathurst Quay', 'Island airport', 'Harbourfront West', 'King and Spadina', 'Railway Lands', 'South Niagara']
['Stn A PO Boxes 25 The Es

In [76]:
# check dimensions and data

print(toronto_venues.shape)
toronto_venues.head()

(826, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,[The Beaches],43.676357,-79.293031,The Big Carrot Natural Food Market,43.678879,-79.297734,Health Food Store
1,[The Beaches],43.676357,-79.293031,Grover Pub and Grub,43.679181,-79.297215,Pub
2,[The Beaches],43.676357,-79.293031,Upper Beaches,43.680563,-79.292869,Neighborhood
3,"[The Danforth West, Riverdale]",43.679557,-79.352188,Pantheon,43.677621,-79.351434,Greek Restaurant
4,"[The Danforth West, Riverdale]",43.679557,-79.352188,Dolce Gelato,43.677773,-79.351187,Ice Cream Shop
