In [5]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

#Libraries for scrapping 
!conda install -c conda-forge butifulsoup4 
from bs4 import BeautifulSoup

!conda install -c conda-forge lxml

!conda install -c conda-forge html5lib

print('Libraries imported.')

Solving environment: done

# All requested packages already installed.

Solving environment: done

# All requested packages already installed.

Solving environment: failed

PackagesNotFoundError: The following packages are not available from current channels:

  - butifulsoup4

Current channels:

  - https://conda.anaconda.org/conda-forge/linux-64
  - https://conda.anaconda.org/conda-forge/noarch
  - https://repo.anaconda.com/pkgs/main/linux-64
  - https://repo.anaconda.com/pkgs/main/noarch

To search for alternate channels that may provide the conda package you're
looking for, navigate to

    https://anaconda.org

and use the search bar at the top of the page.


Solving environment: done

# All requested packages already installed.

Solving environment: done

# All requested packages already installed.

Libraries imported.


# Scrapping Data

In [6]:
# getting HTML as a text
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(source, 'lxml')
#print(soup.prettify())

In [7]:
# Our required data is @ table class="wikitable sortable jquery-tablesorter"

table = soup.find('table', class_='wikitable sortable')
#print(table.prettify())

# Tranform the data into a pandas dataframe                                          
The next task is essentially transforming this data of nested Python dictionaries into a pandas dataframe.

In [8]:
table_rows = table.find_all('tr')
res = []
for tr in table_rows:
    td = tr.find_all('td')
    row = [tr.text.strip() for tr in td if tr.text.strip()]
    if row:
        res.append(row)

neighborhoods = pd.DataFrame(res, columns=['Postcode','Borough','Neighbourhood'])


# Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned

In [9]:
neighborhoods['Borough'].replace('Not assigned', np.nan, inplace=True)
neighborhoods.dropna(subset=['Borough'], inplace=True)

neighborhoods.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor
7,M7A,Downtown Toronto,Queen's Park
9,M9A,Etobicoke,Islington Avenue
10,M1B,Scarborough,Rouge
11,M1B,Scarborough,Malvern
13,M3B,North York,Don Mills North


# Aggregate data

In [10]:
neighborhoods = neighborhoods.groupby(['Postcode', 'Borough'])['Neighbourhood'].apply(', '.join).reset_index()
neighborhoods.columns = ['Postcode', 'Borough', 'Neighbourhood']

In [11]:
neighborhoods[neighborhoods.Neighbourhood == 'Not assigned']

Unnamed: 0,Postcode,Borough,Neighbourhood


In [12]:
neighborhoods['Neighbourhood'].replace('Not assigned', "Queen's Park", inplace=True)
neighborhoods.tail(10)

Unnamed: 0,Postcode,Borough,Neighbourhood
93,M9A,Etobicoke,Islington Avenue
94,M9B,Etobicoke,"Cloverdale, Islington, Martin Grove, Princess ..."
95,M9C,Etobicoke,"Bloordale Gardens, Eringate, Markland Wood, Ol..."
96,M9L,North York,Humber Summit
97,M9M,North York,"Emery, Humberlea"
98,M9N,York,Weston
99,M9P,Etobicoke,Westmount
100,M9R,Etobicoke,"Kingsview Village, Martin Grove Gardens, Richv..."
101,M9V,Etobicoke,"Albion Gardens, Beaumond Heights, Humbergate, ..."
102,M9W,Etobicoke,Northwest


In [13]:
neighborhoods.shape

(103, 3)

# Adding geograpical coordinate

In [14]:
df_geo = pd.read_csv('http://cocl.us/Geospatial_data')
df_geo.columns = ['Postcode', 'Latitude', 'Longitude']


In [16]:
df_pos = pd.merge(neighborhoods, df_geo, on=['Postcode'], how='inner')

df_tor = df_pos[['Borough', 'Neighbourhood', 'Postcode', 'Latitude', 'Longitude']].copy()

df_tor.head()

Unnamed: 0,Borough,Neighbourhood,Postcode,Latitude,Longitude
0,Scarborough,"Rouge, Malvern",M1B,43.806686,-79.194353
1,Scarborough,"Highland Creek, Rouge Hill, Port Union",M1C,43.784535,-79.160497
2,Scarborough,"Guildwood, Morningside, West Hill",M1E,43.763573,-79.188711
3,Scarborough,Woburn,M1G,43.770992,-79.216917
4,Scarborough,Cedarbrae,M1H,43.773136,-79.239476


In [17]:
address = 'Toronto, Canada'

geolocator = Nominatim()
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of the City of Toronto are {}, {}.'.format(latitude, longitude))

  app.launch_new_instance()


GeocoderTimedOut: Service timed out

# Toronto map

In [21]:
#create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df_tor['Latitude'], df_tor['Longitude'], df_tor['Borough'], df_tor['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=3,
        popup=label,
        color='green',
        fill=True,
        fill_color='#3199cc',
        fill_opacity=0.3,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

# Exploration

Define Foursquare Credentials and Version

In [18]:
CLIENT_ID = 'COKXQ55XOFJU2RNGZIHQN3IZLFMXXGNKQHOIWOKMY0GGGJTU' # your Foursquare ID
CLIENT_SECRET = 'PXYXXMYW20YTWZYSHKF3VVWXTZAFN3BGA0PRJPBSKIB3URQN' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)


Your credentails:
CLIENT_ID: COKXQ55XOFJU2RNGZIHQN3IZLFMXXGNKQHOIWOKMY0GGGJTU
CLIENT_SECRET:PXYXXMYW20YTWZYSHKF3VVWXTZAFN3BGA0PRJPBSKIB3URQN


In [19]:
df_t4 = df_tor[df_tor['Borough'].str.contains('Toronto')]

to_data = df_t4.reset_index(drop=True)
to_data

Unnamed: 0,Borough,Neighbourhood,Postcode,Latitude,Longitude
0,East Toronto,The Beaches,M4E,43.676357,-79.293031
1,East Toronto,"The Danforth West, Riverdale",M4K,43.679557,-79.352188
2,East Toronto,"The Beaches West, India Bazaar",M4L,43.668999,-79.315572
3,East Toronto,Studio District,M4M,43.659526,-79.340923
4,Central Toronto,Lawrence Park,M4N,43.72802,-79.38879
5,Central Toronto,Davisville North,M4P,43.712751,-79.390197
6,Central Toronto,North Toronto West,M4R,43.715383,-79.405678
7,Central Toronto,Davisville,M4S,43.704324,-79.38879
8,Central Toronto,"Moore Park, Summerhill East",M4T,43.689574,-79.38316
9,Central Toronto,"Deer Park, Forest Hill SE, Rathnelly, South Hi...",M4V,43.686412,-79.400049


# Explore the first neighbourhood

In [20]:
to_data.loc[0, 'Neighbourhood']

'The Beaches'

let's grab the neighbourhood long and lat values

In [21]:
neighbourhood_latitude = to_data.loc[0, 'Latitude'] # neighbourhood latitude value
neighbourhood_longitude = to_data.loc[0, 'Longitude'] # neighbourhood longitude value

neighbourhood_name = to_data.loc[0, 'Neighbourhood'] # neighbourhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighbourhood_name, 
                                                               neighbourhood_latitude, 
                                                               neighbourhood_longitude))

Latitude and longitude values of The Beaches are 43.67635739999999, -79.2930312.


In [22]:
LIMIT = 100
radius = 500

url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighbourhood_latitude, 
    neighbourhood_longitude, 
    radius, 
    LIMIT)
url

'https://api.foursquare.com/v2/venues/explore?&client_id=COKXQ55XOFJU2RNGZIHQN3IZLFMXXGNKQHOIWOKMY0GGGJTU&client_secret=PXYXXMYW20YTWZYSHKF3VVWXTZAFN3BGA0PRJPBSKIB3URQN&v=20180605&ll=43.67635739999999,-79.2930312&radius=500&limit=100'

In [23]:
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '5e638c6b60ba080028b5a38d'},
 'response': {'headerLocation': 'The Beaches',
  'headerFullLocation': 'The Beaches, Toronto',
  'headerLocationGranularity': 'neighborhood',
  'totalResults': 5,
  'suggestedBounds': {'ne': {'lat': 43.680857404499996,
    'lng': -79.28682091449052},
   'sw': {'lat': 43.67185739549999, 'lng': -79.29924148550948}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '4bd461bc77b29c74a07d9282',
       'name': 'Glen Manor Ravine',
       'location': {'address': 'Glen Manor',
        'crossStreet': 'Queen St.',
        'lat': 43.67682094413784,
        'lng': -79.29394208780985,
        'labeledLatLngs': [{'label': 'display',
          'lat': 43.67682094413784,
          'lng': -79.29394208780985}],
        'distanc

In [24]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

# Clean json and structure into a pandas dataframe

In [25]:
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,Glen Manor Ravine,Trail,43.676821,-79.293942
1,The Big Carrot Natural Food Market,Health Food Store,43.678879,-79.297734
2,Grover Pub and Grub,Pub,43.679181,-79.297215
3,Upper Beaches,Neighborhood,43.680563,-79.292869
4,Seaspray Restaurant,Asian Restaurant,43.678888,-79.298167


# and how many venus were returned from Foursquare?

In [26]:
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

5 venues were returned by Foursquare.
