# First question on scraping from a Wikipedia page (1/3)

In [1]:
import numpy as np # library to handle data in a vectorized manner
import json # library to handle JSON files
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
import requests # library to handle requests
import urllib.request #library to handle url requests
from bs4 import BeautifulSoup as bs #Library to handle data from html source
import pandas as pd #Library to handle data and modify dataframes
from pandas import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

import folium # map rendering library

print('Libraries imported correctly')

Libraries imported correctly


In [2]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M' #Definition of the URL to be scraped
url

'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

In [3]:
scrap_page = urllib.request.urlopen(url) #Put the HTML into the variable scrap_page

In [4]:
beauty = bs(scrap_page, 'lxml') #Parse the data from the variable scrap_page (HTML data) into bs format tree 

In [5]:
#print(beauty.prettify())

In [6]:
tables = beauty.find_all('table') #This find all the table instances in the HTML and put them into the variable tables

Let's loop through all the rows to get the data we want. Let's create three empty lists as the number of rows.
Then we'll loop though every single row. The findAll function built into BS to look for the stings tr and td.
If the the length corresponds then it will copy the data into the relative variable.

In [7]:
P = []
B = []
N = []

for row in beauty.find_all('tr'):
    c = row.find_all('td')
    if len(c)==3:
        P.append(c[0].find(text=True))
        B.append(c[1].find(text=True))
        N.append(c[2].find(text=True))

In [8]:
df = pd.DataFrame(P, columns=['Postal Code']) #Create a dataset and set the colunm Postal Code as label axis
df['Borough'] = B                             #Parse the data from B to the column of the dataset named Borough
df['Neighborhood'] = N                        #Parse the data from N to the column of the dataset named Neighborhood
df.replace('\n', '', regex=True, inplace=True)#Replace the dataset from the HTML string '\n' with nothing
df

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
...,...,...,...
175,M5Z,Not assigned,Not assigned
176,M6Z,Not assigned,Not assigned
177,M7Z,Not assigned,Not assigned
178,M8Z,Etobicoke,"Mimico NW, The Queensway West, South of Bloor,..."


In [9]:
#index_names = df[df['Borough'] == 'Not assigned']
df.drop(df[df['Borough'] == 'Not assigned'].index, axis = 0, inplace=True) #Drop all the rows containing 'Not assigned' in the colunm Borough
df.reset_index(drop=True) #reset indexes as default

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C..."
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


In [10]:
df[df['Neighborhood'].str.contains('Not assigned', regex=False)] #Check for 'Not assigned' string in Neighborhood column

Unnamed: 0,Postal Code,Borough,Neighborhood


## Implicit suppositions
I supposed that every column containing a string 'Not assigned' had to be discarded and that automatically every 
cell having the same Borough but different Neighborhood was merged into the same one. I cleaned the database from 
HTML codes and not assigned values and the dimension if the dataset is given in the next cell.

In [11]:
df.shape

(103, 3)

# Second question: merge csv file into the preceding dataframe (2/3)

In [12]:
db_tomerge = pd.read_csv('Geospatial_Coordinates.csv') #opening and transforming the csv file into a pandas dataframe

In [13]:
df = df.merge(db_tomerge, on = 'Postal Code') #merge the csv file in the dataframe by Postal Code and set it to df
df.head(12)

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village",43.667856,-79.532242
6,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
7,M3B,North York,Don Mills,43.745906,-79.352188
8,M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937


# Third question: Explore and Cluster Toronto (3/3)
My choice is to work with only boroughs that contain the word Toronto. 

In [15]:
# Let's get Toronto's coordinate using Geopy
address = 'Toronto'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


In [19]:
# Let's then create map of Toronto using latitude and longitude values
map_to = folium.Map(location=[latitude, longitude], zoom_start=10)

# Markers
for lat, lng, borough, neighborhood in zip(df['Latitude'], df['Longitude'], df['Borough'], df['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(map_to)  
    
map_to

In [20]:
# filter borough names that contain the word Toronto
names = list(df.Borough.unique())

just_toronto = []

for i in names:
    if "toronto" in i.lower():
        just_toronto.append(i)
        
just_toronto

['Downtown Toronto', 'East Toronto', 'West Toronto', 'Central Toronto']

In [21]:
# create a new DataFrame with only boroughs that contain the word Toronto
df = df[df['Borough'].isin(just_toronto)].reset_index(drop=True)
print(df.shape)
df.head()

(39, 5)


Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
1,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
2,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
3,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
4,M4E,East Toronto,The Beaches,43.676357,-79.293031


In [22]:
# Let's create a map of Toronto using the restricted dataset just created latitude and longitude values
map_to = folium.Map(location=[latitude, longitude], zoom_start=10)

# Markers
for lat, lng, borough, neighborhood in zip(df['Latitude'], df['Longitude'], df['Borough'], df['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(map_to)  
    
map_to

## Exploration using Foursqaure APIs

In [34]:
# define Foursquare Credentials and Version
CLIENT_ID = '' # your Foursquare ID
CLIENT_SECRET = '' # your Foursquare Secret
ACCESS_TOKEN = '' # your FourSquare Access Token
VERSION = '20200118' 
LIMIT = 100

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: 
CLIENT_SECRET:


We now begin exploring the top one hundred venues in a radius of five hundred meters.

In [31]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [32]:
# convert the venues list into a new DataFrame
venues_df = pd.DataFrame(venues_list)

# define the column names
venues_df.columns = ['PostalCode', 'Borough', 'Neighborhood', 'BoroughLatitude', 'BoroughLongitude', 'VenueName', 'VenueLatitude', 'VenueLongitude', 'VenueCategory']

print(venues_df.shape)
venues_df.head()

NameError: name 'venues_list' is not defined

I'll sort this out one day when I have some more time. I'm done for now.