# Import Libraries

In [3]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests

# Get Data from WikiPedia and use BeautifulSoup

In [4]:
Wikilink='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
WikiPage= requests.get(Wikilink).text

# using beautiful soup to parse the HTML/XML codes.
soup = BeautifulSoup(WikiPage,'xml')
#print(soup.prettify())

# Get Data from WikiPedia and use BeautifulSoup

In [5]:


url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

text_result = requests.get(url).text #get the entire html of the article as a str
html_parsed_result = BeautifulSoup(text_result, 'html.parser') #transform the text to html

neighborhood_table = html_parsed_result.find('table', class_ = 'wikitable')
neighborhood_table_rows = neighborhood_table.find_all('tr')

# extract the info ('Postcode', 'Borough', 'Neighbourhood') from the table
neighborhood_info = []
for row in neighborhood_table_rows:
    info = row.text.split('\n')[1:-1] 
    neighborhood_info.append(info)
    
neighborhood_info[0:10]
       

[['Postcode', 'Borough', 'Neighbourhood'],
 ['M1A', 'Not assigned', 'Not assigned'],
 ['M2A', 'Not assigned', 'Not assigned'],
 ['M3A', 'North York', 'Parkwoods'],
 ['M4A', 'North York', 'Victoria Village'],
 ['M5A', 'Downtown Toronto', 'Harbourfront'],
 ['M5A', 'Downtown Toronto', 'Regent Park'],
 ['M6A', 'North York', 'Lawrence Heights'],
 ['M6A', 'North York', 'Lawrence Manor'],
 ['M7A', "Queen's Park", 'Not assigned']]

# Create Neighbourhood DataFrame 

In [10]:
neighborhood_info[0][-1] = 'Neighborhood'
neighborhood_df = pd.DataFrame(neighborhood_info[1:], columns=neighborhood_info[0])

neighborhood_df.head(10)

Unnamed: 0,Postcode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Not assigned
9,M8A,Not assigned,Not assigned


#    # Filter out Not Assigned  for Borough and Neighborhood columns

In [41]:
not_assigned_boroughs = neighborhood_df.index[neighborhood_df['Borough'] == 'Not assigned']
not_assigned_neighborhoods = neighborhood_df.index[neighborhood_df['Neighborhood'] == 'Not assigned']


In [43]:
not_assigned_neighborhoods_and_borough = not_assigned_boroughs & not_assigned_neighborhoods
neighborhood_df.drop(neighborhood_df.index[not_assigned_boroughs], inplace=True)
neighborhood_df.reset_index(drop=True, inplace=True)

neighborhood_df.head(10)

Unnamed: 0,Postcode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights
5,M6A,North York,Lawrence Manor
6,M7A,Queen's Park,Queen's Park
7,M9A,Etobicoke,Islington Avenue
8,M1B,Scarborough,Rouge
9,M1B,Scarborough,Malvern


In [12]:


not_assigned_neighborhoods = neighborhood_df.index[neighborhood_df['Neighborhood'] == 'Not assigned'] # run this again because the indexes on the dataframe where reset

for idx in not_assigned_neighborhoods:
    neighborhood_df['Neighborhood'][idx] = neighborhood_df['Borough'][idx]
    
neighborhood_df.head(10)



Unnamed: 0,Postcode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights
5,M6A,North York,Lawrence Manor
6,M7A,Queen's Park,Queen's Park
7,M9A,Etobicoke,Islington Avenue
8,M1B,Scarborough,Rouge
9,M1B,Scarborough,Malvern


# Printing Data Frame Details;

In [13]:


print('After cleaning the DataFrame, its new shape is {}'.format(neighborhood_df.shape),'\n')
print('There are:')
print('  {} Postal codes'.format(neighborhood_df['Postcode'].unique().shape[0]))
print('  {} Boroughs'.format(neighborhood_df['Borough'].unique().shape[0]))
print('  {} Neighborhoods'.format(neighborhood_df['Neighborhood'].unique().shape[0]))



After cleaning the DataFrame, its new shape is (212, 3) 

There are:
  103 Postal codes
  11 Boroughs
  210 Neighborhoods


## Note there are more Neighbourhoods than postal code. So We will merge Neighbourhood which has common PostalCode.

# Grouping of Neighbourhood based on PIN Code

In [14]:
group = neighborhood_df.groupby('Postcode')
grouped_neighborhoods = group['Neighborhood'].apply(lambda x: "%s" % ', '.join(x))
grouped_boroughs = group['Borough'].apply(lambda x: set(x).pop())
grouped_df = pd.DataFrame(list(zip(grouped_boroughs.index, grouped_boroughs, grouped_neighborhoods)))
grouped_df.columns = ['Postcode', 'Borough', 'Neighborhood']

grouped_df.head(10)

Unnamed: 0,Postcode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


In [15]:


print('The DataFrame shape is', grouped_df.shape)



The DataFrame shape is (103, 3)


# Get Co-Ordinates for each Postal Code

In [17]:

coordinates_df = pd.read_csv('https://cocl.us/Geospatial_data') # transform the csv file into a dataframe

print('The coordinates dataframe shape is', coordinates_df.shape)
coordinates_df.head()



The coordinates dataframe shape is (103, 3)


Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [18]:
postcodes_with_coordinates_df = grouped_df.join(coordinates_df.set_index('Postal Code'), on='Postcode')

postcodes_with_coordinates_df.head(16)

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",43.727929,-79.262029
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848


# Use FourSquare to get Venues bases on Longitude and Latitudes

In [19]:

CLIENT_ID = 'C0KAOQEIT4JKPYXRBHL1RRXD5ARMZSDODSHTZZUAPJCUSNOO' # your Foursquare ID
CLIENT_SECRET = 'KL0PV2VX1J1SDSRNRWRT4AR2TKYYBEZNJ3D43LPBVJUZV0U3' # your Foursquare Secret
VERSION = '20180604'
LIMIT = 30
print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: C0KAOQEIT4JKPYXRBHL1RRXD5ARMZSDODSHTZZUAPJCUSNOO
CLIENT_SECRET:KL0PV2VX1J1SDSRNRWRT4AR2TKYYBEZNJ3D43LPBVJUZV0U3


In [20]:


#function to format url
def format_url(lat, lng, radius=500, limit=100):
    return 'https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&ll={},{}&v={}&radius={}&limit={}'.format(CLIENT_ID, CLIENT_SECRET, 
        lat, lng, VERSION, radius, limit)



In [22]:
first_postcode = postcodes_with_coordinates_df.iloc[2, :]
url = format_url(first_postcode['Latitude'], first_postcode['Longitude'])

url

'https://api.foursquare.com/v2/venues/explore?client_id=C0KAOQEIT4JKPYXRBHL1RRXD5ARMZSDODSHTZZUAPJCUSNOO&client_secret=KL0PV2VX1J1SDSRNRWRT4AR2TKYYBEZNJ3D43LPBVJUZV0U3&ll=43.7635726,-79.1887115&v=20180604&radius=500&limit=100'

In [29]:
#function to get batch of venues as a dataframe
import json
from pandas.io.json import json_normalize
def get_venues(url):
    results = requests.get(url).json()
    venues = results['response']['groups'][0]['items']
    print(venues)
    return json_normalize(venues)

In [30]:


venues_df = get_venues(url)

venues_df.head()



[{'referralId': 'e-0-4b6074e3f964a5200fe729e3-0', 'venue': {'photos': {'groups': [], 'count': 0}, 'id': '4b6074e3f964a5200fe729e3', 'name': 'Swiss Chalet Rotisserie & Grill', 'categories': [{'icon': {'suffix': '.png', 'prefix': 'https://ss3.4sqi.net/img/categories_v2/food/pizza_'}, 'id': '4bf58dd8d48988d1ca941735', 'primary': True, 'shortName': 'Pizza', 'pluralName': 'Pizza Places', 'name': 'Pizza Place'}], 'location': {'lat': 43.76769708292701, 'postalCode': 'M1E 2N5', 'distance': 469, 'address': '4410 Kingston Rd', 'formattedAddress': ['4410 Kingston Rd', 'Scarborough ON M1E 2N5', 'Canada'], 'city': 'Scarborough', 'country': 'Canada', 'state': 'ON', 'labeledLatLngs': [{'lat': 43.76769708292701, 'label': 'display', 'lng': -79.1899135003439}], 'lng': -79.1899135003439, 'cc': 'CA'}}, 'reasons': {'count': 0, 'items': [{'type': 'general', 'reasonName': 'globalInteractionReason', 'summary': 'This spot is popular'}]}}, {'referralId': 'e-0-4c62f34bde1b2d7fec89e370-1', 'venue': {'photos': {'g

Unnamed: 0,reasons.count,reasons.items,referralId,venue.categories,venue.id,venue.location.address,venue.location.cc,venue.location.city,venue.location.country,venue.location.crossStreet,venue.location.distance,venue.location.formattedAddress,venue.location.labeledLatLngs,venue.location.lat,venue.location.lng,venue.location.postalCode,venue.location.state,venue.name,venue.photos.count,venue.photos.groups
0,0,"[{'type': 'general', 'reasonName': 'globalInte...",e-0-4b6074e3f964a5200fe729e3-0,"[{'icon': {'suffix': '.png', 'prefix': 'https:...",4b6074e3f964a5200fe729e3,4410 Kingston Rd,CA,Scarborough,Canada,,469,"[4410 Kingston Rd, Scarborough ON M1E 2N5, Can...","[{'lat': 43.76769708292701, 'label': 'display'...",43.767697,-79.189914,M1E 2N5,ON,Swiss Chalet Rotisserie & Grill,0,[]
1,0,"[{'type': 'general', 'reasonName': 'globalInte...",e-0-4c62f34bde1b2d7fec89e370-1,"[{'icon': {'suffix': '.png', 'prefix': 'https:...",4c62f34bde1b2d7fec89e370,4371 kingston road,CA,Toronto,Canada,,298,"[4371 kingston road, Toronto ON M1E 2M9, Canada]","[{'lat': 43.765309, 'label': 'display', 'lng':...",43.765309,-79.191537,M1E 2M9,ON,G & G Electronics,0,[]
2,0,"[{'type': 'general', 'reasonName': 'globalInte...",e-0-5411f741498e9ebd5e35d8bd-2,"[{'icon': {'suffix': '.png', 'prefix': 'https:...",5411f741498e9ebd5e35d8bd,4383 Kingston rd.,CA,Scarborough,Canada,,343,"[4383 Kingston rd., Scarborough ON, Canada]","[{'lat': 43.766299084470795, 'label': 'display...",43.766299,-79.19072,,ON,Big Bite Burrito,0,[]
3,0,"[{'type': 'general', 'reasonName': 'globalInte...",e-0-4c1c7f9bb306c9288f0464b7-3,"[{'icon': {'suffix': '.png', 'prefix': 'https:...",4c1c7f9bb306c9288f0464b7,4304 Kingston Road,CA,Scarborough,Canada,,378,"[4304 Kingston Road, Scarborough ON M1E 2M8, C...","[{'lat': 43.76404170000001, 'label': 'display'...",43.764042,-79.193371,M1E 2M8,ON,Enterprise Rent-A-Car,0,[]
4,0,"[{'type': 'general', 'reasonName': 'globalInte...",e-0-4b9008acf964a520737133e3-4,"[{'icon': {'suffix': '.png', 'prefix': 'https:...",4b9008acf964a520737133e3,4125 Lawrence Ave E,CA,West Hill,Canada,Kingston Rd,445,"[4125 Lawrence Ave E (Kingston Rd), West Hill ...","[{'lat': 43.766631153138455, 'label': 'display...",43.766631,-79.192286,,ON,Woburn Medical Centre,0,[]


In [31]:
#function to clean the venues dataframe
def clean_df(df):
    relevant_columns = ['venue.categories', 'venue.location.lat', 'venue.location.lng', 'venue.name']
    clean_df = df.loc[:,relevant_columns]

    #rename columns
    clean_df.columns = ['Category', 'Lat', 'Lng', 'Name']
    return clean_df 

venues_df = clean_df(venues_df)
venues_df.head()

Unnamed: 0,Category,Lat,Lng,Name
0,"[{'icon': {'suffix': '.png', 'prefix': 'https:...",43.767697,-79.189914,Swiss Chalet Rotisserie & Grill
1,"[{'icon': {'suffix': '.png', 'prefix': 'https:...",43.765309,-79.191537,G & G Electronics
2,"[{'icon': {'suffix': '.png', 'prefix': 'https:...",43.766299,-79.19072,Big Bite Burrito
3,"[{'icon': {'suffix': '.png', 'prefix': 'https:...",43.764042,-79.193371,Enterprise Rent-A-Car
4,"[{'icon': {'suffix': '.png', 'prefix': 'https:...",43.766631,-79.192286,Woburn Medical Centre


#     Get Categories of the Venues

In [32]:
#function that extract the category name
def get_category_name(row):
    category_json = row['Category']
    try:
        return category_json[0]['name']
    except:        
        return None

venues_df['Category'] = venues_df.apply(get_category_name, axis=1)

venues_df

Unnamed: 0,Category,Lat,Lng,Name
0,Pizza Place,43.767697,-79.189914,Swiss Chalet Rotisserie & Grill
1,Electronics Store,43.765309,-79.191537,G & G Electronics
2,Mexican Restaurant,43.766299,-79.19072,Big Bite Burrito
3,Rental Car Location,43.764042,-79.193371,Enterprise Rent-A-Car
4,Medical Center,43.766631,-79.192286,Woburn Medical Centre
5,Breakfast Spot,43.7678,-79.190466,Eggsmart


# function to get the closest postcode pair and the distance using latitude and longitude

In [51]:
 from geopy.distance import great_circle


def closest_postcode(postcode, postcodes):
    postcode = np.asarray(postcode).reshape(1,-1)
    postcodes = np.asarray(postcodes)
    distances = [great_circle(postcode, point).meters for point in postcodes]
    closest_postcode_index = np.argmin(distances)
    return [round(distances[closest_postcode_index]), closest_postcode_index]

In [54]:
coordinates = postcodes_with_coordinates_df[['Latitude', 'Longitude']]
import matplotlib.pyplot as plt

X = postcodes_with_coordinates_df[['Longitude', 'Latitude']]

distance, closest_point_index = closest_postcode(coordinates[:1], coordinates[~coordinates.index.isin([0])])
close_points = X[:1].append(X[closest_point_index+1:closest_point_index+2])
close_points

Unnamed: 0,Longitude,Latitude
0,-79.194353,43.806686
16,-79.205636,43.836125


In [55]:
from math import isnan

for lat, lng, idx in zip(postcodes_with_coordinates_df['Latitude'], postcodes_with_coordinates_df['Longitude'], postcodes_with_coordinates_df.index):
    distance, closest_point_index = closest_postcode([lat,lng], coordinates[~coordinates.index.isin([idx])])
    postcodes_with_coordinates_df.at[idx,'Distance'] = np.int64(distance//2) #use the half of the disstance to avoid overlapping
    
postcodes_with_coordinates_df.head(15)

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude,Distance
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353,1698.0
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497,1625.0
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711,1205.0
3,M1G,Scarborough,Woburn,43.770992,-79.216917,913.0
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476,913.0
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476,1302.0
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",43.727929,-79.262029,1113.0
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",43.711112,-79.284577,1052.0
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West",43.716316,-79.239476,1113.0
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848,1297.0


In [67]:
#function to construct the dataframe with all the venues (max 100 venues per postal code)
def get_all_venues(postcodes, lat, lng, radius):
    
    venues_list=[]
    for postcode, lat, lng, radius in zip(postcodes, lat, lng, radius):
        url= format_url(lat, lng, radius)
        results = requests.get(url).json()['response']['groups'][0]['items']
        
        venues_list.append([(
            postcode, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'], 
            v['venue']['categories'][0]['name'])
            for v in results])
    all_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    all_venues.columns = ['Postcode', 
                  'Postcode Latitude', 
                  'Postcode Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude',
                  'Venue Category'
                  ]
    
    return all_venues

In [68]:
all_venues = get_all_venues(postcodes_with_coordinates_df['Postcode'], postcodes_with_coordinates_df['Latitude'], postcodes_with_coordinates_df['Longitude'], postcodes_with_coordinates_df['Distance'])

print('The total number of venues returned is ', all_venues.shape[0])

all_venues.head(10)

The total number of venues returned is  3306


Unnamed: 0,Postcode,Postcode Latitude,Postcode Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,M1B,43.806686,-79.194353,African Rainforest Pavilion,43.817725,-79.183433,Zoo Exhibit
1,M1B,43.806686,-79.194353,Images Salon & Spa,43.802283,-79.198565,Spa
2,M1B,43.806686,-79.194353,Canadiana exhibit,43.817962,-79.193374,Zoo Exhibit
3,M1B,43.806686,-79.194353,penguin exhibit,43.819435,-79.185959,Zoo Exhibit
4,M1B,43.806686,-79.194353,Lion Exhibit,43.819228,-79.186977,Zoo Exhibit
5,M1B,43.806686,-79.194353,Caribbean Wave,43.798558,-79.195777,Caribbean Restaurant
6,M1B,43.806686,-79.194353,Gorilla Exhibit,43.81908,-79.184235,Zoo Exhibit
7,M1B,43.806686,-79.194353,Orangutan Exhibit,43.818413,-79.182548,Zoo Exhibit
8,M1B,43.806686,-79.194353,LCBO,43.796671,-79.204586,Liquor Store
9,M1B,43.806686,-79.194353,Wendy's,43.802008,-79.19808,Fast Food Restaurant


In [70]:


print('Total Toronto postcodes:', postcodes_with_coordinates_df['Postcode'].shape[0])
print('Total Toronto postcodes with venues:', all_venues['Postcode'].unique().shape[0])



Total Toronto postcodes: 103
Total Toronto postcodes with venues: 102


# There is one postal code with no venues returned from the Foursquare API


In [71]:
postcodes_diff = np.setdiff1d(postcodes_with_coordinates_df['Postcode'].values,all_venues['Postcode'].unique())
postcodes_with_coordinates_df[postcodes_with_coordinates_df['Postcode']==postcodes_diff[0]]

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude,Distance
56,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306,111.0
