# Importing and installing required modules

In [3]:
import numpy as np
import pandas as pd
import matplotlib as plt
import folium
import json
import sklearn.cluster as kmeans
!pip install inline beautifulsoup4
!pip install inline lxml
!pip install inline requests
!pip install inline geocoder



## Importing Modules for Webscraping and for CSV

In [4]:
from bs4 import BeautifulSoup
import requests
import csv

# Scraping data using BeautifulSoup and writing the data to a csv file

In [5]:
html = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M").text
soup = BeautifulSoup(html, "lxml")
table = soup.find("table", class_="wikitable sortable")
info = table.findAll("tr")

with open("nb.csv", "wt+", newline="") as f:
    writer = csv.writer(f)
    for i in info:
        csv_row = []
        for cell in i.findAll(["td"]):
            csv_row.append(cell.get_text())
        writer.writerow(csv_row)

# Preprocessing data using Pandas. 
#### First I imported the data from the csv. Secondly, I removed 'Not Assigned' values from Borough column
#### Thirdly, I striped unneccasary key values
#### Fourthly, I replaced the Not Assigned values in the Neighbourhood column with its equivalent Boroughs
#### Lastly, I grouped the data for duplicates so that the Postal Codes had all there assigned Nieghbourhoods in one row and sorted them 
#### by postal code

In [6]:
df = pd.read_csv('nb.csv', names = ["Postal Code", "Borough", "Neighbourhood"])
df_drop = df[df.Borough != 'Not assigned'].reset_index(drop=True)
new_df = df_drop.apply(lambda x: x.str.strip('\n') if x.dtype == "object" else x)
new_df['Neighbourhood'].replace('Not assigned', new_df.Borough, inplace = True)
new_df = new_df.groupby(['Postal Code','Borough'])['Neighbourhood'].apply(', '.join).reset_index()
new_df.head(11)

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


# Sample of the data to prove correctness of processing

In [7]:
new_df.sample(n=20)

Unnamed: 0,Postal Code,Borough,Neighbourhood
74,M6E,York,Caledonia-Fairbanks
59,M5J,Downtown Toronto,"Harbourfront East, Toronto Islands, Union Station"
38,M4G,East York,Leaside
9,M1N,Scarborough,"Birch Cliff, Cliffside West"
12,M1S,Scarborough,Agincourt
14,M1V,Scarborough,"Agincourt North, L'Amoreaux East, Milliken, St..."
37,M4E,East Toronto,The Beaches
20,M2L,North York,"Silver Hills, York Mills"
24,M2R,North York,Willowdale West
62,M5M,North York,"Bedford Park, Lawrence Manor East"


In [8]:
new_df.shape

(103, 3)

In [9]:
dh = pd.read_csv('Geospatial_Coordinates.csv')
final_df = pd.merge(new_df, dh)
final_df.head(12)

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",43.727929,-79.262029
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848


# Longitude and Latitude Data

# Toronto Visualized

In [14]:
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

import folium # map rendering library
# DT_data = neighborhoods[neighborhoods['Borough'] == 'Downtown Toronto'].reset_index(drop=True)
# DT_data.head()

In [15]:
address = 'Toronto, ON'

geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.653963, -79.387207.


In [16]:
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighbourhood in zip(final_df['Latitude'], final_df['Longitude'], final_df['Borough'], final_df['Neighbourhood']):
    label = '{}, {}'.format(neighbourhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

# Downtown Toronto Visualized and Clustered

In [17]:
DT_data = final_df[final_df['Borough'] == 'Downtown Toronto'].reset_index(drop=True)
DT_data.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M4W,Downtown Toronto,Rosedale,43.679563,-79.377529
1,M4X,Downtown Toronto,"Cabbagetown, St. James Town",43.667967,-79.367675
2,M4Y,Downtown Toronto,Church and Wellesley,43.66586,-79.38316
3,M5A,Downtown Toronto,"Harbourfront, Regent Park",43.65426,-79.360636
4,M5B,Downtown Toronto,"Ryerson, Garden District",43.657162,-79.378937


In [20]:
address = 'Downtown Toronto'

geolocator = Nominatim(user_agent="DT_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinates of Downtown Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinates of Downtown Toronto are 43.6541737, -79.3808116451341.


In [22]:
# create map of Downtown Toronto using latitude and longitude values
map_DT = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(DT_data['Latitude'], DT_data['Longitude'], DT_data['Neighbourhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_DT)  
    
map_DT

## Foursqure API Utilization

In [23]:
CLIENT_ID = '02MEQJ5NF2YCLVX0I3AHRXK2Q1XTWCOW2KKFSVFDV5KDR4I4'
CLIENT_SECRET = 'OPV0N2CCVJ5GOBINLQ03TCNMMHWHIWQBQE5EZ1SH3R2ZUR0Z'
VERSION = '20180605' # Foursquare API version

In [29]:
import json
from pandas.io.json import json_normalize
DT_data.loc[0, 'Neighbourhood']
neighbourhood_latitude = DT_data.loc[0, 'Latitude'] # neighborhood latitude value
neighbourhood_longitude = DT_data.loc[0, 'Longitude'] # neighborhood longitude value
neighbourhood_name = DT_data.loc[0, 'Neighbourhood'] # neighborhood name

LIMIT = 100
radius = 500 
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighbourhood_latitude, 
    neighbourhood_longitude, 
    radius, 
    LIMIT)
results = requests.get(url).json()
results

#Function for getting venues in Toront0
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

#Making a Dataframe of the Venues
venues = results['response']['groups'][0]['items']
nearby_venues = json_normalize(venues) # flatten JSON
# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]
# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)
# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]
nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,Rosedale Park,Playground,43.682328,-79.378934
1,Whitney Park,Park,43.682036,-79.373788
2,Alex Murray Parkette,Park,43.6783,-79.382773
3,Milkman's Lane,Trail,43.676352,-79.373842


# Explore Dowtown

In [31]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighbourhood', 
                  'Neighbourhood Latitude', 
                  'Neighbourhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

DT_venues = getNearbyVenues(names=DT_data['Neighbourhood'],
                                   latitudes=DT_data['Latitude'],
                                   longitudes=DT_data['Longitude']
                                  )

Rosedale
Cabbagetown, St. James Town
Church and Wellesley
Harbourfront, Regent Park
Ryerson, Garden District
St. James Town
Berczy Park
Central Bay Street
Adelaide, King, Richmond
Harbourfront East, Toronto Islands, Union Station
Design Exchange, Toronto Dominion Centre
Commerce Court, Victoria Hotel
Harbord, University of Toronto
Chinatown, Grange Park, Kensington Market
CN Tower, Bathurst Quay, Island airport, Harbourfront West, King and Spadina, Railway Lands, South Niagara
Stn A PO Boxes 25 The Esplanade
First Canadian Place, Underground city
Christie


# Analysis

In [46]:
# one hot encoding
dt_onehot = pd.get_dummies(DT_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
dt_onehot['Neighbourhood'] = DT_venues['Neighbourhood'] 

# move neighborhood column to the first column
fixed_columns = [dt_onehot.columns[-1]] + list(dt_onehot.columns[:-1])
dt_onehot = dt_onehot[fixed_columns]

dt_onehot.head()
dt_grouped = dt_onehot.groupby('Neighbourhood').mean().reset_index()
dt_grouped

num_top_venues = 5
for hood in dt_grouped['Neighbourhood']:
    print("----"+hood+"----")
    temp = dt_grouped[dt_grouped['Neighbourhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    return row_categories_sorted.index.values[0:num_top_venues]

#Make Dataframe
num_top_venues = 10
indicators = ['st', 'nd', 'rd']
# create columns according to number of top venues
columns = ['Neighbourhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
dt_venues_sorted = pd.DataFrame(columns=columns)
dt_venues_sorted['Neighbourhood'] = dt_grouped['Neighbourhood']

for ind in np.arange(dt_grouped.shape[0]):
    dt_venues_sorted.iloc[ind, 1:] = return_most_common_venues(dt_grouped.iloc[ind, :], num_top_venues)

dt_venues_sorted.head()

----Adelaide, King, Richmond----
                 venue  freq
0          Coffee Shop  0.06
1                 Café  0.05
2           Steakhouse  0.04
3                  Bar  0.04
4  American Restaurant  0.04


----Berczy Park----
                venue  freq
0         Coffee Shop  0.07
1        Cocktail Bar  0.05
2      Farmers Market  0.04
3  Italian Restaurant  0.04
4              Bakery  0.04


----CN Tower, Bathurst Quay, Island airport, Harbourfront West, King and Spadina, Railway Lands, South Niagara----
              venue  freq
0   Airport Service  0.20
1  Airport Terminal  0.13
2    Airport Lounge  0.13
3     Boat or Ferry  0.07
4   Harbor / Marina  0.07


----Cabbagetown, St. James Town----
         venue  freq
0  Coffee Shop  0.09
1  Pizza Place  0.07
2         Café  0.05
3          Pub  0.05
4         Park  0.05


----Central Bay Street----
                venue  freq
0         Coffee Shop  0.16
1                Café  0.05
2  Italian Restaurant  0.05
3        Burger Joint  0.

Unnamed: 0,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Adelaide, King, Richmond",Coffee Shop,Café,Steakhouse,Bar,American Restaurant,Thai Restaurant,Bakery,Burger Joint,Hotel,Cosmetics Shop
1,Berczy Park,Coffee Shop,Cocktail Bar,Seafood Restaurant,Italian Restaurant,Bakery,Steakhouse,Beer Bar,Cheese Shop,Café,Farmers Market
2,"CN Tower, Bathurst Quay, Island airport, Harbo...",Airport Service,Airport Terminal,Airport Lounge,Harbor / Marina,Sculpture Garden,Boutique,Plane,Boat or Ferry,Airport Gate,Airport Food Court
3,"Cabbagetown, St. James Town",Coffee Shop,Pizza Place,Italian Restaurant,Pub,Park,Bakery,Café,Restaurant,Japanese Restaurant,Beer Store
4,Central Bay Street,Coffee Shop,Café,Italian Restaurant,Sandwich Place,Burger Joint,Middle Eastern Restaurant,Bubble Tea Shop,Bar,Bakery,Spa


# Clustering - Using K-Means (Unsupervised Learning)

In [49]:
kclusters = 5
dt_grouped_clustering = dt_grouped.drop('Neighbourhood', 1)
# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(dt_grouped_clustering)
# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 
# add clustering labels
dt_venues_sorted.insert(0, 'Cluster', kmeans.labels_)
dt_merged = DT_data
# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
dt_merged = dt_merged.join(dt_venues_sorted.set_index('Neighbourhood'), on='Neighbourhood')
dt_merged.head() # check the last columns!
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)
# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]
# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(dt_merged['Latitude'], dt_merged['Longitude'], dt_merged['Neighbourhood'], dt_merged['Cluster']):
    label = folium.Popup(str(poi) + 'Cluster' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters