In [42]:
import numpy as np
import pandas as pd
import requests
import lxml.html as lh

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe


# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

from sklearn.cluster import KMeans

In [43]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

#Create a handle, page, to handle the contents of the website
page = requests.get(url)
#Store the contents of the website under doc
doc = lh.fromstring(page.content)
#Parse data that are stored between <tr>..</tr> of HTML
tr_elements = doc.xpath('//tr')

In [44]:
tr_elements = doc.xpath('//tr')
#Create empty list
col=[]
i=0
#For each row, store each first element (header) and an empty list
for t in tr_elements[0]:
    i+=1
    name=t.text_content()
    print ('%d:"%s"'%(i,name))
    col.append((name,[]))

1:"Postal Code
"
2:"Borough
"
3:"Neighbourhood
"


In [45]:
for j in range(1,len(tr_elements)):
    #T is our j'th row
    T=tr_elements[j]
    
    #If row is not of size 3, the //tr data is not from our table 
    if len(T)!=3:
        break
    
    #i is the index of our column
    i=0
    
    #Iterate through each element of the row
    for t in T.iterchildren():
        data=t.text_content() 
        #Check if row is empty
        if i>0:
        #Convert any numerical value to integers
            try:
                data=int(data)
            except:
                pass
        #Append the data to the empty list of the i'th column
        col[i][1].append(data)
        #Increment i for the next column
        i+=1

In [46]:
Dict={title:column for (title,column) in col}
df=pd.DataFrame(Dict)

In [47]:
df.head()

Unnamed: 0,Postal Code\n,Borough\n,Neighbourhood\n
0,M1A\n,Not assigned\n,Not assigned\n
1,M2A\n,Not assigned\n,Not assigned\n
2,M3A\n,North York\n,Parkwoods\n
3,M4A\n,North York\n,Victoria Village\n
4,M5A\n,Downtown Toronto\n,"Regent Park, Harbourfront\n"


In [48]:
df.rename(columns = {'Postal Code\n':'postal code', 'Borough\n':'borough','Neighbourhood\n':'neighborhood'}, inplace = True)


In [49]:
df.replace('\n','', regex = True, inplace = True)

In [50]:
df.head(15)

Unnamed: 0,postal code,borough,neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
7,M8A,Not assigned,Not assigned
8,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
9,M1B,Scarborough,"Malvern, Rouge"


In [51]:
df.dtypes

postal code     object
borough         object
neighborhood    object
dtype: object

In [52]:
df['neighborhood'].str.strip()

0                                           Not assigned
1                                           Not assigned
2                                              Parkwoods
3                                       Victoria Village
4                              Regent Park, Harbourfront
                             ...                        
176                                         Not assigned
177                                         Not assigned
178    Mimico NW, The Queensway West, South of Bloor,...
179                                         Not assigned
180                                                     
Name: neighborhood, Length: 181, dtype: object

### How many rows are are not assigned a neigborhood and a borough? 

In [53]:
missing_boro_neigh = (df['neighborhood'] == 'Not assigned') & (df['borough'] == 'Not assigned')

In [54]:
missing_boro_neigh.count()

181

In [55]:
df['borough'].value_counts()

Not assigned             77
North York               24
Downtown Toronto         19
Scarborough              17
Etobicoke                12
Central Toronto           9
West Toronto              6
East York                 5
East Toronto              5
York                      5
Mississauga               1
Canadian postal codes     1
Name: borough, dtype: int64

#### If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough.

In [56]:
# assign neighbohood name to borough if 'Not assigned'
df['neighborhood'] = np.where(df['neighborhood'] =='Not assigned' ,df['neighborhood'], df['borough'])


In [57]:
indexNames = df[df['borough'] == 'Not assigned' ].index

In [58]:
#df.drop(df[df.score < 50].index, inplace=True)
df.drop(indexNames, inplace=True)

In [59]:
df.shape

(104, 3)

##### Get the geospatial data

In [60]:
geo_df = pd.read_csv('http://cocl.us/Geospatial_data')


In [61]:
geo_df.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [62]:
geo_df.dtypes

Postal Code     object
Latitude       float64
Longitude      float64
dtype: object

In [63]:
neighborhoods = df.merge(geo_df,how='inner', left_on = 'postal code', right_on = 'Postal Code')

In [64]:
neighborhoods.head()

Unnamed: 0,postal code,borough,neighborhood,Postal Code,Latitude,Longitude
0,M3A,North York,North York,M3A,43.753259,-79.329656
1,M4A,North York,North York,M4A,43.725882,-79.315572
2,M5A,Downtown Toronto,Downtown Toronto,M5A,43.65426,-79.360636
3,M6A,North York,North York,M6A,43.718518,-79.464763
4,M7A,Downtown Toronto,Downtown Toronto,M7A,43.662301,-79.389494


#### Clean up column names

In [65]:
neighborhoods.columns = neighborhoods.columns.str.strip()

In [66]:
#check values in latitude & longitude
print("Latitude range", neighborhoods['Latitude'].min(), neighborhoods['Latitude'].max())
print("Longitude range", neighborhoods['Longitude'].min(), neighborhoods['Longitude'].max())
 


Latitude range 43.60241370000001 43.836124700000006
Longitude range -79.61581899999999 -79.16049709999999


In [67]:
# check for nan in lat and long
print(neighborhoods['Latitude'].isnull().values.any())
print(neighborhoods['Longitude'].isnull().values.any())

False
False


In [68]:
# Format the latitude & longitude
neighborhoods['Latitude'] = pd.to_numeric(geo_df['Latitude'], errors = 'coerce', downcast = 'float')
neighborhoods['Longitude'] = pd.to_numeric(geo_df['Longitude'], errors = 'coerce', downcast = 'float')



In [69]:
neighborhoods

Unnamed: 0,postal code,borough,neighborhood,Postal Code,Latitude,Longitude
0,M3A,North York,North York,M3A,43.806686,-79.194351
1,M4A,North York,North York,M4A,43.784534,-79.160500
2,M5A,Downtown Toronto,Downtown Toronto,M5A,43.763573,-79.188713
3,M6A,North York,North York,M6A,43.770992,-79.216919
4,M7A,Downtown Toronto,Downtown Toronto,M7A,43.773136,-79.239479
...,...,...,...,...,...,...
98,M8X,Etobicoke,Etobicoke,M8X,43.706875,-79.518188
99,M4Y,Downtown Toronto,Downtown Toronto,M4Y,43.696320,-79.532242
100,M7Y,East Toronto,East Toronto,M7Y,43.688904,-79.554726
101,M8Y,Etobicoke,Etobicoke,M8Y,43.739418,-79.588440


In [70]:
toronto_data = neighborhoods[['borough','neighborhood','postal code','Latitude','Longitude']].reset_index(drop=True)
toronto_data.head()

Unnamed: 0,borough,neighborhood,postal code,Latitude,Longitude
0,North York,North York,M3A,43.806686,-79.194351
1,North York,North York,M4A,43.784534,-79.1605
2,Downtown Toronto,Downtown Toronto,M5A,43.763573,-79.188713
3,North York,North York,M6A,43.770992,-79.216919
4,Downtown Toronto,Downtown Toronto,M7A,43.773136,-79.239479


In [71]:
# find the center of the data
mid_point_latitude = ((neighborhoods['Latitude'].min() + neighborhoods['Latitude'].max())/2)
mid_point_longitude = ((neighborhoods['Longitude'].min() + neighborhoods['Longitude'].max())/2)
print(mid_point_latitude, mid_point_longitude)


43.719268798828125 -79.3881607055664


In [72]:
map_toronto = folium.Map(location=[mid_point_latitude, mid_point_longitude], zoom_start=11)
map_toronto

### Create map around Toronto

In [73]:
import folium
# create map of Toronto using latitude and longitude values
latitude = neighborhoods['Latitude']
longitude = neighborhoods['Longitude']

#map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)


# add markers to map
for lat, lng, borough, neighborhood in zip(neighborhoods['Latitude'], neighborhoods['Longitude'], neighborhoods['borough'], neighborhoods['neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

### Define Foursquare Credentials and Version

In [74]:
CLIENT_ID = '450JZYHOXGKOWLVHKRM0BEGKHDY5XJL0Q1DHODCMLQRRNGC3' # your Foursquare ID
CLIENT_SECRET = 'MDC2EYL0Y3OVOJL3WJIJJKUT3M1ALJLIILOGA2CCNKNO23O2' # your Foursquare Secret
VERSION = '20200801' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: 450JZYHOXGKOWLVHKRM0BEGKHDY5XJL0Q1DHODCMLQRRNGC3
CLIENT_SECRET:MDC2EYL0Y3OVOJL3WJIJJKUT3M1ALJLIILOGA2CCNKNO23O2


In [75]:
neighborhoods.loc[0, 'neighborhood']

'North York'

In [76]:
neighborhood_latitude = neighborhoods.loc[3, 'Latitude'] # neighborhood latitude value
neighborhood_longitude = neighborhoods.loc[3, 'Longitude'] # neighborhood longitude value

neighborhood_name = neighborhoods.loc[3, 'neighborhood'] # neighborhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_name, 
                                                               neighborhood_latitude, 
                                                               neighborhood_longitude))

Latitude and longitude values of North York are 43.770992279052734, -79.2169189453125.


In [77]:
LIMIT = 100 # limit of number of venues returned by Foursquare API

radius = 500 # define radius

url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighborhood_latitude, 
    neighborhood_longitude, 
    radius, 
    LIMIT)
url # display UR

'https://api.foursquare.com/v2/venues/explore?&client_id=450JZYHOXGKOWLVHKRM0BEGKHDY5XJL0Q1DHODCMLQRRNGC3&client_secret=MDC2EYL0Y3OVOJL3WJIJJKUT3M1ALJLIILOGA2CCNKNO23O2&v=20200801&ll=43.770992279052734,-79.2169189453125&radius=500&limit=100'

In [78]:
results = requests.get(url).json()
results

{'meta': {'code': 429,
  'errorType': 'quota_exceeded',
  'errorDetail': 'Quota exceeded',
  'requestId': '5f31b985c5ebcc30c4c11915'},
 'response': {}}

In [79]:
# tranform venues into a dataframe
dataframe = json_normalize(results)
dataframe.head()

  


Unnamed: 0,meta.code,meta.errorType,meta.errorDetail,meta.requestId
0,429,quota_exceeded,Quota exceeded,5f31b985c5ebcc30c4c11915


In [80]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [82]:
venues = results['response']['groups'][0]['items']


nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)
# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

KeyError: 0

###  Create a function to repeat the same process to all the neighborhoods in Toronto

In [None]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['neighborhood', 
                  'name'
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [None]:
nearby_venues.head()

In [None]:
toronto_venues = getNearbyVenues(names= toronto_data['neighborhood'],
                                   latitudes = toronto_data['Latitude'],
                                   longitudes= toronto_data['Longitude'])
                                  

In [None]:
print(toronto_venues.shape)
toronto_venues.head()

In [None]:
toronto_venues.groupby('neighborhood').count()

Let's find out how many unique categories can be curated from all the returned venues¶

In [None]:
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

In [None]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['neighborhood'] = toronto_venues['neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

In [None]:
toronto_grouped = toronto_onehot.groupby('neighborhood').mean().reset_index()
toronto_grouped

num_top_venues = 5

for hood in toronto_grouped['neighborhood']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

### Put this into a dataframe

In [None]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

Display top ten venues

In [None]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['neighborhood'] = toronto_grouped['neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Cluster Neighborhoods

In [None]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['neighborhood'] = toronto_grouped['neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

In [None]:
toronto_data = neighborhoods[neighborhoods['neighborhood'] == 'Toronto'].reset_index(drop=True)
toronto_data.head()

In [None]:
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

In [None]:
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

In [None]:
# add labels

#toronto_merged = toronto_data

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = neighborhoods.join(neighborhoods_venues_sorted.set_index('neighborhood'), on='neighborhood')

toronto_merged.head() # check the last columns!

In [None]:
#create map
map_clusters = folium.Map(location=[mid_point_latitude, mid_point_longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

In [None]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 0, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

In [None]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 1, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

In [None]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 2, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

In [None]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 3, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]