In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import geocoder
import folium
from sklearn.cluster import KMeans

In [2]:
#get the table on wikipedia
url = r'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
html = requests.get(url).text
soup = BeautifulSoup(html,'html.parser')
table = soup.find('table')

In [3]:
# Generate the column labels first
header_list = []
headers = table.findAll('th')
for i in range(len(headers)):
    header_list.append(headers[i].text)

f = lambda x :x.replace('\n','')
header_list = list(map(f, header_list))

In [4]:
# Begin parsing the data
data_list = []
table_data = table_data = table.findAll('td')

#use for loop to append every row as an tuple into the list
for i in range(int(len(table_data)/3)):
    data_list.append((table_data[3*i].text, table_data[3*i+1].text,table_data[3*i+2].text))

# Load the parsed data into a dataframe 
data_df = pd.DataFrame(data_list, columns = header_list)

#we can see that some of the grid contans '/n', let's get rid of that
data_df = data_df.applymap(f)

In [5]:
# Now, as demanded, we need to concatnate neigborhood names under the same post code
AB  =data_df [['Postcode','Borough']].copy()
AB.drop_duplicates(inplace = True)
C = data_df.groupby('Postcode')['Neighborhood'].agg(lambda x : ','.join(x))

In [6]:
#Define ABC as the container of the final dataframe
ABC = AB.join(C, on = 'Postcode')

In [7]:
# If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough.

# Drop the rows where borough is not assigned
  
cond1 = ABC['Borough']=='Not assigned'
ABC = ABC.where(ABC['Borough']!='Not assigned').dropna(axis = 0)
ABC.reset_index(drop=True, inplace = True)
  #Fill neighborghood name with borough name if neighborhood name is not assigned
for i in range(ABC.shape[0]):
      if ABC['Neighborhood'][i] == 'Not assigned':
          ABC['Neighborhood'][i] = ABC['Borough'][i]

## Question 1 - The shape of DataFrame

In [8]:
print(ABC.shape)

(103, 3)


In [9]:
# Continue to retrieve gps coordinates 
#Tried geocoder but not very promising, so here I apply pd.read_csv to read in coordinates instead
geo_df = pd.read_csv('http://cocl.us/Geospatial_data')

In [10]:
#Let's sort our ABC dataframe and geo_df by postcode then concat them
ABC = ABC.sort_values(by = 'Postcode', axis = 0).reset_index(drop = True)
geo_df = geo_df.sort_values(by = 'Postal Code', axis = 0).reset_index(drop = True)

In [11]:
#define ABCDE as the final container fot this part
ABCDE = pd.concat([ABC, geo_df], axis = 1)

In [12]:
#drop one postcode column
ABCDE.drop('Postal Code', axis = 1, inplace  = True)

## Question 2 - Full DataFrame

In [13]:
ABCDE

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park,Ionview,Kennedy Park",43.727929,-79.262029
7,M1L,Scarborough,"Clairlea,Golden Mile,Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest,Cliffside,Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff,Cliffside West",43.692657,-79.264848


In [14]:
# Proceed to work on clusters
#let's focus on the clusters locate in downtown Toronto
trt_df = ABCDE.where(cond = ABCDE['Borough']=='Downtown Toronto')
trt_df = trt_df.dropna(axis = 0)
trt_df.reset_index(drop=True, inplace = True)

In [15]:
#create a map on folium then mark those spots
map_trt = folium.Map(location = [43.65,-79.38], zoom_start = 13)

for lat, lon, code in zip(trt_df['Latitude'],trt_df['Longitude'], trt_df['Postcode']):
    label = code
    label = folium.Popup(label, parse_html = True)
    folium.CircleMarker(
            [lat, lon],
            radius = 5,
            popup = label,
            color = 'blue',
            fill = True,
            fill_color = '#3186cc',
            parse_html = False).add_to(map_trt)
map_trt

In [16]:
# Okay all good, it's time to pull data from FourSquare for clustering

CLIENT_ID = 'AOHS3WCWE1ZVPB4F3LQNMIE5DY2ULFSDMA0G1MN20LL35LCW'
CLIENT_SECRET = '4FB5FLLAXMEVXFRYT4R251SM2XSSCOU55SYKK4E0XPAMRZWJ'
VERSION = '20191229'

In [17]:
RADIUS = 500
LIMIT = 100

In [18]:
#define a function to retrieve top 100 recommendated venues around each area
#Borrowed from lab
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lon in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lon, 
            RADIUS, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]
        results
        try:
            r_venues = results['groups'][0]['items']
        except:
            continue
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lon, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in r_venues])
    
    return(venues_list)

In [19]:
#use the defined function to get venues
trt_venue = getNearbyVenues(trt_df['Postcode'], trt_df['Latitude'],trt_df['Longitude'])

M4W
M4X
M4Y
M5A
M5B
M5C
M5E
M5G
M5H
M5J
M5K
M5L
M5S
M5T
M5V
M5W
M5X
M6G
M9A


In [20]:
#convert the list to a dataframe
venue_df = pd.DataFrame(columns = ['Name','lon','lat','venue','v_lon','v_lat','cat'])

for i in range(len(trt_venue)):
    mini_df =  pd.DataFrame(trt_venue[i], columns = ['Name','lon','lat','venue','v_lon','v_lat','cat'])
    venue_df = pd.concat([venue_df,mini_df],axis=0)

venue_df.reset_index(drop = True, inplace = True)
#Now, venue_df is the final dataframe that contains all the venues under each area

In [21]:
#Feauture engineering based on the catagory
trt_onehot = pd.get_dummies(venue_df['cat'])

In [22]:
trt_onehot['Name'] = venue_df['Name']

In [23]:
# group the data by area name and calculate the mean as appreance frequency of venues
trt_grouped = trt_onehot.groupby('Name').mean()
trt_grouped = trt_grouped.reset_index()

In [24]:
#Run clusters of the areas based on the data we have got for now

num_cluster = 3
data = trt_grouped.drop('Name', axis = 1)
kmeans = KMeans(n_clusters = num_cluster, random_state=0)
clusters =kmeans.fit(data)
labels = clusters.labels_

In [25]:
trt_grouped['Label'] = labels

In [26]:
#add the longitude and latitude into trt_grouped
trt_grouped ['Longitude'] = trt_df['Longitude'][:18]
trt_grouped['Latitude'] = trt_df['Latitude'][:18]

In [28]:
#Finally, visulize the clusters in the map using diffrent colors

color_list = ['red', 'green','blue']

trt_clustermap = folium.Map(location = [43.65,-79.38], zoom_start = 12)

for lat, lon, name, cluster in zip(trt_grouped['Latitude'], trt_grouped['Longitude'],trt_grouped['Name'], trt_grouped['Label']):
    label = name + ", belongs to cluster " + str(cluster)
    label = folium.Popup(label, parse_html = True)
    folium.CircleMarker(
            [lat, lon],
            radius = 5,
            popup = label,
            color = color_list[cluster],
            fill = True,
            fill_color = color_list[cluster]).add_to(trt_clustermap)
    
trt_clustermap