In [277]:
import pandas as pd
import requests
import lxml.html as lh
%matplotlib inline
from pandas.io.json import json_normalize
import json
import numpy as np
import matplotlib.cm as cm
import matplotlib.colors as colors


# Download and clean data

In [3]:
url='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
page=requests.get(url)
doc=lh.fromstring(page.content)

In [4]:
elements=doc.xpath('//tr')#table is stored in <tr>..</tr>#

In [5]:
[len(x) for x in elements[:10]] #check the length of first 10 rows#

[3, 3, 3, 3, 3, 3, 3, 3, 3, 3]

In [6]:
list=[]
i=0
for j in elements[0]:
    i+=1
    name=j.text_content()
    print ('%d:"%s"'%(i,name))
    list.append((name,[]))

1:"Postcode"
2:"Borough"
3:"Neighbourhood
"


In [7]:
for e in range(1,len(elements)):
    T=elements[e]
    if len(T)!=3:
        break
    i=0
    for t in T.iterchildren():
        data=t.text_content()
        if i>0:
            try:
                data=int(data)
            except:
                pass
                
        list[i][1].append(data)
        i+=1

In [8]:
[len(x) for (title,x) in list]#the length of the columns#

[288, 288, 288]

In [9]:
dic={title:column for (title,column) in list}
df=pd.DataFrame(dic)

In [78]:
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [11]:
df['Neighbourhood\n'] = df['Neighbourhood\n'].map(lambda x: str(x)[:-1])

In [154]:
df.rename(columns={'Neighbourhood\n':'Neighbourhood'},inplace=True)

In [14]:
df=df[df.Borough!='Not assigned']#ignore the 'not assigned' in Borough#

In [15]:
df=df.reset_index(drop=True)
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights


In [16]:
df=df.groupby(['Postcode','Borough'], as_index=False).agg(','.join)# merge the Neighbourhood with the same Postcode#

In [17]:
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [18]:
df[df['Neighbourhood'].str.contains("Not assigned")] #sign the same value with Borough if Neighbourhood is Not assigned#

Unnamed: 0,Postcode,Borough,Neighbourhood
85,M7A,Queen's Park,Not assigned


In [19]:
df=df.replace('Not assigned','Queen"s Park')

In [85]:
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [21]:
df.shape

(103, 3)

# The latitude and the longitude coordinates of each neighborhood.

In [114]:
Latitude=[]#use google map#
Longitude=[]
for index, row in df.iterrows():
    try:
        address=str(row['Postcode'])
        api_key = ""
        api_response = requests.get('https://maps.googleapis.com/maps/api/geocode/json?address={0}&key={1}'.format(address, api_key))
        api_response_dict = api_response.json()
        lat=api_response_dict['results'][0]['geometry']['location']['lat']
        lng=api_response_dict['results'][0]['geometry']['location']['lng']
        Latitude.append(lat)
        Longitude.append(lng)
    except Exception:
        pass      

In [141]:
df['Latitude']=pd.DataFrame(Latitude)
df['Longitude']=pd.DataFrame(Longitude)

In [153]:
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


# Clustering neignbourhood

In [157]:
print('The number of Borough is',len(df['Borough'].unique()))
print('The number of Neighbourhood is',df['Neighbourhood'].shape[0])


The number of Borough is 11
The number of Neighbourhood is 103


In [163]:
import folium
# create map of Toronto latitude and longitude
map = folium.Map(location=[43.653908,-79.384293], zoom_start=10)
# add markers to map
for lat, lng, borough, neighborhood in zip(df['Latitude'], df['Longitude'], df['Borough'], df['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map)  
    
map

In [208]:
# try the Borough of Downtown Toronto#
dt_df=df[df['Borough']=='Downtown Toronto'].reset_index(drop=True)
dt_df

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M4W,Downtown Toronto,Rosedale,43.679563,-79.377529
1,M4X,Downtown Toronto,"Cabbagetown,St. James Town",43.667967,-79.367675
2,M4Y,Downtown Toronto,Church and Wellesley,43.66586,-79.38316
3,M5A,Downtown Toronto,"Harbourfront,Regent Park",43.65426,-79.360636
4,M5B,Downtown Toronto,"Ryerson,Garden District",43.657162,-79.378937
5,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
6,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306
7,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
8,M5H,Downtown Toronto,"Adelaide,King,Richmond",43.650571,-79.384568
9,M5J,Downtown Toronto,"Harbourfront East,Toronto Islands,Union Station",43.640816,-79.381752


In [209]:
dt_map = folium.Map(location=[43.653908, -79.384293], zoom_start=10)
# add markers to map
for lat, lng, borough, neighborhood in zip(dt_df['Latitude'], dt_df['Longitude'], dt_df['Borough'],dt_df['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(dt_map)  
    
dt_map

In [168]:
# utilizing the Foursquare API#
CLIENT_ID = ''
CLIENT_SECRET = '' 
VERSION = '20180605'

In [182]:
# Use one neighbourhood#
df_df.loc[0,'Neighbourhood']

'Agincourt'

In [171]:
dt_latitude=dt_df.loc[0,'Latitude']
dt_longitude=dt_df.loc[0,'Longitude']
dt_name=dt_df.loc[0,'Neighbourhood']

In [204]:
LIMIT=100
radius=300
url='https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID,
    CLIENT_SECRET,
    VERSION,
    dt_latitude,
    dt_longitude,
    radius,
    LIMIT)
url

'https://api.foursquare.com/v2/venues/explore?&client_id=ZNM2CPSH0J4OJZURLC2JKZMF5HWQXLZ0BHQJU52HHCYPCVES&client_secret=5UJFTLFOTYNYJJQTANLI3KNT0S2EV54MMU2JNCFLFB2B3Q2C&v=20180605&ll=43.679563,-79.377529&radius=300&limit=100'

In [205]:
res=requests.get(url).json()
res

{'meta': {'code': 200, 'requestId': '5d2e32cf429bfc0025a7f60d'},
 'response': {'headerLocation': 'Rosedale',
  'headerFullLocation': 'Rosedale, Toronto',
  'headerLocationGranularity': 'neighborhood',
  'totalResults': 4,
  'suggestedBounds': {'ne': {'lat': 43.682263002700005,
    'lng': -79.37380262962137},
   'sw': {'lat': 43.6768629973, 'lng': -79.38125537037863}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '506cda8090e77869cff96174',
       'name': 'Park Drive Reservation Lands',
       'location': {'address': '200 Park Dr.',
        'crossStreet': 'at Mt. Pleasant Rd.',
        'lat': 43.679821977132995,
        'lng': -79.3777871131897,
        'labeledLatLngs': [{'label': 'display',
          'lat': 43.679821977132995,
          'lng': -79.3777871131897}],
   

In [194]:
def get_category_type(row):
    try:
        categories_list=row['categories']
    except:
        categories_list=row['venue.categories']
    if len(categories_list)==0:
        return None
    else:
        return categories_list[0]['name']

In [206]:
venues=json_normalize(res['response']['groups'][0]['items'])
filter_columns=['venue.name','venue.categories','venue.location.lat','venue.location.lng']
venues=venues.loc[:,filter_columns]
venues['venue.categories']=venues.apply(get_category_type,axis=1)


In [207]:
venues

Unnamed: 0,venue.name,venue.categories,venue.location.lat,venue.location.lng
0,Park Drive Reservation Lands,Park,43.679822,-79.377787
1,Mooredale House Pool,Pool,43.678674,-79.379311
2,Mooredale House,Building,43.678631,-79.380091
3,Mooredale Day Camp,Campground,43.678332,-79.380491


In [241]:
# Explore neighborhood in Downtown Toronto
def getVenues(names,latitude,longitude,radius=500):
    venues_list=[]
    for name, lat, lng in zip(names,latitude,longitude):
        url='https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID,
            CLIENT_SECRET,
            VERSION,
            lat,
            lng,
            radius,
            LIMIT)
       
        res=requests.get(url).json()['response']['groups'][0]['items']
        venues_list.append([(name,
                             lat,
                             lng,
                             r['venue']['name'],
                             r['venue']['location']['lat'],
                             r['venue']['location']['lng'],
                             r['venue']['categories'][0]['name']) for r in res])
    venues=pd.DataFrame([j for i in venues_list for j in i])
    venues.columns=['neighborhood','neighborhood lat','neighborhood lng','venue','venue lat','venue lng','venue category']
    return venues
        

In [242]:
dt_venues=getVenues(dt_df['Neighbourhood'],
                    dt_df['Latitude'],dt_df['Longitude'])

In [243]:
dt_venues.shape

(1293, 7)

In [244]:
dt_venues.head()

Unnamed: 0,neighborhood,neighborhood lat,neighborhood lng,venue,venue lat,venue lng,venue category
0,Rosedale,43.679563,-79.377529,Mooredale House,43.678631,-79.380091,Building
1,Rosedale,43.679563,-79.377529,Rosedale Park,43.682328,-79.378934,Playground
2,Rosedale,43.679563,-79.377529,Whitney Park,43.682036,-79.373788,Park
3,Rosedale,43.679563,-79.377529,Alex Murray Parkette,43.6783,-79.382773,Park
4,Rosedale,43.679563,-79.377529,Milkman's Lane,43.676352,-79.373842,Trail


In [245]:
dt_venues.groupby('neighborhood').count()

Unnamed: 0_level_0,neighborhood lat,neighborhood lng,venue,venue lat,venue lng,venue category
neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Adelaide,King,Richmond",100,100,100,100,100,100
Berczy Park,56,56,56,56,56,56
"CN Tower,Bathurst Quay,Island airport,Harbourfront West,King and Spadina,Railway Lands,South Niagara",16,16,16,16,16,16
"Cabbagetown,St. James Town",44,44,44,44,44,44
Central Bay Street,88,88,88,88,88,88
"Chinatown,Grange Park,Kensington Market",100,100,100,100,100,100
Christie,16,16,16,16,16,16
Church and Wellesley,87,87,87,87,87,87
"Commerce Court,Victoria Hotel",100,100,100,100,100,100
"Design Exchange,Toronto Dominion Centre",100,100,100,100,100,100


In [248]:
print('There are {} uniques categories'.format(len(dt_venues['venue category'].unique())))

There are 207 uniques categories


In [263]:
# Analyze each neighborhood
dt_feature=pd.get_dummies(dt_venues[['venue category']], prefix='',prefix_sep='')
dt_feature['neighborhood']=dt_venues['neighborhood']
col=dt_feature.columns.tolist()
column=col[-1:]+col[:-1]
dt_feature=dt_feature[column]
dt_feature.head()

Unnamed: 0,neighborhood,Accessories Store,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Thrift / Vintage Store,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wings Joint,Yoga Studio
0,Rosedale,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Rosedale,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Rosedale,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Rosedale,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Rosedale,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0


In [264]:
dt_feature.shape

(1293, 208)

In [265]:
dt_group=dt_feature.groupby('neighborhood').mean().reset_index()
dt_group

Unnamed: 0,neighborhood,Accessories Store,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Thrift / Vintage Store,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wings Joint,Yoga Studio
0,"Adelaide,King,Richmond",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.04,...,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.01,0.0,0.0
1,Berczy Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.017857,0.0,0.0,0.0,0.0,0.0
2,"CN Tower,Bathurst Quay,Island airport,Harbourf...",0.0,0.0,0.0625,0.0625,0.0625,0.125,0.125,0.125,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"Cabbagetown,St. James Town",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Central Bay Street,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.011364,...,0.0,0.0,0.0,0.0,0.011364,0.0,0.0,0.011364,0.0,0.011364
5,"Chinatown,Grange Park,Kensington Market",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.01,0.0,0.0,0.0,0.06,0.0,0.05,0.01,0.0,0.0
6,Christie,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,Church and Wellesley,0.0,0.011494,0.0,0.0,0.0,0.0,0.0,0.0,0.011494,...,0.0,0.0,0.0,0.0,0.0,0.011494,0.011494,0.0,0.011494,0.011494
8,"Commerce Court,Victoria Hotel",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.04,...,0.0,0.0,0.0,0.0,0.02,0.0,0.0,0.01,0.0,0.0
9,"Design Exchange,Toronto Dominion Centre",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.03,...,0.0,0.0,0.0,0.01,0.01,0.0,0.0,0.01,0.0,0.0


In [267]:
# Print each neighborhood with top 5 venues
num_venues=5
for i in dt_group['neighborhood']:
    print('-----'+i+'-----')
    tem=dt_group[dt_group['neighborhood']==i].T.reset_index()
    tem.columns=['venus','freq']
    tem=tem.iloc[1:]
    tem['freq']=tem['freq'].astype(float)
    tem=tem.round({'freq':2})
    tem=tem.sort_values('freq',ascending=False).reset_index(drop=True)
    print(tem.head(num_venues))
    print('\n')
    

-----Adelaide,King,Richmond-----
                 venus  freq
0          Coffee Shop  0.07
1                 Café  0.05
2                  Bar  0.04
3  American Restaurant  0.04
4           Steakhouse  0.04


-----Berczy Park-----
            venus  freq
0     Coffee Shop  0.07
1          Bakery  0.05
2    Cocktail Bar  0.05
3  Farmers Market  0.04
4        Beer Bar  0.04


-----CN Tower,Bathurst Quay,Island airport,Harbourfront West,King and Spadina,Railway Lands,South Niagara-----
              venus  freq
0    Airport Lounge  0.12
1   Airport Service  0.12
2  Airport Terminal  0.12
3   Harbor / Marina  0.06
4               Bar  0.06


-----Cabbagetown,St. James Town-----
                venus  freq
0         Coffee Shop  0.09
1  Italian Restaurant  0.05
2         Pizza Place  0.05
3              Bakery  0.05
4                 Pub  0.05


-----Central Bay Street-----
                venus  freq
0         Coffee Shop  0.15
1  Italian Restaurant  0.05
2                Café  0.05
3     

In [269]:
# Cluster neighborhood#
from sklearn.cluster import KMeans
k=5
dt_cluster=dt_group.drop('neighborhood',1)
model=KMeans(5,random_state=1).fit(dt_cluster)
model.labels_

array([0, 0, 1, 0, 0, 0, 3, 0, 0, 0, 0, 4, 0, 0, 2, 0, 0, 0], dtype=int32)

In [271]:
dt_df.insert(0,'cluster label',model.labels_)

In [273]:
dt_df

Unnamed: 0,cluster label,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,0,M4W,Downtown Toronto,Rosedale,43.679563,-79.377529
1,0,M4X,Downtown Toronto,"Cabbagetown,St. James Town",43.667967,-79.367675
2,1,M4Y,Downtown Toronto,Church and Wellesley,43.66586,-79.38316
3,0,M5A,Downtown Toronto,"Harbourfront,Regent Park",43.65426,-79.360636
4,0,M5B,Downtown Toronto,"Ryerson,Garden District",43.657162,-79.378937
5,0,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
6,3,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306
7,0,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
8,0,M5H,Downtown Toronto,"Adelaide,King,Richmond",43.650571,-79.384568
9,0,M5J,Downtown Toronto,"Harbourfront East,Toronto Islands,Union Station",43.640816,-79.381752


In [279]:
# visualize the resulting clusters
map_clusters = folium.Map(location=[43.653908, -79.384293], zoom_start=11)

# set color scheme for the clusters
x = np.arange(k)
ys = [i + x + (i*x)**2 for i in range(k)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(dt_df['Latitude'], dt_df['Longitude'], dt_df['Neighbourhood'], dt_df['cluster label']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters