In [2]:

# !conda install -c conda-forge folium=0.5.0 --yes
# !conda install -c conda-forge geopy --yes
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows',500)
import json
import requests #
from pandas import json_normalize
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans
from geopy.geocoders import Nominatim
import folium # map rendering library
# !pip install lxml
import lxml
import time
print('Done!')

Done!


### Get Data from WiKi

In [305]:
# Get data from wiki
url='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
data = pd.read_html(url)

In [306]:
def quick_report(df_use,time_s):
    print('Time: {:.2f}\nShape: {}' .format(time.time()-time_s, df_use.shape))

In [317]:
ts=time.time()
df  = pd.DataFrame(data[0]) # new dataframe
df.dropna(axis=0, inplace=True) # Drop all NaN values from dataframe's rows - it's empty Neighborhood and 'Not assigned'Borough
df.reset_index(inplace=True) # reset index
df.drop(columns='index',inplace=True) # delete old index column
print(df.head())
quick_report(df,ts)

  Postal Code           Borough                                 Neighborhood
0         M3A        North York                                    Parkwoods
1         M4A        North York                             Victoria Village
2         M5A  Downtown Toronto                    Regent Park, Harbourfront
3         M6A        North York             Lawrence Manor, Lawrence Heights
4         M7A  Downtown Toronto  Queen's Park, Ontario Provincial Government
Time: 0.02
Shape: (103, 3)


### Get Data Location from Nominatim

In [318]:
# create DataFrame from csv file with geopoints oh PostalCode 
adr='https://cocl.us/Geospatial_data'
adr_df=pd.read_csv(adr)
adr_df.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [319]:
ts=time.time()
# Combine both DF by 'Postal Code'
df=pd.merge(df,adr_df, on ='Postal Code', how = 'left')
print(quick_report(df,ts))
df.head()

Time: 0.04
Shape: (103, 5)
None


Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


In [302]:
# Doesn't work GEOSERVICE
# geolocator = Nominatim(user_agent="foursquare_agent")
# df['Latitude']=float(0)
# df['Longitude']=float(0)
# for i in range(0, df.shape[0]):   
#     address =df['Postal Code'][i] + ', Canada, Toronto'
#     location = geolocator.geocode(address)
#     df.iloc[i,3]= location.latitude
#     df.iloc[i,4] =  location.longitude
#     print('The geograpical coordinate of {} are {}, {}.'.format(address, df.iloc[i,3], df.iloc[i,4]))

#### Create map of Toronto with Postal Code

In [None]:
address = 'Toronto, Canada'
geolocator = Nominatim(user_agent="toronto")
location = geolocator.geocode(address)
latitude,longitude = location.latitude,location.longitude

In [326]:
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))
toronto_map = folium.Map(location=[latitude, longitude], zoom_start=11)
# add markers Postal Code to map
for lat, lng, borough, postcode in zip(df['Latitude'], df['Longitude'], df['Borough'], df['Postal Code']):
    label = '{}, {}'.format(postcode, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=6,
        popup=label,
        color='green',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.65,
        parse_html=False).add_to(toronto_map)  
    
toronto_map

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


#### Ger more information about Toronto from Foursquare service

In [340]:
CLIENT_ID = '' # your Foursquare ID (deleted)
CLIENT_SECRET = '' # your Foursquare Secret (deleted)
VERSION = '20180605' # Foursquare API version

#### let's explore the top 20 venues that are around each Postal Code within a radius of 1000 meters.

In [616]:
LIMIT = 20
radius = 1000

In [617]:
# Function that will gathering information about category and venues
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
#         print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Postal Code', 
                  'Postal Code Latitude', 
                  'Postal Code Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [618]:
# Run and get info
ts=time.time()
toronto_venues = getNearbyVenues(names=df['Postal Code'],
                                   latitudes=df['Latitude'],
                                   longitudes=df['Longitude']
                                  )
quick_report(df,ts)

Time: 19.49
Shape: (103, 5)


In [619]:
toronto_venues.shape

(1068, 7)

#### Let's look at dataframe and how much we have unique category  

In [620]:
print(toronto_venues.shape)
print(toronto_venues.head())
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

(1068, 7)
  Postal Code  Postal Code Latitude  Postal Code Longitude  \
0         M3A             43.753259             -79.329656   
1         M3A             43.753259             -79.329656   
2         M3A             43.753259             -79.329656   
3         M3A             43.753259             -79.329656   
4         M4A             43.725882             -79.315572   

                               Venue  Venue Latitude  Venue Longitude  \
0                    Brookbanks Park       43.751976       -79.332140   
1                     TTC stop #8380       43.752672       -79.326351   
2                      Variety Store       43.751974       -79.333114   
3  Corrosion Service Company Limited       43.752432       -79.334661   
4             Victoria Village Arena       43.723481       -79.315635   

               Venue Category  
0                        Park  
1                    Bus Stop  
2           Food & Drink Shop  
3  Construction & Landscaping  
4                H

## Analyze Each Postal Code

In [621]:
# one hot encoding and resize the dataframe by mean function
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add Postal Code column back to dataframe
toronto_onehot['Postal Code'] = toronto_venues['Postal Code']
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_grouped = toronto_onehot.groupby('Postal Code').mean().reset_index()
toronto_grouped.head()

Unnamed: 0,Postal Code,Accessories Store,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,Auto Workshop,BBQ Joint,Baby Store,Bagel Shop,Bakery,Bank,Bar,Baseball Field,Basketball Court,Basketball Stadium,Beer Bar,Beer Store,Belgian Restaurant,Bike Shop,Bistro,Boat or Ferry,Bookstore,Boutique,Breakfast Spot,Brewery,Bridal Shop,Bubble Tea Shop,Burger Joint,Burrito Place,Bus Line,Bus Station,Bus Stop,Business Service,Butcher,Café,Cajun / Creole Restaurant,Camera Store,Candy Store,Caribbean Restaurant,Cheese Shop,Chinese Restaurant,...,Print Shop,Pub,Ramen Restaurant,Rental Car Location,Restaurant,River,Salad Place,Salon / Barbershop,Sandwich Place,Sculpture Garden,Seafood Restaurant,Shopping Mall,Shopping Plaza,Skate Park,Skating Rink,Smoke Shop,Smoothie Shop,Snack Place,Soccer Field,Spa,Speakeasy,Sporting Goods Shop,Sports Bar,Stadium,Stationery Store,Steakhouse,Summer Camp,Supermarket,Supplement Shop,Sushi Restaurant,Swim School,Tailor Shop,Taiwanese Restaurant,Tanning Salon,Tea Room,Thai Restaurant,Theater,Theme Restaurant,Thrift / Vintage Store,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,M1B,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,M1C,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,M1E,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.142857,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.142857,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.142857,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,M1G,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,M1H,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.111111,0.0,0.0,0.0,0.0,0.111111,0.111111,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.111111,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.111111,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [622]:
num_top_venues = 10

for PostalCode in toronto_grouped['Postal Code']:
#     print("----"+PostalCode+"----")
    temp = toronto_grouped[toronto_grouped['Postal Code'] == PostalCode].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
#     print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
#     print('\n')

In [623]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    return row_categories_sorted.index.values[0:num_top_venues]

In [624]:
indicators = ['st', 'nd', 'rd']
# create columns according to number of top venues
columns = ['Postal Code']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
postal_venues_sorted = pd.DataFrame(columns=columns)
postal_venues_sorted['Postal Code'] = toronto_grouped['Postal Code']

for ind in np.arange(toronto_grouped.shape[0]):
    postal_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)


### Create K-Means Model and draw segments on map

In [625]:
kclusters = 9
toronto_grouped_clustering = toronto_grouped.drop('Postal Code', 1)
# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)
# check cluster labels generated for each row in the dataframe
kmeans.labels_

array([6, 7, 6, 2, 0, 8, 2, 0, 6, 2, 6, 0, 6, 6, 8, 6, 6, 6, 2, 2, 4, 6,
       0, 2, 2, 6, 2, 3, 0, 6, 6, 6, 6, 6, 6, 2, 6, 4, 6, 6, 2, 0, 6, 2,
       2, 3, 2, 3, 2, 6, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 5, 6, 2, 2, 2, 6,
       2, 2, 6, 6, 6, 3, 2, 0, 6, 2, 0, 2, 4, 2, 2, 2, 2, 2, 6, 2, 6, 3,
       1, 6, 2, 6, 1, 4, 6, 6, 6, 7], dtype=int32)

In [626]:
# add clustering labels
postal_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = df

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(postal_venues_sorted.set_index('Postal Code'), on='Postal Code')

# we have Postal Codfe without POI :(
toronto_merged['Cluster Labels'].replace(np.nan, kclusters, inplace=True)
toronto_merged.replace(np.nan, 'None', inplace=True)
toronto_merged['Cluster Labels'].value_counts()

In [628]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Postal Code'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[int(cluster)-1],
        fill=True,
        fill_color=rainbow[int(cluster)-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

## Some comments about cluster that we have
#### Cluster 0 - Social places
#### Cluster 1 - Sport (Only Baseball Field)
#### Cluster 2 - Relax (Coffee Shop , Fitness)
#### Cluster 3 - Park + Airport
#### Cluster 4 - Park + Shop
#### Cluster 5 - One Garden
#### Cluster 6 - Food court
#### Cluster 7 - Drugstore and Bar
#### Cluster 8 - Playground
#### Cluster 9 - None POI

In [631]:
for i in range (0,kclusters+1):
    print('The Cluster is {}\nThe count of POI for this cluster is {} \nThe popular places in this points are:\n\n{}\n\n'.format(i,toronto_merged[toronto_merged['Cluster Labels'] == i].Borough.count(), 
                              toronto_merged[toronto_merged['Cluster Labels'] == i]['1st Most Common Venue'].value_counts(normalize=True).head(5)))
    print(toronto_merged.loc[toronto_merged['Cluster Labels'] == i, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]-9))]].head(5),'\n\n')    

The Cluster is 0
The count of POI for this cluster is 8 
The popular places in this points are:

Park             0.250
Bus Line         0.125
Bank             0.125
Grocery Store    0.125
Bus Stop         0.125
Name: 1st Most Common Venue, dtype: float64


         Borough  Cluster Labels 1st Most Common Venue
0     North York             0.0              Bus Stop
26   Scarborough             0.0                  Bank
31  West Toronto             0.0              Pharmacy
44   Scarborough             0.0              Bus Line
46    North York             0.0         Grocery Store 


The Cluster is 1
The count of POI for this cluster is 2 
The popular places in this points are:

Baseball Field    1.0
Name: 1st Most Common Venue, dtype: float64


        Borough  Cluster Labels 1st Most Common Venue
57   North York             1.0        Baseball Field
101   Etobicoke             1.0        Baseball Field 


The Cluster is 2
The count of POI for this cluster is 39 
The popular places in