### Crawling data from wiki page with Beautifulsoup lib

In [70]:
from bs4 import BeautifulSoup
import requests
import json
import pandas as pd
import numpy as np


def parse_html_table(url):
            req = requests.get(url)
            soup = BeautifulSoup(req.text, 'lxml')
            table = soup.find_all('table')[0] # Grab the first table
            n_columns = 0
            n_rows=0
            column_names = []
    
            # Find number of rows and columns
            # we also find the column titles if we can
            for row in table.find_all('tr'):
                
                # Determine the number of rows in the table
                td_tags = row.find_all('td')
                if len(td_tags) > 0:
                    n_rows+=1
                    if n_columns == 0:
                        # Set the number of columns for our table
                        n_columns = len(td_tags)
                        
                # Handle column names if we find them
                th_tags = row.find_all('th') 
                if len(th_tags) > 0 and len(column_names) == 0:
                    for th in th_tags:
                        column_names.append(str(th.get_text()).replace('\n',''))
        
                                            
            # Safeguard on Column Titles
            if len(column_names) > 0 and len(column_names) != n_columns:
                raise Exception("Column titles do not match the number of columns")
    
            # columns = column_names if len(column_names) > 0 else range(0,n_columns) => operating
            if len(column_names) > 0:
                columns = column_names
            else:
                columns = range(0,n_columns)
                
            df = pd.DataFrame(columns = columns,
                              index= range(0,n_rows))
            row_marker = 0
            for row in table.find_all('tr'):
                column_marker = 0
                columns = row.find_all('td')
                for column in columns:
                    value = str(column.get_text())
                    df.iat[row_marker,column_marker] = value.replace('\n','')
                    column_marker += 1
                if len(columns) > 0:
                    row_marker += 1
                    
            # Convert to float if possible
            for col in df:
                try:
                    df[col] = df[col].astype(float)
                except ValueError:
                    pass
            
            return df   




In [71]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
postal = parse_html_table(url)
postal.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


### check a cell has a borough but a not assigned neighborhood, then the neighborhood will be the same as the borough

In [72]:
#check a cell has a borough but a not assigned neighborhood, then the neighborhood will be the same as the borough

def check_cell_assigned(df):
    NA_Borough = 0
    NA_Neighbor = 0
    
    for i, row in df.iterrows():
        if (row[1] != 'Not assigned') & (row[2] == 'Not assigned'):
            row[2] = row[1]
        elif (row[1] == 'Not assigned') & (row[2] == 'Not assigned'):
            NA_Borough +=1
            NA_Neighbor +=1

    return NA_Borough, NA_Neighbor

ans = check_cell_assigned(postal)
ans

(77, 77)

In [125]:
print(postal.shape, len(postal.index))

(287, 3) 287


In [75]:
data = postal[(postal['Borough'] != 'Not assigned')]
data.reset_index(inplace=True)
data = data.drop(['index'], axis=1)
print(data.shape)


(210, 3)


In [76]:
for x in data.columns:
    print(data[x].value_counts())

M9V    8
M8Y    8
M5V    7
M8Z    5
M4V    5
M9B    5
M1V    4
M9R    4
M6M    4
M9C    4
M8X    3
M8V    3
M1P    3
M1T    3
M1M    3
M6K    3
M1K    3
M1E    3
M5R    3
M2J    3
M1L    3
M3H    3
M5T    3
M5H    3
M6L    3
M5J    3
M1C    3
M6J    2
M4T    2
M1N    2
      ..
M7R    1
M4C    1
M7A    1
M9A    1
M9P    1
M4N    1
M5N    1
M1G    1
M6C    1
M5G    1
M9L    1
M3A    1
M2K    1
M2P    1
M4A    1
M3L    1
M4G    1
M7Y    1
M4P    1
M9W    1
M5A    1
M4R    1
M3M    1
M1X    1
M6E    1
M4E    1
M4Y    1
M4M    1
M1W    1
M2N    1
Name: Postcode, Length: 103, dtype: int64
Etobicoke           45
North York          38
Scarborough         37
Downtown Toronto    37
Central Toronto     17
West Toronto        13
York                 9
East Toronto         7
East York            6
Mississauga          1
Name: Borough, dtype: int64
St. James Town                     2
Runnymede                          2
Bloordale Gardens                  1
Cloverdale                         1
Gra

In [77]:
def convert_df(df):
    duplicate = []
    for i, row in df.iterrows():
        for j, r in df.loc[i+1: len(df.index)].iterrows():
            if (row[0] == r[0]) & (row[1] == r[1]) & (row[2] != r[2]):
                row[2] = row[2]+ ',' + r[2]
                duplicate.append(j)
                
                
    df = df.drop(df.index[duplicate], inplace=True)
    #df.reset_index(inplace=True)
    #df = df.drop(['index'], axis=1)
    return df
convert_df(data)
#print(data.shape)
#data.head(50)

In [79]:
geospatial_data = pd.read_csv('C:/Users/TRANTUANVU/Google Drive/Data Science book/Data science course/Course 9/Geospatial_Coordinates.csv')
geospatial_data

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
5,M1J,43.744734,-79.239476
6,M1K,43.727929,-79.262029
7,M1L,43.711112,-79.284577
8,M1M,43.716316,-79.239476
9,M1N,43.692657,-79.264848


In [80]:
data['Latitude'] = "NaN"
data['Longitude'] = "NaN"
data.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,,
1,M4A,North York,Victoria Village,,
2,M5A,Downtown Toronto,Harbourfront,,
3,M6A,North York,"Lawrence Heights,Lawrence Manor",,
5,M7A,Downtown Toronto,Queen's Park,,


In [81]:
for row in geospatial_data.iterrows():
    for i, x in data.iterrows():

        if row[1][0] == x[0]:
            #print('yes')
            x[3] = str(row[1][1]) 
            x[4] = str(row[1][2])
data.head(50)

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.7532586,-79.3296565
1,M4A,North York,Victoria Village,43.7258823,-79.31557159999998
2,M5A,Downtown Toronto,Harbourfront,43.6542599,-79.3606359
3,M6A,North York,"Lawrence Heights,Lawrence Manor",43.718518,-79.46476329999999
5,M7A,Downtown Toronto,Queen's Park,43.6623015,-79.3894938
6,M9A,Etobicoke,Islington Avenue,43.6678556,-79.53224240000002
7,M1B,Scarborough,"Rouge,Malvern",43.8066863,-79.19435340000001
9,M3B,North York,Don Mills North,43.7459058,-79.352188
10,M4B,East York,"Woodbine Gardens,Parkview Hill",43.7063972,-79.309937
12,M5B,Downtown Toronto,"Ryerson,Garden District",43.6571618,-79.37893709999999


In [82]:
data['Latitude'] = data['Latitude'].astype(float)
data['Longitude'] = data['Longitude'].astype(float)

In [83]:
data.dtypes

Postcode          object
Borough           object
Neighbourhood     object
Latitude         float64
Longitude        float64
dtype: object

In [84]:
data.apply(lambda x: sum(x.isnull()))

Postcode         0
Borough          0
Neighbourhood    0
Latitude         0
Longitude        0
dtype: int64

In [85]:
data['Borough'].value_counts()

North York          24
Downtown Toronto    19
Scarborough         17
Etobicoke           12
Central Toronto      9
West Toronto         6
York                 5
East York            5
East Toronto         5
Mississauga          1
Name: Borough, dtype: int64

In [86]:
import geocoder # import geocoder
from geopy.geocoders import Nominatim

address = 'Toronto, ON, Canada'

geolocator = Nominatim(user_agent = 'to_explorer')
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto, ON, Canada are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto, ON, Canada are 43.653963, -79.387207.


### Create a map of toronto with neighborhoods superimposed on top

In [87]:
import folium
# create map

toronto = folium.Map(location=[latitude, longitude], zoom_start=12)

# add markers to map
for lat, long, borough, neighborhood in zip(data['Latitude'], data['Longitude'], data['Borough'], data['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html = True)
    folium.CircleMarker(
        [lat, long],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity = 0.7,
        parse_html = False).add_to(toronto)

    
toronto


### using Foursquare API, all venues data were imported into dataframe for toronto

In [88]:
CLIENT_ID = '3L2CWSCV2TK3V23XETBMVJ5URDN3RK5UZJQRPVMKV1DAJW03' 
CLIENT_SECRET = 'NDYIW4SXQKXO3TIZ4CFLX2NFHCOTXC4GSODPDVK3IQGC1D4A'
VERSION = '20190309'

In [89]:
data.loc[0, 'Neighbourhood']

'Parkwoods'

In [90]:
data.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636
3,M6A,North York,"Lawrence Heights,Lawrence Manor",43.718518,-79.464763
5,M7A,Downtown Toronto,Queen's Park,43.662301,-79.389494


In [91]:
neighborhood_latitude = data.loc[0, 'Latitude']
neighborhood_longitude = data.loc[0, 'Longitude']
neighborhood_name = data.loc[0, 'Neighbourhood']

print('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_name, neighborhood_latitude, neighborhood_longitude))


Latitude and longitude values of Parkwoods are 43.7532586, -79.3296565.


### Get the top 100 venues that are in Marble Hill within a radius of 500 meters

In [92]:
limit = 100 # limit number of venues returned by Foursquare API
radius = 500 # define radius

url = 'https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET,
    VERSION,
    neighborhood_latitude,
    neighborhood_longitude,
    radius,
    limit
)

url

'https://api.foursquare.com/v2/venues/explore?client_id=3L2CWSCV2TK3V23XETBMVJ5URDN3RK5UZJQRPVMKV1DAJW03&client_secret=NDYIW4SXQKXO3TIZ4CFLX2NFHCOTXC4GSODPDVK3IQGC1D4A&v=20190309&ll=43.7532586,-79.3296565&radius=500&limit=100'

In [93]:
# get the request
res = requests.get(url).json()
res

{'meta': {'code': 200, 'requestId': '5e67268f1d67cb001b8af560'},
  'headerLocation': 'Parkwoods - Donalda',
  'headerFullLocation': 'Parkwoods - Donalda, Toronto',
  'headerLocationGranularity': 'neighborhood',
  'totalResults': 3,
  'suggestedBounds': {'ne': {'lat': 43.757758604500005,
    'lng': -79.32343823984928},
   'sw': {'lat': 43.7487585955, 'lng': -79.33587476015072}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '4e8d9dcdd5fbbbb6b3003c7b',
       'name': 'Brookbanks Park',
       'location': {'address': 'Toronto',
        'lat': 43.751976046055574,
        'lng': -79.33214044722958,
        'labeledLatLngs': [{'label': 'display',
          'lat': 43.751976046055574,
          'lng': -79.33214044722958}],
        'distance': 245,
        'cc': 'CA',
        'c

In [94]:
# extracting the category of the venue

def get_category_type(r):
    try:
        cat_list = r['categories']
    except:
        cat_list = r['venue.categories']
        
    if len(cat_list) == 0:
        return None
    else:
        return cat_list[0]['name']

In [95]:
from pandas.io.json import json_normalize
# cleaning the json and structure it inot pandas dataframe
venues = res['response']['groups'][0]['items']
nearby_venues = json_normalize(venues)

filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues = nearby_venues.loc[:, filtered_columns]

nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))
nearby_venues.head(10)

3 venues were returned by Foursquare.


Unnamed: 0,name,categories,lat,lng
0,Brookbanks Park,Park,43.751976,-79.33214
1,Variety Store,Food & Drink Shop,43.751974,-79.333114
2,TTC stop - 44 Valley Woods,Bus Stop,43.755402,-79.333741


### Explore Neighbourhoods in toronto

In [96]:
def get_nearby_venues(names,latitudes,longitudes, radius = 500):
    venues_lst = []
    for name, lat, lng in zip(names, latitudes, longitudes):
        
        url = 'https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
        CLIENT_ID, 
        CLIENT_SECRET,
        VERSION,
        neighborhood_latitude,
        neighborhood_longitude,
        radius,
        limit
        )
        
        res = requests.get(url).json()['response']['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_lst.append([(
            name, lat, lng, item['venue']['name'], 
            item['venue']['location']['lat'], 
            item['venue']['location']['lng'],  
            item['venue']['categories'][0]['name']) for item in res
        ])

        
        nearby_v = pd.DataFrame([item for venues_lst in venues_lst for item in venues_lst])
        
        nearby_v.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_v)

In [97]:
### all venues of each neighbourhood

toronto_venues = get_nearby_venues(names=data['Neighbourhood'], latitudes = data['Latitude'], longitudes = data['Longitude'])
toronto_venues.head(50)

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Parkwoods,43.753259,-79.329656,Brookbanks Park,43.751976,-79.33214,Park
1,Parkwoods,43.753259,-79.329656,Variety Store,43.751974,-79.333114,Food & Drink Shop
2,Parkwoods,43.753259,-79.329656,TTC stop - 44 Valley Woods,43.755402,-79.333741,Bus Stop
3,Victoria Village,43.725882,-79.315572,Brookbanks Park,43.751976,-79.33214,Park
4,Victoria Village,43.725882,-79.315572,Variety Store,43.751974,-79.333114,Food & Drink Shop
5,Victoria Village,43.725882,-79.315572,TTC stop - 44 Valley Woods,43.755402,-79.333741,Bus Stop
6,Harbourfront,43.65426,-79.360636,Brookbanks Park,43.751976,-79.33214,Park
7,Harbourfront,43.65426,-79.360636,Variety Store,43.751974,-79.333114,Food & Drink Shop
8,Harbourfront,43.65426,-79.360636,TTC stop - 44 Valley Woods,43.755402,-79.333741,Bus Stop
9,"Lawrence Heights,Lawrence Manor",43.718518,-79.464763,Brookbanks Park,43.751976,-79.33214,Park


In [98]:
# how many venues were returned for each neighborhood
toronto_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Adelaide,King,Richmond",3,3,3,3,3,3
Agincourt,3,3,3,3,3,3
"Agincourt North,L'Amoreaux East,Milliken,Steeles East",3,3,3,3,3,3
"Albion Gardens,Beaumond Heights,Humbergate,Jamestown,Mount Olive,Silverstone,South Steeles,Thistletown",3,3,3,3,3,3
"Alderwood,Long Branch",3,3,3,3,3,3
"Bathurst Manor,Downsview North,Wilson Heights",3,3,3,3,3,3
Bayview Village,3,3,3,3,3,3
"Bedford Park,Lawrence Manor East",3,3,3,3,3,3
Berczy Park,3,3,3,3,3,3
"Birch Cliff,Cliffside West",3,3,3,3,3,3


In [106]:
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

There are 3 uniques categories.


### Analyze each neighborhood

In [107]:
#one hot-encoding

toronto_ohe = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

#add neighborhood column back to dataframe
toronto_ohe['Neighborhood'] = toronto_venues['Neighborhood']

#movie neighborhood column to the first column
fixed_columns = [toronto_ohe.columns[-1]] + list(toronto_ohe.columns[:-1])
toronto_ohe = toronto_ohe[fixed_columns]
                 
toronto_ohe.head()

Unnamed: 0,Neighborhood,Bus Stop,Food & Drink Shop,Park
0,Parkwoods,0,0,1
1,Parkwoods,0,1,0
2,Parkwoods,1,0,0
3,Victoria Village,0,0,1
4,Victoria Village,0,1,0


### grouping rows by neighborhood and by taking the mean of the frequency of occurrence of each category

In [108]:
toronto_grouped = toronto_ohe.groupby('Neighborhood').mean().reset_index()
toronto_grouped

Unnamed: 0,Neighborhood,Bus Stop,Food & Drink Shop,Park
0,"Adelaide,King,Richmond",0.333333,0.333333,0.333333
1,Agincourt,0.333333,0.333333,0.333333
2,"Agincourt North,L'Amoreaux East,Milliken,Steel...",0.333333,0.333333,0.333333
3,"Albion Gardens,Beaumond Heights,Humbergate,Jam...",0.333333,0.333333,0.333333
4,"Alderwood,Long Branch",0.333333,0.333333,0.333333
5,"Bathurst Manor,Downsview North,Wilson Heights",0.333333,0.333333,0.333333
6,Bayview Village,0.333333,0.333333,0.333333
7,"Bedford Park,Lawrence Manor East",0.333333,0.333333,0.333333
8,Berczy Park,0.333333,0.333333,0.333333
9,"Birch Cliff,Cliffside West",0.333333,0.333333,0.333333


In [109]:
#print each neighborhood along with the top 3 most common venues
num_top_venues = 3

for hood in toronto_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','frequency']
    temp = temp.iloc[1:]
    temp['freq'] = temp['frequency'].astype(float)
    temp = temp.round({'frequency': 2})
    print(temp.sort_values('frequency', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Adelaide,King,Richmond----
               venue frequency      freq
0           Bus Stop  0.333333  0.333333
1  Food & Drink Shop  0.333333  0.333333
2               Park  0.333333  0.333333


----Agincourt----
               venue frequency      freq
0           Bus Stop  0.333333  0.333333
1  Food & Drink Shop  0.333333  0.333333
2               Park  0.333333  0.333333


----Agincourt North,L'Amoreaux East,Milliken,Steeles East----
               venue frequency      freq
0           Bus Stop  0.333333  0.333333
1  Food & Drink Shop  0.333333  0.333333
2               Park  0.333333  0.333333


----Albion Gardens,Beaumond Heights,Humbergate,Jamestown,Mount Olive,Silverstone,South Steeles,Thistletown----
               venue frequency      freq
0           Bus Stop  0.333333  0.333333
1  Food & Drink Shop  0.333333  0.333333
2               Park  0.333333  0.333333


----Alderwood,Long Branch----
               venue frequency      freq
0           Bus Stop  0.333333  0.333333
1 

               venue frequency      freq
0           Bus Stop  0.333333  0.333333
1  Food & Drink Shop  0.333333  0.333333
2               Park  0.333333  0.333333


----First Canadian Place,Underground city----
               venue frequency      freq
0           Bus Stop  0.333333  0.333333
1  Food & Drink Shop  0.333333  0.333333
2               Park  0.333333  0.333333


----Flemingdon Park,Don Mills South----
               venue frequency      freq
0           Bus Stop  0.333333  0.333333
1  Food & Drink Shop  0.333333  0.333333
2               Park  0.333333  0.333333


----Forest Hill North,Forest Hill West----
               venue frequency      freq
0           Bus Stop  0.333333  0.333333
1  Food & Drink Shop  0.333333  0.333333
2               Park  0.333333  0.333333


----Glencairn----
               venue frequency      freq
0           Bus Stop  0.333333  0.333333
1  Food & Drink Shop  0.333333  0.333333
2               Park  0.333333  0.333333


----Guildwood,Morningsi

               venue frequency      freq
0           Bus Stop  0.333333  0.333333
1  Food & Drink Shop  0.333333  0.333333
2               Park  0.333333  0.333333


----Studio District----
               venue frequency      freq
0           Bus Stop  0.333333  0.333333
1  Food & Drink Shop  0.333333  0.333333
2               Park  0.333333  0.333333


----The Annex,North Midtown,Yorkville----
               venue frequency      freq
0           Bus Stop  0.333333  0.333333
1  Food & Drink Shop  0.333333  0.333333
2               Park  0.333333  0.333333


----The Beaches----
               venue frequency      freq
0           Bus Stop  0.333333  0.333333
1  Food & Drink Shop  0.333333  0.333333
2               Park  0.333333  0.333333


----The Beaches West,India Bazaar----
               venue frequency      freq
0           Bus Stop  0.333333  0.333333
1  Food & Drink Shop  0.333333  0.333333
2               Park  0.333333  0.333333


----The Danforth West,Riverdale----
          

In [110]:
# sort the venues in descending order.
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [111]:
# create the new dataframe and display the top 3 venues for each neighborhood.

num_top_venues = 3

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue
0,"Adelaide,King,Richmond",Park,Food & Drink Shop,Bus Stop
1,Agincourt,Park,Food & Drink Shop,Bus Stop
2,"Agincourt North,L'Amoreaux East,Milliken,Steel...",Park,Food & Drink Shop,Bus Stop
3,"Albion Gardens,Beaumond Heights,Humbergate,Jam...",Park,Food & Drink Shop,Bus Stop
4,"Alderwood,Long Branch",Park,Food & Drink Shop,Bus Stop


### Clustering neighborhood

In [112]:
# import k-means from clustering stage
from sklearn.cluster import KMeans 

# set number of clusters
kclusters = 3

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

  return_n_iter=True)


array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

### creating a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.

In [113]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

data.rename(columns={'Neighbourhood':'Neighborhood'}, inplace = True)

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = data.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')
neighborhoods_venues_sorted.head()


Unnamed: 0,Cluster Labels,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue
0,0,"Adelaide,King,Richmond",Park,Food & Drink Shop,Bus Stop
1,0,Agincourt,Park,Food & Drink Shop,Bus Stop
2,0,"Agincourt North,L'Amoreaux East,Milliken,Steel...",Park,Food & Drink Shop,Bus Stop
3,0,"Albion Gardens,Beaumond Heights,Humbergate,Jam...",Park,Food & Drink Shop,Bus Stop
4,0,"Alderwood,Long Branch",Park,Food & Drink Shop,Bus Stop


In [117]:
import matplotlib.cm as cm
import matplotlib.colors as colors
# create map
map_clusters = folium.Map(location = [latitude, longitude], zoom_start=10)

#set color scheme for the clusters

x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0,1,len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []

for lat, lng, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood']
                                  , toronto_merged['Cluster Labels']):
    
    label = folium.Popup(str(poi) + 'Cluster' + str(cluster), parse_html=True)
    folium.CircleMarker([lat, lng],
                       radius = 5,
                       popup=label,
                       color=rainbow[cluster-1],
                       fill=True,
                      fill_color=rainbow[cluster-1],
                       fill_opacity = 0.7).add_to(map_clusters)
    
map_clusters

### Cluster

In [121]:
# Cluster 1
toronto_merged.loc[toronto_merged['Cluster Labels'] == 0, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue
0,North York,0,Park,Food & Drink Shop,Bus Stop
1,North York,0,Park,Food & Drink Shop,Bus Stop
2,Downtown Toronto,0,Park,Food & Drink Shop,Bus Stop
3,North York,0,Park,Food & Drink Shop,Bus Stop
5,Downtown Toronto,0,Park,Food & Drink Shop,Bus Stop
6,Etobicoke,0,Park,Food & Drink Shop,Bus Stop
7,Scarborough,0,Park,Food & Drink Shop,Bus Stop
9,North York,0,Park,Food & Drink Shop,Bus Stop
10,East York,0,Park,Food & Drink Shop,Bus Stop
12,Downtown Toronto,0,Park,Food & Drink Shop,Bus Stop
