# Importing libraries

In [1]:
import pandas as pd
import numpy as np
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
import requests
import json
from pandas.io.json import json_normalize
from sklearn.cluster import KMeans 

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

!conda install -c conda-forge folium=0.5.0 --yes # folium setup
import folium # map rendering library

#!conda install -c conda-forge geopy --yes # geopy setup
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - folium=0.5.0


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    altair-3.2.0               |           py36_0         770 KB  conda-forge
    vincent-0.4.4              |             py_1          28 KB  conda-forge
    folium-0.5.0               |             py_0          45 KB  conda-forge
    openssl-1.1.1c             |       h516909a_0         2.1 MB  conda-forge
    ca-certificates-2019.6.16  |       hecc5488_0         145 KB  conda-forge
    certifi-2019.6.16          |           py36_1         149 KB  conda-forge
    branca-0.3.1               |             py_0          25 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         3.3 MB

The following NEW packages will be 

# Loading Data

In [2]:
url='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
toronto_html=pd.read_html(url)
toronto_data=pd.DataFrame(toronto_html[0])
np.shape(toronto_data)

(288, 3)

In [3]:
toronto_data.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


# Data Cleaning

In [4]:
#drop cells where "Borough" is not assigned
toronto_data=toronto_data[toronto_data['Borough']!='Not assigned'].reset_index(drop=True)
toronto_data.head(12)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights
5,M6A,North York,Lawrence Manor
6,M7A,Queen's Park,Not assigned
7,M9A,Etobicoke,Islington Avenue
8,M1B,Scarborough,Rouge
9,M1B,Scarborough,Malvern


In [5]:
#Regroup rows with similar postcodes
toronto_data=toronto_data.groupby(['Postcode','Borough'])['Neighbourhood'].apply(','.join).reset_index() #we group bu the postcode and Borough and then transform the last column
toronto_data.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [6]:
#changing 'Not asssigned' Neighbourhoods to Boroughs
not_assigned_index=toronto_data[toronto_data['Neighbourhood']=='Not assigned'].index.values.astype(int)[0] #index list of the rows where neighborhoods are not assigned 
toronto_data.where(toronto_data!='Not assigned',toronto_data['Borough'][not_assigned_index],inplace=True) #we replace these cells with their Borogh values
toronto_data.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [7]:
np.shape(toronto_data)

(103, 3)

# Adding latitude and longitude to the dataframe

In [8]:
geo_latlng=pd.read_csv('https://cocl.us/Geospatial_data')
geo_latlng.rename(columns={'Postal Code': 'Postcode'},inplace=True) #we rename the postcode column so that we can merge this df with the toronto_data

In [9]:
toronto_data = pd.merge(toronto_data, geo_latlng )
toronto_data.head(12)

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park,Ionview,Kennedy Park",43.727929,-79.262029
7,M1L,Scarborough,"Clairlea,Golden Mile,Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest,Cliffside,Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff,Cliffside West",43.692657,-79.264848


## We will work only with boroughs that contain the word Toronto in their name (City Center)

In [10]:
toronto_data=toronto_data[toronto_data['Borough'].str.contains('Toronto')].reset_index()
toronto_data.drop(columns='index',inplace=True)

In [12]:
toronto_data.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M4E,East Toronto,The Beaches,43.676357,-79.293031
1,M4K,East Toronto,"The Danforth West,Riverdale",43.679557,-79.352188
2,M4L,East Toronto,"The Beaches West,India Bazaar",43.668999,-79.315572
3,M4M,East Toronto,Studio District,43.659526,-79.340923
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879


# Exploring Neighborhoods

### Let's define Foursquare Credentials and Version

In [14]:
CLIENT_ID = 'SZJNCICIIXD5NVK3GKICZSYGTTDU024T51K0OTGTSQKHEWZT' # your Foursquare ID
CLIENT_SECRET = 'V5CNVAB535XIKVWSH1XLUSAX52WF1G33OEM4U40QNA42XB3Q' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: SZJNCICIIXD5NVK3GKICZSYGTTDU024T51K0OTGTSQKHEWZT
CLIENT_SECRET:V5CNVAB535XIKVWSH1XLUSAX52WF1G33OEM4U40QNA42XB3Q


#### Let's create a function that get the top 100 nearby venues within radius of 500m of each Neighborhood

In [39]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    LIMIT=100
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        #print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']  #the information is in the items key
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

###  Applying the function on each neighborhood and create a new dataframe called toronto_venues.

In [40]:
toronto_venues = getNearbyVenues(names=toronto_data['Neighbourhood'],
                                   latitudes=toronto_data['Latitude'],
                                   longitudes=toronto_data['Longitude']
                                  )
toronto_venues.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,The Beaches,43.676357,-79.293031,Glen Manor Ravine,43.676821,-79.293942,Trail
1,The Beaches,43.676357,-79.293031,The Big Carrot Natural Food Market,43.678879,-79.297734,Health Food Store
2,The Beaches,43.676357,-79.293031,Grover Pub and Grub,43.679181,-79.297215,Pub
3,The Beaches,43.676357,-79.293031,Glen Stewart Ravine,43.6763,-79.294784,Other Great Outdoors
4,The Beaches,43.676357,-79.293031,Upper Beaches,43.680563,-79.292869,Neighborhood


### Now we are going to explore the neighborhoods that have the most restaurants because being near your competition may help with business marketing

### let's start by creating a new dataframe of Restaurants from these venues

In [45]:
toronto_restaurants=toronto_venues[toronto_venues['Venue Category'].str.contains('Restaurant')].reset_index(drop=True)
toronto_restaurants.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"The Danforth West,Riverdale",43.679557,-79.352188,Pantheon,43.677621,-79.351434,Greek Restaurant
1,"The Danforth West,Riverdale",43.679557,-79.352188,Mezes,43.677962,-79.350196,Greek Restaurant
2,"The Danforth West,Riverdale",43.679557,-79.352188,Cafe Fiorentina,43.677743,-79.350115,Italian Restaurant
3,"The Danforth West,Riverdale",43.679557,-79.352188,Messini Authentic Gyros,43.677827,-79.350569,Greek Restaurant
4,"The Danforth West,Riverdale",43.679557,-79.352188,7 Numbers,43.677062,-79.353934,Italian Restaurant


### Let's see how many restaurants has each Neighborhood

In [76]:
restaurants_count=toronto_venues.groupby('Neighborhood').count()
restaurants_count

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Adelaide,King,Richmond",100,100,100,100,100,100
Berczy Park,57,57,57,57,57,57
"Brockton,Exhibition Place,Parkdale Village",23,23,23,23,23,23
Business Reply Mail Processing Centre 969 Eastern,16,16,16,16,16,16
"CN Tower,Bathurst Quay,Island airport,Harbourfront West,King and Spadina,Railway Lands,South Niagara",17,17,17,17,17,17
"Cabbagetown,St. James Town",44,44,44,44,44,44
Central Bay Street,85,85,85,85,85,85
"Chinatown,Grange Park,Kensington Market",100,100,100,100,100,100
Christie,16,16,16,16,16,16
Church and Wellesley,86,86,86,86,86,86


#### Now, let's cluster the neighorhoods using the number of restaurants in each neighborhood

In [81]:
kclusters=5 #number of clusters

#first we drop the neighbourhood column to fit the model
kmeans=KMeans(n_clusters=kclusters, random_state=0).fit(restaurants_count)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([0, 4, 3, 3, 3, 1, 0, 0, 3, 0], dtype=int32)

## Let's see which cluster label corresponds to the neighborhoods with the biggest number of restaurants

In [133]:
#restaurants_count.insert(0, 'Cluster labels', kmeans.labels_)
restaurants_clustering=restaurants_count[list(restaurants_count.columns[0:2])]
restaurants_clustering.reset_index(inplace=True)
restaurants_clustering.sort_values(by='Neighborhood Latitude')

Unnamed: 0,Neighborhood,Cluster labels,Neighborhood Latitude
24,"Moore Park,Summerhill East",2,1
28,Roselawn,2,1
22,Lawrence Park,2,3
27,Rosedale,2,5
17,"Forest Hill North,Forest Hill West",2,5
35,The Beaches,2,6
12,Davisville North,2,7
15,"Dovercourt Village,Dufferin",3,15
26,"Parkdale,Roncesvalles",3,15
13,"Deer Park,Forest Hill SE,Rathnelly,South Hill,...",3,15


##### We can see that the Cluster 0 corresponds to what we want. It is the clsuter of neughborhoods that will offer the best cometition. Thus, the locations of this cluster are going to be the most interesting for us, as they will be the best places to open a restaurant

## Visualization

#### First Let's create a dataframe with the neighborhoods and their coordinates aswell as their cluster labels using the number of restaurants they have

In [136]:
restaurants_clustering=pd.merge(restaurants_clustering_competition,toronto_restaurants)
restaurants_clustering.head()

Unnamed: 0,Neighborhood,Cluster labels,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Adelaide,King,Richmond",0,43.650571,-79.384568,Rosalinda,43.650252,-79.385156,Vegetarian / Vegan Restaurant
1,"Adelaide,King,Richmond",0,43.650571,-79.384568,Bosk at Shangri-La,43.649023,-79.385826,Asian Restaurant
2,"Adelaide,King,Richmond",0,43.650571,-79.384568,Estiatorio Volos,43.650329,-79.384533,Greek Restaurant
3,"Adelaide,King,Richmond",0,43.650571,-79.384568,Cactus Club Cafe,43.649552,-79.381671,American Restaurant
4,"Adelaide,King,Richmond",0,43.650571,-79.384568,Noodle King,43.651706,-79.383046,Asian Restaurant


In [137]:
##Finding coordinates of Toronto City
address = 'Toronto, Canada'
geolocator = Nominatim(user_agent="canada_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.653963, -79.387207.


In [138]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

#set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]


# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(restaurants_clustering['Neighborhood Latitude'], restaurants_clustering['Neighborhood Longitude'], restaurants_clustering['Neighborhood'], restaurants_clustering['Cluster labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

# The red cluster corresponds to the neighborhoods with the biggest number of restaurants and those location are the best to open a new restaurant based on the aspect of competition