# Finding the best neighborhood for an Italian restaurant in Toronto

In [40]:
import pandas as pd
import numpy as np
from urllib.request import urlopen
from bs4 import BeautifulSoup
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from sklearn import preprocessing

## 1. Data Scraping

In [2]:
# Load table in Wikipedia page in pandas dataframe with BeautifulSoup
toronto_url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
page = urlopen(toronto_url)
soup = BeautifulSoup(page, 'html.parser')
table = soup.find_all('table')[0] 
df = pd.read_html(str(table), header = 0)[0]
print('Dataframe size: {} rows and {} columns'.format(df.shape[0], df.shape[1]))
df.head()

Dataframe size: 289 rows and 3 columns


Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


## 2. Data Cleaning

In [3]:
# Drop all entries where Borough is not assigned
df = df[df['Borough'] != 'Not assigned']

# When Neighbourhood is not assigned, replace it with Borough
for index, row in df.iterrows():
    if (row['Neighbourhood'] == 'Not assigned'):
        df.loc[index, 'Neighbourhood'] = row['Borough']

# Combine all entries with the same postcode 
df_unique = df.groupby('Postcode', as_index = False).first()
for index, row in df_unique.iterrows():
    code = row['Postcode']    
    neighbourhoods = df.loc[df['Postcode'] == code]
    neighbourhoods_list = neighbourhoods['Neighbourhood'].values.tolist()    
    df_unique.loc[index, 'Neighbourhood'] = ', '.join(neighbourhoods_list)   
df = df_unique   

## 3. Get Location Data

In [4]:
import requests

In [5]:
# Get longitude and latitude from url
url = 'http://cocl.us/Geospatial_data'
page = urlopen(url)
location_df=pd.read_csv(page)
location_df.head()

# Iterate through all rows and get latitude and longitude of the postcode
for index, row in df.iterrows():   
    code = row['Postcode']
    location = location_df.loc[location_df['Postal Code'] == code]     
    df.loc[index, 'Latitude'] = location.values[0][1]
    df.loc[index, 'Longitude'] = location.values[0][2]

df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


## 4. Visualize neighbourhoods of Toronto

In [6]:
# Import Folium library
!conda install -c conda-forge folium=0.5.0 --yes
import folium # plotting library

Solving environment: done

## Package Plan ##

  environment location: /home/jupyterlab/conda

  added / updated specs: 
    - folium=0.5.0


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    cryptography-2.4.2         |   py36h1ba5d50_0         618 KB
    openssl-1.1.1a             |    h14c3975_1000         4.0 MB  conda-forge
    libarchive-3.3.3           |       h5d8350f_5         1.5 MB
    grpcio-1.16.1              |   py36hf8bcb03_1         1.1 MB
    conda-4.6.2                |           py36_0         869 KB  conda-forge
    libssh2-1.8.0              |                1         239 KB  conda-forge
    python-3.6.8               |       h0371630_0        34.4 MB
    ------------------------------------------------------------
                                           Total:        42.6 MB

The following packages will be UPDATED:

    conda:        4.5.12-py36_1000         conda

In [7]:
# create map of New York using latitude and longitude values
map_toronto = folium.Map(location=[43.719, -79.410], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df['Latitude'], df['Longitude'], df['Borough'], df['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

## 5. Get Foursquare data

In [8]:
# Define Foursquare credentials
CLIENT_ID = 'WDCQ3YMWUODKYTGPRXJVNCA1JMPW3XKYBNDYRH2JW5UHTEDR' # your Foursquare ID
CLIENT_SECRET = 'FQH0QVVLMNXGOS3JGGBVZ50Y1BYAVNE41C05WXYZQWOYDUFN' # your Foursquare Secret
VERSION = '20180604'
LIMIT = 30
print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: WDCQ3YMWUODKYTGPRXJVNCA1JMPW3XKYBNDYRH2JW5UHTEDR
CLIENT_SECRET:FQH0QVVLMNXGOS3JGGBVZ50Y1BYAVNE41C05WXYZQWOYDUFN


In [82]:
# Function to get the best venues in the neighbourhood
def getNearbyVenues(names, codes, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, code, lat, lng in zip(names, codes, latitudes, longitudes):
        #print(name)
        LIMIT = 200    
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name,
            code,
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood',
                             'Postal Code',
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [83]:
toronto_venues = getNearbyVenues(names=df['Neighbourhood'],
                                 codes=df['Postcode'],
                                   latitudes=df['Latitude'],
                                   longitudes=df['Longitude'],                                     
                                  )
toronto_venues.head()


Unnamed: 0,Neighborhood,Postal Code,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Rouge, Malvern",M1B,43.806686,-79.194353,Wendy's,43.807448,-79.199056,Fast Food Restaurant
1,"Highland Creek, Rouge Hill, Port Union",M1C,43.784535,-79.160497,Royal Canadian Legion,43.782533,-79.163085,Bar
2,"Guildwood, Morningside, West Hill",M1E,43.763573,-79.188711,Swiss Chalet Rotisserie & Grill,43.767697,-79.189914,Pizza Place
3,"Guildwood, Morningside, West Hill",M1E,43.763573,-79.188711,G & G Electronics,43.765309,-79.191537,Electronics Store
4,"Guildwood, Morningside, West Hill",M1E,43.763573,-79.188711,Big Bite Burrito,43.766299,-79.19072,Mexican Restaurant


In [84]:
# Check for every venue if it is a Italian restaurant or restaurant in general
for index, row in toronto_venues.iterrows():
    if (row['Venue Category'] == 'Pizza Place' or row['Venue Category'] == 'Italian Restaurant'):
        toronto_venues.loc[index, 'Is Italian'] = 1
    else:
        toronto_venues.loc[index, 'Is Italian'] = 0
        
for index, row in toronto_venues.iterrows():
    if (row['Venue Category'].find('Restaurant') != -1 
        or row['Venue Category'] == 'Fried Chicken Joint' 
        or row['Venue Category'] == 'Noodle House'
        or row['Venue Category'] == 'Burger Joint'
        or row['Venue Category'] == 'Food Court'
        or row['Venue Category'] == 'Burrito Place'
        or row['Venue Category'] == 'Wings Joint'
        or row['Venue Category'] == 'Steakhouse'
        or row['Venue Category'] == 'Cafeteria'
        or row['Venue Category'] == 'Food Truck'
        or row['Venue Category'] == 'Fish & Chips Shop'
        or row['Venue Category'] == 'Taco Place'
        or row['Venue Category'] == 'Salad Place'
        or row['Venue Category'] == 'BBQ Joint'
        or row['Venue Category'] == 'Soup Place'
        or row['Venue Category'] == 'Food'
        or row['Venue Category'] == 'Sandwich Place' ):
        toronto_venues.loc[index, 'Is Restaurant'] = 1
    else:
        toronto_venues.loc[index, 'Is Restaurant'] = 0
        
toronto_venues.head()
            

Unnamed: 0,Neighborhood,Postal Code,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category,Is Italian,Is Restaurant
0,"Rouge, Malvern",M1B,43.806686,-79.194353,Wendy's,43.807448,-79.199056,Fast Food Restaurant,0.0,1.0
1,"Highland Creek, Rouge Hill, Port Union",M1C,43.784535,-79.160497,Royal Canadian Legion,43.782533,-79.163085,Bar,0.0,0.0
2,"Guildwood, Morningside, West Hill",M1E,43.763573,-79.188711,Swiss Chalet Rotisserie & Grill,43.767697,-79.189914,Pizza Place,1.0,0.0
3,"Guildwood, Morningside, West Hill",M1E,43.763573,-79.188711,G & G Electronics,43.765309,-79.191537,Electronics Store,0.0,0.0
4,"Guildwood, Morningside, West Hill",M1E,43.763573,-79.188711,Big Bite Burrito,43.766299,-79.19072,Mexican Restaurant,0.0,1.0


In [97]:
# Group by neighborhood and count number of (Italian) restaurants 
venues_grouped = toronto_venues.groupby(['Neighborhood'], as_index = False).sum()
venues_grouped['Postal Code'] = 0

# Drop location of venue
venues_grouped.drop(['Venue Latitude'], inplace = True, axis = 1)
venues_grouped.drop(['Venue Longitude'], inplace = True, axis = 1)

# Replace neighborhood with original values
for index, row in venues_grouped.iterrows():
    venues_grouped.loc[index, 'Neighborhood Latitude'] = df.loc[df['Neighbourhood'] == venues_grouped.loc[index, 'Neighborhood'], 'Latitude'].values
    venues_grouped.loc[index, 'Neighborhood Longitude'] = df.loc[df['Neighbourhood'] == venues_grouped.loc[index, 'Neighborhood'], 'Longitude'].values
    venues_grouped.loc[index, 'Postal Code'] = df.loc[df['Neighbourhood'] == venues_grouped.loc[index, 'Neighborhood'], 'Postcode'].values
    
venues_grouped.rename(columns= {'Is Italian' : 'Italian Restaurants', 'Is Restaurant' : 'Restaurants'}, inplace = True)
venues_grouped.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Italian Restaurants,Restaurants,Postal Code
0,"Adelaide, King, Richmond",43.650571,-79.384568,2.0,37.0,M5H
1,Agincourt,43.7942,-79.262029,0.0,2.0,M1S
2,"Agincourt North, L'Amoreaux East, Milliken, St...",43.815252,-79.284577,0.0,0.0,M1V
3,"Albion Gardens, Beaumond Heights, Humbergate, ...",43.739416,-79.588437,1.0,4.0,M9V
4,"Alderwood, Long Branch",43.602414,-79.543484,2.0,1.0,M8W


## 6. Cluster neighborhoods

In [98]:
# Define input data for the clustering algorithm 
# The location is also taken into account because people from adjacent neighborhoods are also prone to visit the restaurants
X = venues_grouped[['Neighborhood Latitude', 'Neighborhood Longitude', 'Italian Restaurants', 'Restaurants']]

scaler = preprocessing.StandardScaler()
scaled_df = scaler.fit_transform(X)

# Run K_mean algorithm with k = 5
k = 5
kmeans = KMeans(n_clusters=k).fit(X)
labels = kmeans.labels_

for index, row in venues_grouped.iterrows():
    venues_grouped.loc[index, 'Label'] = labels[index]  
    
# Show characteristics of every cluster
venues_grouped.groupby(['Label']).mean()     



Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Italian Restaurants,Restaurants
Label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0.0,43.715968,-79.399869,0.275862,0.775862
1.0,43.652044,-79.379632,4.4,29.0
2.0,43.684875,-79.390244,3.0,15.0
3.0,43.701346,-79.403513,0.894737,5.894737
4.0,43.653253,-79.385975,2.2,37.4


As can be seen from the table, the best neighborhood for an Italian restaurant should be in cluster 4 because it has the highest number of restaurants, but still one of the lowest numbers of Italian Restaurants

## 8. Get best neighborhoods 

In [107]:
# Add labels to dataframe
venues_grouped['Label'] = labels
cluster4 = venues_grouped.loc[venues_grouped['Label'] == 4]
cluster4

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Italian Restaurants,Restaurants,Postal Code,Label
0,"Adelaide, King, Richmond",43.650571,-79.384568,2.0,37.0,M5H,4
20,"Chinatown, Grange Park, Kensington Market",43.653206,-79.400049,2.0,41.0,M5T,4
22,Church and Wellesley,43.66586,-79.38316,1.0,35.0,M4Y,4
27,"Commerce Court, Victoria Hotel",43.648198,-79.379817,4.0,36.0,M5L,4
43,"First Canadian Place, Underground city",43.648429,-79.38228,2.0,38.0,M5X,4


As can be seen, an above average number of restaurants can be found in these neighborhoods. Still, only few of them are Italian Restsaurants

## 9. Visualize clustered neighborhoods on the map

In [108]:
# create map
map_clusters = folium.Map(location=[43.719, -79.410], zoom_start=11)

# set color scheme for the clusters
kclusters = 5
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(venues_grouped['Neighborhood Latitude'], venues_grouped['Neighborhood Longitude'], venues_grouped['Neighborhood'], venues_grouped['Label']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[int(float(cluster)) -1],
        fill=True,
        fill_color=rainbow[int(float(cluster))-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters