# Project: Battle of the Neighborhoods

This notebook is for the capstone project in the IBM Applied Data Science specialisation 

### Week 1: Setting up the Jupyter notebook

In [1]:
import pandas as pd
import numpy as np

In [2]:
# 1st Assignment - Setup the Jupyter notebook on a hosted service
print("Hello Capstone Project Course!")

Hello Capstone Project Course!


### Week 3: Segmenting and Clustering Neighborhoods in Toronto

#### Part 1: Storing Toronto Neighborhood Information into pandas DataFrame

In [3]:
import requests as req
from bs4 import BeautifulSoup as Soup

print("Imports done")

Imports done


In [4]:
# Fetch data from the resource link and store as a csv file
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
res = req.get(url).text
soup = Soup(res, 'html.parser')

# Use # for seperator since , can be used to group addresses
csv_data = soup.table.get_text().replace('\n\n\n', '\n').replace('\n\n', '#')[1:]  # Ignore the first seperator

# Write data into .csv file
with open("toronto.csv", "w") as f:
    f.write(csv_data)

In [5]:
# Read data into df
df = pd.read_csv("toronto.csv", header=0, sep='#')

# Remove any "Not Assigned" boroughs
df = df[df["Borough"] != "Not assigned"].reset_index(drop=True)

# Assign nbhood to have same name as borough if nbhood is "Not Assigned"
df["Neighborhood"] = df.T.apply(lambda x: x["Neighborhood"] if x["Neighborhood"] != "Not Assigned" else x["Borough"])

df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [6]:
df.shape

(103, 3)

#### Part 2: Fetching & Storing Neighborhood Coordinates into pandas DataFrame

In [7]:
# Read csv file with coords and store in seperate df
lat_df = pd.read_csv("https://cocl.us/Geospatial_data")

lat_df.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [8]:
# Merge df, lat_df along Postal Code
merged_df = df.set_index("Postal Code").join(lat_df.set_index("Postal Code")).reset_index()

merged_df.head(20)

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village",43.667856,-79.532242
6,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
7,M3B,North York,Don Mills,43.745906,-79.352188
8,M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937


#### Part 3: Clustering Neighborhoods in Toronto

Create a new_df with only the borough's with Toronto in their name

In [9]:
# Select boroughs with "Toronto" in their name
new_df = merged_df[merged_df["Borough"].apply(lambda x: "toronto" in x.lower())].reset_index(drop=True)

new_df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
1,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
2,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
3,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
4,M4E,East Toronto,The Beaches,43.676357,-79.293031


In [10]:
new_df.shape

(39, 5)

Checking to see how the neighborhoods are distributed amongst the boroughs

In [11]:
new_df.groupby("Borough").count()

Unnamed: 0_level_0,Postal Code,Neighborhood,Latitude,Longitude
Borough,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Central Toronto,9,9,9,9
Downtown Toronto,19,19,19,19
East Toronto,5,5,5,5
West Toronto,6,6,6,6


In [12]:
!pip install folium
import folium

print("Imported folium!")

Imported folium!


Showing a map of Toronto such that we can see the neighborhoods belonging to the same boroughs distinctly

In [13]:
# Get Toronto coordinates by using the average coordintaes of its' postal codes
toronto_coords = new_df['Latitude'].mean(), new_df['Longitude'].mean()

# Create dict that associates each borough with a unique color
borough_color = {
    "Central Toronto": "blue",
    "Downtown Toronto": "green",
    "East Toronto": "orange",
    "West Toronto": "purple"
}

# Create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[*toronto_coords], zoom_start=12)

# Add markers to map
for lat, lng, borough, neighborhoods in zip(new_df['Latitude'], new_df['Longitude'], new_df['Borough'], new_df['Neighborhood']):
    label = '{}: [{}]'.format(neighborhoods.split(',')[0], borough)  # only show one nbh per borough otherwise it gets hard to read
    label = folium.Popup(label, parse_html=True)

    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color=borough_color[borough],
        fill=True,
        fill_color="grey",
        fill_opacity=0.7,
        parse_html=False
).add_to(map_toronto)  

map_toronto

Storing FourSquare API credentials in a hidden cell for future use

In [14]:
# The code was removed by Watson Studio for sharing.

Define functions taht wrap arounf foursquare API calls for ease of use moving forward

In [15]:
import json

def foursquare_explore(lat, long, query, lim=100, rad=250):
    url = 'https://api.foursquare.com/v2/venues/explore'

    params = dict(
        client_id=CLIENT_ID,
        client_secret=CLIENT_SECRET,
        v='20180620',
        ll=f'{lat},{long}',
        query=query,
        limit=lim,
        radius=rad
    )

    resp = req.get(url=url, params=params)
    data = json.loads(resp.text)

    if data['meta']['code'] != 200:
        raise ValueError("Unable to fetch data from FourSquare API")

    return data['response']

In [16]:
def parse_category(row):
    result = row.get("categories", row.get("venue.categories", []))
    
    if not len(result):
        return None
    
    return result[0]['name']

Let's get a list of restaurants for each neighborhood. We will use this to get data on the distrbution of the different types of cuisines that are available in each neighborhood.

In [17]:
from pandas.io.json import json_normalize

def expand_df_with_categories(df):
    """
        This function will expand the input df such that each row
        generates many rows; 1 row per venue
    """
    ret_df = pd.DataFrame(columns=list(df.columns) + ["Categories"])
    
    for _, row in df.iterrows():
        lat = row["Latitude"]
        long = row["Longitude"]
        
        explore_result = foursquare_explore(lat, long, "restaurant")
        
        spec = explore_result["groups"][0]["items"]
        explore_df = json_normalize(spec)
        
        categories = explore_df.T.apply(parse_category).tolist()
        
        for category in categories:
            new_row = row.copy()
            new_row['Categories'] = category
            
            ret_df = ret_df.append(new_row)
    
    return ret_df
        

In [18]:
# Create a new df with the all the categories added on to it
new2_df = expand_df_with_categories(new_df)
# ^ Uncomment to run again. Commented it out since it makes a lot of API calls 
#     and takes a long time so I fon't want to accidentally run it

In [19]:
print(new2_df.shape)
new2_df.head(10)

(535, 6)


Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude,Categories
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,Breakfast Spot
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,Food Truck
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,Gastropub
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,Sandwich Place
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,Mediterranean Restaurant
1,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494,Sushi Restaurant
1,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494,Thai Restaurant
1,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494,Café
1,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494,Sandwich Place
2,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937,Burrito Place


In [20]:
# Encode restaurant categories in one-hot format
one_hot_df = pd.get_dummies(new2_df[["Categories"]], prefix="", prefix_sep="")
one_hot_df["Neighborhood"] = new2_df["Neighborhood"]

# Move Neighborhood to the left to make it wasier to see
cols = list(one_hot_df.columns)
cols = [cols[-1], *cols[:-1]]
one_hot_df = one_hot_df[cols]

print(one_hot_df.shape)
one_hot_df.head()

(535, 75)


Unnamed: 0,Neighborhood,American Restaurant,Arepa Restaurant,Argentinian Restaurant,Asian Restaurant,BBQ Joint,Bagel Shop,Bakery,Belgian Restaurant,Bistro,...,Snack Place,Soup Place,Steakhouse,Sushi Restaurant,Taco Place,Thai Restaurant,Theme Restaurant,Vegetarian / Vegan Restaurant,Vietnamese Restaurant,Wings Joint
0,"Regent Park, Harbourfront",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0,"Regent Park, Harbourfront",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0,"Regent Park, Harbourfront",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0,"Regent Park, Harbourfront",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0,"Regent Park, Harbourfront",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [21]:
new3_df = one_hot_df.groupby(by=['Neighborhood']).mean().reset_index()

print(new3_df.shape)
new3_df.head()

(29, 75)


Unnamed: 0,Neighborhood,American Restaurant,Arepa Restaurant,Argentinian Restaurant,Asian Restaurant,BBQ Joint,Bagel Shop,Bakery,Belgian Restaurant,Bistro,...,Snack Place,Soup Place,Steakhouse,Sushi Restaurant,Taco Place,Thai Restaurant,Theme Restaurant,Vegetarian / Vegan Restaurant,Vietnamese Restaurant,Wings Joint
0,Berczy Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,"Brockton, Parkdale Village, Exhibition Place",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.125,0.0
2,"Business reply mail Processing Centre, South C...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Central Bay Street,0.0,0.0,0.0,0.0,0.0,0.038462,0.0,0.0,0.0,...,0.0,0.0,0.0,0.038462,0.0,0.0,0.0,0.0,0.0,0.0
4,Christie,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Lets now cluster these districts using k-means

In [22]:
from sklearn.cluster import KMeans

km = KMeans(n_clusters=6).fit(new3_df.drop(columns="Neighborhood"))

km.labels_[:5]

array([4, 1, 1, 1, 0], dtype=int32)

In [23]:
new4_df = new3_df.copy()[["Neighborhood"]]
new4_df['Cluster'] = km.labels_
new4_df = new4_df.set_index("Neighborhood")

print(new4_df.shape)
new4_df.head()

(29, 1)


Unnamed: 0_level_0,Cluster
Neighborhood,Unnamed: 1_level_1
Berczy Park,4
"Brockton, Parkdale Village, Exhibition Place",1
"Business reply mail Processing Centre, South Central Letter Processing Plant Toronto",1
Central Bay Street,1
Christie,0


In [24]:
# Add cluster daat to new_df and store in new5_df.
# If nbhood didn't have results returned from foursquare, set cluster to "6"
new5_df = new_df.copy()
new5_df['Cluster'] = new5_df.T.apply(lambda row: new4_df.loc[row['Neighborhood']] if row['Neighborhood'] in new4_df.index else 6).T

print(new5_df.shape)
new5_df.head()

(39, 6)


Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude,Cluster
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,1
1,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494,1
2,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937,1
3,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418,1
4,M4E,East Toronto,The Beaches,43.676357,-79.293031,6


Now lets map these neighborhoods & color them based on their cluster

In [25]:
# Get Toronto coordinates by using the average coordintaes of its' postal codes
toronto_coords = new_df['Latitude'].mean(), new_df['Longitude'].mean()

# Create dict that associates each borough with a unique color
cluster_color = ["blue","green", "yellow", "orange", "purple", "red", "brown"]  # Needs 7 colors

# Create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[*toronto_coords], zoom_start=12)

# Add markers to map
for lat, lng, borough, neighborhoods, cluster in zip(new5_df['Latitude'], new5_df['Longitude'], new5_df['Borough'], new5_df['Neighborhood'], new5_df['Cluster']):
    label = '{}: [{}]'.format(neighborhoods.split(',')[0], borough)  # only show one nbh per borough otherwise it gets hard to read
    label = folium.Popup(label, parse_html=True)

    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color=cluster_color[cluster],
        fill=True,
        fill_color="white",
        fill_opacity=0.7,
        parse_html=False
).add_to(map_toronto)  

map_toronto