## IBM Coursera Capstone
This notebook contains my code for the final project of the Applied Data Science Course

In [2]:
import pandas as pd
import numpy as np

In [3]:
from bs4 import BeautifulSoup
import requests

### This iQuestion 1. 

I used both Pandas and BeautifulSoup to retrieve the table

In [4]:
# Reading table using Pandas
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
tables = pd.read_html(url)

In [5]:
tables[0].head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [6]:
# I also practiced web scraping using Beautiful Soup
headers = {'User-Agent': 'Mozilla/5.0'}

In [7]:
response = requests.get(url, headers = headers)

In [8]:
#Check status code to ensure it has properly connected to web page 200 is connected
response.status_code

200

In [9]:
soup = BeautifulSoup(response.content, 'html.parser')


In [10]:
#Found table name by using inspect on webpage
suburb_table = soup.find_all('table', class_ ='wikitable sortable')

In [11]:
#This will grab all tables on page
len(suburb_table)

1

In [12]:
suburb_table = suburb_table[0]

In [13]:
#This ensures I have the correct type
type(suburb_table)

bs4.element.Tag

In [14]:
#Loops through tr (Rows) and td (elements) and saves each to list. Then puts the list into a dataframe
l = []
table_rows = suburb_table.find_all('tr')

for tr in table_rows:
    td = tr.find_all('td')
    row = [tr.text.strip() for tr in td if tr.text.strip()]
    if row:
        l.append(row)
    
df = pd.DataFrame(l, columns=["Postcode", "Borough", "Neighborhood"])

In [15]:
# Drops values with 'Not assigned' 
df.drop(df[df['Neighborhood'] == 'Not assigned'].index, inplace=True)


In [16]:
df = df.groupby(['Postcode', 'Borough'])['Neighborhood'].apply(', '.join).reset_index()


In [17]:
df.shape

(102, 3)

### Question 2

I used the csv file and merged in the data

In [19]:
# The code was removed by Watson Studio for sharing.

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [20]:
df.head()

Unnamed: 0,Postcode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [21]:
# Merge the two dataframes
toronto_df = pd.merge(df, df_data_0, left_on='Postcode', right_on='Postal Code')

In [22]:
# Drop the extra Postal code column
toronto_df.drop('Postal Code', axis=1, inplace=True)

In [23]:
toronto_df.head()

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


### Question 3

In [24]:
import json # library to handle JSON files

!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - geopy


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    openssl-1.1.1d             |       h516909a_0         2.1 MB  conda-forge
    ca-certificates-2019.9.11  |       hecc5488_0         144 KB  conda-forge
    certifi-2019.9.11          |           py36_0         147 KB  conda-forge
    geographiclib-1.50         |             py_0          34 KB  conda-forge
    geopy-1.20.0               |             py_0          57 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         2.5 MB

The following NEW packages will be INSTALLED:

    geographiclib:   1.50-py_0         conda-forge
    geopy:           1.20.0-py_0       conda-forge

The following packages will be UPDATED:

    cer

In [107]:
# The code was removed by Watson Studio for sharing.

In [88]:
# This function will loop through neighborhoods and make a request to Foursquare for nearby venues 
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

I have chosen the most populated and least populated postcodes to compare.

In [42]:
hl = ['M1B', 'M2N', 'M1V', 'M9V', 'M2J', 'M5K', 'M5L', 'M5W', 'M5X', 'M7A'] 
high_low = toronto_df[toronto_df['Postcode'].isin(hl)]


In [52]:
high_low_venue = getNearbyVenues(names=high_low['Neighborhood'],
                                   latitudes=high_low['Latitude'],
                                   longitudes=high_low['Longitude']
                                  )

Rouge, Malvern
Agincourt North, L'Amoreaux East, Milliken, Steeles East
Fairview, Henry Farm, Oriole
Willowdale South
Design Exchange, Toronto Dominion Centre
Commerce Court, Victoria Hotel
Stn A PO Boxes 25 The Esplanade
First Canadian Place, Underground city
Albion Gardens, Beaumond Heights, Humbergate, Jamestown, Mount Olive, Silverstone, South Steeles, Thistletown


In [53]:
high_low_venue.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Agincourt North, L'Amoreaux East, Milliken, Steeles East",2,2,2,2,2,2
"Albion Gardens, Beaumond Heights, Humbergate, Jamestown, Mount Olive, Silverstone, South Steeles, Thistletown",8,8,8,8,8,8
"Commerce Court, Victoria Hotel",100,100,100,100,100,100
"Design Exchange, Toronto Dominion Centre",100,100,100,100,100,100
"Fairview, Henry Farm, Oriole",64,64,64,64,64,64
"First Canadian Place, Underground city",100,100,100,100,100,100
"Rouge, Malvern",1,1,1,1,1,1
Stn A PO Boxes 25 The Esplanade,99,99,99,99,99,99
Willowdale South,34,34,34,34,34,34


In [89]:
# one hot encoding
toronto_onehot = pd.get_dummies(high_low_venue[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = high_low_venue['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

Unnamed: 0,Neighborhood,American Restaurant,Antique Shop,Art Gallery,Arts & Crafts Store,Asian Restaurant,BBQ Joint,Bagel Shop,Bakery,Bank,...,Theater,Toy / Game Store,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Wine Bar,Wings Joint,Women's Store
0,"Rouge, Malvern",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"Agincourt North, L'Amoreaux East, Milliken, St...",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,"Agincourt North, L'Amoreaux East, Milliken, St...",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"Fairview, Henry Farm, Oriole",0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
4,"Fairview, Henry Farm, Oriole",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [90]:
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped

Unnamed: 0,Neighborhood,American Restaurant,Antique Shop,Art Gallery,Arts & Crafts Store,Asian Restaurant,BBQ Joint,Bagel Shop,Bakery,Bank,...,Theater,Toy / Game Store,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Wine Bar,Wings Joint,Women's Store
0,"Agincourt North, L'Amoreaux East, Milliken, St...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,"Albion Gardens, Beaumond Heights, Humbergate, ...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.125,0.0,0.0,0.0,0.0
2,"Commerce Court, Victoria Hotel",0.04,0.0,0.01,0.0,0.01,0.0,0.0,0.03,0.0,...,0.0,0.0,0.0,0.02,0.0,0.0,0.0,0.01,0.0,0.0
3,"Design Exchange, Toronto Dominion Centre",0.04,0.0,0.01,0.0,0.02,0.0,0.0,0.02,0.0,...,0.01,0.0,0.01,0.01,0.0,0.0,0.0,0.01,0.0,0.0
4,"Fairview, Henry Farm, Oriole",0.015625,0.0,0.0,0.0,0.015625,0.0,0.0,0.03125,0.015625,...,0.015625,0.015625,0.0,0.0,0.015625,0.0,0.0,0.0,0.015625,0.0625
5,"First Canadian Place, Underground city",0.03,0.0,0.01,0.0,0.03,0.0,0.0,0.03,0.0,...,0.01,0.0,0.01,0.01,0.0,0.0,0.0,0.01,0.0,0.0
6,"Rouge, Malvern",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,Stn A PO Boxes 25 The Esplanade,0.010101,0.010101,0.020202,0.0,0.0,0.010101,0.010101,0.030303,0.0,...,0.0,0.0,0.0,0.010101,0.0,0.0,0.0,0.0,0.0,0.0
8,Willowdale South,0.0,0.0,0.0,0.029412,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.029412,0.0,0.0,0.0


In [91]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [92]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Agincourt North, L'Amoreaux East, Milliken, St...",Playground,Park,Clothing Store,Cocktail Bar,Coffee Shop,Comfort Food Restaurant,Concert Hall,Cosmetics Shop,Creperie,Deli / Bodega
1,"Albion Gardens, Beaumond Heights, Humbergate, ...",Pharmacy,Sandwich Place,Grocery Store,Video Store,Pizza Place,Beer Store,Fried Chicken Joint,Fast Food Restaurant,Department Store,Deli / Bodega
2,"Commerce Court, Victoria Hotel",Coffee Shop,Café,Hotel,American Restaurant,Restaurant,Italian Restaurant,Gym,Gastropub,Seafood Restaurant,Steakhouse
3,"Design Exchange, Toronto Dominion Centre",Coffee Shop,Café,Hotel,American Restaurant,Restaurant,Gastropub,Steakhouse,Seafood Restaurant,Italian Restaurant,Bar
4,"Fairview, Henry Farm, Oriole",Clothing Store,Coffee Shop,Fast Food Restaurant,Women's Store,Japanese Restaurant,Tea Room,Bakery,Bank,Dessert Shop,Liquor Store


In [103]:
# set number of clusters
kclusters = 4

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([2, 3, 0, 0, 0, 0, 1, 0, 0], dtype=int32)

In [104]:
# add clustering labels
#neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = high_low

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

toronto_merged

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353,1,Fast Food Restaurant,Women's Store,Electronics Store,Cocktail Bar,Coffee Shop,Comfort Food Restaurant,Concert Hall,Cosmetics Shop,Creperie,Deli / Bodega
14,M1V,Scarborough,"Agincourt North, L'Amoreaux East, Milliken, St...",43.815252,-79.284577,2,Playground,Park,Clothing Store,Cocktail Bar,Coffee Shop,Comfort Food Restaurant,Concert Hall,Cosmetics Shop,Creperie,Deli / Bodega
18,M2J,North York,"Fairview, Henry Farm, Oriole",43.778517,-79.346556,0,Clothing Store,Coffee Shop,Fast Food Restaurant,Women's Store,Japanese Restaurant,Tea Room,Bakery,Bank,Dessert Shop,Liquor Store
22,M2N,North York,Willowdale South,43.77012,-79.408493,0,Ramen Restaurant,Coffee Shop,Sandwich Place,Café,Pizza Place,Sushi Restaurant,Ice Cream Shop,Restaurant,Juice Bar,Bubble Tea Shop
60,M5K,Downtown Toronto,"Design Exchange, Toronto Dominion Centre",43.647177,-79.381576,0,Coffee Shop,Café,Hotel,American Restaurant,Restaurant,Gastropub,Steakhouse,Seafood Restaurant,Italian Restaurant,Bar
61,M5L,Downtown Toronto,"Commerce Court, Victoria Hotel",43.648198,-79.379817,0,Coffee Shop,Café,Hotel,American Restaurant,Restaurant,Italian Restaurant,Gym,Gastropub,Seafood Restaurant,Steakhouse
69,M5W,Downtown Toronto,Stn A PO Boxes 25 The Esplanade,43.646435,-79.374846,0,Coffee Shop,Café,Restaurant,Seafood Restaurant,Hotel,Italian Restaurant,Beer Bar,Bakery,Japanese Restaurant,Creperie
70,M5X,Downtown Toronto,"First Canadian Place, Underground city",43.648429,-79.38228,0,Coffee Shop,Café,Hotel,Restaurant,Steakhouse,Seafood Restaurant,Gastropub,Japanese Restaurant,Deli / Bodega,American Restaurant
100,M9V,Etobicoke,"Albion Gardens, Beaumond Heights, Humbergate, ...",43.739416,-79.588437,3,Pharmacy,Sandwich Place,Grocery Store,Video Store,Pizza Place,Beer Store,Fried Chicken Joint,Fast Food Restaurant,Department Store,Deli / Bodega


In [105]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

The more populated areas are clustered as 0, while the less populated areas are seperated in their own clusters.