# IBM Applied Data Science Capstone
###### This notebook is used for the final capstone project of the IBM Applied Data Science Certificate program on Coursera

### Part 1 - Getting Started (Week 1)


In [1]:
import pandas as pd
import numpy as np

In [2]:
print("Hello Capstone Project Course!")

Hello Capstone Project Course!


### Part 2 - Scraping Location Data and Clustering (Week 3)


##### Import libraries

In [3]:
from urllib.request import urlopen
from bs4 import BeautifulSoup

##### Request html and process it into a Pandas dataframe

In [4]:
# Request html and read it as a BeautifulSoup object
with urlopen("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M") as page:  
    
    mybytes = page.read()

    html = mybytes.decode("utf8")
    
soup = BeautifulSoup(html, 'lxml')

# Find the table 
table = soup.find_all('table')[0]

# Instantiate three empty lists to hold values
postcode = []
borough = []
neighborhood = []

# Find every row in the table except for the first which holds the column names
trs = table.find_all('tr')[1:]

# Append each table element to its respective list
for tr in trs:
    tds = tr.find_all('td')
    postcode.append(tds[0].get_text())
    borough.append(tds[1].get_text())
    neighborhood.append(tds[2].get_text())
    
# Use the lists to create a dataframe
df_dict = {'PostalCode': postcode, 'Borough': borough, 'Neighborhood': neighborhood}
df = pd.DataFrame(df_dict)
df = df[['PostalCode', 'Borough', 'Neighborhood']]

display(df.describe())

display(df.head())

Unnamed: 0,PostalCode,Borough,Neighborhood
count,288,288,288
unique,180,12,209
top,M8Y,Not assigned,Not assigned\n
freq,8,77,78


Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned\n
1,M2A,Not assigned,Not assigned\n
2,M3A,North York,Parkwoods\n
3,M4A,North York,Victoria Village\n
4,M5A,Downtown Toronto,Harbourfront\n


##### Data Cleaning

In [5]:
# Delete rows where a Postal Code is not assigned (Borough == 'Not assigned')
df = df[df['Borough'] != 'Not assigned']

# Delete '\n' at the end of every Neighborhood
df['Neighborhood'] = df['Neighborhood'].apply(lambda x: x[:-1])

# Check which Neighborhood has 'Not assigned'
display(df[df['Neighborhood'] == 'Not assigned'])

# Assign Neighborhood = Borough where Neighborhood == 'Not assigned'
df['Neighborhood'].where(df['Neighborhood'] != 'Not assigned', df['Borough'], inplace = True)

# Check if Neighborhood is filled correctly
display(df[df['Neighborhood'] == 'Not assigned'])
print('Not assigned' in df['Neighborhood'].where(df['Neighborhood'] != 'Not assigned', df['Borough']).tolist())

# Join Neighborhoods together where Postal Code is the same
df = df.groupby(['PostalCode', 'Borough']).agg(lambda x: ', '.join(x)).reset_index()
display(df.head())

Unnamed: 0,PostalCode,Borough,Neighborhood
8,M7A,Queen's Park,Not assigned


Unnamed: 0,PostalCode,Borough,Neighborhood


False


Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [6]:
print('The dataframe has shape', df.shape)

The dataframe has shape (103, 3)


### Getting the Longitude and Latitude of the Neighborhoods

##### Import library

In [7]:
!conda install -c conda-forge geocoder --yes
import geocoder
print('Successfully installed and imported')

Solving environment: done

# All requested packages already installed.

Successfully installed and imported


##### Getting longtitude and latitude

In [8]:
# Have a list of the PostalCodes
postal_codes = df['PostalCode'].tolist()

# Instantiate two empty lists to hold longitute and latitude
longitude = []
latitude = []

# Loop through the postal_codes
for postal_code in postal_codes:

    # initialize the variable to None
    lat_lng_coords = None

    # loop until getting the coordinates
    while(lat_lng_coords is None):
        g = geocoder.arcgis('{}, Toronto, Ontario'.format(postal_code))
        lat_lng_coords = g.latlng
    
    # Append the data to the lists
    latitude.append(lat_lng_coords[0])
    longitude.append(lat_lng_coords[1])

##### Appending longitude and latitude to the dataframe


In [9]:
df['Latitude'] = latitude
df['Longitude'] = longitude
display(df)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.811525,-79.195517
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.785730,-79.158750
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.765690,-79.175256
3,M1G,Scarborough,Woburn,43.768359,-79.217590
4,M1H,Scarborough,Cedarbrae,43.769688,-79.239440
5,M1J,Scarborough,Scarborough Village,43.743125,-79.231750
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",43.726245,-79.263670
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",43.713133,-79.285055
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West",43.723575,-79.234976
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.696665,-79.260163


### Clustering the Neighborhoods

##### Import libraries

In [10]:
!conda install -c conda-forge folium=0.5.0 --yes
import folium
import requests
print('Package installed and imported successfully')

Solving environment: done

# All requested packages already installed.

Package installed and imported successfully


In [11]:
# The code was removed by Watson Studio for sharing.

##### Getting the Toronto dataframe

In [12]:
toronto = df[df['Borough'].str.contains('Toronto')]
display(toronto)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
37,M4E,East Toronto,The Beaches,43.676845,-79.295225
41,M4K,East Toronto,"The Danforth West, Riverdale",43.683262,-79.35512
42,M4L,East Toronto,"The Beaches West, India Bazaar",43.667965,-79.314673
43,M4M,East Toronto,Studio District,43.662766,-79.33483
44,M4N,Central Toronto,Lawrence Park,43.72816,-79.387085
45,M4P,Central Toronto,Davisville North,43.712815,-79.388526
46,M4R,Central Toronto,North Toronto West,43.714523,-79.40696
47,M4S,Central Toronto,Davisville,43.703395,-79.385964
48,M4T,Central Toronto,"Moore Park, Summerhill East",43.690655,-79.383561
49,M4V,Central Toronto,"Deer Park, Forest Hill SE, Rathnelly, South Hi...",43.686083,-79.402335


##### Getting the venues near each neighborhood

In [13]:
# function to get venues near a neighborhood
def getNearbyVenues(names, latitudes, longitudes, radius=500, LIMIT=100):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

toronto_venues = getNearbyVenues(names=toronto['Neighborhood'],
                                   latitudes=toronto['Latitude'],
                                   longitudes=toronto['Longitude']
                                  )



The Beaches
The Danforth West, Riverdale
The Beaches West, India Bazaar
Studio District
Lawrence Park
Davisville North
North Toronto West
Davisville
Moore Park, Summerhill East
Deer Park, Forest Hill SE, Rathnelly, South Hill, Summerhill West
Rosedale
Cabbagetown, St. James Town
Church and Wellesley
Harbourfront, Regent Park
Ryerson, Garden District
St. James Town
Berczy Park
Central Bay Street
Adelaide, King, Richmond
Harbourfront East, Toronto Islands, Union Station
Design Exchange, Toronto Dominion Centre
Commerce Court, Victoria Hotel
Roselawn
Forest Hill North, Forest Hill West
The Annex, North Midtown, Yorkville
Harbord, University of Toronto
Chinatown, Grange Park, Kensington Market
CN Tower, Bathurst Quay, Island airport, Harbourfront West, King and Spadina, Railway Lands, South Niagara
Stn A PO Boxes 25 The Esplanade
First Canadian Place, Underground city
Christie
Dovercourt Village, Dufferin
Little Portugal, Trinity
Brockton, Exhibition Place, Parkdale Village
High Park, The 

In [14]:
# Checking number of venues for each neighborhood
display(toronto_venues.groupby('Neighborhood').count())
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Adelaide, King, Richmond",100,100,100,100,100,100
Berczy Park,63,63,63,63,63,63
"Brockton, Exhibition Place, Parkdale Village",67,67,67,67,67,67
Business Reply Mail Processing Centre 969 Eastern,100,100,100,100,100,100
"CN Tower, Bathurst Quay, Island airport, Harbourfront West, King and Spadina, Railway Lands, South Niagara",68,68,68,68,68,68
"Cabbagetown, St. James Town",43,43,43,43,43,43
Central Bay Street,100,100,100,100,100,100
"Chinatown, Grange Park, Kensington Market",94,94,94,94,94,94
Christie,9,9,9,9,9,9
Church and Wellesley,82,82,82,82,82,82


There are 212 uniques categories.


##### Getting the frequency of each type venue in each neighborhood

In [15]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = toronto_onehot.columns.tolist()
fixed_columns.remove('Neighborhood')
fixed_columns.insert(0, 'Neighborhood')
toronto_onehot = toronto_onehot[fixed_columns]

# group rows by neighborhood and by taking the mean of the frequency of occurrence of each category
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()

display(toronto_grouped.head())
toronto_grouped.shape

Unnamed: 0,Neighborhood,Adult Boutique,Afghan Restaurant,American Restaurant,Antique Shop,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,...,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,"Adelaide, King, Richmond",0.0,0.0,0.03,0.0,0.01,0.0,0.0,0.03,0.0,...,0.0,0.0,0.0,0.01,0.0,0.0,0.01,0.0,0.0,0.0
1,Berczy Park,0.0,0.0,0.0,0.0,0.015873,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.015873,0.0,0.0,0.0,0.0,0.0,0.0
2,"Brockton, Exhibition Place, Parkdale Village",0.0,0.0,0.0,0.0,0.014925,0.0,0.014925,0.0,0.0,...,0.0,0.0,0.0,0.029851,0.0,0.014925,0.0,0.0,0.0,0.0
3,Business Reply Mail Processing Centre 969 Eastern,0.0,0.0,0.03,0.0,0.01,0.0,0.0,0.02,0.0,...,0.0,0.0,0.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0
4,"CN Tower, Bathurst Quay, Island airport, Harbo...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.014706,0.0,...,0.0,0.0,0.014706,0.014706,0.0,0.0,0.0,0.0,0.0,0.014706


(37, 212)

##### Create dataframe with the ten most common venue in each neighborhood

In [16]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Adelaide, King, Richmond",Hotel,Coffee Shop,Café,Japanese Restaurant,Burger Joint,Restaurant,Breakfast Spot,Steakhouse,Deli / Bodega,Bakery
1,Berczy Park,Coffee Shop,Cocktail Bar,Restaurant,Italian Restaurant,Steakhouse,Café,Beer Bar,Seafood Restaurant,Cheese Shop,Bakery
2,"Brockton, Exhibition Place, Parkdale Village",Coffee Shop,Café,Furniture / Home Store,Bar,Sandwich Place,Restaurant,Hotel,Supermarket,Gym,Beer Bar
3,Business Reply Mail Processing Centre 969 Eastern,Coffee Shop,Hotel,Bar,Steakhouse,Café,American Restaurant,Italian Restaurant,Pub,Pizza Place,Japanese Restaurant
4,"CN Tower, Bathurst Quay, Island airport, Harbo...",Coffee Shop,Italian Restaurant,Bar,Restaurant,Gym / Fitness Center,Café,Sandwich Place,Speakeasy,Park,Pub


##### Clustering

In [17]:
from sklearn.cluster import KMeans

# set number of cluster
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([3, 3, 3, 3, 3, 3, 3, 3, 3, 3], dtype=int32)

In [18]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = toronto

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

toronto_merged.head() # check the last columns!

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
37,M4E,East Toronto,The Beaches,43.676845,-79.295225,0.0,Health Food Store,Pub,Trail,Other Great Outdoors,Yoga Studio,Ethiopian Restaurant,Food & Drink Shop,Food,Flower Shop,Fish Market
41,M4K,East Toronto,"The Danforth West, Riverdale",43.683262,-79.35512,0.0,Bus Line,Discount Store,Park,Grocery Store,Fast Food Restaurant,Yoga Studio,Falafel Restaurant,Food Court,Food & Drink Shop,Food
42,M4L,East Toronto,"The Beaches West, India Bazaar",43.667965,-79.314673,0.0,Park,Burger Joint,Ice Cream Shop,Movie Theater,Sushi Restaurant,Pub,Italian Restaurant,Steakhouse,Liquor Store,Sandwich Place
43,M4M,East Toronto,Studio District,43.662766,-79.33483,3.0,Bakery,Diner,Italian Restaurant,Sushi Restaurant,Café,Brewery,Pizza Place,Gastropub,Bar,Coffee Shop
44,M4N,Central Toronto,Lawrence Park,43.72816,-79.387085,2.0,Bus Line,Swim School,Yoga Studio,Falafel Restaurant,Food Court,Food & Drink Shop,Food,Flower Shop,Fish Market,Fish & Chips Shop


In [19]:
# Set the neighborhood with cluster label of nan to 5
toronto_merged['Cluster Labels'].fillna(5, inplace=True)
toronto_merged['Cluster Labels'] = toronto_merged['Cluster Labels'].astype('int')

##### Generating the map

In [21]:
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# create map
map_clusters = folium.Map(location=[43.6532, -79.3832], zoom_start=12)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
display(map_clusters)