# Coursera 
## Applied data science capstone
### Week 3 final assignment

In [1]:
pip install wikipedia

Note: you may need to restart the kernel to use updated packages.


In [5]:
# Import packages
import wikipedia
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup

# Extract the html code of the page
html_code = wikipedia.page('List of postal codes of Canada: M').html()

# Create a beautifulSoup object where the table is
soup_object = BeautifulSoup(html_code,'lxml').find('table',{'class':'wikitable sortable'})

# Initialize an empty list to be populated
Table = []

# Scrape the html code to populate the list
for string in soup_object.stripped_strings: Table.append(string)

# Rearrange the list into an array and finally into a dataframe
Table = pd.DataFrame(np.array(Table).reshape((-1,3)))

# Rename Table Columns names
Table.columns = Table.iloc[0]

# Drop spurious first line
Table.drop(0, inplace=True)

# Ignore cells with a borough that is Not assigned.
Table = Table[Table.Borough!='Not assigned']

# Combine Neighbourhoods having the same Postal Code
if Table['Postal Code'].is_unique : print('Postal Codes are unique')

# Check not assigned neighbourhoods
if not(any(Table.Neighbourhood == 'Not Assigned')) : print('All neighbourhoods assigned')  

# Reset indexes
Table.reset_index(drop=True, inplace=True)

# Print the shape
print('The scraped table has',Table.shape[0],'rows.')

# Print the table
Table

Postal Codes are unique
All neighbourhoods assigned
The scraped table has 103 rows.


Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C..."
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."



# Part two
## Getting the geographical coordinates

In [182]:
# Import packages to get the csv
import requests
import io

# Request to the web address
conn = requests.get("https://cocl.us/Geospatial_data").content
# Read the CSV file from the web
coordinates = pd.read_csv(io.StringIO(conn.decode('utf-8')))
# Merge informations
Table = Table.merge(coordinates, left_on='Postal Code', right_on='Postal Code')
# Print the table
Table.head(6)

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude_x,Longitude_x,Latitude_y,Longitude_y
0,M3A,North York,Parkwoods,43.753259,-79.329656,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763,43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494,43.662301,-79.389494
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village",43.667856,-79.532242,43.667856,-79.532242


# Part 3
## Clustering Toronto

In [11]:
# Import packages
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans
import folium

In [63]:
# create map of New York using latitude and longitude values
TorontoMap = folium.Map(location=[43.65, -79.45], zoom_start=10.5)
# add markers to map
colors=pd.DataFrame({'B': Table['Borough'].unique(), 'C': np.random.randint(low=0, high=0xFFFFFF, size= Table['Borough'].unique().size)})
colors.set_index('B', inplace=True)
for idx in Table.index :
    label = '{}, {}'.format(Table.iloc[idx,2], Table.iloc[idx,1])
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [Table.iloc[idx,3], Table.iloc[idx,4]],
        radius=5,
        popup=label,
        color = "#%06x" %colors.loc[Table.iloc[idx,1]][0],
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(TorontoMap)  

TorontoMap


## Define basic parameters and function for Forsquare API

In [65]:
# Set Foursquare API
CLIENT_ID = '02QBHIMZSD0O05JYQX4MBB0KOZR1KH3V2TQ34PV0CH1FKBEU' 
CLIENT_SECRET = 'NNGC5E03TOWAXLIWW1ERT03PH5T3YFFN1124KLOM31F5AE44'
VERSION = '20180605' 
LIMIT = 200 
RADIUS = 500

# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']


## Define a new dataframe containing all the Toronto venues' information

In [66]:
# Install packages
from pandas.io.json import json_normalize 

# Define an empty dataframe
TorontoVenues = pd.DataFrame(index=[],columns=['name','categories','lat','lng','Neighbourhood'])

# Itreate over postal codes
for idx in Table.index:
    # create URL
    url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
        CLIENT_ID, 
        CLIENT_SECRET, 
        VERSION, 
        Table.iloc[idx,3], 
        Table.iloc[idx,4],  
        RADIUS, 
        LIMIT)
    # get results
    results = requests.get(url).json()
    # Extract venues information
    venues = results['response']['groups'][0]['items']
    # flatten JSON format
    venues = json_normalize(venues) 
    # Check existence condition
    if venues.empty: continue
    # filter columns 
    venues = venues.loc[:, ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']]
    # filter the category for each row
    venues['venue.categories'] = venues.apply(get_category_type, axis=1)
    # clean columns
    venues.columns = [col.split(".")[-1] for col in venues.columns]
    # attach Postal Code informations
    venues['Neighbourhood'] = Table.iloc[idx,2]
    # append to the overall Toronto dataframe
    TorontoVenues = TorontoVenues.append(venues)

# Reset index
TorontoVenues.reset_index(drop=True, inplace=True)
# Print Table
TorontoVenues.head()

  venues = json_normalize(venues)


Unnamed: 0,name,categories,lat,lng,Neighbourhood
0,Brookbanks Park,Park,43.751976,-79.33214,Parkwoods
1,Variety Store,Food & Drink Shop,43.751974,-79.333114,Parkwoods
2,Victoria Village Arena,Hockey Arena,43.723481,-79.315635,Victoria Village
3,Tim Hortons,Coffee Shop,43.725517,-79.313103,Victoria Village
4,Portugril,Portuguese Restaurant,43.725819,-79.312785,Victoria Village


## Explore Venues in Toronto

In [67]:
# Print the database size
print('We listed', TorontoVenues.shape[0],'venues around the city' )
# Print the number of unique categories
print('There are {} uniques categories.'.format(len(TorontoVenues['categories'].unique())))
# Print the number of Neighbourhood
print('There are {} uniques Neighbourhoods.'.format(len(TorontoVenues['Neighbourhood'].unique())))
# Print the top 5 Neighbourhoods per venues
pd.DataFrame(TorontoVenues.groupby('Neighbourhood').count()['name']).sort_values('name',ascending=False).head(5)

We listed 2125 venues around the city
There are 264 uniques categories.
There are 95 uniques Neighbourhoods.


Unnamed: 0_level_0,name
Neighbourhood,Unnamed: 1_level_1
"Toronto Dominion Centre, Design Exchange",100
"Harbourfront East, Union Station, Toronto Islands",100
"Garden District, Ryerson",100
"First Canadian Place, Underground city",100
"Commerce Court, Victoria Hotel",100


In [425]:
# Print the top 5 diffuse venues in Toronto
pd.DataFrame(TorontoVenues.groupby('categories').count()['name']).sort_values('name',ascending=False).head(5)

Unnamed: 0_level_0,name
categories,Unnamed: 1_level_1
Coffee Shop,183
Café,96
Restaurant,65
Park,51
Pizza Place,48


## Cluster neighbourhoods

In [193]:
# Initializa a full dataframe for neighbourhoods venues
NBdb = pd.DataFrame()
# Cicle over neighbourhoods and count venues per categories
for NB in TorontoVenues['Neighbourhood'].unique():
    df = pd.DataFrame(TorontoVenues.loc[TorontoVenues['Neighbourhood']==NB].groupby('categories').count()['name'])
    df = df.rename(columns={'name': NB})
    NBdb = NBdb.append(df.T)
# Replace NaNs with 0 and convert to integers
NBdb = NBdb.fillna(0).astype(int)
# Print the Dataframe head
NBdb.head(5)

Unnamed: 0,Food & Drink Shop,Park,Coffee Shop,French Restaurant,Hockey Arena,Portuguese Restaurant,Antique Shop,Art Gallery,Bakery,Bank,...,Health & Beauty Service,Martial Arts School,Sake Bar,Strip Club,Theme Restaurant,Auto Workshop,Skate Park,Hardware Store,Social Club,Wings Joint
Parkwoods,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Victoria Village,0,0,1,1,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
"Regent Park, Harbourfront",0,3,7,1,0,0,1,1,3,1,...,0,0,0,0,0,0,0,0,0,0
"Lawrence Manor, Lawrence Heights",0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
"Queen's Park, Ontario Provincial Government",0,1,6,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


### Perform k-means clustering

In [194]:
# set number of clusters
kclusters = 3
# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(NBdb)
# add clustering labels
NBdb.insert(0, 'Cluster Labels', kmeans.labels_)
# Print the Dataframe head
NBdb.head(5)

Unnamed: 0,Cluster Labels,Food & Drink Shop,Park,Coffee Shop,French Restaurant,Hockey Arena,Portuguese Restaurant,Antique Shop,Art Gallery,Bakery,...,Health & Beauty Service,Martial Arts School,Sake Bar,Strip Club,Theme Restaurant,Auto Workshop,Skate Park,Hardware Store,Social Club,Wings Joint
Parkwoods,2,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Victoria Village,2,0,0,1,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
"Regent Park, Harbourfront",1,0,3,7,1,0,0,1,1,3,...,0,0,0,0,0,0,0,0,0,0
"Lawrence Manor, Lawrence Heights",2,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
"Queen's Park, Ontario Provincial Government",1,0,1,6,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [195]:
# create map
TorontoClusterMap = folium.Map(location=[43.65, -79.45], zoom_start=10.5)

# add markers to map
colors=pd.DataFrame({'L': np.arange(kclusters), 'C': np.random.randint(low=0, high=0xFFFFFF, size=kclusters)})
colors.set_index('L', inplace=True)
for idx in Table.index :
    if Table.iloc[idx,2] in NBdb.index :
        label = '{}, {}'.format(Table.iloc[idx,2], Table.iloc[idx,1])
        label = folium.Popup(label, parse_html=True)
        folium.CircleMarker(
            [Table.iloc[idx,3], Table.iloc[idx,4]],
            radius=5,
            popup=label,
            color = "#%06x" %colors.loc[NBdb.loc[Table.iloc[idx,2]][0]][0],
            fill=True,
            fill_color="#%06x" %colors.loc[NBdb.loc[Table.iloc[idx,2]][0]][0],
            fill_opacity=0.7,
            parse_html=False).add_to(TorontoClusterMap)  
        
TorontoClusterMap

### Explore what differenziate clusters 

In [240]:
TotFeat=pd.DataFrame()

for L in np.arange(kclusters):
    
    df = pd.DataFrame(NBdb.loc[NBdb['Cluster Labels'] == L].sum())
    df = df.drop(['Cluster Labels'])
    df = df.sort_values(by=[0],ascending=False).head(5).T
    df['Cluster Labels'] = L
    TotFeat = TotFeat.append(df)
    
TotFeat.set_index('Cluster Labels', inplace=True)   
TotFeat

Unnamed: 0_level_0,Coffee Shop,Café,Restaurant,Hotel,Italian Restaurant,Japanese Restaurant,Park,Pizza Place,Grocery Store,Sandwich Place
Cluster Labels,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,94,43.0,29.0,28.0,22.0,,,,,
1,66,42.0,26.0,,21.0,19.0,,,,
2,23,,,,,,27.0,21.0,17.0,15.0
