# Part 1:Web Scrape from Wikipedia

In [153]:
from bs4 import BeautifulSoup
from lxml import html
import requests
import pandas as pd  
import numpy as np
import json
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
#!pip install folium
import folium


In [2]:
#retrieving html from link and converting to soup object
site = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')
content = site.content
soup = BeautifulSoup(content)

In [3]:
#only storing the indiviual table entries into a new list. Excluding table entries including the word "Not" to remove "Not Assigned" neighborhoods/
lines = [i for table in soup.find_all('table')[:1] for line in table for i in line if "Not" not in str(i)]     

In [4]:
#Extracting data and storing it in dictionary for easy conversion to dataframe
data = {'Postal_Code':[], 'Borough':[],'Neighborhood':[]}
for i in range(len(lines)):
    zone_list = str(lines[i]).split('<td>')
    if len(zone_list) > 1:
        data['Postal_Code'].append(str(lines[i]).split('<td>')[1].rstrip('\n</td>\n'))
        data['Borough'].append(str(lines[i]).split('<td>')[2].rstrip('\n</td>\n'))
        data['Neighborhood'].append(str(lines[i]).split('<td>')[3].rstrip('\n</td></tr>'))

In [156]:
df = pd.DataFrame(data)
df.rename(columns = {'Postal_Code':'Postal Code'}, inplace = True)
df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfron"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Governmen"


In [157]:
df.shape

(103, 3)

# Part 2: Adding latitude and longitude to dataframe from 'Geospatial_Coordinates.csv' file

### Importing csv using IBM functions 

In [8]:
# The code was removed by Watson Studio for sharing.

In [9]:
#joining both the web scraped dataframe and the latitude and longitude dataframe
final = pd.merge(df,lat_long,on = 'Postal Code')

In [10]:
final.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfron",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Governmen",43.662301,-79.389494


In [11]:
final.shape

(103, 5)

# Part 3: Map Visualization of Data

### Let's take a look at only the neighborhoods in Toronto 

In [12]:
toronto = final[final['Borough'] == 'Downtown Toronto']
toronto

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
2,M5A,Downtown Toronto,"Regent Park, Harbourfron",43.65426,-79.360636
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Governmen",43.662301,-79.389494
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
15,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
20,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306
24,M5G,Downtown Toronto,Central Bay Stree,43.657952,-79.387383
25,M6G,Downtown Toronto,Christie,43.669542,-79.422564
30,M5H,Downtown Toronto,"Richmond, Adelaide, King",43.650571,-79.384568
36,M5J,Downtown Toronto,"Harbourfront East, Union Station, Toronto Islands",43.640816,-79.381752
42,M5K,Downtown Toronto,"Toronto Dominion Centre, Design Exchange",43.647177,-79.381576


### Getting Coordinates

In [16]:
address = 'Toronto, Canada'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


### Map of Toronto with Neighborhoods superimposed on top

In [17]:
toronto_map =  folium.Map(location=[latitude,longitude], zoom_startss = 10)

for lat, long, borough, neighborhood in zip(toronto['Latitude'], toronto['Longitude'], toronto['Borough'], toronto['Neighborhood']):
    label_string = f'{borough},{neighborhood}'
    label = folium.Popup(label_string, parse_html=True)
    folium.CircleMarker(
        [lat,long],
        popup = label,
        color = 'blue',
        fill=True,
        fill_color = '#3186cc',
        fill_opacity = .7,
        parse_html = False
    
    ).add_to(toronto_map)


In [18]:
toronto_map

# Part 4: FourSquare Venue Data
##### Making API calls, and creating onehot-encoded dataframe for later clustering 

In [19]:
CLIENT_ID = '5CKIKDUK5VHQNELLTKUFNNOHKVISYJ2MZOS2JMYCVDUTH321' # your Foursquare ID
CLIENT_SECRET = 'XUOXALH4INIFZ000T42YVRIUPAPWBGFGOAWV4S2IJMOUM3WY' # your Foursquare Secret
VERSION = '20180604'
LIMIT = 30
print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: 5CKIKDUK5VHQNELLTKUFNNOHKVISYJ2MZOS2JMYCVDUTH321
CLIENT_SECRET:XUOXALH4INIFZ000T42YVRIUPAPWBGFGOAWV4S2IJMOUM3WY


In [83]:
def get_category(response, i):
    return response['groups'][0]['items'][i]['venue']['categories'][0]['name']

def get_name(response, i):
    return response['groups'][0]['items'][i]['venue']['name']
    
def get_coordinates(response, i):
    return (response['groups'][0]['items'][i]['venue']['location']['lat'],response['groups'][0]['items'][i]['venue']['location']['lng'])
            
            

            


### Getting venue info from API

In [84]:

radius = 500
LIMIT = 100
venue_list = []

#loops through each neighborhood
for i in range(toronto.shape[0]):
    neighborhood = toronto.iloc[i,-3]
    neighborhood_latitude = toronto.iloc[i,-2]
    neighborhood_longitude = toronto.iloc[i,-1]
    url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighborhood_latitude, 
    neighborhood_longitude, 
    radius, 
    LIMIT)
    
    #response with venues for each neighborhood
    response = requests.get(url).json()['response']
    
    #loops through each venue in response, and saves venue info to dictionary
    for venue in range(len(response['groups'][0]['items'])):
        venue_dict = {}
        venue_dict['neighborhood'] = neighborhood
        venue_dict['category'] = get_category(response, venue)
        venue_dict['name'] = get_name(response, venue)
        venue_dict['lat'] = get_coordinates(response, venue)[0]
        venue_dict['long'] = get_coordinates(response, venue)[1]
        venue_list.append(venue_dict)



      

In [25]:
#venue dictionary is turned into dataframe
venues_df = pd.DataFrame(venue_list)
venues_df.head(10)

Unnamed: 0,category,lat,long,name,neighborhood
0,Bakery,43.653447,-79.362017,Roselle Desserts,"Regent Park, Harbourfron"
1,Coffee Shop,43.653559,-79.361809,Tandem Coffee,"Regent Park, Harbourfron"
2,Breakfast Spot,43.653947,-79.361149,Morning Glory Cafe,"Regent Park, Harbourfron"
3,Distribution Center,43.653249,-79.358008,Cooper Koo Family YMCA,"Regent Park, Harbourfron"
4,Spa,43.654735,-79.359874,Body Blitz Spa East,"Regent Park, Harbourfron"
5,Restaurant,43.656369,-79.35698,Impact Kitchen,"Regent Park, Harbourfron"
6,Park,43.655618,-79.356211,Corktown Common,"Regent Park, Harbourfron"
7,Gym / Fitness Center,43.653313,-79.359725,The Extension Room,"Regent Park, Harbourfron"
8,Historic Site,43.650244,-79.359323,The Distillery Historic District,"Regent Park, Harbourfron"
9,Breakfast Spot,43.655675,-79.364503,Figs Breakfast & Lunch,"Regent Park, Harbourfron"


### One-hot encoding

In [90]:
#one-hot encoding
toronto_onehot = pd.get_dummies(venues_df, columns = ['category'] ).drop(columns = ['lat','long','name'])
toronto_cat = toronto_onehot.groupby('neighborhood').mean()
toronto_cat

Unnamed: 0_level_0,category_Accessories Store,category_Afghan Restaurant,category_Airport,category_Airport Food Court,category_Airport Gate,category_Airport Lounge,category_Airport Service,category_Airport Terminal,category_American Restaurant,category_Antique Shop,...,category_Theater,category_Theme Restaurant,category_Trail,category_Train Station,category_Vegetarian / Vegan Restaurant,category_Video Game Store,category_Vietnamese Restaurant,category_Wine Bar,category_Women's Store,category_Yoga Studio
neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Berczy Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.017544,0.0,0.0,0.0,0.0,0.0
"CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airpo",0.0,0.0,0.0625,0.0625,0.0625,0.125,0.1875,0.0625,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Central Bay Stree,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.014706,0.0,0.0,0.0,0.0,0.014706
Christie,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Church and Wellesley,0.0,0.013699,0.0,0.0,0.0,0.0,0.0,0.0,0.013699,0.0,...,0.013699,0.013699,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.027397
"Commerce Court, Victoria Hotel",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.04,0.0,...,0.0,0.0,0.0,0.0,0.02,0.0,0.0,0.01,0.0,0.0
"First Canadian Place, Underground city",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.03,0.0,...,0.01,0.0,0.0,0.01,0.01,0.0,0.0,0.01,0.0,0.0
"Garden District, Ryerson",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.02,0.0,0.0,0.0,0.0,0.01,0.01,0.0,0.0,0.0
"Harbourfront East, Union Station, Toronto Islands",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.01,0.0,0.0,0.01,0.01,0.0,0.0,0.01,0.0,0.0
"Kensington Market, Chinatown, Grange Park",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.035088,0.0,0.052632,0.017544,0.0,0.0


### Getting top 10 venues for each neighborhood

In [159]:
#getting top 10 venues for each neighborhood
top_10 = pd.DataFrame()
for neigh in toronto_cat.index:
    top_10 = top_10.append(pd.DataFrame([list(toronto_cat.loc[neigh,:].nlargest(10).index)]))
top_10['neighborhood'] = toronto_cat.index
top_10.rename(columns = {
                        0:'1st Most Popular Venue',
                        1:'2nd Most Popular Venue',
                        2:'3rd Most Popular Venue',
                        3:'4th Most Popular Venue',
                        4:'5th Most Popular Venue',
                        5:'6th Most Popular Venue',
                        6:'7th Most Popular Venue',
                        7:'8th Most Popular Venue',
                        8:'9th Most Popular Venue',
                        9:'10th Most Popular Venue'},
                        inplace = True)

#reorganizing order or column names 
top_10 = top_10[['neighborhood','1st Most Popular Venue', '2nd Most Popular Venue',
       '3rd Most Popular Venue', '4th Most Popular Venue',
       '5th Most Popular Venue', '6th Most Popular Venue',
       '7th Most Popular Venue', '8th Most Popular Venue',
       '9th Most Popular Venue', '10th Most Popular Venue']]
top_10.reset_index(drop = True, inplace = True)
top_10.head()

Unnamed: 0,neighborhood,1st Most Popular Venue,2nd Most Popular Venue,3rd Most Popular Venue,4th Most Popular Venue,5th Most Popular Venue,6th Most Popular Venue,7th Most Popular Venue,8th Most Popular Venue,9th Most Popular Venue,10th Most Popular Venue
0,Berczy Park,category_Coffee Shop,category_Cocktail Bar,category_Bakery,category_Beer Bar,category_Café,category_Cheese Shop,category_Restaurant,category_Seafood Restaurant,category_Art Gallery,category_BBQ Joint
1,"CN Tower, King and Spadina, Railway Lands, Har...",category_Airport Service,category_Airport Lounge,category_Airport,category_Airport Food Court,category_Airport Gate,category_Airport Terminal,category_Bar,category_Boat or Ferry,category_Boutique,category_Coffee Shop
2,Central Bay Stree,category_Coffee Shop,category_Café,category_Italian Restaurant,category_Dessert Shop,category_Sandwich Place,category_Bar,category_Bubble Tea Shop,category_Burger Joint,category_Ice Cream Shop,category_Japanese Restaurant
3,Christie,category_Grocery Store,category_Café,category_Park,category_Athletics & Sports,category_Baby Store,category_Candy Store,category_Coffee Shop,category_Diner,category_Italian Restaurant,category_Nightclub
4,Church and Wellesley,category_Coffee Shop,category_Japanese Restaurant,category_Sushi Restaurant,category_Restaurant,category_Gastropub,category_Gay Bar,category_Hotel,category_Mediterranean Restaurant,category_Men's Store,category_Pub


# Part 5: Clustering 

#### DataFrame for Clustering

In [139]:
toronto_clustering = toronto_cat.reset_index().drop('neighborhood', axis = 1)

#### Preparing model

In [147]:
clusters = 4
kmod = KMeans(n_clusters = clusters, random_state = 0).fit(toronto_clustering)
kmod.labels_

array([0, 3, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
      dtype=int32)

#### Merging DataFrames to visualize information, and also adding cluster labels

In [148]:
toronto_merged = pd.merge(toronto,top_5,left_on='Neighborhood', right_on = 'neighborhood').sort_values('Neighborhood')
toronto_merged['Cluster'] = kmod.labels_
toronto_merged = toronto_merged[['Postal Code', 'Borough', 'Neighborhood','Cluster', 'Latitude', 'Longitude',
       'neighborhood', '1st Most Popular Venue', '2nd Most Popular Venue',
       '3rd Most Popular Venue', '4th Most Popular Venue',
       '5th Most Popular Venue', '6th Most Popular Venue',
       '7th Most Popular Venue', '8th Most Popular Venue',
       '9th Most Popular Venue', '10th Most Popular Venue']]

In [149]:
toronto_merged.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Cluster,Latitude,Longitude,neighborhood,1st Most Popular Venue,2nd Most Popular Venue,3rd Most Popular Venue,4th Most Popular Venue,5th Most Popular Venue,6th Most Popular Venue,7th Most Popular Venue,8th Most Popular Venue,9th Most Popular Venue,10th Most Popular Venue
4,M5E,Downtown Toronto,Berczy Park,0,43.644771,-79.373306,Berczy Park,category_Coffee Shop,category_Cocktail Bar,category_Bakery,category_Beer Bar,category_Café,category_Cheese Shop,category_Restaurant,category_Seafood Restaurant,category_Art Gallery,category_BBQ Joint
13,M5V,Downtown Toronto,"CN Tower, King and Spadina, Railway Lands, Har...",3,43.628947,-79.39442,"CN Tower, King and Spadina, Railway Lands, Har...",category_Airport Service,category_Airport Lounge,category_Airport,category_Airport Food Court,category_Airport Gate,category_Airport Terminal,category_Bar,category_Boat or Ferry,category_Boutique,category_Coffee Shop
5,M5G,Downtown Toronto,Central Bay Stree,0,43.657952,-79.387383,Central Bay Stree,category_Coffee Shop,category_Café,category_Italian Restaurant,category_Dessert Shop,category_Sandwich Place,category_Bar,category_Bubble Tea Shop,category_Burger Joint,category_Ice Cream Shop,category_Japanese Restaurant
6,M6G,Downtown Toronto,Christie,2,43.669542,-79.422564,Christie,category_Grocery Store,category_Café,category_Park,category_Athletics & Sports,category_Baby Store,category_Candy Store,category_Coffee Shop,category_Diner,category_Italian Restaurant,category_Nightclub
18,M4Y,Downtown Toronto,Church and Wellesley,0,43.66586,-79.38316,Church and Wellesley,category_Coffee Shop,category_Japanese Restaurant,category_Sushi Restaurant,category_Restaurant,category_Gastropub,category_Gay Bar,category_Hotel,category_Mediterranean Restaurant,category_Men's Store,category_Pub


#### Create Cluster Map

In [155]:
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(clusters)
ys = [i + x + (i*x)**2 for i in range(clusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters