# <center>Segmenting and Clustering </center>
## <center>Neighborhoods in Toronto</center>

## Code to scrape Wikipedia page

In [402]:
# import required libraries
import requests
import lxml.html as lh
import pandas as pd

In [403]:
url='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
page = requests.get(url)
page # request is successful if responce is 200

<Response [200]>

Store the contents of the website under doc and parse data that are stored between tr  of HTML

In [404]:
doc = lh.fromstring(page.content)
tr_elements = doc.xpath('//tr')

#Check the length of the first 12 rows. All rows should have 3 columns
[len(T) for T in tr_elements[:12]]

[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]

## Parse Table Header
Next, let’s parse the first row as our header.

In [405]:
tr_elements = doc.xpath('//tr')
#Create empty list
col=[]
i=0
#For each row, store each first element (header) and an empty list
for t in tr_elements[0]:
    i+=1
    name=t.text_content()
    print('%d:"%s"'%(i,name))
    col.append((name,[]))

1:"Postcode"
2:"Borough"
3:"Neighbourhood
"


## Creating Pandas DataFrame
Each header is appended to a tuple along with an empty list.

In [406]:
#Since out first row is the header, data is stored on the second row onwards
for j in range(1,len(tr_elements)):
    #T is our j'th row
    T=tr_elements[j]
    
    #If row is not of size 10, the //tr data is not from our table 
    if len(T)!=3:
        break
    
    #i is the index of our column
    i=0
    
    #Iterate through each element of the row
    for t in T.iterchildren():
        data=t.text_content() 
        #Check if row is empty
        if i>0:
        #Convert any numerical value to integers
            try:
                data=int(data)
            except:
                pass
        #Append the data to the empty list of the i'th column
        col[i][1].append(data)
        #Increment i for the next column
        i+=1

## Create the DataFrame

In [407]:
Dict={title:column for (title,column) in col}
df=pd.DataFrame(Dict)

In [408]:
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned\n
1,M2A,Not assigned,Not assigned\n
2,M3A,North York,Parkwoods\n
3,M4A,North York,Victoria Village\n
4,M5A,Downtown Toronto,Harbourfront\n


Ignore cells with a borough that is Not assigned.

In [409]:
df = df[df.Borough != 'Not assigned']
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods\n
3,M4A,North York,Victoria Village\n
4,M5A,Downtown Toronto,Harbourfront\n
5,M5A,Downtown Toronto,Regent Park\n
6,M6A,North York,Lawrence Heights\n


In [410]:
df.shape

(211, 3)

While scrapping the page, name of columns had \n (new line character) so clean the column names of dataframe

In [411]:
df.columns = ['Postcode', 'Borough', 'Neighborhood']

Group By columns where Postcode and Borough values are same. Join value of Neighbourhood column with ,

In [412]:
df.head()

Unnamed: 0,Postcode,Borough,Neighborhood
2,M3A,North York,Parkwoods\n
3,M4A,North York,Victoria Village\n
4,M5A,Downtown Toronto,Harbourfront\n
5,M5A,Downtown Toronto,Regent Park\n
6,M6A,North York,Lawrence Heights\n


In [413]:
df = df['Neighborhood'].groupby([df.Postcode, df.Borough]).apply(lambda Neighborhood: ''.join(Neighborhood.to_string(index=False))).str.replace('(\\n)', '').reset_index()

In [414]:
df.head()

Unnamed: 0,Postcode,Borough,Neighborhood
0,M1B,Scarborough,Rouge\n Malvern\n
1,M1C,Scarborough,Highland Creek\n Rouge Hill\n Port Un...
2,M1E,Scarborough,Guildwood\n Morningside\n West Hill\n
3,M1G,Scarborough,Woburn\n
4,M1H,Scarborough,Cedarbrae\n


In [415]:
df = df[df.Neighborhood != 'Not assigned']

In [416]:
df.shape

(103, 3)

### Load Geospatial Cordinates csv file

In [443]:
# The code was removed by Watson Studio for sharing.

In [444]:


df_geo = pd.read_csv(body)
df_geo.columns = ['Postalcode','Latitude','Longitude']
df_geo.head()

Unnamed: 0,Postalcode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


<b><u>Merge Neighborhood data with location data</u><b>

In [419]:
neighborhoods = pd.merge(left=df,right=df_geo, left_on='Postcode', right_on='Postalcode')

In [420]:
neighborhoods.head()

Unnamed: 0,Postcode,Borough,Neighborhood,Postalcode,Latitude,Longitude
0,M1B,Scarborough,Rouge\n Malvern\n,M1B,43.806686,-79.194353
1,M1C,Scarborough,Highland Creek\n Rouge Hill\n Port Un...,M1C,43.784535,-79.160497
2,M1E,Scarborough,Guildwood\n Morningside\n West Hill\n,M1E,43.763573,-79.188711
3,M1G,Scarborough,Woburn\n,M1G,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae\n,M1H,43.773136,-79.239476


<b><u>Remove duplicate PostalCode column</u></b>

In [421]:
neighborhoods = neighborhoods.drop('Postalcode', 1)
neighborhoods.head()

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,Rouge\n Malvern\n,43.806686,-79.194353
1,M1C,Scarborough,Highland Creek\n Rouge Hill\n Port Un...,43.784535,-79.160497
2,M1E,Scarborough,Guildwood\n Morningside\n West Hill\n,43.763573,-79.188711
3,M1G,Scarborough,Woburn\n,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae\n,43.773136,-79.239476


In [422]:
neighborhoods.head()

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,Rouge\n Malvern\n,43.806686,-79.194353
1,M1C,Scarborough,Highland Creek\n Rouge Hill\n Port Un...,43.784535,-79.160497
2,M1E,Scarborough,Guildwood\n Morningside\n West Hill\n,43.763573,-79.188711
3,M1G,Scarborough,Woburn\n,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae\n,43.773136,-79.239476


In [423]:
#!conda install -c conda-forge folium=0.5.0 --yes

In [424]:
#!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab


# Clustering Toronoto Neighborhood data

In [425]:
import folium 
from geopy.geocoders import Nominatim
import numpy as np
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

In [426]:
toronto_data = neighborhoods[neighborhoods.Borough.str.contains('Toronto')]
toronto_data.head()

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
37,M4E,East Toronto,The Beaches\n,43.676357,-79.293031
41,M4K,East Toronto,The Danforth West\n Riverdale\n,43.679557,-79.352188
42,M4L,East Toronto,The Beaches West\n India Bazaar\n,43.668999,-79.315572
43,M4M,East Toronto,Studio District\n,43.659526,-79.340923
44,M4N,Central Toronto,Lawrence Park\n,43.72802,-79.38879


In [427]:
#Find Toronto Location

In [428]:
address = 'Toronto, CA'

geolocator = Nominatim(user_agent="CN_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Manhattan are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Manhattan are 43.653963, -79.387207.


In [429]:
# create map of Manhattan using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(toronto_data['Latitude'], toronto_data['Longitude'], toronto_data['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

In [430]:
## 

In [431]:
# The code was removed by Watson Studio for sharing.

In [432]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

Get Venues of all Neighborhood

In [433]:
toronto_venues = getNearbyVenues(names=toronto_data['Neighborhood'],
                                   latitudes=toronto_data['Latitude'],
                                   longitudes=toronto_data['Longitude']
                                  )

 The Beaches\n
 The Danforth West\n         Riverdale\n
 The Beaches West\n     India Bazaar\n
 Studio District\n
 Lawrence Park\n
 Davisville North\n
 North Toronto West\n
 Davisville\n
      Moore Park\n Summerhill East\n
       Deer Park\n  Forest Hill SE\n       Rathnelly\n      South Hill\n Summerhill West\n
 Rosedale\n
    Cabbagetown\n St. James Town\n
 Church and Wellesley\n
 Harbourfront\n  Regent Park\n
         Ryerson\n Garden District\n
 St. James Town\n
 Berczy Park\n
 Central Bay Street\n
 Adelaide\n     King\n Richmond\n
 Harbourfront East\n   Toronto Islands\n     Union Station\n
         Design Exchange\n Toronto Dominion Centre\n
 Commerce Court\n Victoria Hotel\n
 Roselawn\n
 Forest Hill North\n  Forest Hill West\n
     The Annex\n North Midtown\n     Yorkville\n
               Harbord\n University of Toronto\n
         Chinatown\n       Grange Park\n Kensington Market\n
          CN Tower\n     Bathurst Quay\n    Island airport\n Harbourfront West\n  King and Spadi

In [434]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")



# move neighborhood column to the first column

toronto_onehot.drop(labels=['Neighborhood'], axis=1,inplace = True)
toronto_onehot.insert(0, 'Neighborhood', toronto_venues['Neighborhood'])


In [435]:
toronto_onehot.shape

(831, 184)

In [436]:
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()


In [437]:
toronto_grouped.shape

(38, 184)

In [438]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [439]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Harbord\n University of Toronto\n,Café,Restaurant,Bar,Italian Restaurant,Japanese Restaurant,Bookstore,Bakery,Dessert Shop,Sushi Restaurant,Sandwich Place
1,CN Tower\n Bathurst Quay\n Is...,Airport Service,Airport Lounge,Airport Terminal,Airport,Bar,Coffee Shop,Sculpture Garden,Boutique,Boat or Ferry,Harbor / Marina
2,High Park\n The Junction South\n,Bar,Mexican Restaurant,Café,Thai Restaurant,Flea Market,Arts & Crafts Store,Bakery,Speakeasy,Diner,Fried Chicken Joint
3,Brockton\n Exhibition Place\n Parkdal...,Café,Breakfast Spot,Coffee Shop,Gym,Bakery,Sandwich Place,Burrito Place,Restaurant,Stadium,Italian Restaurant
4,Chinatown\n Grange Park\n Kensi...,Café,Mexican Restaurant,Vietnamese Restaurant,Caribbean Restaurant,Comfort Food Restaurant,Cocktail Bar,Cheese Shop,Coffee Shop,Organic Grocery,Noodle House


## Clustering of Neighborhood using K-means method

In [440]:
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([0, 0, 0, 0, 0, 0, 0, 0, 3, 0], dtype=int32)

In [441]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = toronto_data

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

toronto_merged.head() # check the last columns!

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
37,M4E,East Toronto,The Beaches\n,43.676357,-79.293031,0,Trail,Health Food Store,Pub,Yoga Studio,Dance Studio,Falafel Restaurant,Eastern European Restaurant,Dumpling Restaurant,Dog Run,Discount Store
41,M4K,East Toronto,The Danforth West\n Riverdale\n,43.679557,-79.352188,0,Greek Restaurant,Ice Cream Shop,Italian Restaurant,Yoga Studio,Brewery,Bookstore,Juice Bar,Spa,Restaurant,Diner
42,M4L,East Toronto,The Beaches West\n India Bazaar\n,43.668999,-79.315572,0,Sandwich Place,Park,Gym,Coffee Shop,Burger Joint,Burrito Place,Pub,Pizza Place,Movie Theater,Steakhouse
43,M4M,East Toronto,Studio District\n,43.659526,-79.340923,0,Café,Coffee Shop,Italian Restaurant,Bakery,Sandwich Place,Bar,Fish Market,Bookstore,Seafood Restaurant,Yoga Studio
44,M4N,Central Toronto,Lawrence Park\n,43.72802,-79.38879,4,Bus Line,Park,Swim School,Dance Studio,Falafel Restaurant,Eastern European Restaurant,Dumpling Restaurant,Dog Run,Discount Store,Diner


Display Venues clustering

In [442]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

<i>End of Assignment</i>

<B>About Author:</B><hr>
<B>Imtiyaz Alamshah, IT Project Manager (PMP)</B>