### Week 1 - Segmenting and Clustering Neighborhoods in Toronto

In [None]:
import requests
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans

import folium
from bs4 import BeautifulSoup
from geopy.geocoders import Nominatim

#### Webscrape Canada's list of postal codes from its wikipedia page

In [None]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
response = requests.get(url)

In [None]:
soup = BeautifulSoup(response.text, 'html.parser')
table = soup.find('table')

In [None]:
content_table = []

for row in table.findAll('td'):
    cell = {}
    if row.span.text == 'Not assigned':
        pass
    else:
        cell['Postalcode'] = row.b.text[:3]
        cell['Borough'] = (row.span.text).split('(')[0]
        cell['Neighborhood'] = (((((row.span.text).split('(')[1]).strip(')')).replace(' /', ',')).replace(')', ' ')).strip(' ')
        content_table.append(cell)

In [None]:
df = pd.DataFrame(content_table)
df.head()

In [None]:
df['Borough'] = df['Borough'].replace({'Downtown TorontoStn A PO Boxes25 The Esplanade':'Downtown Toronto Stn A',
                                      'East TorontoBusiness reply mail Processing Centre969 Eastern':'East Toronto Business',
                                      'EtobicokeNorthwest':'Etobivoke Northwest',
                                      'East YorkEast Toronto':'East York/East Toronto',
                                      'MississaugaCanada Post Gateway Processing Centre':'Mississauga'})

In [None]:
print('Shape of the dataframe {}'.format(df.shape))

#### Order the dataframe to match the "Geospatial_Coordinates.csv" file

In [None]:
order_df = df.sort_values(by=['Postalcode'], ascending = True)
order_df.reset_index(drop = True, inplace = True)
order_df.head(5)

In [None]:
coordinates_df = pd.read_csv('Geospatial_Coordinates.csv')
coordinates_df.reset_index(drop = True, inplace = True)

In [None]:
Toronto_df = pd.concat([order_df, coordinates_df], axis = 1)
Toronto_df.drop(['Postal Code'], axis = 1, inplace = True)
Toronto_df.head()

#### Narrowing my clustering/segmentation task to the Borough with the most Neighborhoods

In [None]:
Toronto_df.groupby('Borough').count()

In [None]:
northYork_df = Toronto_df[Toronto_df['Borough'] == 'North York'].reset_index(drop = True)
northYork_df.head(24)

In [None]:
geolocator = Nominatim(user_agent = 'tor_explorer')
location = geolocator.geocode('North York, TOR')
latitude = location.latitude
longitude = location.longitude
print('Coordinates for North York in Toronto are the following: {} / {}'.format(latitude, longitude))

In [None]:
map_northYork = folium.Map(location = [latitude, longitude], zoom_start = 11)

for lat, lng, label in zip(northYork_df['Latitude'], northYork_df['Longitude'], northYork_df['Neighborhood']):
    label = folium.Popup(label, parse_html = True)
    folium.CircleMarker(
    [lat, lng],
    radius = 4,
    popup = label,
    color = 'blue',
    fill = True,
    fill_color = '#3186cc',
    fill_opacity = 0.7,
    parse_html = False).add_to(map_northYork)
    
map_northYork

#### Initiate Foursquare

In [None]:
CLIENT_ID = 'NKIICUFP3F2YR2CNU31ATDOOGFMZE2MDEHC3ZORF3C0EUSUI'
CLIENT_SECRET = 'WMNZ2GATYAXECUDL4YZX0AEIMUBY4EHYNMYKALOU0FY3QAZR'
VERSION = '20200615'
LIMIT = 100

In [None]:
def getNearbyVenues(names, latitutdes, longitudes, radius = 500):
    venues_list = []
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
        
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
        CLIENT_ID,
        CLIENT_SECRET,
        VERSION,
        lat,
        lng,
        radius,
        LIMIT)
        
        # make the GET request
        results = requests.get(url).json()["Response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name,
        lat,
        lng,
        v['venue']['name'],
        v['venue']['location']['lat'],
        v['venue']['location']['lng'],
        v['venue']['categories'][0]['name']) for v in results])
        
    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in velue_list])
    nearby_venues.columns = ['Neirghborhood',
                            'Neighborhood Latitude',
                            'Neighborhood Longitude',
                            'Venue',
                            'Venue Latitude',
                            'Venue Longitude',
                            'Venue Category']
    
    return nearby_venues

In [None]:
northYork_venues = getNearbyVenues(names = northYork_df['Neighborhood'],
                                  latitudes = northYork_df['Latitude'],
                                  longitudes = northYork_df['Longitude'])

In [None]:
print(northYork_venues.shape)
northYork_venues.head(5)

In [None]:
print('There are {} uniques categories'.format(len(northYork_venues['Venue Category'].unique())))

In [None]:
# one hot encoding
northYork_onehot = pd.get_dummies(northYork_venues[['Venue Category']], prefix = "", prefix_sep = "")

# add neighborhood column back to the dataframe
northYork_onehot['Neighborhood'] = northYork_venus['Neighborhood']

# move neighborhood column to the first column
fixed_columns = [northYork_onehot.columns[-1]] + list(northYork_onehot.columns[:-1])
northYork_onehot = northYork_onehot[fixed_columns]
nortYork_onehot.head(5)

In [None]:
northYork_grouped = northYork_onehot.groupby('Neighborhood').mean().reset_index()
northYork_grouped

In [None]:
print(northYork_grouped.shape)

In [None]:
num_top_venues = 5

for hood in northYork_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = northYork_grouped[northYork_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending = False).reset_index(drop = True).head(num_top_venues))
    print('\n')

In [None]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [None]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = northYork_grouped['Neighborhood']

for ind in np.arange(northYork_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(northYork_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

In [None]:
# set number of clusters
kclusters = 5

manhattan_grouped_clustering = northYork_grouped.drop('Neighborhood', 1)

# run k-means clustering}
kmeans = KMeans(n_clusters = kclusters, random_state = 0).fit(northYork_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

In [None]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

northYork_merged = northYork_data

# merge manhattan_grouped with manhattan_data to add latitude/longitude for each neighborhood
northYork_merged = northYork_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

northYork_merged.head() # check the last columns!

In [None]:
# create map
map_clusters = folium.Map(location = [latitude, longitude], zoom_start = 11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(northYork_merged['Latitude'], northYork_merged['Longitude'], northYork_merged['Neighborhood'], northYork_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

In [None]:
def cluster_examination(num_cluster, dataframe):
    cluster_info = dataframe.loc[dataframe['Cluster Labels'] == num_cluster, dataframe.columns[[1] + list(range(5, dataframe.shape[1]))]]
    return cluster_info

In [None]:
northYork_cluster1 = cluster_examination(1, northYork_merged)
northYork_cluster1.head()