<h3 style = 'color : #000080'><u>Part 1 : Create and Clean Data Frame</u></h3>

In [1]:
#importing numpy and pandas
import numpy as np
import pandas as pd

In [2]:
#import matplotlib
import matplotlib.cm as cm
import matplotlib.colors as colors

In [3]:
#web scraping using Pandas
list_of_tables = pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')

In [4]:
#checking datatype
type(list_of_tables)

list

In [5]:
#Getting the Dataframe which consists of relevant data
df = list_of_tables[0]

In [6]:
df.head()

Unnamed: 0,Postal code,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront


In [7]:
#to check how many rows are null
df.isnull().sum()

Postal code      0
Borough          0
Neighborhood    77
dtype: int64

In [8]:
df.describe()

Unnamed: 0,Postal code,Borough,Neighborhood
count,180,180,103
unique,180,11,98
top,M2M,Not assigned,Downsview
freq,1,77,4


In [9]:
'''
to drop rows containing "Not assigned" by dropping rows with null value in "Neighborhood" column as 
Neighborhood couldn't exist withot Borough.
'''
df.dropna(inplace = True)

In [10]:
df.head()

Unnamed: 0,Postal code,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront
5,M6A,North York,Lawrence Manor / Lawrence Heights
6,M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government


In [11]:
df.reset_index(inplace = True, drop = True)

In [12]:
df.describe()

Unnamed: 0,Postal code,Borough,Neighborhood
count,103,103,103
unique,103,10,98
top,M2M,North York,Downsview
freq,1,24,4


In [13]:
df.head()

Unnamed: 0,Postal code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Regent Park / Harbourfront
3,M6A,North York,Lawrence Manor / Lawrence Heights
4,M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government


In [14]:
#replacing "/" with "," in "Neighborhood" Column
df['Neighborhood'] = df['Neighborhood'].str.replace('/',',')
df.head()

Unnamed: 0,Postal code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park , Harbourfront"
3,M6A,North York,"Lawrence Manor , Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park , Ontario Provincial Government"


In [15]:
#shape of dataframe
print("Shape of DataFrame is : ",df.shape)

Shape of DataFrame is :  (103, 3)


<h3 style = 'color : #000080'><u>Part 2 : Fetching Latitude and Longitude</u></h3>

In [16]:
# i tried a lot for using Geocoders but it was giving issue, hence used .csv file as recommended.
lat_lng = pd.read_csv("http://cocl.us/Geospatial_data")

In [17]:
lat_lng.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [18]:
#user merge to merge df dataframe and lat_lng dataframe. 
df = df.merge(lat_lng, how = 'left', left_on = 'Postal code', right_on = 'Postal Code')

In [19]:
#dropped "Postal Code" column from lat_lng dataframe as it was redundant.
df.drop('Postal Code', axis = 1, inplace = True)

In [20]:
df.head()

Unnamed: 0,Postal code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park , Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor , Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park , Ontario Provincial Government",43.662301,-79.389494


<h3 style = 'color : #000080'><u>Part 3 : Exploring Neighborhoods in Toronto</u></h3>

In [21]:
#neighborhoods in Toronto
df_toronto = df[df['Borough'].str.contains('Toronto')].reset_index(drop = True)

In [22]:
df_toronto.head()

Unnamed: 0,Postal code,Borough,Neighborhood,Latitude,Longitude
0,M5A,Downtown Toronto,"Regent Park , Harbourfront",43.65426,-79.360636
1,M7A,Downtown Toronto,"Queen's Park , Ontario Provincial Government",43.662301,-79.389494
2,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
3,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
4,M4E,East Toronto,The Beaches,43.676357,-79.293031


In [23]:
print('Shape of df_toronto: ',df_toronto.shape)

Shape of df_toronto:  (39, 5)


In [24]:
#import Geopy
import geopy

In [25]:
#import Nominatim
from geopy.geocoders import Nominatim

In [26]:
#fetching latitiude and longitude of Toronto
geo = Nominatim(user_agent = 'my_application')
location_toronto = geo.geocode('Toronto')
lat = location_toronto.latitude
long = location_toronto.longitude
print('Graphical coordinates of Toronto are {} and {}.'.format(lat,long))

Graphical coordinates of Toronto are 43.6534817 and -79.3839347.


In [27]:
#import Folium
import folium

In [28]:
#map visualization of neighborhood in Toronto
map_toronto = folium.Map(location = [lat,long], zoom_start= 11.5)

for lat,lng,label in zip(df_toronto['Latitude'],df_toronto['Longitude'],df_toronto['Neighborhood']):
    label = folium.Popup(label,parse_html = True )
    folium.CircleMarker(
    [lat,lng],
    radius = 5,
    popup = label,
    color = 'blue',
    fill = True,    
    fill_color = '#3186CC',
    fill_opacity = 0.7,
    parse_html = False).add_to(map_toronto)

    
map_toronto

<h4 style = 'color : #FF9933'>Foursquare Credentials</h4>

In [29]:
client_id = '3EXJEPE3HIZJR234GXTVUF0IAMWDGGURTMLNSL3AW3QULKMJ'
client_secret = 'TMQZJAWQKISWFZMI3NAFDVMX4J5GNJNKTNWT1TPLNHPZKCIG'
version = '20200411'

Fetching top 10 venues near <b>St. James Town</b> Neighborhood within 200 meters.

In [30]:
limit = 10
radius = 200

url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&limit={}&radius={}'.format(
        client_id,
        client_secret,
        version,
        df_toronto.loc[3,'Latitude'],
        df_toronto.loc[3,'Longitude'],
        limit,
        radius)

In [31]:
#import requests
import requests as rq

In [33]:
#fetching result for St.James Town
st_james_town_result = rq.get(url).json()

In [34]:
#analyzing the result
st_james_town_result

{'meta': {'code': 200, 'requestId': '5e93111c660a9f001bf7926b'},
 'response': {'headerLocation': 'St. Lawrence',
  'headerFullLocation': 'St. Lawrence, Toronto',
  'headerLocationGranularity': 'neighborhood',
  'totalResults': 9,
  'suggestedBounds': {'ne': {'lat': 43.6532939018, 'lng': -79.37293481442669},
   'sw': {'lat': 43.6496938982, 'lng': -79.37790098557332}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '574ad72238fa943556d93b8e',
       'name': 'Gyu-Kaku Japanese BBQ',
       'location': {'address': '81 Church St',
        'crossStreet': 'at Adelaide St E',
        'lat': 43.651422275497914,
        'lng': -79.37504693687086,
        'labeledLatLngs': [{'label': 'display',
          'lat': 43.651422275497914,
          'lng': -79.37504693687086}],
        'dis

Creating a function that will help to fetch data for all the Neighborhoods.

In [35]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            client_id, 
            client_secret, 
            version, 
            lat, 
            lng, 
            radius, 
            limit)
            
        # make the GET request
        venues_results = rq.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in venues_results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [37]:
#to run above funtion on each Neighborhood
venues = getNearbyVenues(names=df_toronto['Neighborhood'],
                                   latitudes=df_toronto['Latitude'],
                                   longitudes=df_toronto['Longitude']
                                  )

In [38]:
#Check the shape and Head
print("Shape of Venues DF: ",venues.shape)
venues.head()

Shape of Venues DF:  (346, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Regent Park , Harbourfront",43.65426,-79.360636,Roselle Desserts,43.653447,-79.362017,Bakery
1,"Regent Park , Harbourfront",43.65426,-79.360636,Tandem Coffee,43.653559,-79.361809,Coffee Shop
2,"Regent Park , Harbourfront",43.65426,-79.360636,Cooper Koo Family YMCA,43.653249,-79.358008,Distribution Center
3,"Regent Park , Harbourfront",43.65426,-79.360636,Body Blitz Spa East,43.654735,-79.359874,Spa
4,"Regent Park , Harbourfront",43.65426,-79.360636,Morning Glory Cafe,43.653947,-79.361149,Breakfast Spot


In [39]:
#Check the venue category
venues['Venue Category'].value_counts()

Coffee Shop                      26
Café                             24
Restaurant                       17
Park                             15
Italian Restaurant               12
Pub                               9
Gym                               7
Mexican Restaurant                6
Bakery                            6
Dessert Shop                      6
Bar                               5
Pizza Place                       5
Brewery                           5
Ice Cream Shop                    5
Sushi Restaurant                  5
Breakfast Spot                    5
American Restaurant               4
Middle Eastern Restaurant         4
Burrito Place                     4
Cocktail Bar                      4
Yoga Studio                       4
Greek Restaurant                  4
Gastropub                         4
Tea Room                          4
Vegetarian / Vegan Restaurant     4
Bookstore                         4
Grocery Store                     3
Liquor Store                

In [40]:
#Onehot encoding using Get_dummies
toronto_onehot = pd.get_dummies(venues[['Venue Category']],prefix = "", prefix_sep = "")

In [41]:
toronto_onehot['Neighborhood'] = venues['Neighborhood']  

In [42]:
#grouping on Basis of Neighborhood
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()

In [43]:
toronto_grouped.shape

(39, 121)

Let's write a function to sort the venues in descending order.

In [44]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

Now let's create the new dataframe and display the top 10 venues for each neighborhood.

In [45]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Berczy Park,Cocktail Bar,Farmers Market,Concert Hall,Museum,Restaurant,Liquor Store,Coffee Shop,Park,French Restaurant,Vegetarian / Vegan Restaurant
1,"Brockton , Parkdale Village , Exhibition Place",Coffee Shop,Café,Furniture / Home Store,Pet Store,Breakfast Spot,Gym,Italian Restaurant,Climbing Gym,Bar,Dessert Shop
2,Business reply mail Processing CentrE,Pizza Place,Auto Workshop,Garden Center,Restaurant,Burrito Place,Skate Park,Brewery,Farmers Market,Fast Food Restaurant,Comic Shop
3,"CN Tower , King and Spadina , Railway Lands , ...",Airport Lounge,Airport,Bar,Airport Food Court,Airport Gate,Airport Terminal,Boutique,Plane,Harbor / Marina,Distribution Center
4,Central Bay Street,Coffee Shop,Gastropub,Italian Restaurant,Modern European Restaurant,Middle Eastern Restaurant,Bubble Tea Shop,Park,Food,Department Store,Furniture / Home Store


In [46]:
#import KMeans
from sklearn.cluster import KMeans

# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([0, 1, 1, 1, 1, 4, 0, 4, 1, 1])

Create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.

In [47]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = df_toronto

toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

toronto_merged.head() 

Unnamed: 0,Postal code,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M5A,Downtown Toronto,"Regent Park , Harbourfront",43.65426,-79.360636,1,Breakfast Spot,Spa,Bakery,Pub,Restaurant,Coffee Shop,Distribution Center,Historic Site,Park,Fish & Chips Shop
1,M7A,Downtown Toronto,"Queen's Park , Ontario Provincial Government",43.662301,-79.389494,1,Coffee Shop,Beer Bar,Distribution Center,Mexican Restaurant,Creperie,Park,Burrito Place,Yoga Studio,Arts & Crafts Store,Asian Restaurant
2,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937,1,Music Venue,Plaza,Burger Joint,Burrito Place,Ramen Restaurant,Café,Tea Room,Theater,Clothing Store,Comic Shop
3,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418,1,Coffee Shop,Gym,Italian Restaurant,Restaurant,Middle Eastern Restaurant,BBQ Joint,Japanese Restaurant,Food Truck,Cosmetics Shop,Gastropub
4,M4E,East Toronto,The Beaches,43.676357,-79.293031,1,Trail,Pub,Health Food Store,Yoga Studio,Distribution Center,Comic Shop,Concert Hall,Convenience Store,Cosmetics Shop,Creperie


Visualizing Clusters on basis of KMEANS clustering

In [48]:
# create map
map_clusters = folium.Map(location=[lat, long], zoom_start=12)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, neigh, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(neigh) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters