# IBM Capstone Project: Segmenting and Clustering Suburbs in Melbourne

## 1) Data Acquisition and Preparation

In [None]:
import pandas as pd
import numpy as np
import requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

!pip install xlrd
!conda install -c conda-forge geopy --yes
!conda install -c conda-forge folium=0.5.0 --yes

from geopy.geocoders import Nominatim
import folium

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import matplotlib.colors as colors



# import k-means from clustering stage
from scipy.spatial.distance import cdist
from sklearn import metrics 
from sklearn.cluster import KMeans

#!pip install bs4
#from bs4 import BeautifulSoup

print('Libraries imported.')

Please see https://github.com/pypa/pip/issues/5599 for advice on fixing the underlying issue.
To avoid this problem you can invoke Python with '-m pip' instead of running pip directly.
Collecting xlrd
  Downloading xlrd-1.2.0-py2.py3-none-any.whl (103 kB)
[K     |████████████████████████████████| 103 kB 3.5 MB/s eta 0:00:01
[?25hInstalling collected packages: xlrd
Successfully installed xlrd-1.2.0
Collecting package metadata (current_repodata.json): done
Solving environment: \ 

### a) List of Melbourne Suburbs and Corresponding Post Codes

In [None]:
# Obtaining data from csv for more reliable data: 
melb_suburb = pd.read_csv('melbourne_suburbs.csv')
melb_suburb.columns.str.strip()
melb_suburb.reset_index()
melb_suburb


### b) Suburb Population (will be used to choose which suburbs to analyze)

In [None]:
melb_population_forecast = pd.read_excel('Forecast_Pop_By_Area.xls')

# Removing irrelevant columns and rows: 
melb_population_forecast.dropna(axis=0, inplace=True)
melb_population_forecast.columns = melb_population_forecast.iloc[0]
melb_population_forecast.drop(melb_population_forecast.index[0], axis=0, inplace=True)
melb_population_forecast.drop(melb_population_forecast.index[6], axis=0, inplace=True)
melb_population_forecast.drop(melb_population_forecast.index[11], axis=0, inplace=True)
melb_population_forecast.drop(melb_population_forecast.index[11], axis=0, inplace=True)
melb_population_forecast.drop(melb_population_forecast.index[0], axis=0, inplace=True)
melb_population_forecast.drop(['Total change', '2016', 2026.0, 2031.0, 2036.0, 2041.0], axis=1,inplace=True)

header_names = ['Suburb', '2021 Population Forecast', 'Avg. Annual % Change']
melb_population_forecast.columns = header_names
#melb_population_forecast['Suburb'] = melb_population_forecast.astype('str')

# Sorting the table based on Forecasted Population 2021:
melb_population_forecast.sort_values(['2021 Population Forecast'], inplace=True, ascending=False)

melb_population_forecast

In [None]:
melb_population_forecast.drop('Avg. Annual % Change', axis=1, inplace=True)
melb_pop_forecast_final = melb_population_forecast

melb_pop_forecast_final

Based on the Population Data above, we shall focus on the top 5/10 Most Populated Suburbs:

### c) Merging the above dataframes: 

In [None]:
# Creating a new dataframe by merging both dataframes: 
suburb_list = ['Melbourne (CBD)', 'Southbank', 'Carlton', 'North Melbourne', 'Docklands', 'Kensington', 'Parkville', 'East Melbourne', 'South Yarra', 'Port Melbourne']
top_10_sub = melb_pop_forecast_final.loc[melb_pop_forecast_final['Suburb'].isin(suburb_list)]
#top_10_sub.index = np.arange(1, len(test)+1)
#top_10_sub

melb_merged = top_10_sub.merge(melb_suburb, on='Suburb', how='left')
#melb_merged.index = np.arange(1, len(melb_merged)+1)
melb_merged

melb_merged: to be used later for further analysis.

## 2) Exploring Melbourne

 First, let's take a look at ALL the suburbs!

#### Visualizing Melbourne Suburbs using the Folium Library:

In [None]:
# Converting Melbourne address to coordinates:
address = 'Melbourne, AU'

geolocator = Nominatim(user_agent="Melbourne_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Melbourne are {}, {}.'.format(latitude, longitude))

In [None]:
# Plotting map: 
map_melb = folium.Map(location=[latitude, longitude], zoom_start=12)

# add markers to map
for lat, lng, label in zip(melb_suburb['Latitude'], melb_suburb['Longitude'], melb_suburb['Suburb']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_melb)  
    
map_melb

#### Utilizing Foursquare APIs to Explore Venues around Melbourne

In [None]:
CLIENT_ID = 'QZC1WOXJRCQZY4TPL1WUWNMANFCAF4NVYUWE2PC2MRZKKQPS' # your Foursquare ID
CLIENT_SECRET = 'FL25BYPAC1QNBL5SAKO0W0G20W5U4GDLDVHRHSAMQYPSNJFN' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

In [None]:
# Getting top 100 venues within Melbourne CBD???
# type your answer here
LIMIT = 100
radius = 500
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(CLIENT_ID, CLIENT_SECRET, VERSION, latitude, longitude, radius, LIMIT)
print(url)
results = requests.get(url).json()
#results

#### Creating a function to get nearby venues around the suburbs in Melbourne

In [None]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Suburb', 
                  'Suburb Latitude', 
                  'Suburb Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

#### Calling the Function:

In [None]:
# Creating dataframe to run the above function on each neighborhood 
# and create a new dataframe:
melb_venues = getNearbyVenues(names=melb_suburb['Suburb'],
                                   latitudes=melb_suburb['Latitude'],
                                   longitudes=melb_suburb['Longitude']
                                  )

# check size of dataframe:
print(melb_venues.shape)

# how many unique categories
print('\n\nThere are {} uniques categories.'.format(len(melb_venues['Venue Category'].unique())))

In [None]:
melb_venues

####   

#### Results show that there are 196 unique categories. Let's look at the frequency of each categories:

In [None]:
print(melb_venues['Venue Category'].value_counts())

#### Let's thus create a Dataframe consisting of the 10 Most Frequently Occuring Venue Categories:

In [None]:
melb_top_venue_cat = melb_venues['Venue Category'].value_counts()[0:10].to_frame(name='Frequency')
melb_top_venue_cat = melb_top_venue_cat.reset_index()
melb_top_venue_cat.rename(index=str, columns={"index": "Venue Category"}, inplace=True)

melb_top_venue_cat

#### Here's what it looks like on a Bar Chart:

In [None]:
import seaborn as sns
fig = plt.figure(figsize=(18,7))
s = sns.barplot(x="Venue Category", y="Frequency", data=melb_top_venue_cat)
s.set_xticklabels(s.get_xticklabels(), rotation=30)
plt.title('Top 10 Venue Categories in Melbourne', fontsize=15)
plt.xlabel("Venue Category", fontsize=15)
plt.ylabel ("Frequency", fontsize=15)
plt.savefig("Most_Freq_Venues.png", dpi=300)
plt.show()

As expected, the most common venue categories are Cafes and Coffee Shops! Melbournians do love their coffee!

#### In fact, we can also compare the number of Cafes/Coffee Shops in each suburb:

In [None]:
# creating a dataframe of all cafes around Melbourne
top_melb_cafe = melb_venues[melb_venues['Venue Category'].str.contains('Coffee Shop|Café|Cafe')].reset_index(drop=True)

print(top_melb_cafe.shape)
#top_melb_cafe

compare = top_melb_cafe.groupby(['Suburb'])['Venue Category'].apply(lambda x: x[x.str.contains('Coffee Shop|Café')].count())
compare_df = compare.to_frame().reset_index()
compare_df.columns = ['Suburb', 'No of Cafes/Coffee Shops']
compare_df.index = np.arange(1, len(compare_df)+1)
#compare_df.head()

In [None]:
list_cafe_no = compare_df['No of Cafes/Coffee Shops'].to_list()


fig = plt.figure(figsize=(20,9))
bar = sns.barplot(x='Suburb', y='No of Cafes/Coffee Shops', data=compare_df)

bar.set_ylabel("No of Cafes/Coffee Shops", fontsize=14)
bar.set_xlabel("Melbourne Suburbs", fontsize=14)
bar.set_xticklabels(bar.get_xticklabels(),rotation=40)
bar.set_title("Number of Cafes/Cofee Shops in Each of the Top Suburbs", fontsize=14)
#plt.savefig("No_of_Cafes_in_Melb.png", dpi=240)
plt.show()
print (list_cafe_no)

##   

## 3) Exploring the Coffee Scene in the Top 10 Suburbs

### Now, lets focus on the more populated suburbs listed earlier and create a dataframe of all cafes around the Top 10 Suburbs:

In [None]:
# Calling the function above:
top_sub_venues = getNearbyVenues(names=melb_merged['Suburb'],
                                   latitudes=melb_merged['Latitude'],
                                   longitudes=melb_merged['Longitude']
                                  )

# check size of dataframe:
print(top_sub_venues.shape)

# how many unique categories
print('\n\nThere are {} uniques categories.'.format(len(top_sub_venues['Venue Category'].unique())))

In [None]:
# Dataframe of all venues within 500m radius of each of the Top 10 Suburbs:
top_sub_venues

#### Since we're looking for a good location to open a cafe, lets create a dataframe to concentrate just on that: 

In [None]:
# creating a dataframe of all cafes around the top 10 suburbs: 
top_sub_cafe = top_sub_venues[top_sub_venues['Venue Category'].str.contains('Coffee Shop|Café|Cafe')].reset_index(drop=True)

print(top_sub_cafe.shape)
top_sub_cafe

**Assumption: The client is interested in opening a cafe that emphasizes on the quality of their coffee. Thus in this project, Cafés and Coffee Shops were considered to be in the same category.**

#### Let's visualize these cafes on the map

In [None]:
map_cafes = folium.Map(location=[latitude, longitude], zoom_start=13)

# Setting colour schemes for the venues: 
suburbs = ['Melbourne (CBD)', 'Southbank', 'Carlton', 'North Melbourne', 'Docklands', 'Kensington', 'Parkville', 'East Melbourne', 'South Yarra', 'Port Melbourne']

#x = np.arrange(len(suburbs))
           
rainbow = ['red', 'blue', 'darkred', 'orange', 'green', 'darkgreen', 'cadetblue', 'purple', 'pink', 'darkpurple']

# add markers to map
for lat, lon, poi, sub in zip(top_sub_cafe['Venue Latitude'], 
                                  top_sub_cafe['Venue Longitude'], 
                                  top_sub_cafe['Venue Category'], 
                                  top_sub_cafe['Suburb']):
    label = folium.Popup(str(poi) + ' ' + str(sub), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=7,
        popup=label,
        color=rainbow[suburbs.index(sub)-1],
        fill=True,
        fill_color=rainbow[suburbs.index(sub)-1],
        fill_opacity=0.3).add_to(map_cafes)
    
map_cafes

In [None]:
print(top_sub_cafe['Venue Category'].value_counts())

These suburbs have a total of 53 Cafes and 23 Coffee Shops

###    

### Lets Find out what are the Top 10 Venues for Each of the Top Suburbs:

#### Using One Hot Encoding to get more info on the venue categories:

In [None]:
# one hot encoding
melb_onehot = pd.get_dummies(top_sub_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
melb_onehot['Suburb'] =top_sub_venues['Suburb'] 

# move neighborhood column to the first column
fixed_columns = [melb_onehot.columns[-1]] + list(melb_onehot.columns[:-1])
melb_onehot = melb_onehot[fixed_columns]
print("Shape: ", melb_onehot.shape)
#melb_onehot.head()

In [None]:
# Group rows by neighborhood and by taking the mean of the frequency of occurrence of each category:
melb_grouped = melb_onehot.groupby('Suburb').mean().reset_index()
print("Shape: ", melb_grouped.shape)
melb_grouped

#### Displaying top 10 venues: 

In [None]:
# Writing function to sort venues in descending order: 
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [None]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Suburb']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
top_sub_venues_sorted = pd.DataFrame(columns=columns)
top_sub_venues_sorted['Suburb'] = melb_grouped['Suburb']

for ind in np.arange(melb_grouped.shape[0]):
    top_sub_venues_sorted.iloc[ind, 1:] = return_most_common_venues(melb_grouped.iloc[ind, :], num_top_venues)

top_sub_venues_sorted

By showing what categories are popular in each suburb, we can potentially identify a potential or need for a specific type of service!

#### Furthermore, we can compare the number of Cafes/Coffee Shops in each suburb:

In [None]:
melb_top_cafe = top_sub_cafe.groupby(['Suburb'])['Venue Category'].apply(lambda x: x[x.str.contains('Coffee Shop|Café')].count())
melb_top_cafe_df = melb_top_cafe.to_frame().reset_index()
melb_top_cafe_df.columns = ['Suburb', 'No of Cafes/Coffee Shops']
melb_top_cafe_df.index = np.arange(1, len(melb_top_cafe_df)+1)
#melb_top_cafe_df.head()

In [None]:
list_cafe_no = melb_top_cafe_df['No of Cafes/Coffee Shops'].to_list()
print (list_cafe_no)

fig = plt.figure(figsize=(12,8))
bar = sns.barplot(x='Suburb', y='No of Cafes/Coffee Shops', data=melb_top_cafe_df)

bar.set_ylabel("No of Cafes/Coffee Shops", fontsize=14)
bar.set_xlabel("Top 10 Suburbs in Melbourne", fontsize=14)
bar.set_xticklabels(bar.get_xticklabels(),rotation=40)
bar.set_title("Number of Cafes/Cofee Shops in Each of the Top Suburbs", fontsize=14)
#plt.savefig("No_of_Rest_as_Top_Venues.png", dpi=240)
plt.show()

**Note that some suburbs may not appear on the bar chart. This is because the popular spots returned by Foursquare API depends on the foot traffic at the time the API call is made. Therefore we may get slightly different popular venues at different times of the day.**

###   

## 4) Clustering the suburbs using K-Means

### Before clustering, we need to determine the best k value using Elbow Method: 

In [None]:
melb_grouped_clustering = melb_grouped.drop('Suburb', 1)

distortions = []
inertias = []
mapping1 = {}
mapping2 = {}
K = range(1,10)
for k in K: 
    kmeanModel = KMeans(n_clusters=k, random_state=0).fit(melb_grouped_clustering)
    kmeanModel.fit(melb_grouped_clustering)
    
    distortions.append(sum(np.min(cdist(melb_grouped_clustering, kmeanModel.cluster_centers_, 'euclidean'), axis=1))/ melb_grouped_clustering.shape[0])
    inertias.append(kmeanModel.inertia_)
    
    mapping1[k] = sum(np.min(cdist(melb_grouped_clustering, kmeanModel.cluster_centers_, 'euclidean'), axis=1)) / melb_grouped_clustering.shape[0] 
    mapping2[k] = kmeanModel.inertia_ 

In [None]:
# Visualizing the results: 
# a) using diff values of distortion: 
for key,val in mapping1.items(): 
    print(str(key)+' : '+str(val))

plt.plot(K, distortions, 'bx-') 
plt.xlabel('Values of K') 
plt.ylabel('Distortion') 
plt.title('The Elbow Method using Distortion') 
plt.show() 



In [None]:
# b) using diff values of inertia:

for key,val in mapping2.items(): 
    print(str(key)+' : '+str(val)) 

plt.plot(K, inertias, 'bx-') 
plt.xlabel('Values of K') 
plt.ylabel('Inertia') 
plt.title('The Elbow Method using Inertia') 
plt.show() 


Based on the evaluation above, the elbow is located at K=4. We shall thus perform K-Means Cluster with 4 Clusters

### Now we can perform K-Means Clustering

In [None]:
kclusters = 4
melb_grouped_clustering = melb_grouped.drop('Suburb', 1)

kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(melb_grouped_clustering)
print ("Check Cluster labels :", kmeans.labels_[0:10]) # checking cluster labels for each row of the dataframe


# add clustering labels
top_sub_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

In [None]:
# merge Sorted most visited venues in each suburb dataframe with melb_merged to add latitude/longitude for each suburb
melb_sub_coord_cluster = melb_merged

melb_sub_coord_cluster = melb_sub_coord_cluster.join(top_sub_venues_sorted.set_index('Suburb'), on='Suburb')
melb_sub_coord_cluster

### Visualize resulting clusters:

#### Option 1:

In [None]:
# create map
map_clusters1 = folium.Map(location=[latitude, longitude], zoom_start=11, tiles='cartodbpositron')

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(melb_sub_coord_cluster['Latitude'], melb_sub_coord_cluster['Longitude'], melb_sub_coord_cluster['Suburb'], melb_sub_coord_cluster['Cluster Labels']):
    label = folium.Popup(str(poi) + ', Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=10,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters1)

map_clusters1

#### Option 2: Cluster Map with Radius of Clusters Representing No of Cafes/Coffeeshops in Each Suburb

In [None]:
# create map
map_clusters2 = folium.Map(location=[latitude, longitude], zoom_start=11, tiles='cartodbpositron')

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]
suburbs = ['Melbourne (CBD)', 'Southbank', 'Carlton', 'North Melbourne', 'Docklands', 'Kensington', 'Parkville', 'East Melbourne', 'South Yarra', 'Port Melbourne']

list_cafe_no_copy = [9, 5, 5, 7, 21, 2, 0, 0, 20, 7]
#print (list_rest_no)

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(melb_sub_coord_cluster['Latitude'], melb_sub_coord_cluster['Longitude'], melb_sub_coord_cluster['Suburb'], melb_sub_coord_cluster['Cluster Labels']):
    label = folium.Popup(str(poi) + ', Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=list_cafe_no_copy[suburbs.index(poi)]*0.5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters2)

map_clusters2

### Examine the Clusters:

We will examine each cluster and determine the discriminating venue categories that distinguish each cluster. Based on the defining categories, we can then assign a name to each cluster.

In [None]:
def total_venue_categories(df):
    df_all_venues = df['1st Most Common Venue']
    df_all_venues = df_all_venues.append(df['2nd Most Common Venue'])
    df_all_venues = df_all_venues.append(df['3rd Most Common Venue'])
    df_all_venues = df_all_venues.append(df['4th Most Common Venue'])
    df_all_venues = df_all_venues.append(df['5th Most Common Venue'])
    df_all_venues = df_all_venues.append(df['6th Most Common Venue'])
    df_all_venues = df_all_venues.append(df['7th Most Common Venue'])
    df_all_venues = df_all_venues.append(df['8th Most Common Venue'])
    df_all_venues = df_all_venues.append(df['9th Most Common Venue'])
    df_all_venues = df_all_venues.append(df['10th Most Common Venue'])
    df_cnt = df_all_venues.value_counts().to_frame().reset_index()
    df_cnt.columns = ['Venue Category', 'Venue Count']
    df_cnt.sort_values(by = 'Venue Count', ascending = False)
#return the top 10 venue categories
    return df_cnt.head(10)

#### Cluster 1

In [None]:
Melb_Cluster0 = melb_sub_coord_cluster.loc[melb_sub_coord_cluster['Cluster Labels'] == 0, melb_sub_coord_cluster.columns[[1] + list(range(5, melb_sub_coord_cluster.shape[1]))]]

print ("No of Suburbs in Cluster Label 0: %d" %(Melb_Cluster0.shape[0]))
Melb_Cluster0

In [None]:
# Counting venue categories in all neighborhoods in the cluster: 
t0 = total_venue_categories(melb_sub_coord_cluster[melb_sub_coord_cluster['Cluster Labels'] == 0])

# Plotting the frequency of the venue categories in the cluster
y_pos = np.arange(len(t0['Venue Category']))
plt.barh(y_pos, t0['Venue Count'], align='center')
plt.yticks(y_pos, t0['Venue Category'])
plt.xlabel('Number of venue categories')
plt.title('Venue categories in cluster 0')
 
plt.show()



#### Cluster 2

In [None]:
Melb_Cluster1 = melb_sub_coord_cluster.loc[melb_sub_coord_cluster['Cluster Labels'] == 1, melb_sub_coord_cluster.columns[[1] + list(range(5, melb_sub_coord_cluster.shape[1]))]]

print ("No of Suburbs in Cluster Label 1: %d" %(Melb_Cluster1.shape[0]))
Melb_Cluster1

#### Cluster 3

In [None]:
Melb_Cluster2 = melb_sub_coord_cluster.loc[melb_sub_coord_cluster['Cluster Labels'] == 2, melb_sub_coord_cluster.columns[[1] + list(range(5, melb_sub_coord_cluster.shape[1]))]]

print ("No of Suburbs in Cluster Label 2: %d" %(Melb_Cluster2.shape[0]))
Melb_Cluster2

#### Cluster 4

In [None]:
Melb_Cluster3 = melb_sub_coord_cluster.loc[melb_sub_coord_cluster['Cluster Labels'] == 3, melb_sub_coord_cluster.columns[[1] + list(range(5, melb_sub_coord_cluster.shape[1]))]]

print ("No of Suburbs in Cluster Label 3: %d" %(Melb_Cluster3.shape[0]))
Melb_Cluster3

In [None]:
t3 = total_venue_categories(melb_sub_coord_cluster[melb_sub_coord_cluster['Cluster Labels'] == 3])

y_pos = np.arange(len(t3['Venue Category']))
plt.barh(y_pos, t3['Venue Count'], align='center')
plt.yticks(y_pos, t3['Venue Category'])
plt.xlabel('Number of venue categories')
plt.title('Venue categories in cluster 3')
 
plt.show()