# K-Means Clustering of Municipalities Based on Venue Categories in Miami-Dade County


*The Capstone Project of IBM Professional Certificate Courses at Coursera*

**Kangrui Lu**

**Libraries imported**

In [1]:
import requests
import pandas as pd
import numpy as np 
import random 
import json
import time
import matplotlib.cm as cm
import matplotlib.colors as colors

from IPython.display import Image 
from IPython.core.display import HTML 
from pandas.io.json import json_normalize
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes
import folium
#!conda install -c conda-forge geopy --yes 
from geopy.geocoders import Nominatim
print('Folium installed')
print('Libraries imported.')

Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - folium=0.5.0


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    certifi-2020.6.20          |   py36h9f0ad1d_0         151 KB  conda-forge
    altair-4.1.0               |             py_1         614 KB  conda-forge
    openssl-1.1.1g             |       h516909a_0         2.1 MB  conda-forge
    ca-certificates-2020.6.20  |       hecda079_0         145 KB  conda-forge
    vincent-0.4.4              |             py_1          28 KB  conda-forge
    python_abi-3.6             |          1_cp36m           4 KB  conda-forge
    branca-0.4.1               |             py_0          26 KB  conda-forge
    folium-0.5.0               |             py_0          45 KB  conda-forge
    ------------------------------------------------------------
                       

**List of municipality in Miami-Dade**

In [2]:
!wget -q -O 'miami_data.json' https://opendata.arcgis.com/datasets/b671f3bc435d40928bf22b5f44410600_0.geojson

In [3]:
with open('miami_data.json') as json_data:
    miami_data = json.load(json_data)

In [4]:
miami_municipality = miami_data['features']

Construct a dataframe for miami municipalities

In [5]:
column_names = ['MuniID','Municipality'] 
df_MIA = pd.DataFrame(columns=column_names)

In [6]:
for data in miami_municipality:
    MuniID = Municipality = data['properties']['MUNICID'] 
    Municipality = data['properties']['NAME']
    
    df_MIA = df_MIA.append({'MuniID': MuniID,'Municipality': Municipality}, ignore_index=True)
    
df_MIA.head()

Unnamed: 0,MuniID,Municipality
0,5,MIAMI SPRINGS
1,23,NORTH BAY VILLAGE
2,32,MIAMI LAKES
3,24,KEY BISCAYNE
4,26,VIRGINIA GARDENS


**Foursquare Credentials**

This section of code was later removed for privacy reason.

**Get coordinates for those municipalities**

Use the code below in a loop to generate coordinates for every municipalities, and insert into the df_MIA. Note there are several NaN in latitudes been generated. Modify the dataframe to populate these rows with coordinates generated from 'geocode'.

In [None]:
geolocator = Nominatim(user_agent="foursquare_agent")
for x in range(0,len(df_MIA)):
    location = geolocator.geocode(df_MIA['Municipality'][x])
    time.sleep(2)
    df_MIA.at[x, 'lat']=location.latitude
    df_MIA.at[x, 'lon']=location.longitude

In [10]:
df_MIA = df_MIA.drop([df_MIA.index[30],df_MIA.index[31],df_MIA.index[32],df_MIA.index[33],df_MIA.index[34]])

In [11]:
#address = 'GOLDEN BEACH, FL'
#geolocator = Nominatim(user_agent="foursquare_agent")
#location = geolocator.geocode(address)
#latitude = location.latitude
#longitude = location.longitude
#print(latitude, longitude)

row_30 = {'MuniID':'18', 'Municipality':'EL PORTAL', 'lat':25.8553739, 'lon':-80.193103}
row_31 = {'MuniID':'15', 'Municipality':'WEST MIAMI', 'lat':25.7632181, 'lon':-80.2958939}
row_32 = {'MuniID':'3', 'Municipality':'CORAL GABLES', 'lat':25.72149, 'lon':-80.2683838}
row_33 = {'MuniID':'19', 'Municipality':'GOLDEN BEACH', 'lat':25.9660825, 'lon':-80.12097619912299}
df_MIA = df_MIA.append([row_30,row_31,row_32,row_33], ignore_index=True)

In [13]:
print(df_MIA.shape)
df_MIA.head()

(34, 4)


Unnamed: 0,MuniID,Municipality,lat,lon
0,5,MIAMI SPRINGS,25.821848,-80.292708
1,23,NORTH BAY VILLAGE,25.846207,-80.153935
2,32,MIAMI LAKES,25.911621,-80.321275
3,24,KEY BISCAYNE,25.696835,-80.163526
4,26,VIRGINIA GARDENS,25.810376,-80.302273


**Explore venues in these municipalities**

In [14]:
def getNearbyVenues(names, latitudes, longitudes, radius=5000):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius,
            LIMIT)

        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Municipality', 
                  'Muni_Lat', 
                  'Muni_Lon', 
                  'Venue', 
                  'Venue_Lat', 
                  'Venue_Lon', 
                  'Venue_Category']
    
    return(nearby_venues)

In [None]:
miami_venues = getNearbyVenues(names=df_MIA['Municipality'],
                                   latitudes=df_MIA['lat'],
                                   longitudes=df_MIA['lon']
                                  )

MIAMI SPRINGS
NORTH BAY VILLAGE
MIAMI LAKES
KEY BISCAYNE
VIRGINIA GARDENS
MIAMI BEACH
NORTH MIAMI BEACH
AVENTURA
SOUTH MIAMI
HIALEAH
OPA-LOCKA
HIALEAH GARDENS
BISCAYNE PARK
SURFSIDE
DORAL
PINECREST
INDIAN CREEK VILLAGE
SUNNY ISLES BEACH
MIAMI
SWEETWATER
MIAMI GARDENS
HOMESTEAD
BAY HARBOR ISLANDS
FLORIDA CITY


In [None]:
print(miami_venues.shape)
miami_venues.head()

In [None]:
miami_venues.groupby('Municipality').count()

Frequency of occurrence of each category by municipality

In [None]:
miami_onehot = pd.get_dummies(miami_venues[['Venue_Category']], prefix="", prefix_sep="")
miami_onehot['Municipality'] = miami_venues['Municipality'] 
fixed_columns = [miami_onehot.columns[-1]] + list(miami_onehot.columns[:-1])
miami_onehot = miami_onehot[fixed_columns]
miami_grouped = miami_onehot.groupby('Municipality').mean().reset_index()
miami_grouped.head()

Top 10 venues of each municipality

In [None]:
def most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

num_top_venues = 10

indicators = ['st', 'nd', 'rd']

columns = ['Municipality']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

miamimuni_venues_sorted = pd.DataFrame(columns=columns)
miamimuni_venues_sorted['Municipality'] = miami_grouped['Municipality']

for ind in np.arange(miami_grouped.shape[0]):
    miamimuni_venues_sorted.iloc[ind, 1:] = most_common_venues(miami_grouped.iloc[ind, :], num_top_venues)

miamimuni_venues_sorted.head()

**K-Mean Clustering**

In [None]:
kclusters = 5

miami_grouped_clustering = miami_grouped.drop('Municipality', 1)
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(miami_grouped_clustering)
kmeans.labels_[0:10] 
miamimuni_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)
miami_merged = df_MIA
miami_merged = miami_merged.join(miamimuni_venues_sorted.set_index('Municipality'), on='Municipality')

miami_merged.head()

Examine the clusters

In [None]:
cluster1 = miami_merged.loc[miami_merged['Cluster Labels'] == 0, miami_merged.columns[[1] + list(range(5, miami_merged.shape[1]))]]
cluster1 = cluster1.transpose()
cluster1

In [None]:
cluster2 = miami_merged.loc[miami_merged['Cluster Labels'] == 1, miami_merged.columns[[1] + list(range(5, miami_merged.shape[1]))]]
cluster2 = cluster2.transpose()
cluster2

In [None]:
cluster3 = miami_merged.loc[miami_merged['Cluster Labels'] == 2, miami_merged.columns[[1] + list(range(5, miami_merged.shape[1]))]]
cluster3 = cluster3.transpose()
cluster3

In [None]:
cluster4 = miami_merged.loc[miami_merged['Cluster Labels'] == 3, miami_merged.columns[[1] + list(range(5, miami_merged.shape[1]))]]
cluster4 = cluster4.transpose()
cluster4

In [None]:
cluster5 = miami_merged.loc[miami_merged['Cluster Labels'] == 4, miami_merged.columns[[1] + list(range(5, miami_merged.shape[1]))]]
cluster5 = cluster5.transpose()
cluster5

**Overview of Miami-Dade population by municipality**

In [None]:
url = 'https://raw.githubusercontent.com/KL98/Coursera_Capstone/master/MiamiDade_populationbycity.csv'
miami_pop = pd.read_csv(url)
miami_pop.head()

In [None]:
miami_pop['2010 Population'] = miami_pop['2010 Population'].str.replace(',','')
convert_dict = {'2010 Population': int} 
miami_pop = miami_pop.astype(convert_dict) 
miami_pop = miami_pop.rename(columns={"Incorporated Community": "Municipality", "2010 Population": "Population"})
miami_pop = miami_pop.drop(['Designation'], axis=1)
miami_pop['Municipality'] = miami_pop['Municipality'].str.upper() 

miami_pop_coor = pd.merge(df_MIA, miami_pop, on = 'Municipality', how = 'left')
miami_pop_coor.head()

In [None]:
miami_pop_coor = miami_pop_coor.drop(miami_pop_coor.index[16])

In [None]:
miami_pop_coor.astype({'Population': 'int64'}).dtypes

Geopraphic Choropleth Map

In [None]:
!wget --quiet https://opendata.arcgis.com/datasets/bd523e71861749959a7f12c9d0388d1c_0.geojson -O miami.json

In [None]:
miami_geo = r'miami.json'

In [None]:
threshold_scale = np.linspace(miami_pop_coor['Population'].min(),
                              miami_pop_coor['Population'].max(),
                              6, dtype=int)
threshold_scale = threshold_scale.tolist() 
threshold_scale[-1] = threshold_scale[-1] + 1 

miami_pop_map = folium.Map(location=[25.7617, -80.1918], zoom_start=10)
miami_pop_map.choropleth(
    geo_data=miami_geo,
    data=miami_pop_coor,
    columns=['Municipality', 'Population'],
    key_on='feature.properties.NAME',
    threshold_scale=threshold_scale,
    fill_color='YlOrRd', 
    fill_opacity=0.7, 
    line_opacity=0.2,
    legend_name='Population by Municipality'
)

x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

markers_colors = []
for lat, lon, poi, cluster in zip(miami_merged['lat'], miami_merged['lon'], miami_merged['Municipality'], miami_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=["blue", "grey", "green", "purple","pink"][cluster-1],
        fill=True,
        fill_color=["blue", "grey", "green", "purple","pink"][cluster-1],
        fill_opacity=0.7).add_to(miami_pop_map)

miami_pop_map