In [1]:
import numpy as np
import pandas as pd
import requests

from bs4 import BeautifulSoup
from geopy.geocoders import Nominatim

from sklearn.cluster import KMeans

import matplotlib.pyplot as plt
import folium
%matplotlib inline

## Get Jakarta Districts

In [2]:
html_data = requests.get('https://id.wikipedia.org/wiki/Daftar_kecamatan_dan_kelurahan_di_Daerah_Khusus_Ibukota_Jakarta')
soup = BeautifulSoup(html_data.text, "html.parser")

In [3]:
tables = soup.find_all('table')

# for each city part, get districts
jakarta_tables = {
    "Central Jakarta": tables[1],
    "North Jakarta": tables[2],
    "East Jakarta": tables[3],
    "South Jakarta": tables[4],
    "West Jakarta": tables[5],
    "Thousand Islands": tables[6]
}

In [4]:
jakarta_district = []

for each_part in jakarta_tables:
    for row in jakarta_tables[each_part].find_all('tr'):
        col = row.find_all('td')
        if (len(col) == 4):
            jakarta_district.append({
                "code": col[0].text.strip('\n'),
                "name": col[1].text.strip('\n'),
                "city": each_part,
                "n_subdistricts": col[2].text.strip('\n'),
            })

district_data = pd.DataFrame(jakarta_district)

In [5]:
district_data.head()

Unnamed: 0,code,name,city,n_subdistricts
0,31.71.05,Cempaka Putih,Central Jakarta,3
1,31.71.01,Gambir,Central Jakarta,6
2,31.71.08,Johar Baru,Central Jakarta,4
3,31.71.03,Kemayoran,Central Jakarta,8
4,31.71.06,Menteng,Central Jakarta,5


In [6]:
district_data[district_data['city'] == "Thousand Islands"]

Unnamed: 0,code,name,city,n_subdistricts
42,31.01.01,Kepulauan Seribu Utara,Thousand Islands,3
43,31.01.02,Kepulauan Seribu Selatan,Thousand Islands,3


In [7]:
district_data.shape

(44, 4)

## Get Long lat for each district

In [8]:
geolocator = Nominatim(user_agent="jakartapp")
jakarta = geolocator.geocode('Jakarta')

latitude = []
longitude = []

for code, name, city, n_subdistricts in district_data.values:
    try:
        location = geolocator.geocode(name + ", " + city)
        latitude.append(location.latitude)
        longitude.append(location.longitude)
    except AttributeError: # location not found, use jakarta whole geoloc
        latitude.append(jakarta.latitude)
        longitude.append(jakarta.longitude)
    
district_data['latitude'] = latitude
district_data['longitude'] = longitude

In [9]:
district_data.head()

Unnamed: 0,code,name,city,n_subdistricts,latitude,longitude
0,31.71.05,Cempaka Putih,Central Jakarta,3,-6.181214,106.868548
1,31.71.01,Gambir,Central Jakarta,6,-6.176684,106.830653
2,31.71.08,Johar Baru,Central Jakarta,4,-6.186206,106.857134
3,31.71.03,Kemayoran,Central Jakarta,8,-6.162546,106.85689
4,31.71.06,Menteng,Central Jakarta,5,-6.195026,106.832224


## Add Population Density to data

In [10]:
# data source: https://data.go.id/dataset/jumlah-penduduk-wajib-ktp-dki-jakarta
density = pd.read_csv("data/2019_population_density.csv")
density.head()

Unnamed: 0,tahun,nama_provinsi,nama_kabupaten/kota,nama_kecamatan,nama_kelurahan,jenis_kelamin,jumlah
0,2019,PROVINSI DKI JAKARTA,KAB.ADM.KEP.SERIBU,KEP. SERIBU UTR,P. PANGGANG,Laki-laki,2474
1,2019,PROVINSI DKI JAKARTA,KAB.ADM.KEP.SERIBU,KEP. SERIBU UTR,P. KELAPA,Laki-laki,2486
2,2019,PROVINSI DKI JAKARTA,KAB.ADM.KEP.SERIBU,KEP. SERIBU UTR,P. HARAPAN,Laki-laki,934
3,2019,PROVINSI DKI JAKARTA,KAB.ADM.KEP.SERIBU,KEP. SERIBU SLT,P. UNTUNG JAWA,Laki-laki,847
4,2019,PROVINSI DKI JAKARTA,KAB.ADM.KEP.SERIBU,KEP. SERIBU SLT,P. TIDUNG,Laki-laki,2038


In [11]:
density["nama_kecamatan"].unique()

array(['KEP. SERIBU UTR', 'KEP. SERIBU SLT', 'GAMBIR', 'SAWAH BESAR',
       'KEMAYORAN', 'SENEN', 'CEMPAKA PUTIH', 'MENTENG', 'TANAH ABANG',
       'JOHAR BARU', 'PENJARINGAN', 'TANJUNG PRIOK', 'KOJA', 'CILINCING',
       'PADEMANGAN', 'KELAPA GADING', 'CENGKARENG', 'GROGOL PETAMBURAN',
       'TAMAN SARI', 'TAMBORA', 'KEBON JERUK', 'KALI DERES', 'PALMERAH',
       'KEMBANGAN', 'TEBET', 'SETIA BUDI', 'MAMPANG PRAPATAN',
       'PASAR MINGGU', 'KEBAYORAN LAMA', 'CILANDAK', 'KEBAYORAN BARU',
       'PANCORAN', 'JAGAKARSA', 'PESANGGRAHAN', 'MATRAMAN', 'PULO GADUNG',
       'JATINEGARA', 'KRAMAT JATI', 'PASAR REBO', 'CAKUNG', 'DUREN SAWIT',
       'MAKASAR', 'CIRACAS', 'CIPAYUNG'], dtype=object)

In [12]:
district_density = density[["nama_kecamatan", "jumlah"]].groupby("nama_kecamatan").sum("jumlah")
district_density.reset_index(inplace=True)
district_density.head()

Unnamed: 0,nama_kecamatan,jumlah
0,CAKUNG,399252
1,CEMPAKA PUTIH,75267
2,CENGKARENG,406806
3,CILANDAK,166128
4,CILINCING,302248


In [13]:
# change format of district names
new_form_dnames = []

for d_names in district_density['nama_kecamatan'].values:
    if (d_names.split()[0] == "KEP."): # thousand islands
        if (d_names.split()[2] == "UTR"):
            new_form_dnames.append("Kepulauan Seribu Utara")
        elif (d_names.split()[2] == "SLT"):
            new_form_dnames.append("Kepulauan Seribu Selatan")
    
    elif (d_names == "KALI DERES"):
        new_form_dnames.append("Kalideres")
        
    elif (d_names == "SETIA BUDI"):
        new_form_dnames.append("Setiabudi")
    
    else:
        new_form_dnames.append(d_names.lower().title())
        
district_density['nama_kecamatan'] = new_form_dnames

In [14]:
district_density.rename({
    "nama_kecamatan": "name",
    "jumlah": "population_density",
}, axis=1, inplace=True)

In [15]:
district_density.head()

Unnamed: 0,name,population_density
0,Cakung,399252
1,Cempaka Putih,75267
2,Cengkareng,406806
3,Cilandak,166128
4,Cilincing,302248


In [16]:
district_density['name'].values

array(['Cakung', 'Cempaka Putih', 'Cengkareng', 'Cilandak', 'Cilincing',
       'Cipayung', 'Ciracas', 'Duren Sawit', 'Gambir',
       'Grogol Petamburan', 'Jagakarsa', 'Jatinegara', 'Johar Baru',
       'Kalideres', 'Kebayoran Baru', 'Kebayoran Lama', 'Kebon Jeruk',
       'Kelapa Gading', 'Kemayoran', 'Kembangan',
       'Kepulauan Seribu Selatan', 'Kepulauan Seribu Utara', 'Koja',
       'Kramat Jati', 'Makasar', 'Mampang Prapatan', 'Matraman',
       'Menteng', 'Pademangan', 'Palmerah', 'Pancoran', 'Pasar Minggu',
       'Pasar Rebo', 'Penjaringan', 'Pesanggrahan', 'Pulo Gadung',
       'Sawah Besar', 'Senen', 'Setiabudi', 'Taman Sari', 'Tambora',
       'Tanah Abang', 'Tanjung Priok', 'Tebet'], dtype=object)

In [17]:
district_density.shape

(44, 2)

### Merge District Density and District Data

In [18]:
# find difference of district names in both dataframes

print([a for a in district_density['name'].values if (a not in district_data['name'].values)])
print([a for a in district_data['name'].values if (a not in district_density['name'].values)])

[]
[]


In [19]:
complete_district_data = district_data.merge(district_density, on="name")

In [20]:
complete_district_data.shape

(44, 7)

In [21]:
complete_district_data.head()

Unnamed: 0,code,name,city,n_subdistricts,latitude,longitude,population_density
0,31.71.05,Cempaka Putih,Central Jakarta,3,-6.181214,106.868548,75267
1,31.71.01,Gambir,Central Jakarta,6,-6.176684,106.830653,76192
2,31.71.08,Johar Baru,Central Jakarta,4,-6.186206,106.857134,105332
3,31.71.03,Kemayoran,Central Jakarta,8,-6.162546,106.85689,191652
4,31.71.06,Menteng,Central Jakarta,5,-6.195026,106.832224,68584


## Cluster goodness of putting a mall
If small n_subdistricts, than we can assume that the district is smaller, therefore not much land to build a mall<br>
If small population_density, smaller chance of the mall being known

In [22]:
clustering = KMeans(n_clusters=5)
clustering.fit(complete_district_data[["n_subdistricts", "population_density"]])
clusters = clustering.predict(complete_district_data[["n_subdistricts", "population_density"]])

In [23]:
complete_district_data['cluster'] = clusters
complete_district_data.head()

Unnamed: 0,code,name,city,n_subdistricts,latitude,longitude,population_density,cluster
0,31.71.05,Cempaka Putih,Central Jakarta,3,-6.181214,106.868548,75267,3
1,31.71.01,Gambir,Central Jakarta,6,-6.176684,106.830653,76192,3
2,31.71.08,Johar Baru,Central Jakarta,4,-6.186206,106.857134,105332,3
3,31.71.03,Kemayoran,Central Jakarta,8,-6.162546,106.85689,191652,0
4,31.71.06,Menteng,Central Jakarta,5,-6.195026,106.832224,68584,3


In [24]:
clustering.cluster_centers_

array([[6.63636364e+00, 1.78498636e+05],
       [6.50000000e+00, 3.41867000e+05],
       [3.00000000e+00, 9.93900000e+03],
       [5.64285714e+00, 1.02946143e+05],
       [6.36363636e+00, 2.38026818e+05]])

From the centroids above we can see that:
- Smallest districts, smallest population density: cluster 2
- Smaller districts, smaller population density: cluster 3
- Average districts, average population density: cluster 4
- Bigger districts, bigger population density: cluster 0
- Biggest districts, biggest population density: cluster 1 -- Best places to build a Mall

In [29]:
complete_district_data[complete_district_data['cluster'] == 1]['name'].values

array(['Cilincing', 'Tanjung Priok', 'Cakung', 'Duren Sawit',
       'Cengkareng', 'Kalideres'], dtype=object)

In [26]:
cluster_color = [
    'red',
    'green',
    'blue',
    'yellow',
    'cyan'
]

In [27]:
# visualize clusters
map_jakarta = folium.Map(location=[jakarta.latitude, jakarta.longitude], zoom_start=10)

for _, name, city, _, lat, long, _, cluster in complete_district_data.values:
    label = folium.Popup(name + ", " + city + ", Cluster " + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, long],
        radius=5,
        popup=label,
        color=cluster_color[cluster],
        fill=True,
        fill_color=cluster_color[cluster],
        fill_opacity=0.7).add_to(map_jakarta)
    
map_jakarta

<b>Conclusion</b>: Best districts to build malls:
'Cilincing', 'Tanjung Priok', 'Cakung', 'Duren Sawit','Cengkareng', 'Kalideres'