# Coursera Capstone Final Project
This project is about determine where would be most suitable for someone who want to sell pizza.

## 1. Import Libraries

In [1]:
import urllib.request, urllib.parse, urllib.error
from bs4 import BeautifulSoup
import pandas as pd 
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

from geopy.geocoders import Nominatim

import requests
import matplotlib.cm as cm
import matplotlib.colors as colors

from sklearn.cluster import KMeans

import folium
from folium import plugins

## 2. Scrape the Wikipedia webpage and put into Dataframe
The wikipedia webpage consists of a list of small districts in Bandung City, Indonesia. In Indonesia, for every city will have some districts. The larger districts called ***Kecataman*** and the smaller districts called ***Kelurahan***. One Kecataman can have 3 or more Kelurahan

In [2]:
# assign the url 
url = 'https://id.wikipedia.org/wiki/Daftar_kecamatan_dan_kelurahan_di_Kota_Bandung'
req = urllib.request.urlopen(url)
article = req.read().decode()

In [3]:
# use the Beautiful Soup
soup = BeautifulSoup(article, 'html.parser')
tables = soup('table')

In [4]:
# read the headings from table
for table in tables:
    ths = table.find_all('th')
    headings = [th.text.strip() for th in ths]
    if headings[:5] == ['Kode Kemendagri', 'Kecamatan', 'Jumlah Kelurahan', 'Status', 'Daftar Kelurahan']:
        break

print(headings)

['Kode Kemendagri', 'Kecamatan', 'Jumlah Kelurahan', 'Status', 'Daftar Kelurahan', '', 'TOTAL', '151', '', '']


In [5]:
# assign the headings
headings = ['Kode Kemendagri', 'Kecamatan','Jumlah Kelurahan','Status','Kelurahan']
df=pd.DataFrame(columns=headings)
df

Unnamed: 0,Kode Kemendagri,Kecamatan,Jumlah Kelurahan,Status,Kelurahan


In [6]:
# assign the value to the dataframe
for tr in table.find_all('tr'):
    tds = tr.find_all('td')
    if not tds:
        continue
    kode, kecamatan, jumlah_kelurahan, status, daftarkelurahan = [td.text.strip() for td in tds]
    df = df.append({'Kode Kemendagri' : kode, 'Kecamatan':kecamatan, 'Jumlah Kelurahan':jumlah_kelurahan, 'Status':status, 'Kelurahan':daftarkelurahan}, ignore_index=True)

In [7]:
# remove the unused columns
df.drop('Kode Kemendagri', axis=1, inplace=True)
df.drop('Jumlah Kelurahan', axis=1, inplace=True)
df.drop('Status', axis=1, inplace=True)
df.head()

Unnamed: 0,Kecamatan,Kelurahan
0,Andir,Campaka\nCiroyom\nDunguscariang\nGaruda\nKebon...
1,Astana Anyar,Cibadak\nKaranganyar\nKarasak\nNyengseret\nPan...
2,Antapani,Antapani Kidul\nAntapani Kulon\nAntapani Tenga...
3,Arcamanik,Cisaranten Bina Harapan\nCisaranten Endah\nCis...
4,Babakan Ciparay,Babakan\nBabakanciparay\nCirangrang\nMargahayu...


In [8]:
# split the Kelurahan data into one row for each kelurahan
df['Kelurahan'] = df['Kelurahan'].str.split("\n", expand=False)
df.head()

Unnamed: 0,Kecamatan,Kelurahan
0,Andir,"[Campaka, Ciroyom, Dunguscariang, Garuda, Kebo..."
1,Astana Anyar,"[Cibadak, Karanganyar, Karasak, Nyengseret, Pa..."
2,Antapani,"[Antapani Kidul, Antapani Kulon, Antapani Teng..."
3,Arcamanik,"[Cisaranten Bina Harapan, Cisaranten Endah, Ci..."
4,Babakan Ciparay,"[Babakan, Babakanciparay, Cirangrang, Margahay..."


In [9]:
# split the Kelurahan data into one row for each kelurahan
new_df = df.Kelurahan.apply(pd.Series)
new_df = new_df.merge(df, left_index=True, right_index=True)
new_df.drop(["Kelurahan"], axis = 1, inplace=True)
new_df = new_df.melt(id_vars = ['Kecamatan'], value_name = "Kelurahan")
new_df.drop("variable", axis = 1, inplace=True)
new_df.dropna(inplace=True)
new_df.sort_values(by=['Kecamatan'], inplace=True)
new_df.reset_index(drop=True, inplace=True)
new_df.head()

Unnamed: 0,Kecamatan,Kelurahan
0,Andir,Campaka
1,Andir,Ciroyom
2,Andir,Garuda
3,Andir,Kebonjeruk
4,Andir,Maleber


## 3. Combine The Dataframe from wikipedia and coordinate Dataframe
I will used the coordinates of each Kelurahan or small districts head office because I can't find another dataset about each districts longitude and latitude. 

In [10]:
coordinates = pd.read_csv('koordinat-dan-ketinggian-kantor-kelurahan-di-kota-bandung-2014.csv')
coordinates.head()

Unnamed: 0,Kecamatan,Kelurahan,Lintang Selatan,Bujur Timur,Ketinggian (dpl)
0,Bandung Kulon,Gempolsari,-6.92911,107.55907,696
1,Bandung Kulon,Cigondewah Kaler,-6.93411,107.56361,700
2,Bandung Kulon,Cigondewah Kidul,-6.94386,107.56005,686
3,Bandung Kulon,Cigondewah Rahayu,-6.94889,107.56314,683
4,Bandung Kulon,Caringin,-6.92727,107.57698,702


In [11]:
df_merged = pd.merge(new_df, coordinates, how='left', on='Kelurahan')
df_merged.dropna(inplace=True)
df_merged.drop("Kecamatan_y", axis=1, inplace=True)
df_merged.columns = ['Kecamatan','Kelurahan','Latitude','Longitude','Height']
df_merged.drop("Height", axis=1, inplace=True)
df_merged.head()

Unnamed: 0,Kecamatan,Kelurahan,Latitude,Longitude
0,Andir,Campaka,-6.89787,107.56314
1,Andir,Ciroyom,-6.91295,107.58617
2,Andir,Garuda,-6.91596,107.57656
3,Andir,Kebonjeruk,-6.91908,107.60107
4,Andir,Maleber,-6.90734,107.57344


## 4. Plot Bandung City map

In [12]:
# find the coordinate of Bandung City
address = 'Bandung, Indonesia'

geolocator = Nominatim(user_agent="tor_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Bandung are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Bandung are -6.9344694, 107.6049539.


In [72]:
# create map of Bandung using latitude and longitude values
map_bandung = folium.Map(location=[latitude, longitude], zoom_start=12.5)

# add markers to map for each Kelurahan or small district
for lat, lng, kelurahan, kecamatan in zip(df_merged['Latitude'], df_merged['Longitude'], df_merged['Kelurahan'], df_merged['Kecamatan']):
    label = '{}, {}'.format(kelurahan, kecamatan)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_bandung)  
    
map_bandung

## 5. Get FourSquare's Data

In [15]:
CLIENT_ID = 'WL3I3GC2FHSNC5UGPMYC3C5L1KMAFZPXAZCTMBRPZBBWGADA'
CLIENT_SECRET = 'J1VJFCE4UWUFRP550352VT1CWWANOV0Q21VJOJY0WLTJNAQU'
VERSION = '20200530'

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: WL3I3GC2FHSNC5UGPMYC3C5L1KMAFZPXAZCTMBRPZBBWGADA
CLIENT_SECRET:J1VJFCE4UWUFRP550352VT1CWWANOV0Q21VJOJY0WLTJNAQU


In [58]:
# Function for finding each pizza seller around each Kelurahan head office and return into dataframe
def getNearbyVenues(names, latitudes, longitudes, radius=500, LIMIT=100):
    search_query = 'pizza'
    venues_list=[]
    for names, lat, lng in zip(names, latitudes, longitudes): 
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&query={}&radius={}&limit={}'.format(CLIENT_ID, CLIENT_SECRET, VERSION, lat, lng, search_query, radius, LIMIT)
            
        # make the GET request
        try:
            results = requests.get(url).json()["response"]['groups'][0]['items']
        except:
            print("group error", requests.get(url).json())
            print("-------------------------------------------")
        
        # return only relevant information for each nearby venue
        try:
            venues_list.append([(v['venue']['name'], 
                    v['venue']['location']['lat'], 
                    v['venue']['location']['lng'],  
                    v['venue']['categories'][0]['name']) for v in results])
        except:
            print("empty result")
            print("-------------------------------------------")
    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [59]:
bandung_venues = getNearbyVenues(
    names=df_merged['Kelurahan'],
    latitudes=df_merged['Latitude'],
    longitudes=df_merged['Longitude']
)

-------------------------------------------


In [64]:
# Drop duplicate value in case if one venue collected from multiple Kelurahan
final_df = bandung_venues.sort_values('Venue Latitude').drop_duplicates(subset='Venue Latitude', keep='first').reset_index(drop=True)
print(final_df.shape)
final_df.head()

(108, 4)


Unnamed: 0,Venue,Venue Latitude,Venue Longitude,Venue Category
0,J-qeeys pizza,-6.945992,107.660345,Pizza Place
1,Pizza Hut,-6.94594,107.641876,Pizza Place
2,Martabak Asia,-6.943416,107.667167,Pizza Place
3,Warung Nasi Suryalaya,-6.943411,107.623078,Pizza Place
4,Pizza Hut Buah Batu,-6.94338,107.674318,Pizza Place


## 6. Create Heat Map From Pizza Seller Data

In [65]:
# create map of Bandung using latitude and longitude values
map_pizza = folium.Map(location=[latitude, longitude], zoom_start=12.5)

# add markers to map for each pizza seller
for lat, lng, venue, kategori in zip(final_df['Venue Latitude'], final_df['Venue Longitude'], final_df['Venue'], final_df['Venue Category']):
    label = '{}, {}'.format(venue, kategori)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_pizza)  

hm_data = final_df[["Venue Latitude", "Venue Longitude"]]
map_pizza.add_child(plugins.HeatMap(hm_data, radius=50, blur=30))
map_pizza

In [66]:
final_df['Venue Category'].value_counts().to_frame()

Unnamed: 0,Venue Category
Pizza Place,75
Café,7
Coffee Shop,4
Bakery,4
Steakhouse,3
American Restaurant,2
Eastern European Restaurant,2
Bar,2
French Restaurant,1
Lounge,1


In [67]:
print('There is '+ str(len(final_df['Venue'].unique())) + ' different name of venue from ' + str(final_df.shape[0]) + ' total venues')

There is 91 different name of venue from 108 total venues


In [68]:
# Check top 10 pizza seller
final_df["Venue"].value_counts().to_frame().head(10)

Unnamed: 0,Venue
Pizza Hut,9
PHD (Pizza Hut Delivery),5
Domino's Pizza,3
Ngopi Doeloe,2
PizzaHut,2
Magic Pizza,2
Bober Cafe,1
Fakultas Kedokteran,1
Tizi's Restaurant & Bar,1
Pizza Hermes,1
