In [1]:
import pandas as pd
import json
import geopy
from bs4 import BeautifulSoup

## Hawker Centres

In [2]:
#Load the hawker centre geojson file
with open('./hawker-centres/hawker-centres-geojson.geojson') as jsonfile:
    data = json.load(jsonfile)

In [3]:
#Take a look at what the file looks like
data

{'type': 'FeatureCollection',
 'crs': {'type': 'name',
  'properties': {'name': 'urn:ogc:def:crs:OGC:1.3:CRS84'}},
 'features': [{'type': 'Feature',
   'properties': {'Name': 'kml_1',
    'Description': '<center><table><tr><th colspan=\'2\' align=\'center\'><em>Attributes</em></th></tr><tr bgcolor="#E3E3F3"> <th>ADDRESSBLOCKHOUSENUMBER</th> <td>22</td> </tr><tr bgcolor=""> <th>LATITUDE</th> <td>1.3352899599999999</td> </tr><tr bgcolor="#E3E3F3"> <th>STATUS</th> <td>Existing</td> </tr><tr bgcolor=""> <th>CLEANINGSTARTDATE</th> <td>10/12/2018</td> </tr><tr bgcolor="#E3E3F3"> <th>ADDRESSUNITNUMBER</th> <td></td> </tr><tr bgcolor=""> <th>ADDRESSFLOORNUMBER</th> <td></td> </tr><tr bgcolor="#E3E3F3"> <th>NO_OF_FOOD_STALLS</th> <td>61</td> </tr><tr bgcolor=""> <th>HYPERLINK</th> <td></td> </tr><tr bgcolor="#E3E3F3"> <th>REGION</th> <td>Central</td> </tr><tr bgcolor=""> <th>LONGITUDE</th> <td>103.8570633</td> </tr><tr bgcolor="#E3E3F3"> <th>INFO_ON_CO_LOCATORS</th> <td></td> </tr><tr bgcolor="

In [4]:
#Check what each entry for the first hawker centre looks like
data['features'][0]['properties']['Description'].split('<th>')

['<center><table><tr><th colspan=\'2\' align=\'center\'><em>Attributes</em></th></tr><tr bgcolor="#E3E3F3"> ',
 'ADDRESSBLOCKHOUSENUMBER</th> <td>22</td> </tr><tr bgcolor=""> ',
 'LATITUDE</th> <td>1.3352899599999999</td> </tr><tr bgcolor="#E3E3F3"> ',
 'STATUS</th> <td>Existing</td> </tr><tr bgcolor=""> ',
 'CLEANINGSTARTDATE</th> <td>10/12/2018</td> </tr><tr bgcolor="#E3E3F3"> ',
 'ADDRESSUNITNUMBER</th> <td></td> </tr><tr bgcolor=""> ',
 'ADDRESSFLOORNUMBER</th> <td></td> </tr><tr bgcolor="#E3E3F3"> ',
 'NO_OF_FOOD_STALLS</th> <td>61</td> </tr><tr bgcolor=""> ',
 'HYPERLINK</th> <td></td> </tr><tr bgcolor="#E3E3F3"> ',
 'REGION</th> <td>Central</td> </tr><tr bgcolor=""> ',
 'LONGITUDE</th> <td>103.8570633</td> </tr><tr bgcolor="#E3E3F3"> ',
 'INFO_ON_CO_LOCATORS</th> <td></td> </tr><tr bgcolor=""> ',
 'NO_OF_MARKET_STALLS</th> <td>59</td> </tr><tr bgcolor="#E3E3F3"> ',
 'AWARDED_DATE</th> <td></td> </tr><tr bgcolor=""> ',
 'LANDYADDRESSPOINT</th> <td>35268.64</td> </tr><tr bgcolor="

In [5]:
#Parse the text using Beautiful soup
text = data['features'][0]['properties']['Description']
soup = BeautifulSoup(text)

In [6]:
#Retrieve the th tags
soup.find_all('th')[:5]

[<th align="center" colspan="2"><em>Attributes</em></th>,
 <th>ADDRESSBLOCKHOUSENUMBER</th>,
 <th>LATITUDE</th>,
 <th>STATUS</th>,
 <th>CLEANINGSTARTDATE</th>]

In [7]:
#Retrieve the td tags
soup.find_all('td')[:5]

[<td>22</td>,
 <td>1.3352899599999999</td>,
 <td>Existing</td>,
 <td>10/12/2018</td>,
 <td></td>]

<div class="alert alert-block alert-info">
Comparing the values to full entry up above,all the titles are <strong>th</strong> tags and the corresponding values are <strong>td</strong> tags that are one index value behind.
</div>

In [8]:
#Retrieve the name, latitude and longitude for the first hawker centre
print(soup.find_all('th')[18].text)
print(soup.find_all('td')[17].text,'\n')

print(soup.find_all('th')[2].text)
print(soup.find_all('td')[1].text,'\n')

print(soup.find_all('th')[10].text)
print(soup.find_all('td')[9].text)

NAME
Toa Payoh Lorong 7 Blk 22 (Kim Keat Palm Market and Food Centre) 

LATITUDE
1.3352899599999999 

LONGITUDE
103.8570633


In [9]:
#Check the number of hawker centres in the list
len(data['features'])

120

In [10]:
#Create lists to append the corresponding information to
hawker_name = []
hawker_latitude = []
hawker_longitude = []

#iterate through the entries and parse the text
for num in range(len(data['features'])):
    text = data['features'][num]['properties']['Description']
    soup = BeautifulSoup(text)
    
    #Retrieve the name of the hawker centre
    try:
        hawker_name.append(soup.find_all('td')[17].text)
    except:
        hawker_name.append(None)
    
    #Retrieve the latitude of the hawker centre
    try:
        hawker_latitude.append(float(soup.find_all('td')[1].text))
    except:
        hawker_latitude.append(None)

    #Retrieve the longitude of the hawker centre
    try:
        hawker_longitude.append(float(soup.find_all('td')[9].text))
    except:
        hawker_longitude.append(None)

#Compile the information in a DataFrame
hawker_latlong = pd.DataFrame({'hawker_name':hawker_name,'hawker_latitude':hawker_latitude,'hawker_longitude':hawker_longitude})

In [11]:
hawker_latlong.head()

Unnamed: 0,hawker_name,hawker_latitude,hawker_longitude
0,Toa Payoh Lorong 7 Blk 22 (Kim Keat Palm Marke...,1.33529,103.857063
1,Our Tampines Hub,1.352983,103.940231
2,Pasir Ris Central Hawker Centre,1.373318,103.951364
3,Hougang Street 21 Blk 209 (Kovan Hougang Marke...,1.359079,103.885925
4,Redhill Lane Blk 79 (Redhill Market),1.28791,103.818398


In [12]:
#Check for null values
hawker_latlong.isnull().sum()

hawker_name         0
hawker_latitude     0
hawker_longitude    0
dtype: int64

In [13]:
#Check for unusual values/variance
hawker_latlong.describe()

Unnamed: 0,hawker_latitude,hawker_longitude
count,120.0,120.0
mean,1.259586,98.651973
std,0.292444,22.727277
min,0.0,0.0
25%,1.29306,103.802406
50%,1.316994,103.843845
75%,1.341776,103.880271
max,1.44399,103.988182


<div class="alert alert-block alert-danger">
    Minimum values are 0.
<div>

In [14]:
hawker_latlong[hawker_latlong['hawker_latitude']==0]

Unnamed: 0,hawker_name,hawker_latitude,hawker_longitude
49,,0.0,0.0
71,Woodleigh Village Hawker Centre,0.0,0.0
78,Bukit Canberra Hawker Centre,0.0,0.0
97,Anchorvale Village Hawker Centre,0.0,0.0
98,Fernvale Hawker Centre,0.0,0.0
99,Punggol Town Hub Hawker Centre,0.0,0.0


In [15]:
#Take a look at one of the empty entries
data['features'][49]['properties']['Description'].split('<th>')

['<center><table><tr><th colspan=\'2\' align=\'center\'><em>Attributes</em></th></tr><tr bgcolor="#E3E3F3"> ',
 'ADDRESSBLOCKHOUSENUMBER</th> <td>38A</td> </tr><tr bgcolor=""> ',
 'LATITUDE</th> <td>0</td> </tr><tr bgcolor="#E3E3F3"> ',
 'STATUS</th> <td>Under Construction</td> </tr><tr bgcolor=""> ',
 'CLEANINGSTARTDATE</th> <td>#N/A</td> </tr><tr bgcolor="#E3E3F3"> ',
 'ADDRESSUNITNUMBER</th> <td></td> </tr><tr bgcolor=""> ',
 'ADDRESSFLOORNUMBER</th> <td></td> </tr><tr bgcolor="#E3E3F3"> ',
 'NO_OF_FOOD_STALLS</th> <td>0</td> </tr><tr bgcolor=""> ',
 'HYPERLINK</th> <td></td> </tr><tr bgcolor="#E3E3F3"> ',
 'REGION</th> <td></td> </tr><tr bgcolor=""> ',
 'LONGITUDE</th> <td>0</td> </tr><tr bgcolor="#E3E3F3"> ',
 'INFO_ON_CO_LOCATORS</th> <td>hawker centre/branch office/supermarket/shops/flats</td> </tr><tr bgcolor=""> ',
 'NO_OF_MARKET_STALLS</th> <td>0</td> </tr><tr bgcolor="#E3E3F3"> ',
 'AWARDED_DATE</th> <td>11/5/2015</td> </tr><tr bgcolor=""> ',
 'LANDYADDRESSPOINT</th> <td>310

<div class="alert alert-block alert-warning">
    The site is still under contruction and not even named yet. Check the status of the other named entries. 
<div>

In [16]:
#Iterate through the affected entries 
for num in list(hawker_latlong[hawker_latlong['hawker_latitude']==0].index):
    text = data['features'][num]['properties']['Description']
    soup = BeautifulSoup(text)
    
    #Print the status
    print('status: ',soup.find_all('td')[2].text)

status:  Under Construction
status:  Under Construction
status:  Under Construction
status:  Under Construction
status:  Under Construction
status:  Under Construction


<div class="alert alert-block alert-warning">
    All of them are still under construction, we'll drop them.
<div>

In [17]:
#Drop the hawker centres that are still under construction
hawker_latlong.drop(list(hawker_latlong[hawker_latlong['hawker_latitude']==0].index),axis=0,inplace=True)

In [18]:
#Check the description again
hawker_latlong.describe()

Unnamed: 0,hawker_latitude,hawker_longitude
count,114.0,114.0
mean,1.32588,103.844182
std,0.037271,0.05699
min,1.27266,103.697374
25%,1.302355,103.808046
50%,1.320323,103.845428
75%,1.344818,103.883722
max,1.44399,103.988182


In [19]:
import matplotlib.pyplot as plt
import mplleaflet

In [20]:
#Plot out the locations of the hawker centres
plt.figure(figsize=(8,8))
plt.scatter(x=hawker_latlong['hawker_longitude'],y=hawker_latlong['hawker_latitude'],c='k',s=40, alpha=0.5, edgecolors='k', linewidths=2)

mplleaflet.display()



In [21]:
#Save the information to a csv file
hawker_latlong.to_csv('./hawker_latlong.csv')

## Supermarkets

In [22]:
#Load the supermarket geojson file
with open('./supermarkets/supermarkets-geojson.geojson') as jsonfile:
    sm = json.load(jsonfile)

In [23]:
#Check what the file looks like
sm

{'type': 'FeatureCollection',
 'crs': {'type': 'name',
  'properties': {'name': 'urn:ogc:def:crs:OGC:1.3:CRS84'}},
 'features': [{'type': 'Feature',
   'properties': {'Name': 'kml_1',
    'Description': '<center><table><tr><th colspan=\'2\' align=\'center\'><em>Attributes</em></th></tr><tr bgcolor="#E3E3F3"> <th>LIC_NAME</th> <td>LI LI CHENG SUPERMARKET (PUNGGOL) PTE. LTD.</td> </tr><tr bgcolor=""> <th>BLK_HOUSE</th> <td>273C</td> </tr><tr bgcolor="#E3E3F3"> <th>STR_NAME</th> <td>PUNGGOL PLACE</td> </tr><tr bgcolor=""> <th>UNIT_NO</th> <td>884</td> </tr><tr bgcolor="#E3E3F3"> <th>POSTCODE</th> <td>823273</td> </tr><tr bgcolor=""> <th>LIC_NO</th> <td>NE12I65N000</td> </tr><tr bgcolor="#E3E3F3"> <th>INC_CRC</th> <td>3DE8AF6E76F9D3D4</td> </tr><tr bgcolor=""> <th>FMEL_UPD_D</th> <td>20171129183653</td> </tr></table></center>'},
   'geometry': {'type': 'Point',
    'coordinates': [103.901262393433, 1.40230300615945, 0.0]}},
  {'type': 'Feature',
   'properties': {'Name': 'kml_2',
    'Desc

In [24]:
#Check what each entry for a supermarket looks like
sm['features'][0]

{'type': 'Feature',
 'properties': {'Name': 'kml_1',
  'Description': '<center><table><tr><th colspan=\'2\' align=\'center\'><em>Attributes</em></th></tr><tr bgcolor="#E3E3F3"> <th>LIC_NAME</th> <td>LI LI CHENG SUPERMARKET (PUNGGOL) PTE. LTD.</td> </tr><tr bgcolor=""> <th>BLK_HOUSE</th> <td>273C</td> </tr><tr bgcolor="#E3E3F3"> <th>STR_NAME</th> <td>PUNGGOL PLACE</td> </tr><tr bgcolor=""> <th>UNIT_NO</th> <td>884</td> </tr><tr bgcolor="#E3E3F3"> <th>POSTCODE</th> <td>823273</td> </tr><tr bgcolor=""> <th>LIC_NO</th> <td>NE12I65N000</td> </tr><tr bgcolor="#E3E3F3"> <th>INC_CRC</th> <td>3DE8AF6E76F9D3D4</td> </tr><tr bgcolor=""> <th>FMEL_UPD_D</th> <td>20171129183653</td> </tr></table></center>'},
 'geometry': {'type': 'Point',
  'coordinates': [103.901262393433, 1.40230300615945, 0.0]}}

In [25]:
#Retrieve long of the first supermarket
print('Latitude: ',sm['features'][0]['geometry']['coordinates'][1])
print('Longitude: ',sm['features'][0]['geometry']['coordinates'][0])

Latitude:  1.40230300615945
Longitude:  103.901262393433


In [26]:
#Retrieve the name of the first supermarket
soup = BeautifulSoup(sm['features'][0]['properties']['Description'])
print(soup.find_all('th')[1].text)
print(soup.find_all('td')[0].text)

LIC_NAME
LI LI CHENG SUPERMARKET (PUNGGOL) PTE. LTD.


In [27]:
#Check the number of supermarkets
len(sm['features'])

526

In [28]:
#Create lists to append the relevant information to
sm_name = []
sm_latitude = []
sm_longitude = []

#iterate through the entries and retrieve the name,latitude and longitude of each supermarket
for num in range(len(sm['features'])):
    
    #Save the latitude of the supermarket
    try:
        sm_latitude.append(float(sm['features'][num]['geometry']['coordinates'][1]))
    except:
        sm_latitude.append(None)
    
    #Save the longitude of the supermarket
    try:
        sm_longitude.append(float(sm['features'][num]['geometry']['coordinates'][0]))
    except:
        sm_longitude.append(None)
    
    #Save the name of the supermarket
    try:
        soup = BeautifulSoup(sm['features'][num]['properties']['Description'])
        sm_name.append(soup.find_all('td')[0].text)
    except:
        sm_name.append(None)

#Save the retrieved information as a DataFrame
sm_latlong = pd.DataFrame({'supermarket_name':sm_name,'supermarket_latitude':sm_latitude,'supermarket_longitude':sm_longitude})

In [29]:
sm_latlong.head()

Unnamed: 0,supermarket_name,supermarket_latitude,supermarket_longitude
0,LI LI CHENG SUPERMARKET (PUNGGOL) PTE. LTD.,1.402303,103.901262
1,SHENG SIONG SUPERMARKET PTE LTD,1.314239,103.870914
2,COLD STORAGE SINGAPORE (1983) PTE LTD,1.373321,103.886366
3,COLD STORAGE SINGAPORE (1983) PTE LTD,1.332959,103.914942
4,YES SUPERMARKET PTE LTD,1.353453,103.95301


In [30]:
#Check for null values
sm_latlong.isnull().sum()

supermarket_name         0
supermarket_latitude     0
supermarket_longitude    0
dtype: int64

In [31]:
#Check for unusual values/variance
sm_latlong.describe()

Unnamed: 0,supermarket_latitude,supermarket_longitude
count,526.0,526.0
mean,1.346642,103.832177
std,0.047382,0.074858
min,1.24715,103.625765
25%,1.311964,103.779166
50%,1.334972,103.839522
75%,1.375659,103.888471
max,1.461526,104.003578


In [32]:
#Plot out the locations of the supermarkets 
plt.figure(figsize=(8,8))
plt.scatter(x=sm_latlong['supermarket_longitude'],y=sm_latlong['supermarket_latitude'],c='blue', alpha=0.5, edgecolors='k')

mplleaflet.display()

In [33]:
#Save the information to a csv file
sm_latlong.to_csv('./sm_latlong.csv')