In [10]:
import pandas as pd
from geopy.distance import geodesic
from tqdm import tqdm



In [11]:
hdb_df = pd.read_csv('/content/drive/MyDrive/FYP docs/data/hdb_coordinates_final.csv', low_memory=False)

In [12]:
hdb_df = hdb_df.drop(hdb_df.columns[0], axis=1)

In [13]:
hdb_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 919408 entries, 0 to 919407
Data columns (total 14 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   month                919408 non-null  object 
 1   town                 919408 non-null  object 
 2   flat_type            919408 non-null  object 
 3   block                919408 non-null  object 
 4   street_name          919408 non-null  object 
 5   storey_range         919408 non-null  object 
 6   floor_area_sqm       919408 non-null  float64
 7   flat_model           919408 non-null  object 
 8   lease_commence_date  919408 non-null  int64  
 9   resale_price         919408 non-null  float64
 10  remaining_lease      210358 non-null  object 
 11  address              919408 non-null  object 
 12  latitude             919408 non-null  float64
 13  longitude            919408 non-null  float64
dtypes: float64(4), int64(1), object(9)
memory usage: 98.2+ MB


In [14]:
cbd_lat = '1.287953'
cbd_long = '103.851784'

In [15]:
# Amenities
schools_df = pd.read_csv('/content/drive/MyDrive/FYP docs/data/schools_address.csv')
shops_df = pd.read_csv('/content/drive/MyDrive/FYP docs/data/shops_address.csv')
hawkers_df = pd.read_csv('/content/drive/MyDrive/FYP docs/data/hawkers_address.csv')
mrt_df = pd.read_csv('/content/drive/MyDrive/FYP docs/data/mrt_address.csv')

In [16]:
mrt_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 144 entries, 0 to 143
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   station_code  144 non-null    object 
 1   latitude      144 non-null    float64
 2   longitude     144 non-null    float64
 3   station_name  144 non-null    object 
dtypes: float64(2), object(2)
memory usage: 4.6+ KB


In [17]:
# Extract unique HDB coordinates
min_dist_df = hdb_df[['latitude', 'longitude']].drop_duplicates().reset_index(drop=True)
min_dist_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9730 entries, 0 to 9729
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   latitude   9730 non-null   float64
 1   longitude  9730 non-null   float64
dtypes: float64(2)
memory usage: 152.2 KB


In [18]:
def calculate_nearest_amenity_distances(min_dist_df, amenity_df, amenity_lat_col, amenity_lon_col, prefix):
    """
    Calculate the distance to the nearest amenity for each unique coordinate.

    :param min_dist_df: DataFrame with unique HDB coordinates.
    :param amenity_df: DataFrame with amenity locations.
    :param amenity_lat_col: Column name for the latitude of the amenity.
    :param amenity_lon_col: Column name for the longitude of the amenity.
    :param prefix: Prefix for the column name to store results.
    """
    # Initialize column to store the nearest amenity distance
    min_dist_df[f'nearest_{prefix}_distance'] = 0.0

    # Calculate distance for each unique coordinate
    for index, coord_row in tqdm(min_dist_df.iterrows(), total=min_dist_df.shape[0], desc=f"Processing nearest {prefix} distance"):
        coord = (coord_row['latitude'], coord_row['longitude'])

        # Calculate distance to each amenity
        min_distance = min(
            geodesic(coord, (amenity_row[amenity_lat_col], amenity_row[amenity_lon_col])).meters
            for _, amenity_row in amenity_df.iterrows()
        )

        # Update the DataFrame with the nearest amenity distance
        min_dist_df.at[index, f'nearest_{prefix}_distance'] = min_distance

    return min_dist_df



In [19]:
# Calculate nearest distances to Supermarket
min_dist_df = calculate_nearest_amenity_distances(min_dist_df, shops_df, 'latitude', 'longitude', 'supermarket')


Processing nearest supermarket distance: 100%|██████████| 9730/9730 [27:37<00:00,  5.87it/s]


In [20]:
# Calculate nearest distances to School
min_dist_df = calculate_nearest_amenity_distances(min_dist_df, schools_df, 'latitude', 'longitude', 'school')

Processing nearest school distance: 100%|██████████| 9730/9730 [15:46<00:00, 10.28it/s]


In [21]:
# Calculate nearest distances to MRT
min_dist_df = calculate_nearest_amenity_distances(min_dist_df, mrt_df, 'latitude', 'longitude', 'mrt')

Processing nearest mrt distance: 100%|██████████| 9730/9730 [06:43<00:00, 24.10it/s]


In [22]:
# Calculate nearest distances to Hawker Centre
min_dist_df = calculate_nearest_amenity_distances(min_dist_df, hawkers_df, 'latitude', 'longitude', 'hawkers')

Processing nearest hawkers distance: 100%|██████████| 9730/9730 [04:58<00:00, 32.63it/s]


In [23]:
# Calculate nearest distances to CBD
cbd_coord = (float(cbd_lat), float(cbd_long))
min_dist_df['cbd_distance'] = min_dist_df.apply(
    lambda row: geodesic((row['latitude'], row['longitude']), cbd_coord).meters, axis=1
)

In [24]:
min_dist_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9730 entries, 0 to 9729
Data columns (total 7 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   latitude                      9730 non-null   float64
 1   longitude                     9730 non-null   float64
 2   nearest_supermarket_distance  9730 non-null   float64
 3   nearest_school_distance       9730 non-null   float64
 4   nearest_mrt_distance          9730 non-null   float64
 5   nearest_hawkers_distance      9730 non-null   float64
 6   cbd_distance                  9730 non-null   float64
dtypes: float64(7)
memory usage: 532.2 KB


In [25]:
min_dist_df.head()

Unnamed: 0,latitude,longitude,nearest_supermarket_distance,nearest_school_distance,nearest_mrt_distance,nearest_hawkers_distance,cbd_distance
0,1.366558,103.841624,0.0,104.434561,781.53096,0.0,8765.00756
1,1.366197,103.841505,42.113407,145.364227,800.632183,42.113407,8727.095898
2,1.369197,103.841667,223.170055,212.713849,620.310216,161.753049,9053.827676
3,1.368446,103.844516,228.426423,290.002626,584.672255,383.648599,8937.226741
4,1.366824,103.836491,84.90235,203.72022,513.091791,473.219425,8885.621966


In [26]:
hdb_df = hdb_df.merge(min_dist_df, on=['latitude', 'longitude'], how='left')

In [27]:
hdb_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 919408 entries, 0 to 919407
Data columns (total 19 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0   month                         919408 non-null  object 
 1   town                          919408 non-null  object 
 2   flat_type                     919408 non-null  object 
 3   block                         919408 non-null  object 
 4   street_name                   919408 non-null  object 
 5   storey_range                  919408 non-null  object 
 6   floor_area_sqm                919408 non-null  float64
 7   flat_model                    919408 non-null  object 
 8   lease_commence_date           919408 non-null  int64  
 9   resale_price                  919408 non-null  float64
 10  remaining_lease               210358 non-null  object 
 11  address                       919408 non-null  object 
 12  latitude                      919408 non-nul

In [28]:
hdb_df.to_csv('hdb_df_min_dist.csv', index=False)

In [29]:
hdb_df.to_csv('/content/drive/My Drive/hdb_df_min_dist.csv', index=False)

In [30]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
