In [1]:
import pandas as pd
from geopy.distance import geodesic
from tqdm import tqdm



In [3]:
hdb_df = pd.read_csv('/content/hdb_coordinates_final.csv', low_memory=False)

In [4]:
hdb_df = hdb_df.drop(hdb_df.columns[0], axis=1)

In [None]:
hdb_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 919408 entries, 0 to 919407
Data columns (total 14 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   month                919408 non-null  object 
 1   town                 919408 non-null  object 
 2   flat_type            919408 non-null  object 
 3   block                919408 non-null  object 
 4   street_name          919408 non-null  object 
 5   storey_range         919408 non-null  object 
 6   floor_area_sqm       919408 non-null  float64
 7   flat_model           919408 non-null  object 
 8   lease_commence_date  919408 non-null  int64  
 9   resale_price         919408 non-null  float64
 10  remaining_lease      210358 non-null  object 
 11  address              919408 non-null  object 
 12  latitude             919408 non-null  float64
 13  longitude            919408 non-null  float64
dtypes: float64(4), int64(1), object(9)
memory usage: 98.2+ MB


In [None]:
# Amenities
schools_df = pd.read_csv('/content/schools_address.csv')
shops_df = pd.read_csv('/content/shops_address.csv')
hawkers_df = pd.read_csv('/content/hawkers_address.csv')
mrt_df = pd.read_csv('/content/mrt_address.csv')

In [None]:
schools_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 346 entries, 0 to 345
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   school_name  346 non-null    object 
 1   address      346 non-null    object 
 2   postal_code  346 non-null    int64  
 3   mrt_desc     346 non-null    object 
 4   bus_desc     346 non-null    object 
 5   latitude     346 non-null    float64
 6   longitude    346 non-null    float64
dtypes: float64(2), int64(1), object(4)
memory usage: 19.0+ KB


In [None]:
# Extract unique HDB coordinates
min_dist_df = hdb_df[['latitude', 'longitude']].drop_duplicates().reset_index(drop=True)
min_dist_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9730 entries, 0 to 9729
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   latitude   9730 non-null   float64
 1   longitude  9730 non-null   float64
dtypes: float64(2)
memory usage: 152.2 KB


In [None]:
def calculate_nearest_amenity_distances(min_dist_df, amenity_df, amenity_lat_col, amenity_lon_col, prefix):
    """
    Calculate the distance to the nearest amenity for each unique coordinate.

    :param min_dist_df: DataFrame with unique HDB coordinates.
    :param amenity_df: DataFrame with amenity locations.
    :param amenity_lat_col: Column name for the latitude of the amenity.
    :param amenity_lon_col: Column name for the longitude of the amenity.
    :param prefix: Prefix for the column name to store results.
    """
    # Initialize column to store the nearest amenity distance
    min_dist_df[f'nearest_{prefix}_distance'] = 0.0

    # Calculate distance for each unique coordinate
    for index, coord_row in tqdm(min_dist_df.iterrows(), total=min_dist_df.shape[0], desc=f"Processing nearest {prefix} distance"):
        coord = (coord_row['latitude'], coord_row['longitude'])

        # Calculate distance to each amenity
        min_distance = min(
            geodesic(coord, (amenity_row[amenity_lat_col], amenity_row[amenity_lon_col])).meters
            for _, amenity_row in amenity_df.iterrows()
        )

        # Update the DataFrame with the nearest amenity distance
        min_dist_df.at[index, f'nearest_{prefix}_distance'] = min_distance

    return min_dist_df



In [None]:
# Example usage:
# Calculate nearest amenity distances for unique coordinates
min_dist_df = calculate_nearest_amenity_distances(min_dist_df, shops_df, 'latitude', 'longitude', 'supermarket')


Processing nearest supermarket distance: 100%|██████████| 9730/9730 [28:32<00:00,  5.68it/s]


In [None]:
min_dist_df = calculate_nearest_amenity_distances(min_dist_df, schools_df, 'latitude', 'longitude', 'school')

Processing nearest school distance: 100%|██████████| 9730/9730 [16:16<00:00,  9.97it/s]


In [None]:
min_dist_df = calculate_nearest_amenity_distances(min_dist_df, mrt_df, 'latitude', 'longitude', 'mrt')

Processing nearest mrt distance: 100%|██████████| 9730/9730 [07:31<00:00, 21.54it/s]


In [None]:
min_dist_df = calculate_nearest_amenity_distances(min_dist_df, hawkers_df, 'latitude', 'longitude', 'hawkers')

Processing nearest hawkers distance: 100%|██████████| 9730/9730 [05:09<00:00, 31.40it/s]


In [None]:
min_dist_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9730 entries, 0 to 9729
Data columns (total 6 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   latitude                      9730 non-null   float64
 1   longitude                     9730 non-null   float64
 2   nearest_supermarket_distance  9730 non-null   float64
 3   nearest_school_distance       9730 non-null   float64
 4   nearest_mrt_distance          9730 non-null   float64
 5   nearest_hawkers_distance      9730 non-null   float64
dtypes: float64(6)
memory usage: 456.2 KB


In [None]:
hdb_df = hdb_df.merge(min_dist_df, on=['latitude', 'longitude'], how='left')

In [None]:
hdb_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 919408 entries, 0 to 919407
Data columns (total 18 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0   month                         919408 non-null  object 
 1   town                          919408 non-null  object 
 2   flat_type                     919408 non-null  object 
 3   block                         919408 non-null  object 
 4   street_name                   919408 non-null  object 
 5   storey_range                  919408 non-null  object 
 6   floor_area_sqm                919408 non-null  float64
 7   flat_model                    919408 non-null  object 
 8   lease_commence_date           919408 non-null  int64  
 9   resale_price                  919408 non-null  float64
 10  remaining_lease               210358 non-null  object 
 11  address                       919408 non-null  object 
 12  latitude                      919408 non-nul

In [None]:
hdb_df.to_csv('/content/drive/My Drive/hdb_df_min_dist.csv', index=False)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive
