## PLEASE READ: DATA DOWNLOAD

Ensure you have downloaded the `OSM Data Files` and the `VIC Localities Shape File` folder from our Google drive into the `/data/map` directory. 

----

## Import Libraries 

In [3]:
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
from shapely.geometry import Point
from sklearn.preprocessing import LabelEncoder
import os

## Read in Files

#### OSM Shape Files

In [4]:
# Read in the shape files as a Geo dataframe 
# "..._area_..." - indicates features drawn as areas
# "..._line_..." - indicates features drawn as lines/points

landuse_gdf = gpd.read_file('../../data/map/OSM Data Files/gis_osm_landuse_a_free_1.shp')
roads_gdf = gpd.read_file('../../data/map/OSM Data Files/gis_osm_roads_free_1.shp')

# Dataframes which contain features of areas and points/lines
natural_area_gdf = gpd.read_file('../../data/map/OSM Data Files/gis_osm_natural_a_free_1.shp')
natural_line_gdf = gpd.read_file('../../data/map/OSM Data Files/gis_osm_natural_free_1.shp')

pofw_area_gdf = gpd.read_file('../../data/map/OSM Data Files/gis_osm_pofw_a_free_1.shp')
pofw_line_gdf = gpd.read_file('../../data/map/OSM Data Files/gis_osm_pofw_free_1.shp')

pois_area_gdf = gpd.read_file('../../data/map/OSM Data Files/gis_osm_pois_a_free_1.shp')
pois_line_gdf = gpd.read_file('../../data/map/OSM Data Files/gis_osm_pois_free_1.shp')

#### Other Files

In [5]:
vic_suburbs = gpd.read_file('../../data/map/VIC Localities Shape File/gda2020_vicgrid/esrishape/whole_of_dataset/victoria/VMADMIN/LOCALITY_POLYGON.shp')
properties_df = pd.read_csv('../../data/raw/scraped_properties.csv')

#### View the Dataframes

In [6]:
# OSM Dataframes
gdfs = {
    'landuse_gdf': landuse_gdf,
    'roads_gdf': roads_gdf,
    'natural_area_gdf': natural_area_gdf,
    'natural_line_gdf': natural_line_gdf,
    'pofw_area_gdf': pofw_area_gdf,
    'pofw_line_gdf': pofw_line_gdf,
    'pois_area_gdf': pois_area_gdf,
    'pois_line_gdf': pois_line_gdf
}

# View the details of each dataframe
for name, gdf in gdfs.items():
    print(f"\n---------------------------- {name} ----------------------------")
    # rename feature column
    gdf.rename(columns={'fclass': 'feature_class'}, inplace=True)
    print("Shape:", gdf.shape)
    print("Columns:", gdf.columns)
    print("Schema:")
    print(gdf.dtypes)
    print("First 5 rows:")
    print(gdf.head())
    print("Unique feauture values:")
    print(gdf['feature_class'].unique())


---------------------------- landuse_gdf ----------------------------
Shape: (490185, 5)
Columns: Index(['osm_id', 'code', 'feature_class', 'name', 'geometry'], dtype='object')
Schema:
osm_id             object
code                int32
feature_class      object
name               object
geometry         geometry
dtype: object
First 5 rows:
    osm_id  code feature_class                           name  \
0  4018601  7202          park                  Anderson Park   
1  4018602  7202          park                      Gold Park   
2  4061402  7202          park                    Queens Park   
3  4085520  7202          park             Ollie Webb Reserve   
4  4282955  7202          park  Brisbane City Botanic Gardens   

                                            geometry  
0  POLYGON ((151.96054 -27.5783, 151.96054 -27.57...  
1  POLYGON ((151.95151 -27.58105, 151.95153 -27.5...  
2  POLYGON ((151.95787 -27.56136, 151.95788 -27.5...  
3  POLYGON ((150.99548 -33.82277, 150.99557 -

------

# Preprocessing

#### Vic Suburbs Dataframe

In [7]:
# Rename 'GAZLOC' to gazetted locality
vic_suburbs['gazetted_locality'] = vic_suburbs['GAZLOC'].str.lower()

# Filter the dataframe to only include the gazetted localities and their geometries
vic_suburbs_filtered = vic_suburbs[['gazetted_locality', 'geometry']]

vic_suburbs_filtered.head()

Unnamed: 0,gazetted_locality,geometry
0,mollongghip,"POLYGON ((2417354.185 2445906.94, 2417260.386 ..."
1,north blackwood,"POLYGON ((2445160.214 2452798.651, 2445239.392..."
2,basalt,"POLYGON ((2421860.482 2465074.51, 2421860.053 ..."
3,llanelly,"POLYGON ((2394406.798 2527031.044, 2394351.249..."
4,murrabit west,"POLYGON ((2397518.057 2666586.745, 2397393.211..."


#### OSM Dataframes

In [8]:
# Stack Area and Point/Line Files into One File using the concat function
natural_gdf = pd.concat([natural_area_gdf, natural_line_gdf], ignore_index=True)
pofw_gdf = pd.concat([pofw_area_gdf, pofw_line_gdf], ignore_index=True)
pois_gdf = pd.concat([pois_area_gdf, pois_line_gdf], ignore_index=True)

In [9]:
print("\n----------------------------- Natural Combined GeoDataFrame -----------------------------")
print(natural_gdf.shape)
print(natural_gdf.head())

print("\n----------------------------- POFW Combined GeoDataFrame -----------------------------")
print(pofw_gdf.shape)
print(pofw_gdf.head())

print("\n----------------------------- POIs Combined GeoDataFrame -----------------------------")
print(pois_gdf.shape)
print(pois_gdf.head())


----------------------------- Natural Combined GeoDataFrame -----------------------------
(477883, 5)
    osm_id  code feature_class              name  \
0  4042219  4141         beach  Freshwater Beach   
1  4042355  4141         beach     Dee Why Beach   
2  4042505  4141         beach     Bilgola Beach   
3  4044016  4141         beach  Warriewood Beach   
4  4335014  4141         beach       Whale Beach   

                                            geometry  
0  POLYGON ((151.28911 -33.78233, 151.28913 -33.7...  
1  POLYGON ((151.29679 -33.75261, 151.29681 -33.7...  
2  POLYGON ((151.32709 -33.6466, 151.32718 -33.64...  
3  POLYGON ((151.30826 -33.68926, 151.30879 -33.6...  
4  POLYGON ((151.33089 -33.61265, 151.33106 -33.6...  

----------------------------- POFW Combined GeoDataFrame -----------------------------
(6901, 5)
     osm_id  code       feature_class  \
0   4463645  3101  christian_anglican   
1   8746568  3100           christian   
2   9923245  3102  christian_cath

## Filter for the Areas within the Victoria Region

In [10]:
# Ensure all OSM files are geodataframes and convert every gdf to the correct CRS
vic_suburbs_gdf = vic_suburbs.to_crs(epsg=4326)
gdfs = {
    'landuse_gdf': gpd.GeoDataFrame(landuse_gdf).to_crs(epsg=4326),
    'roads_gdf': gpd.GeoDataFrame(roads_gdf).to_crs(epsg=4326),
    'natural_gdf': gpd.GeoDataFrame(natural_gdf).to_crs(epsg=4326),
    'pofw_gdf': gpd.GeoDataFrame(pofw_gdf).to_crs(epsg=4326),
    'pois_gdf': gpd.GeoDataFrame(pois_gdf).to_crs(epsg=4326)
}

# Spatial join and filter features that fall within suburb geometries (based on areas that intersect)
def filter_vic_region(gdf, suburbs_gdf):
    filtered_gdf = gpd.sjoin(gdf, suburbs_gdf, how='inner', predicate='intersects')
    return filtered_gdf

# Filter each gdf and store in filtered_gdfs
filtered_gdfs = {}
for name, gdf in gdfs.items():
    filtered_gdfs[name] = filter_vic_region(gdf, vic_suburbs_gdf)

# Display the filtered gdfs
for name, gdf in filtered_gdfs.items():
    print(f"\n----------------------------- Filtered {name} -----------------------------")
    print("Shape after filtering:", gdf.shape)
    print(gdf.head())


----------------------------- Filtered landuse_gdf -----------------------------
Shape after filtering: (142310, 17)
     osm_id  code feature_class                      name  \
61  4394426  7202          park                      None   
62  4396215  7202          park  Scotchmans Creek Reserve   
63  4396215  7201        forest  Scotchmans Creek Reserve   
64  4396376  7202          park  Scotchmans Creek Reserve   
65  4472473  7202          park            Bogong Reserve   

                                             geometry  index_right        UFI  \
61  POLYGON ((145.14888 -37.87904, 145.14899 -37.8...         1815  767125259   
62  POLYGON ((145.14631 -37.88405, 145.14649 -37.8...         1815  767125259   
63  POLYGON ((145.14631 -37.88405, 145.14649 -37.8...         1815  767125259   
64  POLYGON ((145.14971 -37.88515, 145.15003 -37.8...         1815  767125259   
65  POLYGON ((145.15571 -37.88091, 145.15578 -37.8...         1815  767125259   

          PFI       LOCALITY

## Filter Features

In [11]:
# Keep the columns of feature_class, gazetted_locality and geometry in each gdf but in pois_gdf also keep the code
for name, gdf in filtered_gdfs.items():
    if name == 'pois_gdf':
        columns = [col for col in ['feature_class', 'gazetted_locality', 'geometry', 'code'] if col in gdf.columns]
    else:
        columns = [col for col in ['feature_class', 'gazetted_locality', 'geometry'] if col in gdf.columns]
        
    filtered_gdfs[name] = gdf[columns]

In [12]:
# View the unique features in the dataframes
for name, gdf in filtered_gdfs.items():
    print(f"\n---------------------------- {name} ----------------------------")
    print(gdf.shape)
    print("Features:")
    print(gdf['feature_class'].unique())
    print(gdf.head())


---------------------------- landuse_gdf ----------------------------
(142310, 3)
Features:
['park' 'forest' 'cemetery' 'industrial' 'retail' 'residential'
 'recreation_ground' 'nature_reserve' 'commercial' 'allotments' 'grass'
 'quarry' 'military' 'vineyard' 'scrub' 'farmyard' 'farmland' 'meadow'
 'heath' 'orchard']
   feature_class gazetted_locality  \
61          park     glen waverley   
62          park     glen waverley   
63        forest     glen waverley   
64          park     glen waverley   
65          park     glen waverley   

                                             geometry  
61  POLYGON ((145.14888 -37.87904, 145.14899 -37.8...  
62  POLYGON ((145.14631 -37.88405, 145.14649 -37.8...  
63  POLYGON ((145.14631 -37.88405, 145.14649 -37.8...  
64  POLYGON ((145.14971 -37.88515, 145.15003 -37.8...  
65  POLYGON ((145.15571 -37.88091, 145.15578 -37.8...  

---------------------------- roads_gdf ----------------------------
(852590, 3)
Features:
['primary' 'tertiary' 's

#### Select only the relevant features from each dataframe 

Land Use

In [13]:
# Convert geodataframe to a dataframe 
landuse_df = filtered_gdfs['landuse_gdf']

# landuse features to include 
landuse_features = ['park', 'forest', 'industrial', 'retail', 'residential', 'nature_reserve', 'commercial', 'farmland']

# Filter the dataframe to only include the 'landuse_features'
landuse_df = landuse_df[landuse_df['feature_class'].isin(landuse_features)]

# Change 'residential' to be 'residential land' to avoid name conflicts
landuse_df.loc[:, 'feature_class'] = landuse_df['feature_class'].replace({'residential': 'residential_areas', 'commercial': 'commercial_areas', 
'retail': 'retail_areas', 'industrial': 'industrial_areas'})

print(landuse_df.shape)
print(landuse_df.head())

(80210, 3)
   feature_class gazetted_locality  \
61          park     glen waverley   
62          park     glen waverley   
63        forest     glen waverley   
64          park     glen waverley   
65          park     glen waverley   

                                             geometry  
61  POLYGON ((145.14888 -37.87904, 145.14899 -37.8...  
62  POLYGON ((145.14631 -37.88405, 145.14649 -37.8...  
63  POLYGON ((145.14631 -37.88405, 145.14649 -37.8...  
64  POLYGON ((145.14971 -37.88515, 145.15003 -37.8...  
65  POLYGON ((145.15571 -37.88091, 145.15578 -37.8...  


Roads

In [14]:
# Convert geodataframe to a dataframe 
roads_df = filtered_gdfs['roads_gdf']

# roads features to include 
roads_features = ['primary', 'tertiary', 'secondary', 'residential',
                'cycleway', 'motorway', 'pedestrian','path', 'footway', 'track']

# Filter the dataframe to only include the 'roads_features'
roads_df = roads_df[roads_df['feature_class'].isin(roads_features)]

# primary', 'tertiary', and 'secondary' = 'main roads' 
roads_df.loc[:, 'feature_class'] = roads_df['feature_class'].replace({'primary': 'main_roads', 'tertiary': 'main_roads', 
'secondary': 'main_roads', 'residential': 'residential_roads', 'pedestrian': 'walking_paths', 'path': 'walking_paths',
'footway': 'walking_paths'})

print(roads_df.shape)
print(roads_df.head())

(595165, 3)
     feature_class gazetted_locality  \
1880    main_roads     hawthorn east   
1881    main_roads          hawthorn   
1882    main_roads          hawthorn   
1883    main_roads          hawthorn   
1884    main_roads          hawthorn   

                                               geometry  
1880  LINESTRING (145.05555 -37.83032, 145.05561 -37...  
1881  LINESTRING (145.03561 -37.82122, 145.03562 -37...  
1882  LINESTRING (145.03701 -37.8206, 145.03736 -37....  
1883  LINESTRING (145.03971 -37.82313, 145.03972 -37...  
1884  LINESTRING (145.02378 -37.82126, 145.02384 -37...  


Nature

In [15]:
# Convert geodataframe to a dataframe 
natural_df = filtered_gdfs['natural_gdf']

# roads features to include 
nature_features = ['beach', 'cliff', 'spring', 'peak']

# Filter the dataframe to only include the 'nature_features'
natural_df = natural_df[natural_df['feature_class'].isin(nature_features)]

print(natural_df.shape)
print(natural_df.head())

(3238, 3)
    feature_class gazetted_locality  \
96          beach           ventnor   
102         beach      smiths beach   
112         beach          sorrento   
132         beach          brighton   
136         beach           ventnor   

                                              geometry  
96   POLYGON ((145.20014 -38.51713, 145.2008 -38.51...  
102  POLYGON ((145.25259 -38.50482, 145.25299 -38.5...  
112  POLYGON ((144.72577 -38.34608, 144.72597 -38.3...  
132  POLYGON ((144.98556 -37.9264, 144.98559 -37.92...  
136  POLYGON ((145.14956 -38.51159, 145.14985 -38.5...  


Places of Worship

In [16]:
pofw_categories = {
     "churches": ['christian', 'christian_anglican', 'christian_catholic', 'christian_lutheran', 
                      'christian_orthodox', 'christian_evangelical', 'christian_protestant', 'christian_methodist'],
    "mosques": ['muslim', 'muslim_sunni', 'muslim_shia'],
    "synagogues": ['jewish'],
    "buddhist_temples": ['buddhist'],
    "sikh_gurdwaras": ['sikh'],
    "hindu_temples": ['hindu'],
    "taoist_temples": ['taoist']
}

# Convert geodataframe to a dataframe 
pofw_df = filtered_gdfs['pofw_gdf']

# map each feature to the corresponding category using this reverse mapping dictionary
feature_mapper = {feature: category for category, features in pofw_categories.items() for feature in features}

# Create a new column called categories which acts as a tag for each feature
pofw_df['categories'] = pofw_df['feature_class'].map(feature_mapper)

print(pofw_df.shape)
print(pofw_df.head())

(1684, 4)
        feature_class gazetted_locality  \
0  christian_anglican     glen waverley   
1           christian     glen waverley   
2  christian_catholic            coburg   
3  christian_anglican         melbourne   
4  christian_lutheran         southbank   

                                            geometry categories  
0  POLYGON ((145.17019 -37.8913, 145.17048 -37.88...   churches  
1  POLYGON ((145.14844 -37.87265, 145.14847 -37.8...   churches  
2  POLYGON ((144.9525 -37.75004, 144.95256 -37.74...   churches  
3  POLYGON ((144.96736 -37.81662, 144.96739 -37.8...   churches  
4  POLYGON ((144.96685 -37.82063, 144.96688 -37.8...   churches  


Places of Interest

In [17]:
# As defined in the data dictionary, define categories based on the first 2 numbers of the code column
def poi_categories(code):
    # The code column is an int, convert to string
    code_str = str(code)
    if code_str.startswith('20'):
        return 'public_facilities'
    elif code_str.startswith('21'):
        return 'healthcare'
    elif code_str.startswith('22'):
        return 'culture_and_leisure'
    elif code_str.startswith('23'):
        return 'food_and_beverage'
    elif code_str.startswith('24'):
        return 'accommodation'
    elif code_str.startswith('25'):
        return 'shopping_and_retail'
    elif code_str.startswith('26'):
        return 'financial_institutions'
    elif code_str.startswith('27'):
        return 'tourism_and_attractions'

# Convert geodataframe to a dataframe 
pois_df = filtered_gdfs['pois_gdf']

# Apply the function based on the code column
pois_df['categories'] = pois_df['code'].apply(poi_categories)

# Drop the code column
pois_df.drop(columns=['code'], inplace=True)

print(pois_df.shape)
print(pois_df.head())

(135592, 4)
    feature_class gazetted_locality  \
95           park     glen waverley   
96           park     glen waverley   
97           park     glen waverley   
98  sports_centre     glen waverley   
99        college     glen waverley   

                                             geometry           categories  
95  POLYGON ((145.14888 -37.87904, 145.14899 -37.8...  culture_and_leisure  
96  POLYGON ((145.14631 -37.88405, 145.14649 -37.8...  culture_and_leisure  
97  POLYGON ((145.14971 -37.88515, 145.15003 -37.8...  culture_and_leisure  
98  POLYGON ((145.15462 -37.88899, 145.15481 -37.8...  culture_and_leisure  
99  POLYGON ((145.17156 -37.89345, 145.17195 -37.8...    public_facilities  


## Removing Null Values

In [18]:
# List of the OSM dataframes
dfs = [pois_df, pofw_df, roads_df, natural_df, landuse_df]
df_names = ['pois_df', 'pofw_df', 'roads_df', 'natural_df', 'landuse_df']

# Check for null values in each column of the dataframes 
for df, name in zip(dfs, df_names):
    print(f"Null values in {name}:")
    # Find the number of null vals in each col
    null_columns = df.isnull().sum()
    # Filter these cols for the null vals 
    null_columns = null_columns[null_columns > 0]
    print(null_columns if not null_columns.empty else "No null values in any column.")
    print("\n")
    # Drop any rows with null vals 
    df.dropna(inplace=True)

Null values in pois_df:
categories    22456
dtype: int64


Null values in pofw_df:
No null values in any column.


Null values in roads_df:
No null values in any column.


Null values in natural_df:
No null values in any column.


Null values in landuse_df:
No null values in any column.




A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.dropna(inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.dropna(inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.dropna(inplace=True)


#### Save Dataframes to Raw Folder

In [19]:
# Define the directory path
directory_path = '../../data/raw/urban_landmarks_shp'

# Create the directory if it doesn't exist
if not os.path.exists(directory_path):
    os.makedirs(directory_path)

# Save all dfs as a gdf
for df, name in zip(dfs, df_names):
    gdf = gpd.GeoDataFrame(df)
    gdf.to_file(f"{directory_path.rstrip('/')}/{name}.gpkg", driver='GPKG')

print("All saved!")

  ogr_write(


All saved!


-------

# Feature Engineering 

In [20]:
def landmarks_feature_engineering(df, df_name, feature_col='feature_class', suburb_col='gazetted_locality'):
    """
    This function firstly finds the mode feature from the 'feature_class' column for each gazetted_locality. Then 
    for each feature in the 'feature_class' column, it creates a column which stores the counts for that feature 
    based on each suburb. This is done using a pivot table. The function returns a dataframe storing this 
    pivot table with the first column being each gazetted_locality and the consequent columns storing the counts 
    for each feature with the last column storing the feature mode. 
    """
    
    # Find the mode of 'feature_class' for each 'gazetted_locality'
    mode_df = df.groupby(suburb_col)[feature_col].agg(lambda x: x.mode().iloc[0]).reset_index()
    mode_df.rename(columns={feature_col: f"{df_name}_mode"}, inplace=True)
    
    # Create a pivot table to get counts of each 'feature_class' for each 'gazetted_locality'
    counts_df = pd.pivot_table(df, 
                               index=suburb_col, 
                               columns=feature_col, 
                               aggfunc='size', 
                               fill_value=0).reset_index()
    
    # Merge the mode column with the pivot table
    feature_df = pd.merge(counts_df, mode_df, on=suburb_col)
    
    # Return the final result
    return feature_df

In [21]:
landuse_features = landmarks_feature_engineering(landuse_df, df_name='landuse')
roads_features = landmarks_feature_engineering(roads_df, df_name='roads')
nature_features = landmarks_feature_engineering(natural_df, df_name='nature')
pois_features = landmarks_feature_engineering(pois_df, df_name='pois', feature_col='categories')

Places of Worship

In [22]:
# Count the number of places of worship in each GAZLOC
pofw_count = pofw_df.groupby('gazetted_locality').size().reset_index(name='pofw_count')

# Find the most common religous places by category
pofw_mode = pofw_df.groupby('gazetted_locality')['categories'].agg(lambda x: x.mode().iloc[0]).reset_index()

# Merge the most common category with the gazetted_locality counts
pofw_features = pd.merge(pofw_count, pofw_mode, on='gazetted_locality')

# Rename columns for clarity
pofw_features.rename(columns={'categories': 'pofw_mode'}, inplace=True)

print(pofw_features.head())

  gazetted_locality  pofw_count pofw_mode
0        abbotsford           4  churches
1         albanvale           1   mosques
2       albert park           3  churches
3            albion           3  churches
4         alexandra           2  churches


#### Distance Features

In [23]:
# Extract the suburbs from the address column of the properties dataframe 
properties_df['suburb'] = properties_df['address'].str.extract(r',\s*([A-Za-z\s]+)\s*VIC', expand=False)
properties_df['suburb'] = properties_df['suburb'].str.strip().str.lower()

# Extract the coordinates to a geometry column 
properties_df[['latitude', 'longitude']] = properties_df['coordinates'].str.split(',', expand=True).astype(float)

# Convert properties_df to a gdf
properties_gdf = gpd.GeoDataFrame(
    properties_df, 
    geometry=gpd.points_from_xy(properties_df['longitude'], properties_df['latitude']),
    crs="EPSG:4326"  # Assuming WGS84 coordinate system
)

properties_gdf.head()

Unnamed: 0,url,address,rental_price,rooms,parking,features,desc,availability,bond,property_type,coordinates,suburb,latitude,longitude,geometry
0,https://www.domain.com.au/4012-22-24-jane-bell...,"4012/22-24 Jane Bell Lane, Melbourne VIC 3000",$530 pw,"['1 Bed', '1 Bath']",['1 Parking'],[],This UNFURNISHED one-bedroom apartment is sure...,Available Now,$2303,Apartment / Unit / Flat,"-37.8102191,144.966267",melbourne,-37.810219,144.966267,POINT (144.96627 -37.81022)
1,https://www.domain.com.au/125-131-lonsdale-str...,"125/131 Lonsdale Street, Melbourne VIC 3000",$500 per week,"['1 Bed', '1 Bath']",['− Parking'],[],This inner city spacious and modern 1 bedroom ...,"Available fromMonday, 16th September 2024",$2173,Apartment / Unit / Flat,"-37.810779,144.9685513",melbourne,-37.810779,144.968551,POINT (144.96855 -37.81078)
2,https://www.domain.com.au/2004-618-lonsdale-st...,"2004/618 Lonsdale Street, Melbourne VIC 3000",$470 pw,"['1 Bed', '1 Bath']",['− Parking'],[],Step inside this never been lived in a one-bed...,Available Now,$2042,Apartment / Unit / Flat,"-37.81441450000001,144.9539107",melbourne,-37.814415,144.953911,POINT (144.95391 -37.81441)
3,https://www.domain.com.au/906-238-flinders-str...,"906/238 Flinders Street, Melbourne VIC 3000",$400 per week,"['1 Bed', '1 Bath']",['− Parking'],[],This furnished large size studio has a lot to ...,"Available fromSaturday, 28th September 2024",$1738,Apartment / Unit / Flat,"-37.8175167,144.9664983",melbourne,-37.817517,144.966498,POINT (144.9665 -37.81752)
4,https://www.domain.com.au/715-39-lonsdale-st-m...,"715/39 Lonsdale St, Melbourne VIC 3000",$520 per week,"['2 Beds', '1 Bath']",['− Parking'],[],This fully furnished apartment is in central l...,"Available fromMonday, 30th September 2024",$2260,Apartment / Unit / Flat,"-37.8099061,144.9711071",melbourne,-37.809906,144.971107,POINT (144.97111 -37.80991)


In [24]:
# Convert suburb names and feature_class to lowercase
pois_gdf = filtered_gdfs['pois_gdf']
pois_gdf['gazetted_locality'] = pois_gdf['gazetted_locality'].str.lower()
pois_gdf['feature_class'] = pois_gdf['feature_class'].str.lower()

# Filter the gdf for the feature_class features of interest 
distance_features = [
    'hospital', 'police', 'fire_station', 'kindergarten', 'supermarket',
    'mall', 'library', 'park', 'hotel', 'restaurant']

# For the polygom geometry, calculate the centroid 
pois_gdf['geometry'] = pois_gdf['geometry'].apply(lambda geom: geom.centroid if geom.geom_type == 'Polygon' else geom)

distance_pois = pois_gdf[pois_gdf['feature_class'].isin(distance_features)]

In [25]:
# Convert crs to EPSG:28355 (GDA94 / MGA zone 55) for Melbourne, Australia to get distances in metres
properties_gdf = properties_gdf.to_crs(epsg=28355)
distance_pois = distance_pois.to_crs(epsg=28355)

In [26]:
# Initlaise a list to store the distance from each property to the feature 
distances_list = []

for idx, property_row in properties_gdf.iterrows():
    # Retrieve property geometry and suburb
    property_geom = property_row['geometry']
    property_suburb = property_row['suburb']
    
    # Match on the same subrub 
    suburb_pois = distance_pois[distance_pois['gazetted_locality'] == property_suburb]
    
    if not suburb_pois.empty:
        # based on the feature_class feature, calculate the distance (m) from the property to each POIS
        distances = suburb_pois.copy()
        distances['distance'] = distances['geometry'].apply(lambda x: property_geom.distance(x))
        
        # In the list, store the distance with the corresponding suburb and POIS
        for _, row in distances.iterrows():
            distances_list.append({
                'gazetted_locality': property_suburb,
                'feature_class': row['feature_class'],
                'distance_m': row['distance']  # Distance unit: metres
            })

# List to dataframe 
distances_df = pd.DataFrame(distances_list)

# Distance unit should be in km
distances_df['distance_km'] = distances_df['distance_m'] / 1000

# Calculate the average distance of each suburb to each feature_class feature (km)
average_distances = distances_df.groupby(['gazetted_locality', 'feature_class'])['distance_km'].mean().reset_index()

print(average_distances.head()) 

  gazetted_locality feature_class  distance_km
0        abbotsford         hotel     0.608855
1        abbotsford  kindergarten     1.002205
2        abbotsford       library     1.353868
3        abbotsford          mall     1.093969
4        abbotsford          park     1.129524


In [27]:
# Pivot the average_distances dataframe so each feature_class is a column
avg_dist_pivot = average_distances.pivot_table(
    index='gazetted_locality',
    columns='feature_class',  
    values='distance_km' 
).reset_index()

# Rename the distance columns according to each feature 
avg_dist_pivot.columns = ['gazetted_locality'] + [f'distance_to_{col}' for col in avg_dist_pivot.columns[1:]]

# Fill all NaN values with 0 to signify no distance 
avg_dist_pivot.fillna(0, inplace=True)

avg_dist_pivot

Unnamed: 0,gazetted_locality,distance_to_fire_station,distance_to_hospital,distance_to_hotel,distance_to_kindergarten,distance_to_library,distance_to_mall,distance_to_park,distance_to_police,distance_to_restaurant,distance_to_supermarket
0,abbotsford,0.000000,0.000000,0.608855,1.002205,1.353868,1.093969,1.129524,1.239521,1.141601,1.083238
1,aberfeldie,0.000000,0.000000,0.000000,0.749018,0.680236,0.000000,0.707653,0.000000,0.689880,0.725835
2,aintree,0.000000,0.000000,0.000000,0.702519,0.000000,0.000000,1.024680,0.000000,0.566349,0.573841
3,aireys inlet,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.729931,0.000000,0.517279,0.000000
4,airport west,0.000000,0.000000,0.974140,0.523928,0.000000,1.194920,0.704623,0.000000,1.416179,0.933527
...,...,...,...,...,...,...,...,...,...,...,...
706,yarram,1.190234,0.281745,1.098570,0.000000,1.170774,1.065549,1.395001,0.597102,0.000000,1.051537
707,yarraville,0.000000,0.000000,0.000000,0.953363,0.000000,0.000000,1.139454,0.000000,1.032864,0.937960
708,yarrawonga,1.648800,2.078682,2.333090,1.587461,2.248337,0.000000,1.981987,2.907501,5.249796,2.255740
709,yea,0.441206,0.732877,0.515069,0.460175,0.460139,0.000000,0.702149,0.472234,0.456256,0.460624


#### Create Final Feature Set

In [28]:
# Use an outer join to merge all the seperate feature dataframes on the 'gazetted_locality' column
urban_landmarks_features = landuse_features.merge(roads_features, on='gazetted_locality', how='outer')
urban_landmarks_features = urban_landmarks_features.merge(nature_features, on='gazetted_locality', how='outer')
urban_landmarks_features = urban_landmarks_features.merge(pois_features, on='gazetted_locality', how='outer')
urban_landmarks_features = urban_landmarks_features.merge(pofw_features, on='gazetted_locality', how='outer')
urban_landmarks_features = urban_landmarks_features.merge(avg_dist_pivot, on='gazetted_locality', how='outer')

# Fill Nan vals with 0
urban_landmarks_features.fillna(0, inplace=True)

print(urban_landmarks_features.head())
print(urban_landmarks_features.columns)

  gazetted_locality  commercial_areas  farmland  forest  industrial_areas  \
0          abbeyard               0.0       0.0     8.0               0.0   
1        abbotsford              17.0       4.0     2.0               6.0   
2        aberfeldie               0.0       0.0     0.0               0.0   
3         aberfeldy               0.0       0.0     2.0               0.0   
4           acheron               0.0       0.0     3.0               0.0   

   nature_reserve  park  residential_areas  retail_areas       landuse_mode  \
0             7.0   0.0                0.0           0.0             forest   
1             1.0  15.0               53.0          10.0  residential_areas   
2             0.0   6.0                2.0           3.0               park   
3             0.0   0.0                0.0           0.0             forest   
4             1.0   0.0                0.0           0.0             forest   

   ...  distance_to_fire_station  distance_to_hospital  distan

#### Apply Numerical Indexing to the Mode Columns

In [29]:
# Find all mode columns which end in'_mode'
mode_cols = [col for col in urban_landmarks_features.columns if col.endswith('_mode')]

mode_encoder = {}  # Store the encoders for later reverse transformations

for col in mode_cols:
    encoder = LabelEncoder()

    # Ensure all mode columns are strings 
    urban_landmarks_features[col] = urban_landmarks_features[col].astype(str)
    
    # Numerically encode each column
    urban_landmarks_features[col] = encoder.fit_transform(urban_landmarks_features[col])
    mode_encoder[col] = encoder  

#### Save Final Feature Set

In [30]:
# Save df to a CSV file
urban_landmarks_features.to_csv('../../data/curated/urban_landmarks.csv', index=False)

print("All Saved!")

All Saved!
