## Import Libraries 

In [1]:
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
from shapely.geometry import Point

## Read in Files

#### OSM Shape Files

In [2]:
# Read in the shape files as a Geo dataframe 
# "..._area_..." - indicates features drawn as areas
# "..._line_..." - indicates features drawn as lines/points

landuse_gdf = gpd.read_file('/Users/promaali/Desktop/Desktop Documents/Unimelb 2024/Sem 2/Applied Data Science/A2/Australia_OSM/gis_osm_landuse_a_free_1.shp')
roads_gdf = gpd.read_file('/Users/promaali/Desktop/Desktop Documents/Unimelb 2024/Sem 2/Applied Data Science/A2/Australia_OSM/gis_osm_roads_free_1.shp')

# Dataframes which contain features of areas and points/lines
natural_area_gdf = gpd.read_file('/Users/promaali/Desktop/Desktop Documents/Unimelb 2024/Sem 2/Applied Data Science/A2/Australia_OSM/gis_osm_natural_a_free_1.shp')
natural_line_gdf = gpd.read_file('/Users/promaali/Desktop/Desktop Documents/Unimelb 2024/Sem 2/Applied Data Science/A2/Australia_OSM/gis_osm_natural_free_1.shp')

pofw_area_gdf = gpd.read_file('/Users/promaali/Desktop/Desktop Documents/Unimelb 2024/Sem 2/Applied Data Science/A2/Australia_OSM/gis_osm_pofw_a_free_1.shp')
pofw_line_gdf = gpd.read_file('/Users/promaali/Desktop/Desktop Documents/Unimelb 2024/Sem 2/Applied Data Science/A2/Australia_OSM/gis_osm_pofw_free_1.shp')

pois_area_gdf = gpd.read_file('/Users/promaali/Desktop/Desktop Documents/Unimelb 2024/Sem 2/Applied Data Science/A2/Australia_OSM/gis_osm_pois_a_free_1.shp')
pois_line_gdf = gpd.read_file('/Users/promaali/Desktop/Desktop Documents/Unimelb 2024/Sem 2/Applied Data Science/A2/Australia_OSM/gis_osm_pois_free_1.shp')

#### Other Files

In [3]:
vic_suburbs = gpd.read_file("/Users/promaali/Desktop/Desktop Documents/Unimelb 2024/Sem 2/Applied Data Science/A2/Vic_Localities/gda2020_vicgrid/esrishape/whole_of_dataset/victoria/VMADMIN/LOCALITY_POLYGON.shp")
properties_df = pd.read_csv('../data/raw/scraped_properties.csv')

#### View the Dataframes

In [7]:
# OSM Dataframes
gdfs = {
    'landuse_gdf': landuse_gdf,
    'roads_gdf': roads_gdf,
    'natural_area_gdf': natural_area_gdf,
    'natural_line_gdf': natural_line_gdf,
    'pofw_area_gdf': pofw_area_gdf,
    'pofw_line_gdf': pofw_line_gdf,
    'pois_area_gdf': pois_area_gdf,
    'pois_line_gdf': pois_line_gdf
}

# View the details of each dataframe
for name, gdf in gdfs.items():
    print(f"\n---------------------------- {name} ----------------------------")
    # rename feature column
    gdf.rename(columns={'fclass': 'feature_class'}, inplace=True)
    print("Shape:", gdf.shape)
    print("Columns:", gdf.columns)
    print("Schema:")
    print(gdf.dtypes)
    print("First 5 rows:")
    print(gdf.head())
    print("Unique feauture values:")
    print(gdf['feature_class'].unique())


---------------------------- landuse_gdf ----------------------------
Shape: (490185, 5)
Columns: Index(['osm_id', 'code', 'feature_class', 'name', 'geometry'], dtype='object')
Schema:
osm_id             object
code                int32
feature_class      object
name               object
geometry         geometry
dtype: object
First 5 rows:
    osm_id  code feature_class                           name  \
0  4018601  7202          park                  Anderson Park   
1  4018602  7202          park                      Gold Park   
2  4061402  7202          park                    Queens Park   
3  4085520  7202          park             Ollie Webb Reserve   
4  4282955  7202          park  Brisbane City Botanic Gardens   

                                            geometry  
0  POLYGON ((151.96054 -27.5783, 151.96054 -27.57...  
1  POLYGON ((151.95151 -27.58105, 151.95153 -27.5...  
2  POLYGON ((151.95787 -27.56136, 151.95788 -27.5...  
3  POLYGON ((150.99548 -33.82277, 150.99557 -

In [8]:
# Vic suburbs dataframe 
vic_suburbs.head()

Unnamed: 0,UFI,PFI,LOCALITY,GAZLOC,VICNAMESID,TASK_ID,PFI_CR,UFI_OLD,UFI_CR,LABEL_USE_,geometry
0,831486758,205409448,MOLLONGGHIP,MOLLONGGHIP,102101,,2005-09-27,802469033,2024-02-19,5,"POLYGON ((2417354.185 2445906.94, 2417260.386 ..."
1,831486747,205409756,NORTH BLACKWOOD,NORTH BLACKWOOD,102373,,2005-09-27,802469023,2024-02-19,5,"POLYGON ((2445160.214 2452798.651, 2445239.392..."
2,831486715,205410021,BASALT,BASALT,100297,,2005-09-27,468966022,2024-02-19,5,"POLYGON ((2421860.482 2465074.51, 2421860.053 ..."
3,835229245,205408908,LLANELLY,LLANELLY,101863,,2005-09-27,468965512,2024-03-12,5,"POLYGON ((2394406.798 2527031.044, 2394351.249..."
4,468964855,205409589,MURRABIT WEST,MURRABIT WEST,102236,,2005-09-27,314043656,2013-10-24,5,"POLYGON ((2397518.057 2666586.745, 2397393.211..."


In [None]:
### Should i save to raw folder ??

------

# Preprocessing

## Stack Area and Point/Line Files into One File

In [24]:
# Use the concat function to do this 
natural_gdf = pd.concat([natural_area_gdf, natural_line_gdf], ignore_index=True)
pofw_gdf = pd.concat([pofw_area_gdf, pofw_line_gdf], ignore_index=True)
pois_gdf = pd.concat([pois_area_gdf, pois_line_gdf], ignore_index=True)

In [25]:
print("\n----------------------------- Natural Combined GeoDataFrame -----------------------------")
print(natural_gdf.shape)
print(natural_gdf.head())

print("\n----------------------------- POFW Combined GeoDataFrame -----------------------------")
print(pofw_gdf.shape)
print(pofw_gdf.head())

print("\n----------------------------- POIs Combined GeoDataFrame -----------------------------")
print(pois_gdf.shape)
print(pois_gdf.head())


----------------------------- Natural Combined GeoDataFrame -----------------------------
(477883, 5)
    osm_id  code fclass              name  \
0  4042219  4141  beach  Freshwater Beach   
1  4042355  4141  beach     Dee Why Beach   
2  4042505  4141  beach     Bilgola Beach   
3  4044016  4141  beach  Warriewood Beach   
4  4335014  4141  beach       Whale Beach   

                                            geometry  
0  POLYGON ((151.28911 -33.78233, 151.28913 -33.7...  
1  POLYGON ((151.29679 -33.75261, 151.29681 -33.7...  
2  POLYGON ((151.32709 -33.6466, 151.32718 -33.64...  
3  POLYGON ((151.30826 -33.68926, 151.30879 -33.6...  
4  POLYGON ((151.33089 -33.61265, 151.33106 -33.6...  

----------------------------- POFW Combined GeoDataFrame -----------------------------
(6901, 5)
     osm_id  code              fclass  \
0   4463645  3101  christian_anglican   
1   8746568  3100           christian   
2   9923245  3102  christian_catholic   
3  13127644  3101  christian_angli

## Filter for the Areas within the Victoria Region

In [26]:
# Ensure all OSM files are geodataframes and convert every gdf to the correct CRS
vic_suburbs_gdf = vic_suburbs.to_crs(epsg=4326)
gdfs = {
    'landuse_gdf': gpd.GeoDataFrame(landuse_gdf).to_crs(epsg=4326),
    'roads_gdf': gpd.GeoDataFrame(roads_gdf).to_crs(epsg=4326),
    'natural_gdf': gpd.GeoDataFrame(natural_gdf).to_crs(epsg=4326),
    'pofw_gdf': gpd.GeoDataFrame(pofw_gdf).to_crs(epsg=4326),
    'pois_gdf': gpd.GeoDataFrame(pois_gdf).to_crs(epsg=4326)
}

# Spatial join and filter features that fall within suburb geometries (based on areas that intersect)
def filter_vic_region(gdf, suburbs_gdf):
    filtered_gdf = gpd.sjoin(gdf, suburbs_gdf, how='inner', predicate='intersects')
    return filtered_gdf

# Filter each gdf and store in filtered_gdfs
filtered_gdfs = {}
for name, gdf in gdfs.items():
    filtered_gdfs[name] = filter_vic_region(gdf, vic_suburbs_gdf)

# Display the filtered gdfs
for name, gdf in filtered_gdfs.items():
    print(f"\n----------------------------- Filtered {name} -----------------------------")
    print("Shape after filtering:", gdf.shape)
    print(gdf.head())


----------------------------- Filtered landuse_gdf -----------------------------
Shape after filtering: (142310, 16)
     osm_id  code  fclass                      name  \
61  4394426  7202    park                      None   
62  4396215  7202    park  Scotchmans Creek Reserve   
63  4396215  7201  forest  Scotchmans Creek Reserve   
64  4396376  7202    park  Scotchmans Creek Reserve   
65  4472473  7202    park            Bogong Reserve   

                                             geometry  index_right        UFI  \
61  POLYGON ((145.14888 -37.87904, 145.14899 -37.8...         1815  767125259   
62  POLYGON ((145.14631 -37.88405, 145.14649 -37.8...         1815  767125259   
63  POLYGON ((145.14631 -37.88405, 145.14649 -37.8...         1815  767125259   
64  POLYGON ((145.14971 -37.88515, 145.15003 -37.8...         1815  767125259   
65  POLYGON ((145.15571 -37.88091, 145.15578 -37.8...         1815  767125259   

          PFI       LOCALITY         GAZLOC  VICNAMESID  TASK_ID

## Filter Features

In [27]:
# Keep the columns of fclass, name and geometry in each gdf
for name, gdf in filtered_gdfs.items():
    columns = [col for col in ['fclass', 'GAZLOC', 'geometry'] if col in gdf.columns]
    filtered_gdfs[name] = gdf[columns]

In [28]:
# View the unique features in the dataframes
for name, gdf in filtered_gdfs.items():
    print(f"\n---------------------------- {name} ----------------------------")
    print(gdf.shape)
    print("Features:")
    print(gdf['fclass'].unique())
    print(gdf.head())


---------------------------- landuse_gdf ----------------------------
(142310, 3)
Features:
['park' 'forest' 'cemetery' 'industrial' 'retail' 'residential'
 'recreation_ground' 'nature_reserve' 'commercial' 'allotments' 'grass'
 'quarry' 'military' 'vineyard' 'scrub' 'farmyard' 'farmland' 'meadow'
 'heath' 'orchard']
    fclass         GAZLOC                                           geometry
61    park  GLEN WAVERLEY  POLYGON ((145.14888 -37.87904, 145.14899 -37.8...
62    park  GLEN WAVERLEY  POLYGON ((145.14631 -37.88405, 145.14649 -37.8...
63  forest  GLEN WAVERLEY  POLYGON ((145.14631 -37.88405, 145.14649 -37.8...
64    park  GLEN WAVERLEY  POLYGON ((145.14971 -37.88515, 145.15003 -37.8...
65    park  GLEN WAVERLEY  POLYGON ((145.15571 -37.88091, 145.15578 -37.8...

---------------------------- roads_gdf ----------------------------
(852590, 3)
Features:
['primary' 'tertiary' 'secondary' 'residential' 'trunk' 'motorway_link'
 'cycleway' 'motorway' 'trunk_link' 'service' 'unclassi

#### Select only the relevant features from each dataframe 

Land Use

In [29]:
# Convert geodataframe to a dataframe 
landuse_df = filtered_gdfs['landuse_gdf']

# landuse features to include 
landuse_features = ['park', 'forest', 'industrial', 'retail', 'residential', 'nature_reserve', 'commercial', 'farmland']

# Filter the dataframe to only include the 'landuse_features'
landuse_df = landuse_df[landuse_df['fclass'].isin(landuse_features)]

# Change 'residential' to be 'residential land' to avoid name conflicts
landuse_df.loc[:, 'fclass'] = landuse_df['fclass'].replace({'residential': 'residential areas', 'commercial': 'commercial areas', 
'retail': 'retail areas', 'industrial': 'industrial areas'})

print(landuse_df.shape)
print(landuse_df.head())

(80210, 3)
    fclass         GAZLOC                                           geometry
61    park  GLEN WAVERLEY  POLYGON ((145.14888 -37.87904, 145.14899 -37.8...
62    park  GLEN WAVERLEY  POLYGON ((145.14631 -37.88405, 145.14649 -37.8...
63  forest  GLEN WAVERLEY  POLYGON ((145.14631 -37.88405, 145.14649 -37.8...
64    park  GLEN WAVERLEY  POLYGON ((145.14971 -37.88515, 145.15003 -37.8...
65    park  GLEN WAVERLEY  POLYGON ((145.15571 -37.88091, 145.15578 -37.8...


Roads

In [30]:
# Convert geodataframe to a dataframe 
roads_df = filtered_gdfs['roads_gdf']

# roads features to include 
roads_features = ['primary', 'tertiary', 'secondary', 'residential',
                'cycleway', 'motorway', 'pedestrian','path', 'footway', 'track']

# Filter the dataframe to only include the 'roads_features'
roads_df = roads_df[roads_df['fclass'].isin(roads_features)]

# primary', 'tertiary', and 'secondary' = 'main roads' 
roads_df.loc[:, 'fclass'] = roads_df['fclass'].replace({'primary': 'main roads', 'tertiary': 'main roads', 
'secondary': 'main roads', 'residential': 'residential roads', 'pedestrian': 'walking paths', 'path': 'walking paths',
'footway': 'walking paths'})

print(roads_df.shape)
print(roads_df.head())

(595165, 3)
          fclass         GAZLOC  \
1880  main roads  HAWTHORN EAST   
1881  main roads       HAWTHORN   
1882  main roads       HAWTHORN   
1883  main roads       HAWTHORN   
1884  main roads       HAWTHORN   

                                               geometry  
1880  LINESTRING (145.05555 -37.83032, 145.05561 -37...  
1881  LINESTRING (145.03561 -37.82122, 145.03562 -37...  
1882  LINESTRING (145.03701 -37.8206, 145.03736 -37....  
1883  LINESTRING (145.03971 -37.82313, 145.03972 -37...  
1884  LINESTRING (145.02378 -37.82126, 145.02384 -37...  


Nature

In [31]:
# Convert geodataframe to a dataframe 
natural_df = filtered_gdfs['natural_gdf']

# roads features to include 
nature_features = ['beach', 'cliff', 'spring', 'peak']

# Filter the dataframe to only include the 'nature_features'
natural_df = natural_df[natural_df['fclass'].isin(nature_features)]

print(natural_df.shape)
print(natural_df.head())

(3238, 3)
    fclass        GAZLOC                                           geometry
96   beach       VENTNOR  POLYGON ((145.20014 -38.51713, 145.2008 -38.51...
102  beach  SMITHS BEACH  POLYGON ((145.25259 -38.50482, 145.25299 -38.5...
112  beach      SORRENTO  POLYGON ((144.72577 -38.34608, 144.72597 -38.3...
132  beach      BRIGHTON  POLYGON ((144.98556 -37.9264, 144.98559 -37.92...
136  beach       VENTNOR  POLYGON ((145.14956 -38.51159, 145.14985 -38.5...


Places of Worship

In [32]:
pofw_categories = {
     "Christianity": ['christian', 'christian_anglican', 'christian_catholic', 'christian_lutheran', 
                      'christian_orthodox', 'christian_evangelical', 'christian_protestant', 'christian_methodist'],
    "Islam": ['muslim', 'muslim_sunni', 'muslim_shia'],
    "Judaism": ['jewish'],
    "Buddhism": ['buddhist'],
    "Sikhism": ['sikh'],
    "Hinduism": ['hindu'],
    "Taoism": ['taoist']
}

# Convert geodataframe to a dataframe 
pofw_df = filtered_gdfs['pofw_gdf']

# map each feature to the corresponding category using this reverse mapping dictionary
feature_mapper = {feature: category for category, features in pofw_categories.items() for feature in features}

# Create a new column called categories which acts as a tag for each feature
pofw_df['categories'] = pofw_df['fclass'].map(feature_mapper)

print(pofw_df.shape)
print(pofw_df.head())

(1684, 4)
               fclass         GAZLOC  \
0  christian_anglican  GLEN WAVERLEY   
1           christian  GLEN WAVERLEY   
2  christian_catholic         COBURG   
3  christian_anglican      MELBOURNE   
4  christian_lutheran      SOUTHBANK   

                                            geometry    categories  
0  POLYGON ((145.17019 -37.8913, 145.17048 -37.88...  Christianity  
1  POLYGON ((145.14844 -37.87265, 145.14847 -37.8...  Christianity  
2  POLYGON ((144.9525 -37.75004, 144.95256 -37.74...  Christianity  
3  POLYGON ((144.96736 -37.81662, 144.96739 -37.8...  Christianity  
4  POLYGON ((144.96685 -37.82063, 144.96688 -37.8...  Christianity  


Places of Interest

In [33]:
# For each feature in the POIS dataframe, manually create categories for them
pois_categories = {
    "Public Amenities and Services": ['hospital', 'police', 'courthouse', 'clinic', 'pharmacy', 'doctors', 'nursing_home', 'dentist', 'embassy', 'post_office', 'fire_station', 'community_centre', 'library', 'wastewater_plant'],
    "Educational Institutions": ['school', 'college', 'university', 'kindergarten'],
    "Recreational Facilities": ['park', 'sports_centre', 'stadium', 'zoo', 'museum', 'cinema', 'theatre', 'arts_centre', 'nightclub', 'theme_park', 'ice_rink', 'swimming_pool', 'playground', 'dog_park', 'golf_course', 'camp_site', 'picnic_site', 'observation_tower', 'tower', 'chalet', 'alpine_hut', 'hunting_stand'],
    "Food and Beverage": ['restaurant', 'cafe', 'pub', 'fast_food', 'bar', 'bakery', 'food_court', 'beverages', 'biergarten'],
    "Shopping and Retail": ['mall', 'supermarket', 'department_store', 'convenience', 'furniture_shop', 'outdoor_shop', 'computer_shop', 'stationery', 'car_dealership', 'car_rental', 'shoe_shop', 'florist', 'clothes', 'gift_shop', 'toy_shop', 'sports_shop', 'greengrocer', 'beauty_shop', 'newsagent', 'laundry', 'optician', 'jeweller', 'mobile_phone_shop'],
    "Tourism and Attractions": ['attraction', 'monument', 'castle', 'fort', 'memorial', 'ruins', 'archaeological', 'viewpoint', 'lighthouse', 'tourist_info'],
    "Transportation": ['caravan_site', 'car_wash', 'bicycle_shop', 'bicycle_rental', 'car_sharing'],
    "Utility and General Services": ['atm', 'bank', 'post_box', 'waste_basket', 'drinking_water', 'telephone', 'kiosk', 'vending_any', 'vending_machine', 'vending_parking', 'toilet', 'camera_surveillance'],
    "Nature and Outdoors": ['garden_centre', 'water_works', 'water_tower', 'water_well', 'track', 'fountain', 'bench', 'windmill', 'battlefield', 'wayside_shrine', 'wayside_cross'],
    "Recycling and Waste Management": ['recycling', 'recycling_glass', 'recycling_paper', 'recycling_metal', 'recycling_clothes'],
    "Accommodation": ['hotel', 'motel', 'guesthouse', 'hostel']
}

# Convert geodataframe to a dataframe 
pois_df = filtered_gdfs['pois_gdf']

# map each feature to the corresponding category using this reverse mapping dictionary
feature_mapper = {feature: category for category, features in pois_categories.items() for feature in features}

# Create a new column called categories which acts as a tag for each feature
pois_df['categories'] = pois_df['fclass'].map(feature_mapper)

print(pois_df.shape)
print(pois_df.head())

(135592, 4)
           fclass         GAZLOC  \
95           park  GLEN WAVERLEY   
96           park  GLEN WAVERLEY   
97           park  GLEN WAVERLEY   
98  sports_centre  GLEN WAVERLEY   
99        college  GLEN WAVERLEY   

                                             geometry  \
95  POLYGON ((145.14888 -37.87904, 145.14899 -37.8...   
96  POLYGON ((145.14631 -37.88405, 145.14649 -37.8...   
97  POLYGON ((145.14971 -37.88515, 145.15003 -37.8...   
98  POLYGON ((145.15462 -37.88899, 145.15481 -37.8...   
99  POLYGON ((145.17156 -37.89345, 145.17195 -37.8...   

                  categories  
95   Recreational Facilities  
96   Recreational Facilities  
97   Recreational Facilities  
98   Recreational Facilities  
99  Educational Institutions  


## Removing Null Values

In [34]:
# List of the OSM dataframes
dfs = [pois_df, pofw_df, roads_df, natural_df, landuse_df]
df_names = ['pois_df', 'pofw_df', 'roads_df', 'natural_df', 'landuse_df']

# Check for null values in each column of the dataframes 
for df, name in zip(dfs, df_names):
    print(f"Null values in {name}:")
    # Find the number of null vals in each col
    null_columns = df.isnull().sum()
    # Filter these cols for the null vals 
    null_columns = null_columns[null_columns > 0]
    print(null_columns if not null_columns.empty else "No null values in any column.")
    print("\n")
    # Drop any rows with null vals 
    df.dropna(inplace=True)

Null values in pois_df:
categories    29635
dtype: int64


Null values in pofw_df:
No null values in any column.


Null values in roads_df:
No null values in any column.


Null values in natural_df:
No null values in any column.


Null values in landuse_df:
No null values in any column.




A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.dropna(inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.dropna(inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.dropna(inplace=True)


#### Save Dataframes to Raw Folder

In [35]:
directory_path = '../data/raw/'

# Save all dfs as a gdf
for df, name in zip(dfs, df_names):
    gdf = gpd.GeoDataFrame(df)
    gdf.to_file(f"{directory_path.rstrip('/')}/{name}.gpkg", driver='GPKG')

print("All saved!")

All saved!


-------

# Feature Engineering 

In [36]:
def landmarks_feature_engineering(df, df_name, fclass_column='fclass', GAZLOC_column='GAZLOC'):
    """
    This function firstly finds the mode feature from the 'fclass' column for each GAZLOC. Then 
    for each feature in the 'fclass' column, it creates a column which stores the counts for that feature 
    based on each suburb. This is done using a pivot table. The function returns a dataframe storing this 
    pivot table with the first column being each GAZLOC and the consequent columns storing the counts 
    for each feature with the last column storing the feature mode. 
    """
    
    # Find the mode of 'fclass' for each GAZLOC
    mode_df = df.groupby(GAZLOC_column)[fclass_column].agg(lambda x: x.mode().iloc[0]).reset_index()
    mode_df.rename(columns={fclass_column: f"{df_name} mode"}, inplace=True)
    
    # Create a pivot table to get counts of each 'fclass' for each 'GAZLOC'
    counts_df = pd.pivot_table(df, 
                               index=GAZLOC_column, 
                               columns=fclass_column, 
                               aggfunc='size', 
                               fill_value=0).reset_index()
    
    # Merge the mode column with the pivot table
    feature_df = pd.merge(counts_df, mode_df, on=GAZLOC_column)
    
    # Return the final result
    return feature_df

In [37]:
landuse_features = landmarks_feature_engineering(landuse_df, df_name='landuse')
roads_features = landmarks_feature_engineering(roads_df, df_name='roads')
nature_features = landmarks_feature_engineering(natural_df, df_name='nature')
pois_features = landmarks_feature_engineering(pois_df, df_name='pois', fclass_column='categories')

Places of Worship

In [38]:
# Count the number of places of worship in each GAZLOC
pofw_count = pofw_df.groupby('GAZLOC').size().reset_index(name='pofw count')

# Find the most common religous places by category
pofw_mode = pofw_df.groupby('GAZLOC')['categories'].agg(lambda x: x.mode().iloc[0]).reset_index()

# Merge the most common category with the GAZLOC counts
pofw_features = pd.merge(pofw_count, pofw_mode, on='GAZLOC')

# Rename columns for clarity
pofw_features.rename(columns={'categories': 'pofw mode'}, inplace=True)

print(pofw_features.head())

        GAZLOC  pofw count     pofw mode
0   ABBOTSFORD           4  Christianity
1    ALBANVALE           1         Islam
2  ALBERT PARK           3  Christianity
3       ALBION           3  Christianity
4    ALEXANDRA           2  Christianity


#### Distance Features

In [62]:
# Extract the suburbs from the address column of the properties dataframe 
properties_df['suburb'] = properties_df['address'].str.extract(r',\s*([A-Za-z\s]+)\s*VIC', expand=False)
properties_df['suburb'] = properties_df['suburb'].str.strip().str.lower()

# Extract the coordinates to a geometry column 
properties_df[['latitude', 'longitude']] = properties_df['coordinates'].str.split(',', expand=True).astype(float)

# Convert properties_df to a gdf
properties_gdf = gpd.GeoDataFrame(
    properties_df, 
    geometry=gpd.points_from_xy(properties_df['longitude'], properties_df['latitude']),
    crs="EPSG:4326"  # Assuming WGS84 coordinate system
)

properties_gdf.head()

Unnamed: 0,url,address,rental_price,rooms,parking,features,desc,availability,bond,property_type,coordinates,suburb,latitude,longitude,geometry
0,https://www.domain.com.au/4012-22-24-jane-bell...,"4012/22-24 Jane Bell Lane, Melbourne VIC 3000",$530 pw,"['1 Bed', '1 Bath']",['1 Parking'],[],This UNFURNISHED one-bedroom apartment is sure...,Available Now,$2303,Apartment / Unit / Flat,"-37.8102191,144.966267",melbourne,-37.810219,144.966267,POINT (144.96627 -37.81022)
1,https://www.domain.com.au/125-131-lonsdale-str...,"125/131 Lonsdale Street, Melbourne VIC 3000",$500 per week,"['1 Bed', '1 Bath']",['− Parking'],[],This inner city spacious and modern 1 bedroom ...,"Available fromMonday, 16th September 2024",$2173,Apartment / Unit / Flat,"-37.810779,144.9685513",melbourne,-37.810779,144.968551,POINT (144.96855 -37.81078)
2,https://www.domain.com.au/2004-618-lonsdale-st...,"2004/618 Lonsdale Street, Melbourne VIC 3000",$470 pw,"['1 Bed', '1 Bath']",['− Parking'],[],Step inside this never been lived in a one-bed...,Available Now,$2042,Apartment / Unit / Flat,"-37.81441450000001,144.9539107",melbourne,-37.814415,144.953911,POINT (144.95391 -37.81441)
3,https://www.domain.com.au/906-238-flinders-str...,"906/238 Flinders Street, Melbourne VIC 3000",$400 per week,"['1 Bed', '1 Bath']",['− Parking'],[],This furnished large size studio has a lot to ...,"Available fromSaturday, 28th September 2024",$1738,Apartment / Unit / Flat,"-37.8175167,144.9664983",melbourne,-37.817517,144.966498,POINT (144.9665 -37.81752)
4,https://www.domain.com.au/715-39-lonsdale-st-m...,"715/39 Lonsdale St, Melbourne VIC 3000",$520 per week,"['2 Beds', '1 Bath']",['− Parking'],[],This fully furnished apartment is in central l...,"Available fromMonday, 30th September 2024",$2260,Apartment / Unit / Flat,"-37.8099061,144.9711071",melbourne,-37.809906,144.971107,POINT (144.97111 -37.80991)


In [63]:
# Convert suburb names and fclass to lowercase
pois_gdf = filtered_gdfs['pois_gdf']
pois_gdf['GAZLOC'] = pois_gdf['GAZLOC'].str.lower()
pois_gdf['fclass'] = pois_gdf['fclass'].str.lower()

# Filter the gdf for the fclass features of interest 
distance_features = [
    'hospital', 'holice', 'fire_station', 'kindergarten', 'supermarket',
    'mall', 'library', 'park', 'hotel', 'restaurant']

# For the polygom geometry, calculate the centroid 
pois_gdf['geometry'] = pois_gdf['geometry'].apply(lambda geom: geom.centroid if geom.geom_type == 'Polygon' else geom)

distance_pois = pois_gdf[pois_gdf['fclass'].isin(distance_features)]

In [None]:
# Convert crs to EPSG:28355 (GDA94 / MGA zone 55) for Melbourne, Australia to get distances in metres
properties_gdf = properties_gdf.to_crs(epsg=28355)
distance_pois = distance_pois.to_crs(epsg=28355)

In [69]:
# Initlaise a list to store the distance from each property to the feature 
distances_list = []

for idx, property_row in properties_gdf.iterrows():
    # Retrieve property geometry and suburb
    property_geom = property_row['geometry']
    property_suburb = property_row['suburb']
    
    # Match on the same subrub 
    suburb_pois = distance_pois[distance_pois['GAZLOC'] == property_suburb]
    
    if not suburb_pois.empty:
        # based on the fclass feature, calculate the distance (m) from the property to each POIS
        distances = suburb_pois.copy()
        distances['distance'] = distances['geometry'].apply(lambda x: property_geom.distance(x))
        
        # In the list, store the distance with the corresponding suburb and POIS
        for _, row in distances.iterrows():
            distances_list.append({
                'suburb': property_suburb,
                'fclass': row['fclass'],
                'distance_m': row['distance']  # Distance unit: metres
            })

# List to dataframe 
distances_df = pd.DataFrame(distances_list)

# Distance unit should be in km
distances_df['distance_km'] = distances_df['distance_m'] / 1000

# Calculate the average distance of each suburb to each fclass feature (km)
average_distances = distances_df.groupby(['suburb', 'fclass'])['distance_km'].mean().reset_index()

print(average_distances.head())

Example suburb from properties_gdf: ['melbourne' nan 'east melbourne' 'west melbourne' 'southbank']
Example GAZLOC from distance_pois: ['glen waverley' 'mount waverley' 'clayton' 'ringwood east'
 'east melbourne']
Processing property in suburb: melbourne
Number of POIs in suburb 'melbourne': 616
Processing property in suburb: melbourne
Number of POIs in suburb 'melbourne': 616
Processing property in suburb: melbourne
Number of POIs in suburb 'melbourne': 616
Processing property in suburb: melbourne
Number of POIs in suburb 'melbourne': 616
Processing property in suburb: melbourne
Number of POIs in suburb 'melbourne': 616
Processing property in suburb: melbourne
Number of POIs in suburb 'melbourne': 616
Processing property in suburb: melbourne
Number of POIs in suburb 'melbourne': 616
Processing property in suburb: melbourne
Number of POIs in suburb 'melbourne': 616
Processing property in suburb: melbourne
Number of POIs in suburb 'melbourne': 616
Processing property in suburb: melbourn

#### Create Final Feature Set

In [22]:
# Use an outer join to merge all the seperate feature dataframes on the 'GAZLOC' column
urban_landmarks_features = landuse_features.merge(roads_features, on='GAZLOC', how='outer')
urban_landmarks_features = urban_landmarks_features.merge(nature_features, on='GAZLOC', how='outer')
urban_landmarks_features = urban_landmarks_features.merge(pois_features, on='GAZLOC', how='outer')
urban_landmarks_features = urban_landmarks_features.merge(pofw_features, on='GAZLOC', how='outer')

# Fill Nan vals with 0
urban_landmarks_features.fillna(0, inplace=True)

print(urban_landmarks_features.head())
print(urban_landmarks_features.columns)

       GAZLOC  commercial areas  farmland  forest  industrial areas  \
0    ABBEYARD               0.0       0.0     8.0               0.0   
1  ABBOTSFORD              17.0       4.0     2.0               6.0   
2  ABERFELDIE               0.0       0.0     0.0               0.0   
3   ABERFELDY               0.0       0.0     2.0               0.0   
4     ACHERON               0.0       0.0     3.0               0.0   

   nature_reserve  park  residential areas  retail areas       landuse mode  \
0             7.0   0.0                0.0           0.0             forest   
1             1.0  15.0               53.0          10.0  residential areas   
2             0.0   6.0                2.0           3.0               park   
3             0.0   0.0                0.0           0.0             forest   
4             1.0   0.0                0.0           0.0             forest   

   ...  Public Amenities and Services  Recreational Facilities  \
0  ...                          

#### Save Final Feature Set

In [23]:
# Save df to a CSV file
urban_landmarks_features.to_csv('../data/raw/urban_landmarks_features.csv', index=False)

print("All Saved!")

All Saved!
