### This file is dedicated to put all data we got together

Created by Yuecheng Wang 14-09-2024
Edited by Wanyu and Ran 25-09-2024

First, using shapefile to assignment each property SA2 population and income of area.

In [1]:
import geopandas as gpd
import pandas as pd
import numpy as np
from shapely.geometry import Point

In [2]:
# Path to domain data and SA2
SA2_shapefile_path = "../../data/raw/ABS_SA2/SA2_2021_AUST_GDA2020.shp"
LGA_shapefile_path = "../../data/raw/ABS_LGA/LGA_2024_AUST_GDA94.shp"

income_path = "../../data/raw/Past_income_population_preprocessed/income_forecast_2024_2027.csv"
population_path = "../../data/raw/ABS_population/population_forecast_2024_2027.csv"
crime_path = "../../data/raw/ABS_population/crime_forecast_2024_2027.csv"

property_data_path = "../../data/raw/domain/all_properties_preprocessed.csv"

In [3]:
# Load in datasets
SA2_gdf = gpd.read_file(SA2_shapefile_path)
LGA_gdf = gpd.read_file(LGA_shapefile_path)
property_data = pd.read_csv(property_data_path)

In [4]:
try:
    income = pd.read_csv(
        income_path, 
        encoding='utf-8', 
        na_values=['', ' ', 'NA', 'NaN']
    )
except Exception as e:
    print("Error loading income data:", e)

In [5]:
try:
    population = pd.read_csv(
        population_path, 
        encoding='utf-8', 
        na_values=['', ' ', 'NA', 'NaN']
    )
except Exception as e:
    print("Error loading population data:", e)

In [6]:
try:
    crime = pd.read_csv(
        crime_path, 
        encoding='utf-8', 
        na_values=['', ' ', 'NA', 'NaN']
    )
except Exception as e:
    print("Error loading crime data:", e)

In [7]:
try:
    property = pd.read_csv(
        property_data_path, 
        encoding='utf-8', 
        na_values=['', ' ', 'NA', 'NaN']
    )
except Exception as e:
    print("Error loading property data:", e)

In [8]:
print(population.head())
print(population.info())

# Check specific columns for unexpected NaN values
print("Missing values per column:\n", population.isna().sum())

     area code  population 2024  population 2025  population 2026  \
0  201011001.0     20032.475851     21029.078703     21988.267894   
1  201011002.0     11697.541717     11598.683352     11511.000536   
2  201011005.0      7336.281699      7348.711900      7360.345192   
3  201011006.0     13936.277761     14964.719544     15955.738508   
4  201011007.0      4304.529655      4340.805762      4376.830079   

   population 2027  
0     22911.447978  
1     11433.229915  
2      7371.232667  
3     16910.696389  
4      4412.604354  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 522 entries, 0 to 521
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   area code        522 non-null    float64
 1   population 2024  522 non-null    float64
 2   population 2025  522 non-null    float64
 3   population 2026  522 non-null    float64
 4   population 2027  522 non-null    float64
dtypes: float64(5)
memory usage: 

In [9]:
print(income.head())
print(income.info())

# Check specific columns for unexpected NaN values
print("Missing values per column:\n", income.isna().sum())

      SA2 Code  2024 income  2025 income  2026 income  2027 income
0  201011001.0      11602.4      12177.3      12752.2      13327.1
1  201011002.0       7666.2       7683.5       7700.8       7718.1
2  201011005.0       4499.6       4545.1       4590.6       4636.1
3  201011006.0       7418.2       7849.3       8280.4       8711.5
4  201011007.0       2767.8       2808.0       2848.2       2888.4
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 522 entries, 0 to 521
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   SA2 Code     522 non-null    float64
 1   2024 income  522 non-null    float64
 2   2025 income  522 non-null    float64
 3   2026 income  522 non-null    float64
 4   2027 income  522 non-null    float64
dtypes: float64(5)
memory usage: 20.5 KB
None
Missing values per column:
 SA2 Code       0
2024 income    0
2025 income    0
2026 income    0
2027 income    0
dtype: int64


In [10]:
print(crime.head())
print(crime.info())

# Check specific columns for unexpected NaN values
print("Missing values per column:\n", crime.isna().sum())

  Local Government Area   crime 2024   crime 2025   crime 2026   crime 2027
0                Alpine  2501.430246  2472.826850  2466.194663  2464.656876
1                Ararat  8671.402588  8478.599199  8510.051545  8504.920670
2              Ballarat  7761.059695  7919.863535  7951.434057  7957.710340
3               Banyule  4633.858403  4750.109729  4786.479893  4797.858591
4            Bass Coast  5127.500868  5158.036806  5165.290127  5167.013037
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 81 entries, 0 to 80
Data columns (total 5 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Local Government Area  81 non-null     object 
 1   crime 2024             81 non-null     float64
 2   crime 2025             81 non-null     float64
 3   crime 2026             81 non-null     float64
 4   crime 2027             81 non-null     float64
dtypes: float64(4), object(1)
memory usage: 3.3+ KB
None
Missing values per co

In [11]:
print(property.head())
print(property.info())

# Check specific columns for unexpected NaN values
print("Missing values per column:\n", property.isna().sum())

   Unnamed: 0                                            Address    Cost  \
0           0        8/90 Hambleton Street, Middle Park VIC 3206   410.0   
1           1            3/33 Bevan Street, Albert Park VIC 3206   550.0   
2           3        7/7-9 Faussett Street, Albert Park VIC 3206   490.0   
3           4                               Albert Park VIC 3206  1280.0   
4           5  214/363 Beaconsfield Parade, Middle Park VIC 3206   350.0   

  Property Type  Bedrooms  Bathrooms   Latitude   Longitude  \
0     Apartment         1          1 -37.847553  144.960477   
1     Apartment         1          1 -37.839959  144.956373   
2     Apartment         1          1 -37.841670  144.955332   
3     Apartment         1          1 -37.843861  144.951454   
4     Apartment         1          1 -37.854035  144.961308   

  Closest Gov Secondary School Gov Secondary Distance Age under 20 Age 20-39  \
0          Albert Park College            1.2 km away          36%       15%   
1   

In [13]:
# Fix SA2 code being float issue
population['area code'] = population['area code'].apply(lambda x: str(int(x)) if not pd.isna(x) else '')

# Inspect the corrected 'SA2  code' values
print("Corrected 'SA2  code' values:", population['area code'].unique()[:10])


Corrected 'SA2  code' values: ['201011001' '201011002' '201011005' '201011006' '201011007' '201011008'
 '201011481' '201011482' '201011483' '201011484']


In [14]:
# Fix SA2 code being float issue
income['SA2 Code'] = income['SA2 Code'].apply(lambda x: str(int(x)) if not pd.isna(x) else '')

# Inspect the corrected 'SA2  code' values
print("Corrected 'SA2  code' values:", income['SA2 Code'].unique()[:10])


Corrected 'SA2  code' values: ['201011001' '201011002' '201011005' '201011006' '201011007' '201011008'
 '201011481' '201011482' '201011483' '201011484']


In [15]:
# Modify property to prepare join with shapefile
property_data['geometry'] = property_data.apply(lambda row: Point(row['Longitude'], row['Latitude']), axis=1)
property_gdf = gpd.GeoDataFrame(property_data, geometry='geometry', crs=SA2_gdf.crs)

In [120]:
# Join
mapped_properties = gpd.sjoin(property_gdf, SA2_gdf, how='left', predicate='within')
mapped_properties['SA2_CODE21'] = mapped_properties['SA2_CODE21'].astype(str)

print(mapped_properties.head())

   Unnamed: 0                                            Address    Cost  \
0           0        8/90 Hambleton Street, Middle Park VIC 3206   410.0   
1           1            3/33 Bevan Street, Albert Park VIC 3206   550.0   
2           3        7/7-9 Faussett Street, Albert Park VIC 3206   490.0   
3           4                               Albert Park VIC 3206  1280.0   
4           5  214/363 Beaconsfield Parade, Middle Park VIC 3206   350.0   

  Property Type  Bedrooms  Bathrooms   Latitude   Longitude  \
0     Apartment         1          1 -37.847553  144.960477   
1     Apartment         1          1 -37.839959  144.956373   
2     Apartment         1          1 -37.841670  144.955332   
3     Apartment         1          1 -37.843861  144.951454   
4     Apartment         1          1 -37.854035  144.961308   

  Closest Gov Secondary School Gov Secondary Distance  ... SA4_CODE21  \
0          Albert Park College            1.2 km away  ...        206   
1          Albert 

In [121]:
mapped_properties.columns

Index(['Unnamed: 0', 'Address', 'Cost', 'Property Type', 'Bedrooms',
       'Bathrooms', 'Latitude', 'Longitude', 'Closest Gov Secondary School',
       'Gov Secondary Distance', 'Age under 20', 'Age 20-39', 'Age 40-59',
       'Age 60+', 'Postcode', 'geometry', 'index_right', 'SA2_CODE21',
       'SA2_NAME21', 'CHG_FLAG21', 'CHG_LBL21', 'SA3_CODE21', 'SA3_NAME21',
       'SA4_CODE21', 'SA4_NAME21', 'GCC_CODE21', 'GCC_NAME21', 'STE_CODE21',
       'STE_NAME21', 'AUS_CODE21', 'AUS_NAME21', 'AREASQKM21', 'LOCI_URI21'],
      dtype='object')

In [122]:
# Merge the spatially joined data with population data using the correct column names
merged_data = mapped_properties.merge(
    population, 
    left_on='SA2_CODE21', 
    right_on='area code', 
    how='left'
)

In [123]:
# Merge the spatially joined data with income data using the correct column names
merged_data = merged_data.merge(
    income, 
    left_on='SA2_CODE21', 
    right_on='SA2 Code', 
    how='left'
)

In [124]:
merged_data.columns

Index(['Unnamed: 0', 'Address', 'Cost', 'Property Type', 'Bedrooms',
       'Bathrooms', 'Latitude', 'Longitude', 'Closest Gov Secondary School',
       'Gov Secondary Distance', 'Age under 20', 'Age 20-39', 'Age 40-59',
       'Age 60+', 'Postcode', 'geometry', 'index_right', 'SA2_CODE21',
       'SA2_NAME21', 'CHG_FLAG21', 'CHG_LBL21', 'SA3_CODE21', 'SA3_NAME21',
       'SA4_CODE21', 'SA4_NAME21', 'GCC_CODE21', 'GCC_NAME21', 'STE_CODE21',
       'STE_NAME21', 'AUS_CODE21', 'AUS_NAME21', 'AREASQKM21', 'LOCI_URI21',
       'area code', 'population 2024', 'population 2025', 'population 2026',
       'population 2027', 'SA2 Code', '2024 income', '2025 income',
       '2026 income', '2027 income'],
      dtype='object')

In [125]:
# select features from data merged by SA2
drop_columns = ['geometry', 'index_right', 'SA2_NAME21', 'CHG_FLAG21',
       'CHG_LBL21', 'SA3_CODE21', 'SA3_NAME21', 'SA4_CODE21', 'SA4_NAME21',
       'GCC_CODE21', 'GCC_NAME21', 'STE_CODE21', 'STE_NAME21', 'AUS_CODE21',
       'AUS_NAME21', 'AREASQKM21', 'LOCI_URI21', 'area code', 'SA2 Code', 
       ]

# Filter the merged data to keep only the selected columns
filtered_data = merged_data.drop(drop_columns, axis=1)
filtered_data.columns

Index(['Unnamed: 0', 'Address', 'Cost', 'Property Type', 'Bedrooms',
       'Bathrooms', 'Latitude', 'Longitude', 'Closest Gov Secondary School',
       'Gov Secondary Distance', 'Age under 20', 'Age 20-39', 'Age 40-59',
       'Age 60+', 'Postcode', 'SA2_CODE21', 'population 2024',
       'population 2025', 'population 2026', 'population 2027', '2024 income',
       '2025 income', '2026 income', '2027 income'],
      dtype='object')

In [126]:
# Modify property to prepare join with shapefile
filtered_data['geometry'] = filtered_data.apply(lambda row: Point(row['Longitude'], row['Latitude']), axis=1)
property_gdf_filtered = gpd.GeoDataFrame(filtered_data, geometry='geometry', crs=LGA_gdf.crs)

In [127]:
# Join
filtered_data = gpd.sjoin(property_gdf_filtered, LGA_gdf, how='left', predicate='within')
filtered_data['LGA_NAME24'] = filtered_data['LGA_NAME24'].astype(str)

filtered_data.columns

Index(['Unnamed: 0', 'Address', 'Cost', 'Property Type', 'Bedrooms',
       'Bathrooms', 'Latitude', 'Longitude', 'Closest Gov Secondary School',
       'Gov Secondary Distance', 'Age under 20', 'Age 20-39', 'Age 40-59',
       'Age 60+', 'Postcode', 'SA2_CODE21', 'population 2024',
       'population 2025', 'population 2026', 'population 2027', '2024 income',
       '2025 income', '2026 income', '2027 income', 'geometry', 'index_right',
       'LGA_CODE24', 'LGA_NAME24', 'STE_CODE21', 'STE_NAME21', 'AUS_CODE21',
       'AUS_NAME21', 'AREASQKM', 'LOCI_URI21'],
      dtype='object')

In [128]:
crime['Local Government Area'] = crime['Local Government Area'].astype(str)
crime['Local Government Area'] = crime['Local Government Area'].str.strip()
filtered_data['LGA_NAME24'] = filtered_data['LGA_NAME24'].str.strip()


In [129]:
# Merge the spatially joined data with property and crime data using the correct column names
filtered_merge = filtered_data.merge(
    crime, 
    left_on='LGA_NAME24', 
    right_on='Local Government Area', 
    how='left'
)

In [130]:
filtered_merge.columns

Index(['Unnamed: 0', 'Address', 'Cost', 'Property Type', 'Bedrooms',
       'Bathrooms', 'Latitude', 'Longitude', 'Closest Gov Secondary School',
       'Gov Secondary Distance', 'Age under 20', 'Age 20-39', 'Age 40-59',
       'Age 60+', 'Postcode', 'SA2_CODE21', 'population 2024',
       'population 2025', 'population 2026', 'population 2027', '2024 income',
       '2025 income', '2026 income', '2027 income', 'geometry', 'index_right',
       'LGA_CODE24', 'LGA_NAME24', 'STE_CODE21', 'STE_NAME21', 'AUS_CODE21',
       'AUS_NAME21', 'AREASQKM', 'LOCI_URI21', 'Local Government Area',
       'crime 2024', 'crime 2025', 'crime 2026', 'crime 2027'],
      dtype='object')

In [131]:
drop_columns = ['geometry', 'index_right',  'LGA_NAME24', 'STE_CODE21', 
                    'STE_NAME21', 'AUS_CODE21', 'AUS_NAME21', 'AREASQKM', 'LOCI_URI21',
                    'Local Government Area']

# Filter the merged data to keep only the selected columns
filtered_merge = filtered_merge.drop(drop_columns, axis=1)
filtered_merge.columns

Index(['Unnamed: 0', 'Address', 'Cost', 'Property Type', 'Bedrooms',
       'Bathrooms', 'Latitude', 'Longitude', 'Closest Gov Secondary School',
       'Gov Secondary Distance', 'Age under 20', 'Age 20-39', 'Age 40-59',
       'Age 60+', 'Postcode', 'SA2_CODE21', 'population 2024',
       'population 2025', 'population 2026', 'population 2027', '2024 income',
       '2025 income', '2026 income', '2027 income', 'LGA_CODE24', 'crime 2024',
       'crime 2025', 'crime 2026', 'crime 2027'],
      dtype='object')

In [132]:
filtered_merge.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6283 entries, 0 to 6282
Data columns (total 29 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Unnamed: 0                    6283 non-null   int64  
 1   Address                       6283 non-null   object 
 2   Cost                          6283 non-null   float64
 3   Property Type                 6283 non-null   object 
 4   Bedrooms                      6283 non-null   int64  
 5   Bathrooms                     6283 non-null   int64  
 6   Latitude                      6283 non-null   float64
 7   Longitude                     6283 non-null   float64
 8   Closest Gov Secondary School  5468 non-null   object 
 9   Gov Secondary Distance        5468 non-null   object 
 10  Age under 20                  6210 non-null   object 
 11  Age 20-39                     6210 non-null   object 
 12  Age 40-59                     6210 non-null   object 
 13  Age

# Combine data with all API calculated distance to locations

In [133]:
distance_files = {
    "CBD Distance": "../../data/raw/domain/cbd_distance.csv",
    "Train Distance": "../../data/raw/domain/train_distance.csv",
    "Electricity Distance": "../../data/raw/domain/elec_distance.csv",
    "Hospital Distance": "../../data/raw/domain/hospital_distance.csv",
    "Library Distance": "../../data/raw/domain/lib_distance.csv",
    "Park Distance": "../../data/raw/domain/park_distance.csv",
    "Tourist Attraction Distance": "../../data/raw/domain/tour_distance.csv",
    "Grocery Distance": "../../data/raw/domain/shop_distance.csv"
}

In [134]:
# Join
for column_name, file_path in distance_files.items():
    # Load the CSV file
    new_column_data = pd.read_csv(file_path)
    
    # Check if the number of rows matches the filtered_merge DataFrame
    if len(new_column_data) == len(filtered_merge):
        # Add the new column to the filtered_merge DataFrame
        filtered_merge[column_name] = new_column_data.iloc[:, 0]
    else:
        print(f"Warning: File {file_path} has a different number of rows and was not added.")


In [135]:
print(filtered_merge.head())

   Unnamed: 0                                            Address    Cost  \
0           0        8/90 Hambleton Street, Middle Park VIC 3206   410.0   
1           1            3/33 Bevan Street, Albert Park VIC 3206   550.0   
2           3        7/7-9 Faussett Street, Albert Park VIC 3206   490.0   
3           4                               Albert Park VIC 3206  1280.0   
4           5  214/363 Beaconsfield Parade, Middle Park VIC 3206   350.0   

  Property Type  Bedrooms  Bathrooms   Latitude   Longitude  \
0     Apartment         1          1 -37.847553  144.960477   
1     Apartment         1          1 -37.839959  144.956373   
2     Apartment         1          1 -37.841670  144.955332   
3     Apartment         1          1 -37.843861  144.951454   
4     Apartment         1          1 -37.854035  144.961308   

  Closest Gov Secondary School Gov Secondary Distance  ...   crime 2026  \
0          Albert Park College            1.2 km away  ...  8881.956296   
1          Alb

In [136]:
# Save the updated DataFrame if needed
filtered_merge.to_csv("../../data/raw/individual_property_unpreprocessed.csv", index=False)