### This file is dedicated to put all data we got together

Created by Yuecheng Wang 14-09-2024
Edited by Wanyu and Ran 25-09-2024

First, using shapefile to assignment each property SA2 population and income of area.

In [1]:
import geopandas as gpd
import pandas as pd
import numpy as np
from shapely.geometry import Point

In [2]:
# Path to domain data and SA2
SA2_shapefile_path = "../../data/raw/ABS_SA2/SA2_2021_AUST_GDA2020.shp"
LGA_shapefile_path = "../../data/raw/ABS_LGA/LGA_2024_AUST_GDA94.shp"

income_path = "../../data/raw/Past_income_population_preprocessed/income_2020_filtered.csv"
population_path = "../../data/raw/Past_income_population_preprocessed/past_population_data_filtered.csv"
crime_path = "../../data/raw/Crime/Crime_info.csv"

property_data_path = "../../data/raw/domain/all_properties_preprocessed.csv"

In [3]:
# Load in datasets
SA2_gdf = gpd.read_file(SA2_shapefile_path)
LGA_gdf = gpd.read_file(LGA_shapefile_path)
property_data = pd.read_csv(property_data_path)

LGA | current crime | crime growth rate

In [4]:
try:
    income = pd.read_csv(
        income_path, 
        encoding='utf-8', 
        na_values=['', ' ', 'NA', 'NaN']
    )
except Exception as e:
    print("Error loading income data:", e)

In [5]:
try:
    population = pd.read_csv(
        population_path, 
        encoding='utf-8', 
        na_values=['', ' ', 'NA', 'NaN']
    )
except Exception as e:
    print("Error loading population data:", e)

In [6]:
try:
    crime = pd.read_csv(
        crime_path, 
        encoding='utf-8', 
        na_values=['', ' ', 'NA', 'NaN']
    )
except Exception as e:
    print("Error loading crime data:", e)

In [20]:
try:
    property = pd.read_csv(
        property_data_path, 
        encoding='utf-8', 
        na_values=['', ' ', 'NA', 'NaN']
    )
except Exception as e:
    print("Error loading property data:", e)

In [7]:
print(population.head())
print(population.info())

# Check specific columns for unexpected NaN values
print("Missing values per column:\n", population.isna().sum())

      SA2 code  population 2001  population 2002  population 2003  \
0  201011001.0           5756.0           6092.0           6293.0   
1  201011002.0          11497.0          11708.0          12015.0   
2  201011005.0           5320.0           5399.0           5557.0   
3  201011006.0           4154.0           4225.0           4371.0   
4  201011007.0           3317.0           3378.0           3411.0   

   population 2004  population 2005  population 2006  population 2007  \
0           6480.0           6648.0           6761.0           7034.0   
1          12189.0          12269.0          12356.0          12408.0   
2           5620.0           5857.0           6037.0           6131.0   
3           4465.0           4704.0           5041.0           5206.0   
4           3473.0           3508.0           3542.0           3594.0   

   population 2008  population 2009  ...  population 2014  population 2015  \
0           7272.0           7614.0  ...          10338.0          1

In [8]:
print(income.head())
print(income.info())

# Check specific columns for unexpected NaN values
print("Missing values per column:\n", income.isna().sum())

    SA2 Code 2016 income 2017 income 2018 income 2019 income 2020 income
0  201011001       7,117       7,558       7,987       8,665       9,438
1  201011002       7,465       7,587       7,592       7,646       7,522
2  201011005       4,114       4,196       4,250       4,267       4,306
3  201011006       4,086       4,315       4,723       5,270       5,764
4  201011007       2,407       2,506       2,582       2,554       2,584
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 522 entries, 0 to 521
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   SA2 Code     522 non-null    int64 
 1   2016 income  522 non-null    object
 2   2017 income  522 non-null    object
 3   2018 income  522 non-null    object
 4   2019 income  522 non-null    object
 5   2020 income  522 non-null    object
dtypes: int64(1), object(5)
memory usage: 24.6+ KB
None
Missing values per column:
 SA2 Code       0
2016 income    0
2017 income 

In [9]:
print(crime.head())
print(crime.info())

# Check specific columns for unexpected NaN values
print("Missing values per column:\n", crime.isna().sum())

   Year Year ending       Police Region Local Government Area  \
0  2024       March  1 North West Metro               Banyule   
1  2024       March  1 North West Metro              Brimbank   
2  2024       March  1 North West Metro               Darebin   
3  2024       March  1 North West Metro           Hobsons Bay   
4  2024       March  1 North West Metro                  Hume   

   Incidents Recorded  Rate per 100,000 population  
0                6168                  4633.858403  
1               13401                  6762.297792  
2               10916                  6848.291671  
3                4660                  4909.951604  
4               14031                  5188.707896  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 870 entries, 0 to 869
Data columns (total 6 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Year                         870 non-null    int64  
 1   Year end

In [21]:
print(property.head())
print(property.info())

# Check specific columns for unexpected NaN values
print("Missing values per column:\n", property.isna().sum())

                                       Address                       Cost  \
0  8/90 Hambleton Street, Middle Park VIC 3206                    $410.00   
1      3/33 Bevan Street, Albert Park VIC 3206           $550.00 per week   
2  8/90 Hambleton Street, Middle Park VIC 3206                    $410.00   
3  7/7-9 Faussett Street, Albert Park VIC 3206  $490 Per Week I Furnished   
4                         Albert Park VIC 3206            $1,280 per week   

  Property Type  Bedrooms  Bathrooms   Latitude   Longitude  \
0     Apartment         1          1 -37.847553  144.960477   
1     Apartment         1          1 -37.839959  144.956373   
2     Apartment         1          1 -37.847553  144.960477   
3     Apartment         1          1 -37.841670  144.955332   
4     Apartment         1          1 -37.843861  144.951454   

  Closest Gov Secondary School Gov Secondary Distance Age under 20 Age 20-39  \
0          Albert Park College            1.2 km away          36%       15%  

In [10]:
# Fix SA2 code being float issue
population['SA2 Code'] = population['SA2 code'].apply(lambda x: str(int(x)) if not pd.isna(x) else '')

# Inspect the corrected 'SA2  code' values
print("Corrected 'SA2  code' values:", population['SA2 Code'].unique()[:10])


Corrected 'SA2  code' values: ['201011001' '201011002' '201011005' '201011006' '201011007' '201011008'
 '201011481' '201011482' '201011483' '201011484']


In [11]:
# Fix SA2 code being float issue
income['SA2 Code'] = income['SA2 Code'].apply(lambda x: str(int(x)) if not pd.isna(x) else '')

# Inspect the corrected 'SA2  code' values
print("Corrected 'SA2  code' values:", income['SA2 Code'].unique()[:10])


Corrected 'SA2  code' values: ['201011001' '201011002' '201011005' '201011006' '201011007' '201011008'
 '201011481' '201011482' '201011483' '201011484']


In [12]:
# Modify property to prepare join with shapefile
property_data['geometry'] = property_data.apply(lambda row: Point(row['Longitude'], row['Latitude']), axis=1)
property_gdf = gpd.GeoDataFrame(property_data, geometry='geometry', crs=SA2_gdf.crs)

In [13]:
# Join
mapped_properties = gpd.sjoin(property_gdf, SA2_gdf, how='left', predicate='within')
mapped_properties['SA2_CODE21'] = mapped_properties['SA2_CODE21'].astype(str)

print(mapped_properties.head())

                                       Address                       Cost  \
0  8/90 Hambleton Street, Middle Park VIC 3206                    $410.00   
1      3/33 Bevan Street, Albert Park VIC 3206           $550.00 per week   
2  8/90 Hambleton Street, Middle Park VIC 3206                    $410.00   
3  7/7-9 Faussett Street, Albert Park VIC 3206  $490 Per Week I Furnished   
4                         Albert Park VIC 3206            $1,280 per week   

  Property Type  Bedrooms  Bathrooms   Latitude   Longitude  \
0     Apartment         1          1 -37.847553  144.960477   
1     Apartment         1          1 -37.839959  144.956373   
2     Apartment         1          1 -37.847553  144.960477   
3     Apartment         1          1 -37.841670  144.955332   
4     Apartment         1          1 -37.843861  144.951454   

  Closest Gov Secondary School Gov Secondary Distance Age under 20  ...  \
0          Albert Park College            1.2 km away          36%  ...   
1       

In [14]:
mapped_properties.columns

Index(['Address', 'Cost', 'Property Type', 'Bedrooms', 'Bathrooms', 'Latitude',
       'Longitude', 'Closest Gov Secondary School', 'Gov Secondary Distance',
       'Age under 20', 'Age 20-39', 'Age 40-59', 'Age 60+', 'Postcode',
       'geometry', 'index_right', 'SA2_CODE21', 'SA2_NAME21', 'CHG_FLAG21',
       'CHG_LBL21', 'SA3_CODE21', 'SA3_NAME21', 'SA4_CODE21', 'SA4_NAME21',
       'GCC_CODE21', 'GCC_NAME21', 'STE_CODE21', 'STE_NAME21', 'AUS_CODE21',
       'AUS_NAME21', 'AREASQKM21', 'LOCI_URI21'],
      dtype='object')

In [15]:
# Merge the spatially joined data with population data using the correct column names
merged_data = mapped_properties.merge(
    population, 
    left_on='SA2_CODE21', 
    right_on='SA2 Code', 
    how='left'
)

In [16]:
# Merge the spatially joined data with income data using the correct column names
merged_data = merged_data.merge(
    income, 
    left_on='SA2_CODE21', 
    right_on='SA2 Code', 
    how='left'
)

In [19]:
merged_data.columns

Index(['Address', 'Cost', 'Property Type', 'Bedrooms', 'Bathrooms', 'Latitude',
       'Longitude', 'Closest Gov Secondary School', 'Gov Secondary Distance',
       'Age under 20', 'Age 20-39', 'Age 40-59', 'Age 60+', 'Postcode',
       'geometry', 'index_right', 'SA2_CODE21', 'SA2_NAME21', 'CHG_FLAG21',
       'CHG_LBL21', 'SA3_CODE21', 'SA3_NAME21', 'SA4_CODE21', 'SA4_NAME21',
       'GCC_CODE21', 'GCC_NAME21', 'STE_CODE21', 'STE_NAME21', 'AUS_CODE21',
       'AUS_NAME21', 'AREASQKM21', 'LOCI_URI21', 'SA2 code', 'population 2001',
       'population 2002', 'population 2003', 'population 2004',
       'population 2005', 'population 2006', 'population 2007',
       'population 2008', 'population 2009', 'population 2010',
       'population 2011', 'population 2012', 'population 2013',
       'population 2014', 'population 2015', 'population 2016',
       'population 2017', 'population 2018', 'population 2019',
       'population 2020', 'population 2021', 'population 2022',
       'popul

In [23]:
# select features from data merged by SA2
selected_columns = ['Address', 'Cost', 'Property Type', 'Bedrooms', 'Bathrooms', 'Latitude',
       'Longitude', 'Closest Gov Secondary School', 'Gov Secondary Distance',
       'Age under 20', 'Age 20-39', 'Age 40-59', 'Age 60+', 'Postcode',
       'SA2_CODE21', 'population 2016', 'population 2017', 'population 2018', 
       'population 2019', 'population 2020', 'population 2021', 
       'population 2022', 'population 2023', '2016 income',
       '2017 income', '2018 income', '2019 income', '2020 income'
       ]

# Filter the merged data to keep only the selected columns
filtered_data = merged_data[selected_columns].copy()
filtered_data

Unnamed: 0,Address,Cost,Property Type,Bedrooms,Bathrooms,Latitude,Longitude,Closest Gov Secondary School,Gov Secondary Distance,Age under 20,...,population 2019,population 2020,population 2021,population 2022,population 2023,2016 income,2017 income,2018 income,2019 income,2020 income
0,"8/90 Hambleton Street, Middle Park VIC 3206",$410.00,Apartment,1,1,-37.847553,144.960477,Albert Park College,1.2 km away,36%,...,17081.0,16955.0,16011.0,16177.0,16861.0,10997,11489,11682,11683,11068
1,"3/33 Bevan Street, Albert Park VIC 3206",$550.00 per week,Apartment,1,1,-37.839959,144.956373,Albert Park College,0.9 km away,28%,...,17081.0,16955.0,16011.0,16177.0,16861.0,10997,11489,11682,11683,11068
2,"8/90 Hambleton Street, Middle Park VIC 3206",$410.00,Apartment,1,1,-37.847553,144.960477,Albert Park College,1.2 km away,36%,...,17081.0,16955.0,16011.0,16177.0,16861.0,10997,11489,11682,11683,11068
3,"7/7-9 Faussett Street, Albert Park VIC 3206",$490 Per Week I Furnished,Apartment,1,1,-37.841670,144.955332,Albert Park College,0.8 km away,20%,...,17081.0,16955.0,16011.0,16177.0,16861.0,10997,11489,11682,11683,11068
4,Albert Park VIC 3206,"$1,280 per week",Apartment,1,1,-37.843861,144.951454,Albert Park College,0.4 km away,17%,...,17081.0,16955.0,16011.0,16177.0,16861.0,10997,11489,11682,11683,11068
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7454,"4 Unaipon Street, Leneva VIC 3691",$620 per week,House,4,2,-36.161611,146.887448,Wodonga Senior Secondary College,3.8 km away,,...,11372.0,12046.0,12790.0,13365.0,13878.0,5683,6118,6465,7021,7466
7455,"1 Chatham Road, Leneva VIC 3691",$600 pw,House,4,2,-36.160540,146.893057,Wodonga Senior Secondary College,3.8 km away,,...,11372.0,12046.0,12790.0,13365.0,13878.0,5683,6118,6465,7021,7466
7456,"8 Roycroft Street, Baranduda VIC 3691",$620 per week,House,4,2,-36.179518,146.937635,Wodonga Senior Secondary College,7.8 km away,,...,11372.0,12046.0,12790.0,13365.0,13878.0,5683,6118,6465,7021,7466
7457,"20 WOODBRIDGE STREET, Killara VIC 3691",$800,House,4,2,-36.140167,146.942511,Wodonga Senior Secondary College,5.8 km away,,...,11372.0,12046.0,12790.0,13365.0,13878.0,5683,6118,6465,7021,7466


In [24]:
# Modify property to prepare join with shapefile
filtered_data['geometry'] = filtered_data.apply(lambda row: Point(row['Longitude'], row['Latitude']), axis=1)
property_gdf_filtered = gpd.GeoDataFrame(filtered_data, geometry='geometry', crs=LGA_gdf.crs)

In [27]:
# Join
filtered_data = gpd.sjoin(property_gdf_filtered, LGA_gdf, how='left', predicate='within')
filtered_data['LGA_NAME24'] = filtered_data['LGA_NAME24'].astype(str)

filtered_data.columns

Index(['Address', 'Cost', 'Property Type', 'Bedrooms', 'Bathrooms', 'Latitude',
       'Longitude', 'Closest Gov Secondary School', 'Gov Secondary Distance',
       'Age under 20', 'Age 20-39', 'Age 40-59', 'Age 60+', 'Postcode',
       'SA2_CODE21', 'population 2016', 'population 2017', 'population 2018',
       'population 2019', 'population 2020', 'population 2021',
       'population 2022', 'population 2023', '2016 income', '2017 income',
       '2018 income', '2019 income', '2020 income', 'geometry', 'index_right',
       'LGA_CODE24', 'LGA_NAME24', 'STE_CODE21', 'STE_NAME21', 'AUS_CODE21',
       'AUS_NAME21', 'AREASQKM', 'LOCI_URI21'],
      dtype='object')

In [29]:
crime['Local Government Area'] = crime['Local Government Area'].astype(str)
crime['Local Government Area'] = crime['Local Government Area'].str.strip()
filtered_data['LGA_NAME24'] = filtered_data['LGA_NAME24'].str.strip()


In [30]:
# Merge the spatially joined data with property and crime data using the correct column names
filtered_merge = filtered_data.merge(
    crime, 
    left_on='LGA_NAME24', 
    right_on='Local Government Area', 
    how='left'
)

In [32]:
filtered_merge.columns

Index(['Address', 'Cost', 'Property Type', 'Bedrooms', 'Bathrooms', 'Latitude',
       'Longitude', 'Closest Gov Secondary School', 'Gov Secondary Distance',
       'Age under 20', 'Age 20-39', 'Age 40-59', 'Age 60+', 'Postcode',
       'SA2_CODE21', 'population 2016', 'population 2017', 'population 2018',
       'population 2019', 'population 2020', 'population 2021',
       'population 2022', 'population 2023', '2016 income', '2017 income',
       '2018 income', '2019 income', '2020 income', 'geometry', 'index_right',
       'LGA_CODE24', 'LGA_NAME24', 'STE_CODE21', 'STE_NAME21', 'AUS_CODE21',
       'AUS_NAME21', 'AREASQKM', 'LOCI_URI21', 'Year', 'Year ending',
       'Police Region', 'Local Government Area', 'Incidents Recorded',
       'Rate per 100,000 population'],
      dtype='object')

In [33]:
final_columns = ['Address', 'Cost', 'Property Type', 'Bedrooms', 'Bathrooms', 'Latitude',
       'Longitude', 'Closest Gov Secondary School', 'Gov Secondary Distance',
       'Age under 20', 'Age 20-39', 'Age 40-59', 'Age 60+', 'Postcode',
       'SA2_CODE21', 'population 2016', 'population 2017', 'population 2018',
       'population 2019', 'population 2020', 'population 2021',
       'population 2022', 'population 2023', '2016 income', '2017 income',
       '2018 income', '2019 income', '2020 income', 'LGA_CODE24', 'Year', 
       'Year ending', 'Incidents Recorded',
       ]

# Filter the merged data to keep only the selected columns
final_data = filtered_merge[final_columns].copy()

In [34]:
final_data.head(5)

Unnamed: 0,Address,Cost,Property Type,Bedrooms,Bathrooms,Latitude,Longitude,Closest Gov Secondary School,Gov Secondary Distance,Age under 20,...,population 2023,2016 income,2017 income,2018 income,2019 income,2020 income,LGA_CODE24,Year,Year ending,Incidents Recorded
0,"8/90 Hambleton Street, Middle Park VIC 3206",$410.00,Apartment,1,1,-37.847553,144.960477,Albert Park College,1.2 km away,36%,...,16861.0,10997,11489,11682,11683,11068,25900,2024.0,March,10048.0
1,"8/90 Hambleton Street, Middle Park VIC 3206",$410.00,Apartment,1,1,-37.847553,144.960477,Albert Park College,1.2 km away,36%,...,16861.0,10997,11489,11682,11683,11068,25900,2023.0,March,9809.0
2,"8/90 Hambleton Street, Middle Park VIC 3206",$410.00,Apartment,1,1,-37.847553,144.960477,Albert Park College,1.2 km away,36%,...,16861.0,10997,11489,11682,11683,11068,25900,2022.0,March,9136.0
3,"8/90 Hambleton Street, Middle Park VIC 3206",$410.00,Apartment,1,1,-37.847553,144.960477,Albert Park College,1.2 km away,36%,...,16861.0,10997,11489,11682,11683,11068,25900,2021.0,March,10172.0
4,"8/90 Hambleton Street, Middle Park VIC 3206",$410.00,Apartment,1,1,-37.847553,144.960477,Albert Park College,1.2 km away,36%,...,16861.0,10997,11489,11682,11683,11068,25900,2020.0,March,9956.0


With this done, now, combine it with all API calculated distance to locations :D
周四上午完成，注意crime有多重数据（2020-2024

In [13]:
distance_files = {
    "CBD Distance": "../../data/raw/domain/cbd_distance.csv",
    "Train Distance": "../../data/raw/domain/train_distance.csv",
    "Electricity Distance": "../../data/raw/domain/elec_distance.csv",
    "Hospital Distance": "../../data/raw/domain/hospital_distance.csv",
    "Library Distance": "../../data/raw/domain/lib_distance.csv",
    "Park Distance": "../../data/raw/domain/park_distance.csv",
    "Tourist Attraction Distance": "../../data/raw/domain/tour_distance.csv",
    "Grocery Distance": "../../data/raw/domain/shop_distance.csv"
}

In [14]:
# Join
for column_name, file_path in distance_files.items():
    # Load the CSV file
    new_column_data = pd.read_csv(file_path)
    
    # Check if the number of rows matches the filtered_data DataFrame
    if len(new_column_data) == len(filtered_data):
        # Add the new column to the filtered_data DataFrame
        filtered_data[column_name] = new_column_data.iloc[:, 0]
    else:
        print(f"Warning: File {file_path} has a different number of rows and was not added.")




In [15]:
print(filtered_data.head())

                                             Address    Cost  Bedrooms  \
0        8/90 Hambleton Street, Middle Park VIC 3206   410.0         1   
1            3/33 Bevan Street, Albert Park VIC 3206   550.0         1   
2        7/7-9 Faussett Street, Albert Park VIC 3206   490.0         1   
3                               Albert Park VIC 3206  1280.0         1   
4  214/363 Beaconsfield Parade, Middle Park VIC 3206   350.0         1   

   Bathrooms Closest Gov Secondary School Gov Secondary Distance Age under 20  \
0          1          Albert Park College            1.2 km away          36%   
1          1          Albert Park College            0.9 km away          28%   
2          1          Albert Park College            0.8 km away          20%   
3          1          Albert Park College            0.4 km away          17%   
4          1          Albert Park College            1.6 km away          19%   

  Age 20-39 Age 40-59 Age 60+  ...  SA2  code  income_2020     2023 

In [16]:
# Save the updated DataFrame if needed
filtered_data.to_csv("../../data/raw/individual_property.csv", index=False)