### This file is dedicated to put all data we got together

Created by Yuecheng Wang 14-09-2024

First, using shapefile to assignment each property SA2 population and income of area.

In [1]:
import geopandas as gpd
import pandas as pd
import numpy as np
from shapely.geometry import Point

In [3]:
# Path to domain data and SA2
SA2_shapefile_path = "../../data/raw/ABS_SA2/SA2_2021_AUST_GDA2020.shp"
population_income_path = "../../data/curated/merged_population_income_data.csv"
property_data_path = "../../data/raw/domain/all_postcodes.csv"

In [4]:
try:
    population_income = pd.read_csv(
        population_income_path, 
        encoding='utf-8', 
        na_values=['', ' ', 'NA', 'NaN']
    )
except Exception as e:
    print("Error loading population income data:", e)

In [5]:
print(population_income.head())
print(population_income.info())

# Check specific columns for unexpected NaN values
print("Missing values per column:\n", population_income.isna().sum())

   GCCSA  SA4 Code  SA3 Code    SA2  code Region Type         Region     2021  \
0  2RVIC     201.0   20101.0  201011001.0         SA2      Alfredton  16841.0   
1  2RVIC     201.0   20101.0  201011002.0         SA2       Ballarat  12071.0   
2  2RVIC     201.0   20101.0  201011005.0         SA2      Buninyong   7229.0   
3  2RVIC     201.0   20101.0  201011006.0         SA2      Delacombe  10648.0   
4  2RVIC     201.0   20101.0  201011007.0         SA2  Smythes Creek   4211.0   

           2026          2031          2036     2023 income_2020  
0  20756.256163  23604.443836  26060.320807  18997.0      69,111  
1  11698.293593  11803.430603  11985.992387  11809.0      83,800  
2   7372.079773   7685.113372   8028.887243   7323.0      67,558  
3  15915.186041  20475.587469  24965.202439  12869.0      58,548  
4   4312.098530   4457.413406   4725.467837   4268.0      65,068  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 524 entries, 0 to 523
Data columns (total 12 columns):
 #   C

In [6]:
# Fix SA2 code being float issue
population_income['SA2  code'] = population_income['SA2  code'].apply(lambda x: str(int(x)) if not pd.isna(x) else '')

# Inspect the corrected 'SA2  code' values
print("Corrected 'SA2  code' values:", population_income['SA2  code'].unique()[:10])


Corrected 'SA2  code' values: ['201011001' '201011002' '201011005' '201011006' '201011007' '201011008'
 '201011481' '201011482' '201011483' '201011484']


In [7]:
# Load in datasets
SA2_gdf = gpd.read_file(SA2_shapefile_path)
property_data = pd.read_csv(property_data_path)

In [8]:
# Modify property to prepare join with shapefile
property_data['geometry'] = property_data.apply(lambda row: Point(row['Longitude'], row['Latitude']), axis=1)
property_gdf = gpd.GeoDataFrame(property_data, geometry='geometry', crs=SA2_gdf.crs)

In [9]:
# Join
mapped_properties = gpd.sjoin(property_gdf, SA2_gdf, how='left', predicate='within')
mapped_properties['SA2_CODE21'] = mapped_properties['SA2_CODE21'].astype(str)

print(mapped_properties.head())
print(population_income.head())

                                         Address  \
0     901/22-40 Wills Street, Melbourne VIC 3000   
1       1207/270 King Street, Melbourne VIC 3000   
2  5809/442 ELIZABETH STREET, Melbourne VIC 3000   
3   2112/80 A'beckett Street, Melbourne VIC 3000   
4   1210/81 A'beckett Street, Melbourne VIC 3000   

                               Cost  Bedrooms  Bathrooms  \
0                     $600 per week       1.0        1.0   
1                     $720 per week       2.0        2.0   
2  $850 Per Week ( Fully Furnished)       2.0        1.0   
3                     $700 per week       2.0        2.0   
4                       $650 weekly       2.0        1.0   

                  Coordinates Closest Gov Secondary School  \
0  [-37.8107551, 144.9570001]       University High School   
1  [-37.8136918, 144.9548583]       University High School   
2  [-37.8084101, 144.9607759]       University High School   
3  [-37.8089991, 144.9610792]       University High School   
4   [-37.8092536

In [10]:
# Merge the spatially joined data with population and income data using the correct column names
merged_data = mapped_properties.merge(
    population_income, 
    left_on='SA2_CODE21', 
    right_on='SA2  code', 
    how='left'
)

In [11]:
print(merged_data.head())

                                         Address  \
0     901/22-40 Wills Street, Melbourne VIC 3000   
1       1207/270 King Street, Melbourne VIC 3000   
2  5809/442 ELIZABETH STREET, Melbourne VIC 3000   
3   2112/80 A'beckett Street, Melbourne VIC 3000   
4   1210/81 A'beckett Street, Melbourne VIC 3000   

                               Cost  Bedrooms  Bathrooms  \
0                     $600 per week       1.0        1.0   
1                     $720 per week       2.0        2.0   
2  $850 Per Week ( Fully Furnished)       2.0        1.0   
3                     $700 per week       2.0        2.0   
4                       $650 weekly       2.0        1.0   

                  Coordinates Closest Gov Secondary School  \
0  [-37.8107551, 144.9570001]       University High School   
1  [-37.8136918, 144.9548583]       University High School   
2  [-37.8084101, 144.9607759]       University High School   
3  [-37.8089991, 144.9610792]       University High School   
4   [-37.8092536

In [12]:
selected_columns = [
    'Address', 'Cost', 'Bedrooms', 'Bathrooms', 'Coordinates', 
    'Closest Gov Secondary School', 'Gov Secondary Distance',
    'Age under 20', 'Age 20-39', 'Age 40-59', 'Age 60+', 'Postcode',
    'Latitude', 'Longitude', 'SA2  code', 'income_2020', '2023', '2026'
]

# Filter the merged data to keep only the selected columns
filtered_data = merged_data[selected_columns].copy()

In [13]:
filtered_data.head(5)

Unnamed: 0,Address,Cost,Bedrooms,Bathrooms,Coordinates,Closest Gov Secondary School,Gov Secondary Distance,Age under 20,Age 20-39,Age 40-59,Age 60+,Postcode,latitude,longitude,SA2 code,income_2020,2023,2026
0,"901/22-40 Wills Street, Melbourne VIC 3000",$600 per week,1.0,1.0,"[-37.8107551, 144.9570001]",University High School,1.5 km away,8%,77%,12%,3%,3000,-37.810755,144.957,206041504,44257,21566.0,24536.663914
1,"1207/270 King Street, Melbourne VIC 3000",$720 per week,2.0,2.0,"[-37.8136918, 144.9548583]",University High School,1.9 km away,6%,83%,10%,1%,3000,-37.813692,144.954858,206041505,59201,20027.0,23543.514398
2,"5809/442 ELIZABETH STREET, Melbourne VIC 3000",$850 Per Week ( Fully Furnished),2.0,1.0,"[-37.8084101, 144.9607759]",University High School,1.3 km away,3%,90%,7%,0%,3000,-37.80841,144.960776,206041504,44257,21566.0,24536.663914
3,"2112/80 A'beckett Street, Melbourne VIC 3000",$700 per week,2.0,2.0,"[-37.8089991, 144.9610792]",University High School,1.4 km away,3%,90%,7%,0%,3000,-37.808999,144.961079,206041504,44257,21566.0,24536.663914
4,"1210/81 A'beckett Street, Melbourne VIC 3000",$650 weekly,2.0,1.0,"[-37.8092536, 144.961181]",University High School,1.4 km away,6%,79%,12%,3%,3000,-37.809254,144.961181,206041504,44257,21566.0,24536.663914


With this done, now, combine it with all API calculated distance to locations :D

In [14]:
distance_files = {
    "CBD Distance": "../../data/raw/domain/cbd_distance.csv",
    "Train Distance": "../../data/raw/domain/train_distance.csv",
    "Electricity Distance": "../../data/raw/domain/elec_distance.csv",
    "Hospital Distance": "../../data/raw/domain/hospital_distance.csv",
    "Library Distance": "../../data/raw/domain/lib_distance.csv",
    "Park Distance": "../../data/raw/domain/park_distance.csv",
    "Tourist Attraction Distance": "../../data/raw/domain/tour_distance.csv",
    "Grocery Distance": "../../data/raw/domain/shop_distance.csv"
}

In [15]:
# Join
for column_name, file_path in distance_files.items():
    # Load the CSV file
    new_column_data = pd.read_csv(file_path)
    
    # Check if the number of rows matches the filtered_data DataFrame
    if len(new_column_data) == len(filtered_data):
        # Add the new column to the filtered_data DataFrame
        filtered_data[column_name] = new_column_data.iloc[:, 0]
    else:
        print(f"Warning: File {file_path} has a different number of rows and was not added.")


In [16]:
print(filtered_data.head())

                                         Address  \
0     901/22-40 Wills Street, Melbourne VIC 3000   
1       1207/270 King Street, Melbourne VIC 3000   
2  5809/442 ELIZABETH STREET, Melbourne VIC 3000   
3   2112/80 A'beckett Street, Melbourne VIC 3000   
4   1210/81 A'beckett Street, Melbourne VIC 3000   

                               Cost  Bedrooms  Bathrooms  \
0                     $600 per week       1.0        1.0   
1                     $720 per week       2.0        2.0   
2  $850 Per Week ( Fully Furnished)       2.0        1.0   
3                     $700 per week       2.0        2.0   
4                       $650 weekly       2.0        1.0   

                  Coordinates Closest Gov Secondary School  \
0  [-37.8107551, 144.9570001]       University High School   
1  [-37.8136918, 144.9548583]       University High School   
2  [-37.8084101, 144.9607759]       University High School   
3  [-37.8089991, 144.9610792]       University High School   
4   [-37.8092536

In [17]:
# Save the updated DataFrame if needed
filtered_data.to_csv("../../data/raw/individual_property.csv", index=False)