In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as func
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sys
from pyspark.sql import functions as F  #filtering
import geopandas as gpd
import folium
import pandas as pd
# starting a Spark session
spark = (
    SparkSession.builder.appName('PropertyFirstCleaning')
    .config("spark.sql.repl.eagerEval.enabled", True)
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.driver.memory", "2g")
    .config("spark.executor.memory", "4g")
    .getOrCreate()
)

# read the parquet dataset
property = spark.read.parquet('../data/landing/property_data/property_table_01.parquet')

your 131072x1 screen size is bogus. expect trouble
24/09/12 14:52:36 WARN Utils: Your hostname, LAPTOP-KOQUIUN resolves to a loopback address: 127.0.1.1; using 10.255.255.254 instead (on interface lo)
24/09/12 14:52:36 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/09/12 14:52:36 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/09/12 14:52:38 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
                                                                                

In [2]:
# Get the number of rows and columns
row_count = property.count()
column_count = len(property.columns)

# Return the shape of the cleaned DataFrame
property_shape = (row_count, column_count)
property_shape

(432315, 10)

In [3]:
# Get the number of duplicate rows directly
duplicate_count = property.count() - property.dropDuplicates().count()
duplicate_count

                                                                                

0

In [4]:
# Get the number of NA values for each column
na_counts = property.select([F.count(F.when(F.col(c).isNull(), c)).alias(c) for c in property.columns])
na_counts.show()

+----+-----------+-------------+---------------------+--------+------+-----------------+-------------+---------------+------------------+
|Year|Year ending|Police Region|Local Government Area|Postcode|Suburb|Location Division|Property Item|Number of Items|Value of Items ($)|
+----+-----------+-------------+---------------------+--------+------+-----------------+-------------+---------------+------------------+
|   0|          0|            0|                    0|       0|     0|                0|            0|              0|                 0|
+----+-----------+-------------+---------------------+--------+------+-----------------+-------------+---------------+------------------+



As we can see, there are no duplicates and NULL values on the dataset.

In [5]:
property.limit(5)

Year,Year ending,Police Region,Local Government Area,Postcode,Suburb,Location Division,Property Item,Number of Items,Value of Items ($)
2024,March,1 North West Metro,Banyule,3079,Ivanhoe,1 Residential,Car Accessories,14,2040.0
2024,March,1 North West Metro,Banyule,3079,Ivanhoe,1 Residential,Cash/Document,42,51750.0
2024,March,1 North West Metro,Banyule,3079,Ivanhoe,1 Residential,Clothing,9,2140.0
2024,March,1 North West Metro,Banyule,3079,Ivanhoe,1 Residential,Electrical Applia...,24,13310.0
2024,March,1 North West Metro,Banyule,3079,Ivanhoe,1 Residential,Food,1,200.0


In [6]:
from pyspark.sql import functions as F

# Create a filter condition to check if any column contains 'VICTORIA' or None
condition = F.lit(False)
for col in property.columns:
    condition = condition | F.lower(F.col(col)).isin(F.lower(F.lit('Victoria')), None)

# Apply the condition to filter the DataFrame
victoria_none_check = property.filter(condition)

victoria_none_check.show(1)

+----+-----------+-------------+---------------------+--------+--------+-----------------+---------------+---------------+------------------+
|Year|Year ending|Police Region|Local Government Area|Postcode|  Suburb|Location Division|  Property Item|Number of Items|Value of Items ($)|
+----+-----------+-------------+---------------------+--------+--------+-----------------+---------------+---------------+------------------+
|2024|      March|     Victoria|             Victoria|Victoria|Victoria|    1 Residential|Car Accessories|           6636|        1659629.97|
+----+-----------+-------------+---------------------+--------+--------+-----------------+---------------+---------------+------------------+
only showing top 1 row



In [7]:
# Apply the condition to filter the DataFrame and count the results
victoria_none_count = property.filter(condition).count()

# Show the count of rows that match the condition
print(f"Number of rows containing 'Victoria' or None: {victoria_none_count}")

Number of rows containing 'Victoria' or None: 989


                                                                                

In [8]:
victoria_none_count / row_count

0.00228768374911812

In [9]:
# Create a filter condition to check if any column contains 'Victoria' (case-insensitive)
condition = F.lit(False)

for col in property.columns:
    condition = condition | F.lower(F.col(col)).contains(F.lower(F.lit('Victoria')))

# Filter out rows where any column contains 'Victoria'
property = property.filter(~condition)

# Show the resulting DataFrame without rows containing 'Victoria'
property.show(1)

# Optionally, count the remaining rows
remaining_count = property.count()
print(f"Number of rows remaining after removing rows containing 'Victoria': {remaining_count}")

+----+-----------+------------------+---------------------+--------+-------+-----------------+---------------+---------------+------------------+
|Year|Year ending|     Police Region|Local Government Area|Postcode| Suburb|Location Division|  Property Item|Number of Items|Value of Items ($)|
+----+-----------+------------------+---------------------+--------+-------+-----------------+---------------+---------------+------------------+
|2024|      March|1 North West Metro|              Banyule|    3079|Ivanhoe|    1 Residential|Car Accessories|             14|            2040.0|
+----+-----------+------------------+---------------------+--------+-------+-----------------+---------------+---------------+------------------+
only showing top 1 row

Number of rows remaining after removing rows containing 'Victoria': 431294


                                                                                

In [10]:
# Filter for a specific suburb, e.g., 'Arcadia'
specific_suburb = "Ivanhoe"
suburb_check = property[property['Suburb'] == specific_suburb]

# Show the result
suburb_check

Year,Year ending,Police Region,Local Government Area,Postcode,Suburb,Location Division,Property Item,Number of Items,Value of Items ($)
2024,March,1 North West Metro,Banyule,3079,Ivanhoe,1 Residential,Car Accessories,14,2040.0
2024,March,1 North West Metro,Banyule,3079,Ivanhoe,1 Residential,Cash/Document,42,51750.0
2024,March,1 North West Metro,Banyule,3079,Ivanhoe,1 Residential,Clothing,9,2140.0
2024,March,1 North West Metro,Banyule,3079,Ivanhoe,1 Residential,Electrical Applia...,24,13310.0
2024,March,1 North West Metro,Banyule,3079,Ivanhoe,1 Residential,Food,1,200.0
2024,March,1 North West Metro,Banyule,3079,Ivanhoe,1 Residential,Furniture,1,1000.0
2024,March,1 North West Metro,Banyule,3079,Ivanhoe,1 Residential,Garden Items,10,5253.0
2024,March,1 North West Metro,Banyule,3079,Ivanhoe,1 Residential,Jewellery,79,140802.0
2024,March,1 North West Metro,Banyule,3079,Ivanhoe,1 Residential,Personal Property,54,38369.0
2024,March,1 North West Metro,Banyule,3079,Ivanhoe,1 Residential,Photographic Equip,3,700.0


In [11]:
# Clean and convert the 'Value of Items ($)' and 'Number of Items' columns to numeric
property = property.withColumn("Value of Items ($)", F.regexp_replace(F.col("Value of Items ($)"), "[$,]", "").cast("float"))
property = property.withColumn("Number of Items", F.col("Number of Items").cast("int"))

# Filter out instances where 'Value of Items ($)' is 0
filtered_property_df = property.filter(property["Value of Items ($)"] > 0)

# Calculate the sum of 'Value of Items ($)' and 'Number of Items' for each 'Postcode'
average_value_by_postcode = filtered_property_df.groupBy("Postcode").agg(
    (F.sum("Value of Items ($)") / F.sum("Number of Items")).alias("Average Value Per Item ($)")
)

# Convert to Pandas for easier viewing
average_value_by_postcode_summary = average_value_by_postcode.toPandas()

# Adjust the index to start from 1
average_value_by_postcode_summary.index = average_value_by_postcode_summary.index + 1

# Show the resulting summary
average_value_by_postcode_summary

                                                                                

Unnamed: 0,Postcode,Average Value Per Item ($)
1,3959,1102.127216
2,3414,2064.451777
3,3015,665.748732
4,3858,823.703448
5,3517,2851.723577
...,...,...
688,3419,794.301733
689,3093,1447.499927
690,3033,752.596557
691,3715,1341.936842


### Grouping

In [12]:
# Count the occurrences of each 'Property Item'
property_item_counts = property.groupBy("Property Item").count().orderBy("Property Item", ascending=True)

# Convert to Pandas for easier viewing (optional, if the dataset is small enough to fit in memory)
property_item_summary = property_item_counts.toPandas()

# Adjust the index to start from 1
property_item_summary.index = property_item_summary.index + 1

# Rename columns for better readability
property_item_summary.columns = ['Property Item', 'Count']

property_item_summary

                                                                                

Unnamed: 0,Property Item,Count
1,Car Accessories,31320
2,Cash/Document,30208
3,Cigarettes/Liquor,16660
4,Clothing,19486
5,Domestic Pets,4036
6,Electrical Appliances,31240
7,Explosives,70
8,Firearms/Ammunition,4067
9,Food,13790
10,Furniture,9936


## Shape File

In [13]:
# sf stands for shape file
sf = gpd.read_file("../data/landing/boundaries/Victoria/vic_dist_boundaries.shp")

sf.head()

Unnamed: 0,sa2_code,sa2_name,chg_flag,chg_lbl,sa3_code,sa3_name,sa4_code,sa4_name,gcc_code,gcc_name,ste_code,ste_name,aus_code,aus_name,areasqkm,loci_uri,geometry
0,201011001,Alfredton,0,No change,20101,Ballarat,201,Ballarat,2RVIC,Rest of Vic.,2,Victoria,AUS,Australia,52.7109,http://linked.data.gov.au/dataset/asgsed3/SA2/...,"POLYGON ((143.78281 -37.56667, 143.75557 -37.5..."
1,201011002,Ballarat,0,No change,20101,Ballarat,201,Ballarat,2RVIC,Rest of Vic.,2,Victoria,AUS,Australia,12.3787,http://linked.data.gov.au/dataset/asgsed3/SA2/...,"POLYGON ((143.81896 -37.55583, 143.81644 -37.5..."
2,201011005,Buninyong,0,No change,20101,Ballarat,201,Ballarat,2RVIC,Rest of Vic.,2,Victoria,AUS,Australia,51.5855,http://linked.data.gov.au/dataset/asgsed3/SA2/...,"POLYGON ((143.8417 -37.61597, 143.84175 -37.61..."
3,201011006,Delacombe,0,No change,20101,Ballarat,201,Ballarat,2RVIC,Rest of Vic.,2,Victoria,AUS,Australia,34.1607,http://linked.data.gov.au/dataset/asgsed3/SA2/...,"POLYGON ((143.75049 -37.5912, 143.75044 -37.59..."
4,201011007,Smythes Creek,0,No change,20101,Ballarat,201,Ballarat,2RVIC,Rest of Vic.,2,Victoria,AUS,Australia,104.7274,http://linked.data.gov.au/dataset/asgsed3/SA2/...,"POLYGON ((143.73295 -37.62334, 143.73262 -37.6..."


In [14]:
# For exact match search
exact_match = sf[sf['sa2_name'].str.lower() == 'ivanhoe east - eaglemont']
print(exact_match[['sa2_name', 'geometry']])

                     sa2_name  \
223  Ivanhoe East - Eaglemont   

                                              geometry  
223  POLYGON ((145.05176 -37.76678, 145.05188 -37.7...  


### Merge

In [15]:
# Convert the PySpark DataFrame to Pandas
property_pandas = property.toPandas()

# Perform the left join on 'suburb' from property and 'sa2_name' from sf
merged_df = property_pandas.merge(sf[['sa2_name', 'geometry']], 
                                  left_on='Suburb', right_on='sa2_name', 
                                  how='left')

# Convert the merged DataFrame back to a GeoDataFrame
merged_gdf = gpd.GeoDataFrame(merged_df, geometry='geometry')

# Display the merged GeoDataFrame
merged_gdf.head()

                                                                                

Unnamed: 0,Year,Year ending,Police Region,Local Government Area,Postcode,Suburb,Location Division,Property Item,Number of Items,Value of Items ($),sa2_name,geometry
0,2024,March,1 North West Metro,Banyule,3079,Ivanhoe,1 Residential,Car Accessories,14,2040.0,Ivanhoe,"POLYGON ((145.02852 -37.76136, 145.02856 -37.7..."
1,2024,March,1 North West Metro,Banyule,3079,Ivanhoe,1 Residential,Cash/Document,42,51750.0,Ivanhoe,"POLYGON ((145.02852 -37.76136, 145.02856 -37.7..."
2,2024,March,1 North West Metro,Banyule,3079,Ivanhoe,1 Residential,Clothing,9,2140.0,Ivanhoe,"POLYGON ((145.02852 -37.76136, 145.02856 -37.7..."
3,2024,March,1 North West Metro,Banyule,3079,Ivanhoe,1 Residential,Electrical Appliances,24,13310.0,Ivanhoe,"POLYGON ((145.02852 -37.76136, 145.02856 -37.7..."
4,2024,March,1 North West Metro,Banyule,3079,Ivanhoe,1 Residential,Food,1,200.0,Ivanhoe,"POLYGON ((145.02852 -37.76136, 145.02856 -37.7..."


In [16]:
merged_gdf.length


  merged_gdf.length


0         0.160206
1         0.160206
2         0.160206
3         0.160206
4         0.160206
            ...   
431289         NaN
431290         NaN
431291         NaN
431292         NaN
431293         NaN
Length: 431294, dtype: float64

In [17]:
# Check if any rows have missing geometry (NaN values in the 'geometry' column)
missing_geometry = merged_gdf[merged_gdf['geometry'].isnull()]

# Count how many suburbs are missing geometry
missing_geometry_count = missing_geometry.shape[0]

# Show the first few rows of suburbs without geometry
# missing_geometry.head(20), missing_geometry_count
missing_geometry.head(5)

Unnamed: 0,Year,Year ending,Police Region,Local Government Area,Postcode,Suburb,Location Division,Property Item,Number of Items,Value of Items ($),sa2_name,geometry
45,2024,March,1 North West Metro,Banyule,3079,Ivanhoe East,1 Residential,Cash/Document,10,187.0,,
46,2024,March,1 North West Metro,Banyule,3079,Ivanhoe East,1 Residential,Clothing,2,450.0,,
47,2024,March,1 North West Metro,Banyule,3079,Ivanhoe East,1 Residential,Jewellery,3,5000.0,,
48,2024,March,1 North West Metro,Banyule,3079,Ivanhoe East,1 Residential,Other,8,2500.0,,
49,2024,March,1 North West Metro,Banyule,3079,Ivanhoe East,1 Residential,Power Tools,3,1350.0,,


In [18]:
# Check if any rows have missing geometry (NaN values in the 'geometry' column)
missing_geometry = merged_gdf[merged_gdf['geometry'].isnull()]

# Count how many suburbs are missing geometry
missing_geometry_count = missing_geometry.shape[0]

# Return the suburb names for the rows with missing geometry
missing_suburbs = missing_geometry['Suburb'].unique()

# Show the missing suburbs and the count of missing geometries
missing_suburbs, missing_geometry_count


(array(['Ivanhoe East', 'Bellfield', 'Heidelberg Heights', ...,
        'Swanwater', 'Tanwood', 'Chinangin'], dtype=object),
 297977)

In [19]:
# Step 4: Load the suburb_df Parquet file
suburb_df = pd.read_parquet('../data/landing/suburb_match/suburb_match.parquet')

# Filter the DataFrame to only include rows where the state is 'VIC'
suburb_df = suburb_df[suburb_df['state'] == 'VIC']

# Display the first 10 rows of the filtered DataFrame
suburb_df.head(1)

Unnamed: 0,id,postcode,locality,state,long,lat,dc,type,status,sa3,...,altitude,chargezone,phn_code,phn_name,lgaregion,lgacode,electorate,electoraterating,sed_code,sed_name
6202,4746,3000,MELBOURNE,VIC,144.982585,-37.814437,CITY DELIVERY CENTRE,Delivery Area,Updated 17-Mar-2024 AUSPOST,20604.0,...,27.332188,V1,PHN201,North Western Melbourne,Melbourne,24600.0,Melbourne,Inner Metropolitan,24703.0,Melbourne (Northern Metropolitan)


In [20]:
suburb_df.columns

Index(['id', 'postcode', 'locality', 'state', 'long', 'lat', 'dc', 'type',
       'status', 'sa3', 'sa3name', 'sa4', 'sa4name', 'region', 'Lat_precise',
       'Long_precise', 'SA1_CODE_2021', 'SA1_NAME_2021', 'SA2_CODE_2021',
       'SA2_NAME_2021', 'SA3_CODE_2021', 'SA3_NAME_2021', 'SA4_CODE_2021',
       'SA4_NAME_2021', 'RA_2011', 'RA_2016', 'RA_2021', 'RA_2021_NAME',
       'MMM_2015', 'MMM_2019', 'ced', 'altitude', 'chargezone', 'phn_code',
       'phn_name', 'lgaregion', 'lgacode', 'electorate', 'electoraterating',
       'sed_code', 'sed_name'],
      dtype='object')

In [38]:
# Example: Searching for an exact match of 'Ivanhoe East - Eaglemont' in the 'sa2_name_2021' column
exact_sa2_name = suburb_df[suburb_df['SA2_NAME_2021'] == 'Ivanhoe East - Eaglemont']
exact_sa2_name.head(20)

Unnamed: 0,id,postcode,locality,state,long,lat,dc,type,status,sa3,...,altitude,chargezone,phn_code,phn_name,lgaregion,lgacode,electorate,electoraterating,sed_code,sed_name
6393,4651,3079,Ivanhoe,VIC,145.048573,-37.772675,HEIDELBERG WEST DEL CENTRE,Delivery Area,Updated 6-Feb-2020,20901.0,...,,V1,PHN202,Eastern Melbourne,Banyule,20660.0,Jagajaga,Outer Metropolitan,23802.0,Ivanhoe (North-Eastern Metropolitan)
6394,4652,3079,Ivanhoe East,VIC,145.048573,-37.772675,HEIDELBERG WEST DEL CENTRE,Delivery Area,Updated 6-Feb-2020,20901.0,...,,V1,PHN202,Eastern Melbourne,Banyule,20660.0,Jagajaga,Outer Metropolitan,23802.0,Ivanhoe (North-Eastern Metropolitan)
6395,4653,3079,Ivanhoe North,VIC,145.048573,-37.772675,HEIDELBERG WEST DEL CENTRE,Delivery Area,Updated 6-Feb-2020,20901.0,...,,V1,PHN202,Eastern Melbourne,Banyule,20660.0,Jagajaga,Outer Metropolitan,23802.0,Ivanhoe (North-Eastern Metropolitan)


In [22]:
import pandas as pd

# Assuming suburb_df and missing_geometry are already loaded

# Clean the locality column in suburb_df for consistency
suburb_df['locality'] = suburb_df['locality'].str.strip().str.title()

# Create a dictionary to map 'locality' to 'SA2_NAME_2021'
locality_to_sa2_mapping = suburb_df.set_index('locality')['SA2_NAME_2021'].to_dict()

# Replace the 'Suburb' values in missing_geometry with corresponding 'SA2_NAME_2021'
missing_geometry['sa2_name'] = missing_geometry['Suburb'].map(locality_to_sa2_mapping)

# Count how many suburbs were matched
matched_count = missing_geometry['sa2_name'].notna().sum()
print(f"Missing geometry suburbs matched with SA2_NAME_2021: {matched_count}")

# Display the first few rows of missing_geometry after updating SA2 names
print(missing_geometry[['Year', 'Police Region', 'Local Government Area', 'Postcode', 'Suburb', 'sa2_name', 'geometry']].head())

# Optionally, you can create a new DataFrame with only matched rows
# missing_geometry_with_sa2 = missing_geometry.dropna(subset=['sa2_name'])

# Check for unmatched suburbs
unmatched_suburbs = missing_geometry[missing_geometry['sa2_name'].isna()]['Suburb'].unique()
print(f"\nNumber of unmatched suburbs: {len(unmatched_suburbs)}")
print("First few unmatched suburbs:", unmatched_suburbs)

Missing geometry suburbs matched with SA2_NAME_2021: 290965
    Year       Police Region Local Government Area Postcode        Suburb  \
45  2024  1 North West Metro               Banyule     3079  Ivanhoe East   
46  2024  1 North West Metro               Banyule     3079  Ivanhoe East   
47  2024  1 North West Metro               Banyule     3079  Ivanhoe East   
48  2024  1 North West Metro               Banyule     3079  Ivanhoe East   
49  2024  1 North West Metro               Banyule     3079  Ivanhoe East   

                    sa2_name geometry  
45  Ivanhoe East - Eaglemont     None  
46  Ivanhoe East - Eaglemont     None  
47  Ivanhoe East - Eaglemont     None  
48  Ivanhoe East - Eaglemont     None  
49  Ivanhoe East - Eaglemont     None  

Number of unmatched suburbs: 17
First few unmatched suburbs: ['Melbourne' 'Shepparton' 'Mansfield' 'Bandiana' 'McMahons Creek'
 'McKinnon' 'Dandenong' 'Dandenong South' 'McCrae' 'Bakery Hill'
 'McMillans' 'McKenzie Creek' 'Mildura' 'McK

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)


In [23]:
# Clean the locality column in suburb_df for consistency
suburb_df['locality'] = suburb_df['locality'].str.strip().str.title()

# Create a dictionary to map 'locality' to 'SA2_NAME_2021'
locality_to_sa2_mapping = suburb_df.set_index('locality')['SA2_NAME_2021'].to_dict()

# Replace the 'Suburb' values in missing_geometry with corresponding 'SA2_NAME_2021'
missing_geometry['sa2_name'] = missing_geometry['Suburb'].map(locality_to_sa2_mapping)

# Count how many suburbs were matched initially
initial_matched_count = missing_geometry['sa2_name'].notna().sum()
print(f"Initially matched suburbs: {initial_matched_count}")

# Manual matching dictionary for unmatched suburbs
manual_matches = {
    'Melbourne': 'North Melbourne',
    'Shepparton': 'Shepparton Surrounds - East',
    'Bandiana': 'Wodonga',
    'McMahons Creek': 'Yarra Valley',
    'McKinnon': 'Bentleigh East - South',
    'Dandenong': 'Mount Dandenong - Olinda',
    'Dandenong South': 'Mount Dandenong - Olinda',
    'McCrae': 'Rosebud - McCrae',
    'Bakery Hill': 'Sebastopol - Redan',
    'McMillans': 'Gannawarra',
    'Spring Hill': 'Seymour Surrounds',
    'McKenzie Creek': 'Southern Grampians',
    'Inglewood': 'Loddon',
    'Mildura': 'Mildura - South',
    'McKenzie Hill': 'Castlemaine Surrounds',
    'Kingston': 'Castlemaine Surrounds',
    'McLoughlins Beach': 'Yarram',
    'McIntyre': 'Loddon',
    'Murray-sunset': 'Mildura Surrounds',
    'Mansfield': 'Mansfield (Vic.)'
    # Add more manual matches here as needed
}

# Apply manual matches
missing_geometry.loc[missing_geometry['sa2_name'].isna(), 'sa2_name'] = missing_geometry.loc[missing_geometry['sa2_name'].isna(), 'Suburb'].map(manual_matches)

# Final count of matched suburbs
final_matched_count = missing_geometry['sa2_name'].notna().sum()
print(f"Final count of matched suburbs: {final_matched_count}")

# Display the first few rows of missing_geometry after all matching
#print(missing_geometry[['Year', 'Police Region', 'Local Government Area', 'Postcode', 'Suburb', 'sa2_name', 'geometry']].head())

# Check for any remaining unmatched suburbs
unmatched_suburbs = missing_geometry[missing_geometry['sa2_name'].isna()]['Suburb'].unique()
print(f"\nNumber of remaining unmatched suburbs: {len(unmatched_suburbs)}")
if len(unmatched_suburbs) > 0:
    print("Remaining unmatched suburbs:", unmatched_suburbs)
else:
    print("All suburbs have been matched!")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)


Initially matched suburbs: 290965
Final count of matched suburbs: 297977

Number of remaining unmatched suburbs: 0
All suburbs have been matched!


In [24]:
missing_geometry.head(1)

Unnamed: 0,Year,Year ending,Police Region,Local Government Area,Postcode,Suburb,Location Division,Property Item,Number of Items,Value of Items ($),sa2_name,geometry
45,2024,March,1 North West Metro,Banyule,3079,Ivanhoe East,1 Residential,Cash/Document,10,187.0,Ivanhoe East - Eaglemont,


In [25]:
# Example: Searching for an exact match of 'Ivanhoe East - Eaglemont' in the 'sa2_name_2021' column
exact_sa2_name = missing_geometry[missing_geometry['sa2_name'] == 'Kensington (Vic.)']
exact_sa2_name.head(1)

Unnamed: 0,Year,Year ending,Police Region,Local Government Area,Postcode,Suburb,Location Division,Property Item,Number of Items,Value of Items ($),sa2_name,geometry
4994,2024,March,1 North West Metro,Melbourne,3031,Kensington,1 Residential,Car Accessories,13,2243.0,Kensington (Vic.),


In [26]:
'''# Step 1: Merge the dataframes
missing_merged_df = missing_geometry.merge(sf[['sa2_name', 'geometry']], on='sa2_name', how='left', suffixes=('', '_sf'))

# Step 2: Update the geometry column
missing_geometry['geometry'] = missing_merged_df['geometry_sf']

# Step 3: Verify the update
print("Number of null geometries:")
print(missing_geometry['geometry'].isnull().sum())

print("\nSample of updated geometries:")
print(missing_geometry['geometry'].head())

# Step 4: Check the columns in the final dataframe
print("\nColumns in the updated missing_geometry dataframe:")
print(missing_geometry.columns)

# Optional: If you want to remove the original geometry column (if it existed)
if 'geometry_x' in missing_geometry.columns:
    missing_geometry = missing_geometry.drop(columns=['geometry_x'])

print("\nFinal columns in missing_geometry:")
print(missing_geometry.columns)'''

'# Step 1: Merge the dataframes\nmissing_merged_df = missing_geometry.merge(sf[[\'sa2_name\', \'geometry\']], on=\'sa2_name\', how=\'left\', suffixes=(\'\', \'_sf\'))\n\n# Step 2: Update the geometry column\nmissing_geometry[\'geometry\'] = missing_merged_df[\'geometry_sf\']\n\n# Step 3: Verify the update\nprint("Number of null geometries:")\nprint(missing_geometry[\'geometry\'].isnull().sum())\n\nprint("\nSample of updated geometries:")\nprint(missing_geometry[\'geometry\'].head())\n\n# Step 4: Check the columns in the final dataframe\nprint("\nColumns in the updated missing_geometry dataframe:")\nprint(missing_geometry.columns)\n\n# Optional: If you want to remove the original geometry column (if it existed)\nif \'geometry_x\' in missing_geometry.columns:\n    missing_geometry = missing_geometry.drop(columns=[\'geometry_x\'])\n\nprint("\nFinal columns in missing_geometry:")\nprint(missing_geometry.columns)'

In [27]:
# Step 1: Merge the dataframes
missing_merged_df = missing_geometry.merge(sf[['sa2_name', 'geometry']], on='sa2_name', how='left', suffixes=('', '_sf'))

# Step 2: Update the geometry column
missing_geometry['geometry'] = missing_merged_df['geometry_sf']

# Step 3: Verify the update
print(missing_geometry['geometry'].isnull().sum())
print(missing_geometry['geometry'].head())

93212
45    POLYGON ((145.03287 -37.74091, 145.0328 -37.74...
46    POLYGON ((145.03287 -37.74091, 145.0328 -37.74...
47    POLYGON ((145.03287 -37.74091, 145.0328 -37.74...
48    POLYGON ((145.03287 -37.74091, 145.0328 -37.74...
49    POLYGON ((145.03287 -37.74091, 145.0328 -37.74...
Name: geometry, dtype: geometry


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)


In [28]:
'''missing_merged_df.head()'''

'missing_merged_df.head()'

In [29]:
'''# Step 1: Merge the dataframes
missing_merged_df = missing_geometry.merge(sf[['sa2_name', 'geometry']], on='sa2_name', how='left', suffixes=('', '_sf'))

# Step 2: Update the geometry column
missing_geometry['geometry'] = missing_merged_df['geometry_sf']

# Step 3: Filter the unmatched instances
unmatched_instances = missing_geometry[missing_geometry['geometry'].isnull()]

# Step 4: Get the distinct sa2_name values for unmatched instances
distinct_unmatched_sa2_names = unmatched_instances['sa2_name'].unique()

# Show the distinct sa2_name values that were not matched with geometry
print("Distinct sa2_name values that are not matched with geometry:")
print(distinct_unmatched_sa2_names)'''

'# Step 1: Merge the dataframes\nmissing_merged_df = missing_geometry.merge(sf[[\'sa2_name\', \'geometry\']], on=\'sa2_name\', how=\'left\', suffixes=(\'\', \'_sf\'))\n\n# Step 2: Update the geometry column\nmissing_geometry[\'geometry\'] = missing_merged_df[\'geometry_sf\']\n\n# Step 3: Filter the unmatched instances\nunmatched_instances = missing_geometry[missing_geometry[\'geometry\'].isnull()]\n\n# Step 4: Get the distinct sa2_name values for unmatched instances\ndistinct_unmatched_sa2_names = unmatched_instances[\'sa2_name\'].unique()\n\n# Show the distinct sa2_name values that were not matched with geometry\nprint("Distinct sa2_name values that are not matched with geometry:")\nprint(distinct_unmatched_sa2_names)'

In [30]:
# Step 1: Create sets of sa2_names from both dataframes
missing_sa2names = set(missing_geometry['sa2_name'])
sf_sa2names = set(sf['sa2_name'])

# Step 2: Find sa2_names that are in missing_geometry but not in sf
unmatched_names = missing_sa2names - sf_sa2names

print(f"Number of unmatched sa2_names: {len(unmatched_names)}")

if len(unmatched_names) > 0:
    print("\nSample of unmatched sa2_names:")
    print(list(unmatched_names)[:10])

    # Step 3: Detailed comparison for unmatched names
    for name in list(unmatched_names)[:10]:
        close_matches = [sf_name for sf_name in sf_sa2names if name.lower() == sf_name.lower()]
        if close_matches:
            print(f"\nPossible match for '{name}':")
            print(close_matches)

# Step 4: Check for leading/trailing whitespace
whitespace_issues_missing = missing_geometry[missing_geometry['sa2_name'].str.strip() != missing_geometry['sa2_name']]
whitespace_issues_sf = sf[sf['sa2_name'].str.strip() != sf['sa2_name']]

print(f"\nNames with leading/trailing whitespace in missing_geometry: {len(whitespace_issues_missing)}")
print(f"Names with leading/trailing whitespace in sf: {len(whitespace_issues_sf)}")

# Step 5: Check for slight differences in names
def find_close_matches(name, name_list, max_distance=3):
    return [n for n in name_list if abs(len(n) - len(name)) <= max_distance and sum(c1 != c2 for c1, c2 in zip(name, n)) <= max_distance]

sample_unmatched = list(unmatched_names)[:10]
for name in sample_unmatched:
    close_matches = find_close_matches(name, sf_sa2names)
    if close_matches:
        print(f"\nPossible close matches for '{name}':")
        print(close_matches)

# Step 6: Check data types
print("\nData type of sa2_name in missing_geometry:", missing_geometry['sa2_name'].dtype)
print("Data type of sa2_name in sf:", sf['sa2_name'].dtype)

# Step 7: Perform the merge and check results
merged_df = missing_geometry.merge(sf[['sa2_name', 'geometry']], on='sa2_name', how='left', indicator=True)

print("\nMerge results:")
print(merged_df['_merge'].value_counts())

unmatched_after_merge = merged_df[merged_df['_merge'] == 'left_only']
print(f"\nNumber of unmatched rows after merge: {len(unmatched_after_merge)}")

if len(unmatched_after_merge) > 0:
    print("\nSample of unmatched sa2_names after merge:")
    print(unmatched_after_merge['sa2_name'].head())

Number of unmatched sa2_names: 0

Names with leading/trailing whitespace in missing_geometry: 0
Names with leading/trailing whitespace in sf: 0

Data type of sa2_name in missing_geometry: object
Data type of sa2_name in sf: object

Merge results:
_merge
both          297977
left_only          0
right_only         0
Name: count, dtype: int64

Number of unmatched rows after merge: 0


In [31]:
# Step 1: Merge the dataframes to include geometry (this was already done correctly)
missing_merged_df = missing_geometry.merge(sf[['sa2_name', 'geometry']], on='sa2_name', how='left', suffixes=('', '_sf'))

# Step 2: Update the geometry column
missing_geometry['geometry'] = missing_merged_df['geometry_sf']

# Step 3: Display the fully matched DataFrame (since there are no unmatched rows)
print("Fully matched DataFrame:")
missing_geometry.head(100)  # Display the first few rows of the fully matched DataFrame

# Optionally, save the fully matched DataFrame to a CSV for further inspection
# missing_geometry.to_csv("fully_matched_df.csv", index=False)


Fully matched DataFrame:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)


Unnamed: 0,Year,Year ending,Police Region,Local Government Area,Postcode,Suburb,Location Division,Property Item,Number of Items,Value of Items ($),sa2_name,geometry
45,2024,March,1 North West Metro,Banyule,3079,Ivanhoe East,1 Residential,Cash/Document,10,187.0,Ivanhoe East - Eaglemont,"POLYGON ((145.03287 -37.74091, 145.0328 -37.74..."
46,2024,March,1 North West Metro,Banyule,3079,Ivanhoe East,1 Residential,Clothing,2,450.0,Ivanhoe East - Eaglemont,"POLYGON ((145.03287 -37.74091, 145.0328 -37.74..."
47,2024,March,1 North West Metro,Banyule,3079,Ivanhoe East,1 Residential,Jewellery,3,5000.0,Ivanhoe East - Eaglemont,"POLYGON ((145.03287 -37.74091, 145.0328 -37.74..."
48,2024,March,1 North West Metro,Banyule,3079,Ivanhoe East,1 Residential,Other,8,2500.0,Ivanhoe East - Eaglemont,"POLYGON ((145.03287 -37.74091, 145.0328 -37.74..."
49,2024,March,1 North West Metro,Banyule,3079,Ivanhoe East,1 Residential,Power Tools,3,1350.0,Ivanhoe East - Eaglemont,"POLYGON ((145.03287 -37.74091, 145.0328 -37.74..."
...,...,...,...,...,...,...,...,...,...,...,...,...
185,2024,March,1 North West Metro,Banyule,3083,Bundoora,3 Other,Other,4,3054.0,Bundoora - West,"POLYGON ((145.06122 -37.726, 145.0614 -37.7249..."
186,2024,March,1 North West Metro,Banyule,3083,Bundoora,3 Other,Power Tools,1,300.0,Bundoora - West,"POLYGON ((145.06122 -37.726, 145.0614 -37.7249..."
187,2024,March,1 North West Metro,Banyule,3083,Bundoora,3 Other,Tv/Vcr,1,658.0,Bundoora - West,"POLYGON ((145.06122 -37.726, 145.0614 -37.7249..."
188,2024,March,1 North West Metro,Banyule,3083,Bundoora,All Locations,Car Accessories,43,1965.0,Bundoora - West,"POLYGON ((145.06122 -37.726, 145.0614 -37.7249..."


In [32]:
# Check if any rows have null geometry after the merge
null_geometry_count = missing_geometry['geometry'].isnull().sum()
print(f"Number of rows with null geometry: {null_geometry_count}")


Number of rows with null geometry: 93212


In [33]:
# Check the first few rows where geometry is null
null_geometry_rows = missing_geometry[missing_geometry['geometry'].isnull()]
null_geometry_rows.head()

Unnamed: 0,Year,Year ending,Police Region,Local Government Area,Postcode,Suburb,Location Division,Property Item,Number of Items,Value of Items ($),sa2_name,geometry
297977,2017,March,1 North West Metro,Hume,3059,Greenvale,2 Community,Clothing,6,402.0,Greenvale - Bulla,
297978,2017,March,1 North West Metro,Hume,3059,Greenvale,2 Community,Electrical Appliances,14,1705.0,Greenvale - Bulla,
297979,2017,March,1 North West Metro,Hume,3059,Greenvale,2 Community,Garden Items,1,5000.0,Greenvale - Bulla,
297980,2017,March,1 North West Metro,Hume,3059,Greenvale,2 Community,Other,9,6960.0,Greenvale - Bulla,
297981,2017,March,1 North West Metro,Hume,3059,Greenvale,2 Community,Personal Property,18,1848.0,Greenvale - Bulla,


In [34]:
# Example: Searching for an exact match of 'Ivanhoe East - Eaglemont' in the 'sa2_name_2021' column
exact_sa2_name = missing_geometry[missing_geometry['sa2_name'] == 'Greenvale - Bulla']
exact_sa2_name.head()

Unnamed: 0,Year,Year ending,Police Region,Local Government Area,Postcode,Suburb,Location Division,Property Item,Number of Items,Value of Items ($),sa2_name,geometry
3407,2024,March,1 North West Metro,Hume,3059,Greenvale,1 Residential,Car Accessories,20,1070.0,Greenvale - Bulla,"POLYGON ((144.57114 -37.67301, 144.57256 -37.6..."
3408,2024,March,1 North West Metro,Hume,3059,Greenvale,1 Residential,Cash/Document,18,204414.0,Greenvale - Bulla,"POLYGON ((144.5927 -37.85777, 144.59031 -37.85..."
3409,2024,March,1 North West Metro,Hume,3059,Greenvale,1 Residential,Cigarettes/Liquor,4,598.0,Greenvale - Bulla,"POLYGON ((144.5927 -37.85777, 144.59031 -37.85..."
3410,2024,March,1 North West Metro,Hume,3059,Greenvale,1 Residential,Clothing,13,11576.0,Greenvale - Bulla,"POLYGON ((144.5927 -37.85777, 144.59031 -37.85..."
3411,2024,March,1 North West Metro,Hume,3059,Greenvale,1 Residential,Electrical Appliances,40,60767.0,Greenvale - Bulla,"POLYGON ((144.5927 -37.85777, 144.59031 -37.85..."


In [35]:
# Get unique sa2_names from both dataframes
missing_sa2names = set(missing_geometry['sa2_name'])
sf_sa2names = set(sf['sa2_name'])

# Find sa2_names in missing_geometry but not in sf
only_in_missing = missing_sa2names - sf_sa2names

# Find sa2_names in sf but not in missing_geometry
only_in_sf = sf_sa2names - missing_sa2names

print("sa2_names in missing_geometry but not in sf:")
print(list(only_in_missing))
print("\nNumber of unmatched sa2_names in missing_geometry:", len(only_in_missing))

print("\nsa2_names in sf but not in missing_geometry:")
print(list(only_in_sf))
print("\nNumber of unmatched sa2_names in sf:", len(only_in_sf))

# Optional: Check for any sa2_names that are None or empty strings
none_or_empty_in_missing = [name for name in missing_geometry['sa2_name'] if pd.isna(name) or name == '']
none_or_empty_in_sf = [name for name in sf['sa2_name'] if pd.isna(name) or name == '']

print("\nNumber of None or empty sa2_names in missing_geometry:", len(none_or_empty_in_missing))
print("Number of None or empty sa2_names in sf:", len(none_or_empty_in_sf))

sa2_names in missing_geometry but not in sf:
[]

Number of unmatched sa2_names in missing_geometry: 0

sa2_names in sf but not in missing_geometry:
['Essendon - East', 'Alphington - Fairfield', 'French Island', 'Parkville', 'Clyde North - South', 'Ballarat North - Invermay', 'Charlemont', 'Eltham', 'Creswick - Clunes', 'Craigieburn - South', 'Colac', 'Greensborough', 'Canadian - Mount Clear', 'Ascot Vale', 'Yarraville', 'Ivanhoe', 'Springvale South', 'Leopold', 'Bendigo', 'Lynbrook - Lyndhurst', 'Carrum Downs', 'Melbourne CBD - East', 'South Yarra - North', 'Mooroopna', 'Craigieburn - North West', 'Noble Park North', 'Gowanbrae', 'Ballarat East - Warrenheip', 'Tarneit - North', 'Southbank (West) - South Wharf', 'Mill Park - North', 'Altona Meadows', 'Coburg - East', 'Lysterfield', 'Dandenong North', 'Mount Martha', 'Monbulk - Silvan', 'Forest Hill', 'Narre Warren North', 'Clyde North - North', 'Kew East', 'Maiden Gully', 'Berwick - North', 'Braybrook', 'Springvale', 'Horsham Surrounds'

In [36]:
# Check the number of rows in each dataframe
print(f"Number of rows in missing_geometry: {len(missing_geometry)}")
print(f"Number of rows in sf: {len(sf)}")
print(f"Number of rows in missing_merged_df: {len(missing_merged_df)}")

# Check if there are any rows where geometry is null in sf
null_geometries_sf = sf['geometry'].isnull().sum()
print(f"\nNumber of rows with null geometries in sf: {null_geometries_sf}")

# Check a sample of rows where geometry is null after the merge
print("\nSample of rows with null geometries after merge:")
print(missing_merged_df[missing_merged_df['geometry'].isnull()].head())

# Check if the merge created duplicate columns
print("\nColumns in missing_merged_df:")
print(missing_merged_df.columns)

# Check if there are any rows where geometry is null in sf
null_geometries = sf['geometry'].isnull().sum()
print(f"\nNumber of rows with null geometries in missing_geometry: {null_geometries}")

# If there's a 'geometry_sf' column, compare it with 'geometry'
if 'geometry_sf' in missing_merged_df.columns:
    null_geometries_sf_merged = missing_merged_df['geometry_sf'].isnull().sum()
    print(f"\nNumber of rows with null geometries in geometry_sf column: {null_geometries_sf_merged}")
    
    # Check if geometries are in geometry_sf instead of geometry
    print("\nSample of geometry_sf column:")
    print(missing_merged_df['geometry_sf'].head())

# Check if the geometries are being overwritten during the merge
if 'geometry' in missing_geometry.columns:
    print("\nNumber of non-null geometries in original missing_geometry:")
    print(missing_geometry['geometry'].notnull().sum())

# Perform a new merge with a different suffix for the geometry column
new_merged_df = missing_geometry.merge(sf[['sa2_name', 'geometry']], on='sa2_name', how='left', suffixes=('', '_new'))

print("\nNumber of null geometries in new merge:")
print(new_merged_df['geometry_new'].isnull().sum())

print("\nSample of new merged geometries:")
print(new_merged_df['geometry_new'].head())

Number of rows in missing_geometry: 297977
Number of rows in sf: 522
Number of rows in missing_merged_df: 297977

Number of rows with null geometries in sf: 0

Sample of rows with null geometries after merge:
        Year Year ending       Police Region Local Government Area Postcode  \
204765  2017       March  1 North West Metro                  Hume     3059   
204766  2017       March  1 North West Metro                  Hume     3059   
204767  2017       March  1 North West Metro                  Hume     3059   
204768  2017       March  1 North West Metro                  Hume     3059   
204769  2017       March  1 North West Metro                  Hume     3059   

           Suburb Location Division          Property Item  Number of Items  \
204765  Greenvale       2 Community               Clothing                6   
204766  Greenvale       2 Community  Electrical Appliances               14   
204767  Greenvale       2 Community           Garden Items                1   


In [37]:
import pandas as pd

# Step 1: Ensure 'sa2_name' is populated in missing_geometry from previous steps
print("Missing geometry columns:", missing_geometry.columns)
print("Sample rows from missing_geometry before merge:", missing_geometry.head())

# Step 2: Perform the merge operation, matching 'sa2_name' to get geometry from sf
final_missing_geometry_matched = missing_geometry.merge(
    sf[['sa2_name', 'geometry']], 
    on='sa2_name', 
    how='left'
)

# Step 3: Ensure 'geometry' is present after the merge
print("Columns after merge:", final_missing_geometry_matched.columns)
print("Sample rows after merge:", final_missing_geometry_matched.head())

# Step 4: Update geometry in missing_geometry DataFrame
missing_geometry['geometry'] = final_missing_geometry_matched['geometry']

# Final count of rows with geometry after the match
final_geometry_count = missing_geometry['geometry'].notna().sum()
print(f"\nFinal count of rows with geometry: {final_geometry_count}")


Missing geometry columns: Index(['Year', 'Year ending', 'Police Region', 'Local Government Area',
       'Postcode', 'Suburb', 'Location Division', 'Property Item',
       'Number of Items', 'Value of Items ($)', 'sa2_name', 'geometry'],
      dtype='object')
Sample rows from missing_geometry before merge:     Year Year ending       Police Region Local Government Area Postcode  \
45  2024       March  1 North West Metro               Banyule     3079   
46  2024       March  1 North West Metro               Banyule     3079   
47  2024       March  1 North West Metro               Banyule     3079   
48  2024       March  1 North West Metro               Banyule     3079   
49  2024       March  1 North West Metro               Banyule     3079   

          Suburb Location Division  Property Item  Number of Items  \
45  Ivanhoe East     1 Residential  Cash/Document               10   
46  Ivanhoe East     1 Residential       Clothing                2   
47  Ivanhoe East     1 Resident

KeyError: 'geometry'

In [31]:
import pandas as pd
import geopandas as gpd

# Assuming property, sf, and suburb_df are already loaded

# Convert the PySpark DataFrame to Pandas
property_pandas = property.toPandas()

# Step 1: Perform the initial merge on 'suburb' from property and 'sa2_name' from sf to get geometry
merged_gdf = property_pandas.merge(sf[['sa2_name', 'geometry']], 
                                   left_on='Suburb', right_on='sa2_name', 
                                   how='left')

# Convert the merged DataFrame to a GeoDataFrame
merged_gdf = gpd.GeoDataFrame(merged_gdf, geometry='geometry')

# Step 2: Check which rows are missing geometry
missing_geometry = merged_gdf[merged_gdf['geometry'].isnull()]

# Step 3: Clean the locality column in suburb_df and map missing geometry rows
suburb_df['locality'] = suburb_df['locality'].str.strip().str.title()

# Step 4: Create a dictionary to map 'locality' to 'SA2_NAME_2021'
locality_to_sa2_mapping = suburb_df.set_index('locality')['SA2_NAME_2021'].to_dict()

# Step 5: Map missing 'Suburb' values to their corresponding 'SA2_NAME_2021'
merged_gdf.loc[merged_gdf['sa2_name'].isna(), 'sa2_name'] = merged_gdf.loc[merged_gdf['sa2_name'].isna(), 'Suburb'].map(locality_to_sa2_mapping)

# Step 6: For the rows where 'sa2_name' was updated, match with the sf data to get geometry
rows_to_match = merged_gdf[merged_gdf['geometry'].isnull() & merged_gdf['sa2_name'].notna()]

# Step 7: Merge these rows with the sf to get the geometry based on the newly updated 'sa2_name'
final_merged_gdf = rows_to_match.merge(sf[['sa2_name', 'geometry']], 
                                       on='sa2_name', 
                                       how='left')

# Step 8: Update the geometry for rows that were matched
merged_gdf.update(final_merged_gdf)

# Step 9: Check for any remaining unmatched suburbs after the final matching
remaining_unmatched = merged_gdf[merged_gdf['geometry'].isnull()]['Suburb'].unique()

print(f"Remaining unmatched suburbs: {len(remaining_unmatched)}")
if len(remaining_unmatched) > 0:
    print("Unmatched suburbs after final matching:", remaining_unmatched)
else:
    print("All suburbs have been successfully matched with geometry.")

# Ensure the result is still a GeoDataFrame
merged_gdf = gpd.GeoDataFrame(merged_gdf, geometry='geometry')

# Display the final GeoDataFrame
print("\nFinal GeoDataFrame with geometry:")
print(merged_gdf.head())

  merged_gdf.update(final_merged_gdf)


Remaining unmatched suburbs: 2485
Unmatched suburbs after final matching: ['Heidelberg Heights' 'Bundoora' 'Eaglemont' ... 'McMillans' 'Callawadda'
 'Swanwater']

Final GeoDataFrame with geometry:
   Year Year ending       Police Region Local Government Area Postcode  \
0  2024       March  1 North West Metro               Banyule     3079   
1  2024       March  1 North West Metro               Banyule     3079   
2  2024       March  1 North West Metro               Banyule     3079   
3  2024       March  1 North West Metro               Banyule     3079   
4  2024       March  1 North West Metro               Banyule     3079   

         Suburb Location Division  Property Item  Number of Items  \
0  Ivanhoe East     1 Residential  Cash/Document               10   
1  Ivanhoe East     1 Residential       Clothing                2   
2  Ivanhoe East     1 Residential      Jewellery                3   
3  Ivanhoe East     1 Residential          Other                8   
4  Ivanhoe Ea

In [149]:
# Step 4: Replace the 'Suburb' values in missing_geometry with corresponding 'SA2_NAME_2021'
missing_geometry['sa2_name'] = missing_geometry['Suburb'].map(locality_to_sa2_mapping)

# Step 5: Check if any unmatched suburbs remain after the mapping
unmatched_suburbs = missing_geometry[missing_geometry['sa2_name'].isnull()]

# Check if there are any unmatched suburbs
if unmatched_suburbs.empty:
    print("All missing geometry suburbs have been successfully matched.")
else:
    print(f"There are {len(unmatched_suburbs)} suburbs that could not be matched with SA2_NAME_2021.")
    print("Unmatched suburbs:")
    print(unmatched_suburbs['Suburb'].unique())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)


There are 6858 suburbs that could not be matched with SA2_NAME_2021.
Unmatched suburbs:
['Melbourne' 'Shepparton' 'Bandiana' 'McMahons Creek' 'McKinnon'
 'Dandenong' 'Dandenong South' 'McCrae' 'Bakery Hill' 'McMillans'
 'Spring Hill' 'McKenzie Creek' 'Inglewood' 'Mildura' 'McKenzie Hill'
 'Kingston' 'McLoughlins Beach' 'McIntyre' 'Murray-sunset']


In [150]:
# Step 5: Filter Victorian suburbs and clean 'locality' in suburb_df
suburb_df_vic = suburb_df[suburb_df['state'] == 'VIC'].copy()
suburb_df_vic['locality'] = suburb_df_vic['locality'].str.strip().str.upper()

# Step 6: Create a mapping from 'locality' to 'SA2_NAME_2021'
locality_to_sa2_mapping = suburb_df_vic.set_index('locality')['SA2_NAME_2021'].to_dict()

# Step 7: Replace unmatched suburbs with SA2_NAME_2021 using the mapping
property_pandas['Suburb'] = property_pandas['Suburb'].str.upper()
property_pandas['Suburb'] = property_pandas['Suburb'].replace(locality_to_sa2_mapping)

# Step 8: Perform the second merge to get geometry for the previously unmatched suburbs
second_merge_df = property_pandas.merge(
    sf[['sa2_name', 'geometry']],
    left_on='Suburb',
    right_on='sa2_name',
    how='left'
)

# Step 9: Combine the results of the first and second merges, focusing on specific columns
combined_df = pd.concat([merged_df[merged_df['geometry'].notnull()], second_merge_df], ignore_index=True)
final_df = combined_df[property_pandas.columns.tolist() + ['sa2_name', 'geometry']]

# Step 10: Identify remaining unmatched suburbs
remaining_unmatched_df = final_df[final_df['geometry'].isnull()]
remaining_unmatched_suburbs = remaining_unmatched_df['Suburb'].unique()

print(f"Remaining unmatched suburbs: {len(remaining_unmatched_suburbs)}")
print("Sample of remaining unmatched suburbs:", remaining_unmatched_suburbs[:5])

# Step 11: Convert to a GeoDataFrame with relevant columns
final_merged_gdf = gpd.GeoDataFrame(final_df, geometry='geometry', crs=sf.crs)

# Display the final GeoDataFrame
final_merged_gdf.head()

KeyboardInterrupt: 

In [56]:
# Specify the suburb you want to search for (e.g., 'Ivanhoe')
suburb_to_search = 'Ivanhoe East'

# Filter the final_merged_gdf for the specific suburb
filtered_gdf = final_merged_gdf[final_merged_gdf['Suburb'].str.contains(suburb_to_search, case=False, na=False)]

# Display the results
print(f"Results for '{suburb_to_search}':")
filtered_gdf


Results for 'Ivanhoe East':


Unnamed: 0,Year,Year ending,Police Region,Local Government Area,Postcode,Suburb,Location Division,Property Item,Number of Items,Value of Items ($),sa2_name_x,geometry
0,2024,March,1 North West Metro,Banyule,3079,Ivanhoe East - Eaglemont,1 Residential,Car Accessories,14,2040.0,Ivanhoe,"POLYGON ((145.05176 -37.76678, 145.05188 -37.7..."
1,2024,March,1 North West Metro,Banyule,3079,Ivanhoe East - Eaglemont,1 Residential,Cash/Document,42,51750.0,Ivanhoe,"POLYGON ((145.05176 -37.76678, 145.05188 -37.7..."
2,2024,March,1 North West Metro,Banyule,3079,Ivanhoe East - Eaglemont,1 Residential,Clothing,9,2140.0,Ivanhoe,"POLYGON ((145.05176 -37.76678, 145.05188 -37.7..."
3,2024,March,1 North West Metro,Banyule,3079,Ivanhoe East - Eaglemont,1 Residential,Electrical Appliances,24,13310.0,Ivanhoe,"POLYGON ((145.05176 -37.76678, 145.05188 -37.7..."
4,2024,March,1 North West Metro,Banyule,3079,Ivanhoe East - Eaglemont,1 Residential,Food,1,200.0,Ivanhoe,"POLYGON ((145.05176 -37.76678, 145.05188 -37.7..."
...,...,...,...,...,...,...,...,...,...,...,...,...
387518,2015,March,1 North West Metro,Banyule,3079,Ivanhoe East - Eaglemont,All Locations,Personal Property,24,9850.0,,"POLYGON ((145.05176 -37.76678, 145.05188 -37.7..."
387519,2015,March,1 North West Metro,Banyule,3079,Ivanhoe East - Eaglemont,All Locations,Power Tools,28,2380.0,,"POLYGON ((145.05176 -37.76678, 145.05188 -37.7..."
387625,2015,March,1 North West Metro,Banyule,3084,Ivanhoe East - Eaglemont,2 Community,Other,2,1160.0,,"POLYGON ((145.05176 -37.76678, 145.05188 -37.7..."
387626,2015,March,1 North West Metro,Banyule,3084,Ivanhoe East - Eaglemont,All Locations,Cash/Document,1,600.0,,"POLYGON ((145.05176 -37.76678, 145.05188 -37.7..."


In [41]:
final_merged_gdf.length


  final_merged_gdf.length


0         0.160206
1         0.160206
2         0.160206
3         0.160206
4         0.160206
            ...   
564606    5.388948
564607    5.526044
564608    5.388948
564609    5.388948
564610    8.821326
Length: 564611, dtype: float64

In [42]:
# Check for duplicates in sa2_name in the shapefile (sf)
duplicate_sa2 = sf[sf.duplicated(subset='sa2_name', keep=False)]
print(f"Duplicate SA2 names in shapefile: {duplicate_sa2.shape[0]}")

# Display duplicate SA2 names, if any
if duplicate_sa2.shape[0] > 0:
    print(duplicate_sa2['sa2_name'].unique())


Duplicate SA2 names in shapefile: 0


In [43]:
merged_gdf.head(20)

Unnamed: 0,Year,Year ending,Police Region,Local Government Area,Postcode,Suburb,Location Division,Property Item,Number of Items,Value of Items ($),sa2_name,geometry
0,2024,March,1 North West Metro,Banyule,3079,Ivanhoe,1 Residential,Car Accessories,14,2040.0,Ivanhoe,"POLYGON ((145.02852 -37.76136, 145.02856 -37.7..."
1,2024,March,1 North West Metro,Banyule,3079,Ivanhoe,1 Residential,Cash/Document,42,51750.0,Ivanhoe,"POLYGON ((145.02852 -37.76136, 145.02856 -37.7..."
2,2024,March,1 North West Metro,Banyule,3079,Ivanhoe,1 Residential,Clothing,9,2140.0,Ivanhoe,"POLYGON ((145.02852 -37.76136, 145.02856 -37.7..."
3,2024,March,1 North West Metro,Banyule,3079,Ivanhoe,1 Residential,Electrical Appliances,24,13310.0,Ivanhoe,"POLYGON ((145.02852 -37.76136, 145.02856 -37.7..."
4,2024,March,1 North West Metro,Banyule,3079,Ivanhoe,1 Residential,Food,1,200.0,Ivanhoe,"POLYGON ((145.02852 -37.76136, 145.02856 -37.7..."
5,2024,March,1 North West Metro,Banyule,3079,Ivanhoe,1 Residential,Furniture,1,1000.0,Ivanhoe,"POLYGON ((145.02852 -37.76136, 145.02856 -37.7..."
6,2024,March,1 North West Metro,Banyule,3079,Ivanhoe,1 Residential,Garden Items,10,5253.0,Ivanhoe,"POLYGON ((145.02852 -37.76136, 145.02856 -37.7..."
7,2024,March,1 North West Metro,Banyule,3079,Ivanhoe,1 Residential,Jewellery,79,140802.0,Ivanhoe,"POLYGON ((145.02852 -37.76136, 145.02856 -37.7..."
8,2024,March,1 North West Metro,Banyule,3079,Ivanhoe,1 Residential,Personal Property,54,38369.0,Ivanhoe,"POLYGON ((145.02852 -37.76136, 145.02856 -37.7..."
9,2024,March,1 North West Metro,Banyule,3079,Ivanhoe,1 Residential,Photographic Equip,3,700.0,Ivanhoe,"POLYGON ((145.02852 -37.76136, 145.02856 -37.7..."


In [44]:
'''# Step 1: Convert PySpark DataFrame to Pandas for easier handling
property_df = property.toPandas()

# Step 2: Perform initial merge of property_df and sf based on 'Suburb' and 'sa2_name'
initial_merged_df = property_df.merge(sf[['sa2_name', 'geometry']], 
                                      left_on='Suburb', right_on='sa2_name', 
                                      how='left')

# Step 3: Identify distinct unmatched suburbs (those with missing geometry after the initial merge)
unmatched_suburbs_before = initial_merged_df[initial_merged_df['geometry'].isnull()]['Suburb'].unique()
print(f"Distinct unmatched suburbs before lookup: {len(unmatched_suburbs_before)}")
print("Unmatched suburbs before lookup:", unmatched_suburbs_before)

# Step 4: Load the suburb_df Parquet file
suburb_df = pd.read_parquet('../data/landing/suburb_match/suburb_match.parquet')

# Step 5: Filter only Victorian suburbs from suburb_df
suburb_df_vic = suburb_df[suburb_df['state'] == 'VIC'].copy()

# Step 6: Clean the locality data (in suburb_df) to standardize
suburb_df_vic['locality'] = suburb_df_vic['locality'].str.strip().str.title()

# Step 7: Create a dictionary to map 'locality' to 'SA2_NAME_2021'
locality_to_sa2_mapping = suburb_df_vic.set_index('locality')['SA2_NAME_2021'].to_dict()

# Step 8: Replace unmatched suburbs in the property DataFrame using the mapping
property_df['Suburb'] = property_df['Suburb'].replace(locality_to_sa2_mapping)

# Step 9: Perform a second merge with the sf DataFrame to find geometry for the renamed suburbs
final_merged_df = property_df.merge(sf[['sa2_name', 'geometry']], 
                                    left_on='Suburb', right_on='sa2_name', 
                                    how='left')

# Step 10: Identify distinct remaining unmatched suburbs (those with missing geometry)
unmatched_suburbs_after = final_merged_df[final_merged_df['geometry'].isnull()]['Suburb'].unique()

# Step 11: Output results
print(f"Distinct unmatched suburbs after lookup: {len(unmatched_suburbs_after)}")
if len(unmatched_suburbs_after) > 0:
    print("Remaining unmatched suburbs:", unmatched_suburbs_after)
else:
    print("All suburbs have been successfully matched with geometry.")

# Step 12: The final merged DataFrame with geometry
# final_merged_df.head()'''

'# Step 1: Convert PySpark DataFrame to Pandas for easier handling\nproperty_df = property.toPandas()\n\n# Step 2: Perform initial merge of property_df and sf based on \'Suburb\' and \'sa2_name\'\ninitial_merged_df = property_df.merge(sf[[\'sa2_name\', \'geometry\']], \n                                      left_on=\'Suburb\', right_on=\'sa2_name\', \n                                      how=\'left\')\n\n# Step 3: Identify distinct unmatched suburbs (those with missing geometry after the initial merge)\nunmatched_suburbs_before = initial_merged_df[initial_merged_df[\'geometry\'].isnull()][\'Suburb\'].unique()\nprint(f"Distinct unmatched suburbs before lookup: {len(unmatched_suburbs_before)}")\nprint("Unmatched suburbs before lookup:", unmatched_suburbs_before)\n\n# Step 4: Load the suburb_df Parquet file\nsuburb_df = pd.read_parquet(\'../data/landing/suburb_match/suburb_match.parquet\')\n\n# Step 5: Filter only Victorian suburbs from suburb_df\nsuburb_df_vic = suburb_df[suburb_df[\'st

In [45]:
# Step 1: Convert PySpark DataFrame to Pandas for easier handling
property_df = property.toPandas()

# Step 2: Ensure suburb names in property_df are in uppercase for consistent matching
property_df['Suburb'] = property_df['Suburb'].str.upper()

# Step 3: Perform initial merge of property_df and sf based on 'Suburb' and 'sa2_name'
initial_merged_df = property_df.merge(sf[['sa2_name', 'geometry']], 
                                      left_on='Suburb', right_on='sa2_name', 
                                      how='left')

# Step 4: Identify distinct unmatched suburbs (those with missing geometry after the initial merge)
unmatched_suburbs_before = initial_merged_df[initial_merged_df['geometry'].isnull()]['Suburb'].unique()
print(f"Distinct unmatched suburbs before lookup: {len(unmatched_suburbs_before)}")
print("Unmatched suburbs before lookup:", unmatched_suburbs_before)

# Step 5: Load the suburb_df Parquet file
suburb_df = pd.read_parquet('../data/landing/suburb_match/suburb_match.parquet')

# Step 6: Filter only Victorian suburbs from suburb_df and ensure locality names are uppercase
suburb_df_vic = suburb_df[suburb_df['state'] == 'VIC'].copy()
suburb_df_vic['locality'] = suburb_df_vic['locality'].str.upper()

# Step 7: Create a dictionary to map 'locality' to 'SA2_NAME_2021'
locality_to_sa2_mapping = suburb_df_vic.set_index('locality')['SA2_NAME_2021'].to_dict()

# Step 8: Replace unmatched suburbs in the property DataFrame using the mapping
property_df['Suburb'] = property_df['Suburb'].replace(locality_to_sa2_mapping)

# Step 9: Perform a second merge with the sf DataFrame to find geometry for the renamed suburbs
final_merged_df = property_df.merge(sf[['sa2_name', 'geometry']], 
                                    left_on='Suburb', right_on='sa2_name', 
                                    how='left')

# Step 10: Identify distinct remaining unmatched suburbs (those with missing geometry)
unmatched_suburbs_after = final_merged_df[final_merged_df['geometry'].isnull()]['Suburb'].unique()

# Step 11: Output results
print(f"Distinct unmatched suburbs after lookup: {len(unmatched_suburbs_after)}")
if len(unmatched_suburbs_after) > 0:
    print("Remaining unmatched suburbs:", unmatched_suburbs_after)
else:
    print("All suburbs have been successfully matched with geometry.")

# Step 12: The final merged DataFrame with geometry
final_merged_df.head()

                                                                                

Distinct unmatched suburbs before lookup: 2748
Unmatched suburbs before lookup: ['IVANHOE' 'IVANHOE EAST' 'BELLFIELD' ... 'SWANWATER' 'TANWOOD'
 'CHINANGIN']
Distinct unmatched suburbs after lookup: 1
Remaining unmatched suburbs: [None]


Unnamed: 0,Year,Year ending,Police Region,Local Government Area,Postcode,Suburb,Location Division,Property Item,Number of Items,Value of Items ($),sa2_name,geometry
0,2024,March,1 North West Metro,Banyule,3079,Ivanhoe East - Eaglemont,1 Residential,Car Accessories,14,2040.0,Ivanhoe East - Eaglemont,"POLYGON ((145.05176 -37.76678, 145.05188 -37.7..."
1,2024,March,1 North West Metro,Banyule,3079,Ivanhoe East - Eaglemont,1 Residential,Cash/Document,42,51750.0,Ivanhoe East - Eaglemont,"POLYGON ((145.05176 -37.76678, 145.05188 -37.7..."
2,2024,March,1 North West Metro,Banyule,3079,Ivanhoe East - Eaglemont,1 Residential,Clothing,9,2140.0,Ivanhoe East - Eaglemont,"POLYGON ((145.05176 -37.76678, 145.05188 -37.7..."
3,2024,March,1 North West Metro,Banyule,3079,Ivanhoe East - Eaglemont,1 Residential,Electrical Appliances,24,13310.0,Ivanhoe East - Eaglemont,"POLYGON ((145.05176 -37.76678, 145.05188 -37.7..."
4,2024,March,1 North West Metro,Banyule,3079,Ivanhoe East - Eaglemont,1 Residential,Food,1,200.0,Ivanhoe East - Eaglemont,"POLYGON ((145.05176 -37.76678, 145.05188 -37.7..."


In [46]:
# Filter for a specific suburb, e.g., 'Arcadia'
specific_suburb = "Ivanhoe"
suburb_check = final_merged_df[final_merged_df['sa2_name'] == specific_suburb]

# Show the result
suburb_check

Unnamed: 0,Year,Year ending,Police Region,Local Government Area,Postcode,Suburb,Location Division,Property Item,Number of Items,Value of Items ($),sa2_name,geometry


In [47]:
'''# Step 1: Convert PySpark DataFrame to Pandas for easier handling
property_df = property.toPandas()

# Step 2: Perform initial merge of property_df and sf based on 'Suburb' and 'sa2_name'
initial_merged_df = property_df.merge(sf[['sa2_name', 'geometry']], 
                                      left_on='Suburb', right_on='sa2_name', 
                                      how='left')

# Step 3: Identify unmatched suburbs (those with missing geometry after the initial merge)
unmatched_suburbs_before = initial_merged_df[initial_merged_df['geometry'].isnull()]['Suburb'].unique()
print(f"Unmatched suburbs before lookup: {len(unmatched_suburbs_before)}")

# Step 4: Load the suburb_df Parquet file
suburb_df = pd.read_parquet('../data/landing/suburb_match/suburb_match.parquet')

# Step 5: Filter only Victorian suburbs from suburb_df
suburb_df_vic = suburb_df[suburb_df['state'] == 'VIC'].copy()

# Step 6: Clean the locality data (in suburb_df) to standardize
suburb_df_vic['locality'] = suburb_df_vic['locality'].str.strip().str.title()

# Step 7: Create a dictionary to map 'locality' to 'SA2_NAME_2021'
locality_to_sa2_mapping = suburb_df_vic.set_index('locality')['SA2_NAME_2021'].to_dict()

# Step 8: Replace unmatched suburbs in the property DataFrame using the mapping
property_df['Suburb'] = property_df['Suburb'].replace(locality_to_sa2_mapping)

# Step 9: Perform a second merge with the sf DataFrame to find geometry for the renamed suburbs
final_merged_df = property_df.merge(sf[['sa2_name', 'geometry']], 
                                    left_on='Suburb', right_on='sa2_name', 
                                    how='left')

# Step 10: Identify any remaining unmatched suburbs (those with missing geometry)
unmatched_suburbs_after = final_merged_df[final_merged_df['geometry'].isnull()]['Suburb'].unique()

# Step 11: Output results
print(f"Unmatched suburbs after lookup: {len(unmatched_suburbs_after)}")
if len(unmatched_suburbs_after) > 0:
    print("Remaining unmatched suburbs:", unmatched_suburbs_after)
else:
    print("All suburbs have been successfully matched with geometry.")

# Step 12: The final merged DataFrame with geometry
final_merged_df.head()'''

'# Step 1: Convert PySpark DataFrame to Pandas for easier handling\nproperty_df = property.toPandas()\n\n# Step 2: Perform initial merge of property_df and sf based on \'Suburb\' and \'sa2_name\'\ninitial_merged_df = property_df.merge(sf[[\'sa2_name\', \'geometry\']], \n                                      left_on=\'Suburb\', right_on=\'sa2_name\', \n                                      how=\'left\')\n\n# Step 3: Identify unmatched suburbs (those with missing geometry after the initial merge)\nunmatched_suburbs_before = initial_merged_df[initial_merged_df[\'geometry\'].isnull()][\'Suburb\'].unique()\nprint(f"Unmatched suburbs before lookup: {len(unmatched_suburbs_before)}")\n\n# Step 4: Load the suburb_df Parquet file\nsuburb_df = pd.read_parquet(\'../data/landing/suburb_match/suburb_match.parquet\')\n\n# Step 5: Filter only Victorian suburbs from suburb_df\nsuburb_df_vic = suburb_df[suburb_df[\'state\'] == \'VIC\'].copy()\n\n# Step 6: Clean the locality data (in suburb_df) to standa

In [48]:
import pandas as pd

# Load the CSV file containing Locality and SA2 NAME 2021
suburb_to_sa2_df = pd.read_csv('../data/landing/suburb_match/suburb_match.csv')

suburb_to_sa2_df.head()

Unnamed: 0,id,postcode,locality,state,long,lat,dc,type,status,sa3,...,altitude,chargezone,phn_code,phn_name,lgaregion,lgacode,electorate,electoraterating,sed_code,sed_name
0,230,200,ANU,ACT,149.119,-35.2777,,,Updated 3-Dec-2022,,...,,N2,,,Unincorporated ACT,89399.0,Durack,,,
1,21820,200,Australian National University,ACT,149.1189,-35.2777,,,Updated 3-Dec-2022,,...,,N2,,,Unincorporated ACT,89399.0,Durack,,,
2,232,800,DARWIN,NT,130.83668,-12.458684,,,Updated 3-Dec-2022,70101.0,...,,NT1,PHN701,Northern Territory,Darwin Waterfront Precinct,71150.0,Solomon,Inner Metropolitan,70022.0,Port Darwin
3,24049,800,DARWIN CITY,NT,130.83668,-12.458684,,,Updated 3-Dec-2022,70101.0,...,,NT1,PHN701,Northern Territory,Darwin Waterfront Precinct,71150.0,Solomon,Inner Metropolitan,70022.0,Port Darwin
4,233,801,DARWIN,NT,130.83668,-12.458684,,,Updated 3-Dec-2022,70101.0,...,,NT1,PHN701,,Darwin,71000.0,Lingiari,Rural,,


In [49]:
suburb_to_sa2_df.columns

Index(['id', 'postcode', 'locality', 'state', 'long', 'lat', 'dc', 'type',
       'status', 'sa3', 'sa3name', 'sa4', 'sa4name', 'region', 'Lat_precise',
       'Long_precise', 'SA1_CODE_2021', 'SA1_NAME_2021', 'SA2_CODE_2021',
       'SA2_NAME_2021', 'SA3_CODE_2021', 'SA3_NAME_2021', 'SA4_CODE_2021',
       'SA4_NAME_2021', 'RA_2011', 'RA_2016', 'RA_2021', 'RA_2021_NAME',
       'MMM_2015', 'MMM_2019', 'ced', 'altitude', 'chargezone', 'phn_code',
       'phn_name', 'lgaregion', 'lgacode', 'electorate', 'electoraterating',
       'sed_code', 'sed_name'],
      dtype='object')

In [50]:
# 1. Identify unmatched suburbs with missing geometry
unmatched_suburbs = merged_gdf[merged_gdf['geometry'].isnull()]['Suburb'].unique()

# Convert to Parquet format
suburb_to_sa2_df.to_parquet('../data/landing/suburb_match/suburb_match.parquet')

# Load the Parquet file containing Locality and SA2 NAME 2021
suburb_to_sa2_df = pd.read_parquet('../data/landing/suburb_match/suburb_match.parquet')

# Assuming there is a column in the Parquet file that specifies the state, e.g., 'state'
suburb_to_sa2_df_vic = suburb_to_sa2_df[suburb_to_sa2_df['state'] == 'VIC']

# Clean the suburb data
suburb_to_sa2_df_vic['locality'] = suburb_to_sa2_df_vic['locality'].str.strip().str.title()
suburb_to_sa2_df_vic.drop_duplicates(subset=['locality'], inplace=True)

# 3. Map unmatched suburbs to SA2 names using the Parquet data
locality_to_sa2_mapping = suburb_to_sa2_df_vic.set_index('locality')['SA2_NAME_2021'].to_dict()

# Replace unmatched suburbs in the merged_gdf with corresponding SA2 namesfff 
merged_gdf['Suburb'] = merged_gdf['Suburb'].replace(locality_to_sa2_mapping)

# 5. Use the SA2 names to find the corresponding geometry from the shapefile
# Merge based on the SA2 names (after the substitution)
final_merged_gdf = merged_gdf.merge(sf[['sa2_name', 'geometry']], 
                                    left_on='Suburb', right_on='sa2_name', 
                                    how='left')

# Check the updated GeoDataFrame to ensure the geometries are updated
final_merged_gdf.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  suburb_to_sa2_df_vic['locality'] = suburb_to_sa2_df_vic['locality'].str.strip().str.title()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  suburb_to_sa2_df_vic.drop_duplicates(subset=['locality'], inplace=True)


Unnamed: 0,Year,Year ending,Police Region,Local Government Area,Postcode,Suburb,Location Division,Property Item,Number of Items,Value of Items ($),sa2_name_x,geometry_x,sa2_name_y,geometry_y
0,2024,March,1 North West Metro,Banyule,3079,Ivanhoe East - Eaglemont,1 Residential,Car Accessories,14,2040.0,Ivanhoe,"POLYGON ((145.02852 -37.76136, 145.02856 -37.7...",Ivanhoe East - Eaglemont,"POLYGON ((145.05176 -37.76678, 145.05188 -37.7..."
1,2024,March,1 North West Metro,Banyule,3079,Ivanhoe East - Eaglemont,1 Residential,Cash/Document,42,51750.0,Ivanhoe,"POLYGON ((145.02852 -37.76136, 145.02856 -37.7...",Ivanhoe East - Eaglemont,"POLYGON ((145.05176 -37.76678, 145.05188 -37.7..."
2,2024,March,1 North West Metro,Banyule,3079,Ivanhoe East - Eaglemont,1 Residential,Clothing,9,2140.0,Ivanhoe,"POLYGON ((145.02852 -37.76136, 145.02856 -37.7...",Ivanhoe East - Eaglemont,"POLYGON ((145.05176 -37.76678, 145.05188 -37.7..."
3,2024,March,1 North West Metro,Banyule,3079,Ivanhoe East - Eaglemont,1 Residential,Electrical Appliances,24,13310.0,Ivanhoe,"POLYGON ((145.02852 -37.76136, 145.02856 -37.7...",Ivanhoe East - Eaglemont,"POLYGON ((145.05176 -37.76678, 145.05188 -37.7..."
4,2024,March,1 North West Metro,Banyule,3079,Ivanhoe East - Eaglemont,1 Residential,Food,1,200.0,Ivanhoe,"POLYGON ((145.02852 -37.76136, 145.02856 -37.7...",Ivanhoe East - Eaglemont,"POLYGON ((145.05176 -37.76678, 145.05188 -37.7..."


In [51]:
# Perform the merge, and rename the columns properly to avoid confusion
final_merged_gdf = merged_gdf.merge(sf[['sa2_name', 'geometry']], 
                                    left_on='Suburb', right_on='sa2_name', 
                                    how='left')

# Drop the redundant geometry_x and rename geometry_y to geometry
final_merged_gdf = final_merged_gdf.drop(columns=['geometry_x', 'sa2_name_y'])
final_merged_gdf = final_merged_gdf.rename(columns={'geometry_y': 'geometry'})

# Now, check the updated GeoDataFrame to ensure the geometries are updated
print("Columns in final_merged_gdf after cleaning:", final_merged_gdf.columns)

# 6. Identify remaining unmatched suburbs with missing geometry after the merge
unmatched_suburbs_after = final_merged_gdf[final_merged_gdf['geometry'].isnull()]['Suburb'].unique()
print(f"Unmatched suburbs after SA2 mapping: {len(unmatched_suburbs_after)}")

# Print unmatched suburbs, if any
if len(unmatched_suburbs_after) > 0:
    print("Remaining unmatched suburbs:")
    print(unmatched_suburbs_after)
else:
    print("All unmatched suburbs have been successfully mapped.")

Columns in final_merged_gdf after cleaning: Index(['Year', 'Year ending', 'Police Region', 'Local Government Area',
       'Postcode', 'Suburb', 'Location Division', 'Property Item',
       'Number of Items', 'Value of Items ($)', 'sa2_name_x', 'geometry'],
      dtype='object')
Unmatched suburbs after SA2 mapping: 10
Remaining unmatched suburbs:
[None 'McMahons Creek' 'McKinnon' 'McCrae' 'McMillans' 'McKenzie Creek'
 'McKenzie Hill' 'McLoughlins Beach' 'McIntyre' 'Murray-sunset']
