# DATA2001 Assignment - Team 3 🧠📊

**Group Name**: Data Princesses

**Team Members**: Michella Krishna, Yujia Liu, Katherine Kong

**Region Focus**: City and Inner South, North Sydney and Hornsby, Inner West  


## 📚 Table of Contents

1. [Project Overview](#project-overview)  
2. [Dataset Overview](#dataset-overview)  
3. [Data Preparation (All Regions)](#data-preparation)  
4. [SA4 Zone Analyses](#sa4-analyses)  
    4.1 [City and Inner South](#city-and-inner-south)  
    4.2 [North Sydney and Hornsby](#north-sydney-and-hornsby)        
    4.3 [Inner West](#inner-west)  
6. [Score Comparison Across SA4 Zones](#score-comparison)  
7. [Correlation with Income](#correlation-analysis)  
8. [Conclusion and Reflection](#conclusion)


## 🔎 Project Overview <a name="project-overview"></a>


## 🧪 Data Preparation (All Regions)

In [65]:
import pandas as pd

# Load the dataset
businesses = pd.read_csv("Businesses.csv")

# Clean column names: lowercase and replace special characters
businesses.columns = (
    businesses.columns
    .str.lower()
    .str.replace(" ", "_", regex=False)
    .str.replace("-", "_", regex=False)
)

businesses = businesses.rename(columns={
    "0_to_50k_businesses": "b_0_50k",
    "50k_to_200k_businesses": "b_50k_200k",
    "200k_to_2m_businesses": "b_200k_2m",
    "2m_to_5m_businesses": "b_2m_5m",
    "5m_to_10m_businesses": "b_5m_10m",
    "10m_or_more_businesses": "b_10m_more"
})

# Preview cleaned column names
print("Cleaned columns:", businesses.columns.tolist())

# Confirm data types and nulls (should be fine)
print(businesses.info())
print(businesses.isnull().sum())

# Export the cleaned dataset
businesses.to_csv("b_cleaned.csv", index=False)

businesses.head()

Cleaned columns: ['industry_code', 'industry_name', 'sa2_code', 'sa2_name', 'b_0_50k', 'b_50k_200k', 'b_200k_2m', 'b_2m_5m', 'b_5m_10m', 'b_10m_more', 'total_businesses']
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12217 entries, 0 to 12216
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   industry_code     12217 non-null  object
 1   industry_name     12217 non-null  object
 2   sa2_code          12217 non-null  int64 
 3   sa2_name          12217 non-null  object
 4   b_0_50k           12217 non-null  int64 
 5   b_50k_200k        12217 non-null  int64 
 6   b_200k_2m         12217 non-null  int64 
 7   b_2m_5m           12217 non-null  int64 
 8   b_5m_10m          12217 non-null  int64 
 9   b_10m_more        12217 non-null  int64 
 10  total_businesses  12217 non-null  int64 
dtypes: int64(8), object(3)
memory usage: 1.0+ MB
None
industry_code       0
industry_name       0
sa2_code            0
s

Unnamed: 0,industry_code,industry_name,sa2_code,sa2_name,b_0_50k,b_50k_200k,b_200k_2m,b_2m_5m,b_5m_10m,b_10m_more,total_businesses
0,A,"Agriculture, Forestry and Fishing",101021007,Braidwood,136,92,63,4,0,0,296
1,A,"Agriculture, Forestry and Fishing",101021008,Karabar,6,3,0,0,0,0,9
2,A,"Agriculture, Forestry and Fishing",101021009,Queanbeyan,6,4,3,0,0,3,15
3,A,"Agriculture, Forestry and Fishing",101021010,Queanbeyan - East,0,3,0,0,0,0,3
4,A,"Agriculture, Forestry and Fishing",101021012,Queanbeyan West - Jerrabomberra,7,4,5,0,0,0,16


In [50]:
# Load the data
population = pd.read_csv("Population.csv")

# Clean column names: lowercase, replace hyphens with underscores
population.columns = (
    population.columns
    .str.lower()
    .str.replace("-", "_")
    .str.replace(" ", "_")
)

# Show cleaned columns
print("Cleaned columns:", population.columns.tolist())

# Confirm data types and nulls (should be fine)
print(population.info())
print(population.isnull().sum())

# Export cleaned dataset
population.to_csv("p_cleaned.csv", index=False)

population.head()

Cleaned columns: ['sa2_code', 'sa2_name', '0_4_people', '5_9_people', '10_14_people', '15_19_people', '20_24_people', '25_29_people', '30_34_people', '35_39_people', '40_44_people', '45_49_people', '50_54_people', '55_59_people', '60_64_people', '65_69_people', '70_74_people', '75_79_people', '80_84_people', '85_and_over_people', 'total_people']
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 373 entries, 0 to 372
Data columns (total 21 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   sa2_code            373 non-null    int64 
 1   sa2_name            373 non-null    object
 2   0_4_people          373 non-null    int64 
 3   5_9_people          373 non-null    int64 
 4   10_14_people        373 non-null    int64 
 5   15_19_people        373 non-null    int64 
 6   20_24_people        373 non-null    int64 
 7   25_29_people        373 non-null    int64 
 8   30_34_people        373 non-null    int64 
 9   35_39_people  

Unnamed: 0,sa2_code,sa2_name,0_4_people,5_9_people,10_14_people,15_19_people,20_24_people,25_29_people,30_34_people,35_39_people,...,45_49_people,50_54_people,55_59_people,60_64_people,65_69_people,70_74_people,75_79_people,80_84_people,85_and_over_people,total_people
0,102011028,Avoca Beach - Copacabana,424,522,623,552,386,222,306,416,...,572,602,570,520,464,369,226,142,70,7530
1,102011029,Box Head - MacMasters Beach,511,666,702,592,461,347,420,535,...,749,749,794,895,863,925,603,331,264,11052
2,102011030,Calga - Kulnura,200,225,258,278,274,227,214,286,...,325,436,422,397,327,264,190,100,75,4748
3,102011031,Erina - Green Point,683,804,880,838,661,502,587,757,...,859,882,901,930,917,1065,976,773,1028,14803
4,102011032,Gosford - Springfield,1164,1044,1084,1072,1499,1864,1750,1520,...,1330,1241,1377,1285,1166,949,664,476,537,21346


In [46]:
# Load your dataset
df = pd.read_csv('Income.csv')

# 1. Rename column names to lowercase
df.columns = df.columns.str.lower()

# 2. Replace spaces with underscores
df.columns = df.columns.str.replace(' ', '_')

# 3. Replace hyphens with underscores
df.columns = df.columns.str.replace('-', '_')

# 4. Drop rows with null values
df.dropna(inplace=True)

# View the cleaned DataFrame
df.head()

Unnamed: 0,sa2_code21,sa2_name,earners,median_age,median_income,mean_income
0,101021007,Braidwood,2467,51,46640,68904
1,101021008,Karabar,5103,42,65564,69672
2,101021009,Queanbeyan,7028,39,63528,69174
3,101021010,Queanbeyan - East,3398,39,66148,74162
4,101021012,Queanbeyan West - Jerrabomberra,8422,44,78630,91981


In [55]:
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point, Polygon, MultiPolygon
from geoalchemy2 import Geometry, WKTElement
import matplotlib.pyplot as plt

In [57]:
# Adjust path as needed
catchments_primary = gpd.read_file("~/Desktop/DATA2001/DATA2001-A/catchments/catchments_primary.shp")

# Preview structure
print(catchments_primary.columns)
catchments_primary.head()

Index(['USE_ID', 'CATCH_TYPE', 'USE_DESC', 'ADD_DATE', 'KINDERGART', 'YEAR1',
       'YEAR2', 'YEAR3', 'YEAR4', 'YEAR5', 'YEAR6', 'YEAR7', 'YEAR8', 'YEAR9',
       'YEAR10', 'YEAR11', 'YEAR12', 'PRIORITY', 'geometry'],
      dtype='object')


Unnamed: 0,USE_ID,CATCH_TYPE,USE_DESC,ADD_DATE,KINDERGART,YEAR1,YEAR2,YEAR3,YEAR4,YEAR5,YEAR6,YEAR7,YEAR8,YEAR9,YEAR10,YEAR11,YEAR12,PRIORITY,geometry
0,2838,PRIMARY,Parklea PS,20181210,Y,Y,Y,Y,Y,Y,Y,N,N,N,N,N,N,,"POLYGON ((150.93564 -33.71612, 150.93715 -33.7..."
1,2404,PRIMARY,Lindfield EPS,20211219,Y,Y,Y,Y,Y,Y,Y,N,N,N,N,N,N,,"POLYGON ((151.18336 -33.74748, 151.18443 -33.7..."
2,4393,PRIMARY,Carlingford WPS,20220223,Y,Y,Y,Y,Y,Y,Y,N,N,N,N,N,N,,"POLYGON ((151.04518 -33.77303, 151.04526 -33.7..."
3,4615,PRIMARY,Caddies Ck PS,20181210,Y,Y,Y,Y,Y,Y,Y,N,N,N,N,N,N,,"POLYGON ((150.92567 -33.7296, 150.92602 -33.72..."
4,3918,PRIMARY,Killara PS,20211219,Y,Y,Y,Y,Y,Y,Y,N,N,N,N,N,N,,"POLYGON ((151.15379 -33.75586, 151.15404 -33.7..."


In [59]:
# Adjust path as needed
catchments_secondary = gpd.read_file("~/Desktop/DATA2001/DATA2001-A/catchments/catchments_secondary.shp")

# Preview structure
print(catchments_secondary.columns)
catchments_secondary.head()

Index(['USE_ID', 'CATCH_TYPE', 'USE_DESC', 'ADD_DATE', 'KINDERGART', 'YEAR1',
       'YEAR2', 'YEAR3', 'YEAR4', 'YEAR5', 'YEAR6', 'YEAR7', 'YEAR8', 'YEAR9',
       'YEAR10', 'YEAR11', 'YEAR12', 'PRIORITY', 'geometry'],
      dtype='object')


Unnamed: 0,USE_ID,CATCH_TYPE,USE_DESC,ADD_DATE,KINDERGART,YEAR1,YEAR2,YEAR3,YEAR4,YEAR5,YEAR6,YEAR7,YEAR8,YEAR9,YEAR10,YEAR11,YEAR12,PRIORITY,geometry
0,8503,HIGH_COED,Billabong HS,20200507,N,N,N,N,N,N,N,Y,Y,Y,Y,Y,Y,,"POLYGON ((146.67182 -35.31444, 146.6893 -35.31..."
1,8266,HIGH_COED,James Fallon HS,20200507,N,N,N,N,N,N,N,Y,Y,Y,Y,Y,Y,,"POLYGON ((147.08734 -35.86271, 147.10413 -35.8..."
2,8505,HIGH_COED,Murray HS,20200507,N,N,N,N,N,N,N,Y,Y,Y,Y,Y,Y,,"POLYGON ((146.81448 -35.78341, 146.8125 -35.79..."
3,8458,HIGH_COED,Kingswood HS,20201016,N,N,N,N,N,N,N,Y,Y,Y,Y,Y,Y,,"MULTIPOLYGON (((150.686 -33.74031, 150.68631 -..."
4,8559,HIGH_COED,Jamison HS,20201016,N,N,N,N,N,N,N,Y,Y,Y,Y,Y,Y,,"POLYGON ((150.69513 -33.75627, 150.68936 -33.7..."


In [61]:
# Adjust path as needed
catchments_future = gpd.read_file("~/Desktop/DATA2001/DATA2001-A/catchments/catchments_future.shp")

# Preview structure
print(catchments_future.columns)
catchments_future.head()

Index(['USE_ID', 'CATCH_TYPE', 'USE_DESC', 'ADD_DATE', 'KINDERGART', 'YEAR1',
       'YEAR2', 'YEAR3', 'YEAR4', 'YEAR5', 'YEAR6', 'YEAR7', 'YEAR8', 'YEAR9',
       'YEAR10', 'YEAR11', 'YEAR12', 'geometry'],
      dtype='object')


Unnamed: 0,USE_ID,CATCH_TYPE,USE_DESC,ADD_DATE,KINDERGART,YEAR1,YEAR2,YEAR3,YEAR4,YEAR5,YEAR6,YEAR7,YEAR8,YEAR9,YEAR10,YEAR11,YEAR12,geometry
0,8416,HIGH_COED,Ku-ring-gai HS,20230114,0,0,0,0,0,0,0,2024,2024,2024,2024,2024,2024,"POLYGON ((151.19849 -33.5399, 151.19945 -33.54..."
1,8161,HIGH_BOYS,Randwick BHS,20200220,0,0,0,0,0,0,0,2024,2024,2024,2024,2024,2024,"POLYGON ((151.27152 -33.91402, 151.27152 -33.9..."
2,8539,HIGH_COED,SSC Blackwattle Bay,20220609,0,0,0,0,0,0,0,0,0,0,0,2024,2024,"POLYGON ((151.15292 -33.83939, 151.16144 -33.8..."
3,8400,HIGH_COED,St Ives HS,20230114,0,0,0,0,0,0,0,2024,2024,2024,2024,2024,2024,"POLYGON ((151.17794 -33.6982, 151.17859 -33.69..."
4,8555,HIGH_COED,Rose Bay SC,20200220,0,0,0,0,0,0,0,2024,2024,2024,2024,2024,2024,"POLYGON ((151.28072 -33.83287, 151.28095 -33.8..."
