# DATA2001 Assignment - Team 3 🧠📊

**Group Name**: Data Princesses

**Team Members**: Michella Krishna, Yujia Liu, Katherine Kong

**Region Focus**: City and Inner South, North Sydney and Hornsby, Inner West  


## 📚 Table of Contents

1. [Project Overview](#project-overview)  
2. [Dataset Overview](#dataset-overview)  
3. [Data Preparation (All Regions)](#data-preparation)  
4. [SA4 Zone Analyses](#sa4-analyses)  
    4.1 [City and Inner South](#city-and-inner-south)  
    4.2 [North Sydney and Hornsby](#north-sydney-and-hornsby)        
    4.3 [Inner West](#inner-west)  
6. [Score Comparison Across SA4 Zones](#score-comparison)  
7. [Correlation with Income](#correlation-analysis)  
8. [Conclusion and Reflection](#conclusion)


## 🔎 Project Overview <a name="project-overview"></a>


## 🧪 Data Preparation (All Regions)

In [5]:
import pandas as pd

# Load the dataset
businesses = pd.read_csv("Businesses.csv")

# Clean column names: lowercase and replace special characters
businesses.columns = (
    businesses.columns
    .str.lower()
    .str.replace(" ", "_", regex=False)
    .str.replace("-", "_", regex=False)
)

businesses = businesses.rename(columns={
    "0_to_50k_businesses": "b_0_50k",
    "50k_to_200k_businesses": "b_50k_200k",
    "200k_to_2m_businesses": "b_200k_2m",
    "2m_to_5m_businesses": "b_2m_5m",
    "5m_to_10m_businesses": "b_5m_10m",
    "10m_or_more_businesses": "b_10m_more"
})

# Preview cleaned column names
print("Cleaned columns:", businesses.columns.tolist())

# Confirm data types and nulls (should be fine)
print(businesses.info())
print(businesses.isnull().sum())

# Export the cleaned dataset
businesses.to_csv("b_cleaned.csv", index=False)

businesses.head()

Cleaned columns: ['industry_code', 'industry_name', 'sa2_code', 'sa2_name', 'b_0_50k', 'b_50k_200k', 'b_200k_2m', 'b_2m_5m', 'b_5m_10m', 'b_10m_more', 'total_businesses']
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12217 entries, 0 to 12216
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   industry_code     12217 non-null  object
 1   industry_name     12217 non-null  object
 2   sa2_code          12217 non-null  int64 
 3   sa2_name          12217 non-null  object
 4   b_0_50k           12217 non-null  int64 
 5   b_50k_200k        12217 non-null  int64 
 6   b_200k_2m         12217 non-null  int64 
 7   b_2m_5m           12217 non-null  int64 
 8   b_5m_10m          12217 non-null  int64 
 9   b_10m_more        12217 non-null  int64 
 10  total_businesses  12217 non-null  int64 
dtypes: int64(8), object(3)
memory usage: 1.0+ MB
None
industry_code       0
industry_name       0
sa2_code            0
s

Unnamed: 0,industry_code,industry_name,sa2_code,sa2_name,b_0_50k,b_50k_200k,b_200k_2m,b_2m_5m,b_5m_10m,b_10m_more,total_businesses
0,A,"Agriculture, Forestry and Fishing",101021007,Braidwood,136,92,63,4,0,0,296
1,A,"Agriculture, Forestry and Fishing",101021008,Karabar,6,3,0,0,0,0,9
2,A,"Agriculture, Forestry and Fishing",101021009,Queanbeyan,6,4,3,0,0,3,15
3,A,"Agriculture, Forestry and Fishing",101021010,Queanbeyan - East,0,3,0,0,0,0,3
4,A,"Agriculture, Forestry and Fishing",101021012,Queanbeyan West - Jerrabomberra,7,4,5,0,0,0,16


In [6]:
# Load the data
population = pd.read_csv("Population.csv")

# Clean column names: lowercase, replace hyphens and spaces with underscores
population.columns = (
    population.columns
    .str.lower()
    .str.replace("-", "_")
    .str.replace(" ", "_")
)

# Define new column names
new_columns = {
    'sa2_code': 'sa2_code',
    'sa2_name': 'sa2_name',
    '0_4_people': 'age_0_4',
    '5_9_people': 'age_5_9',
    '10_14_people': 'age_10_14',
    '15_19_people': 'age_15_19',
    '20_24_people': 'age_20_24',
    '25_29_people': 'age_25_29',
    '30_34_people': 'age_30_34',
    '35_39_people': 'age_35_39',
    '40_44_people': 'age_40_44',
    '45_49_people': 'age_45_49',
    '50_54_people': 'age_50_54',
    '55_59_people': 'age_55_59',
    '60_64_people': 'age_60_64',
    '65_69_people': 'age_65_69',
    '70_74_people': 'age_70_74',
    '75_79_people': 'age_75_79',
    '80_84_people': 'age_80_84',
    '85_and_over_people': 'age_85_over',
    'total_people': 'total'
}

# Rename columns
population.rename(columns=new_columns, inplace=True)

# Show renamed columns
print("Renamed columns:", population.columns.tolist())

# Confirm data types and nulls
print(population.info())
print(population.isnull().sum())

# Export cleaned and renamed dataset
population.to_csv("p_cleaned.csv", index=False)

# Preview the data
print(population.head())


Renamed columns: ['sa2_code', 'sa2_name', 'age_0_4', 'age_5_9', 'age_10_14', 'age_15_19', 'age_20_24', 'age_25_29', 'age_30_34', 'age_35_39', 'age_40_44', 'age_45_49', 'age_50_54', 'age_55_59', 'age_60_64', 'age_65_69', 'age_70_74', 'age_75_79', 'age_80_84', 'age_85_over', 'total']
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 373 entries, 0 to 372
Data columns (total 21 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   sa2_code     373 non-null    int64 
 1   sa2_name     373 non-null    object
 2   age_0_4      373 non-null    int64 
 3   age_5_9      373 non-null    int64 
 4   age_10_14    373 non-null    int64 
 5   age_15_19    373 non-null    int64 
 6   age_20_24    373 non-null    int64 
 7   age_25_29    373 non-null    int64 
 8   age_30_34    373 non-null    int64 
 9   age_35_39    373 non-null    int64 
 10  age_40_44    373 non-null    int64 
 11  age_45_49    373 non-null    int64 
 12  age_50_54    373 non-null    int6

In [7]:
# Load your dataset
df = pd.read_csv('Income.csv')

# 1. Rename column names to lowercase
df.columns = df.columns.str.lower()

# 2. Replace spaces with underscores
df.columns = df.columns.str.replace(' ', '_')

# 3. Replace hyphens with underscores
df.columns = df.columns.str.replace('-', '_')

# 4. Drop rows with null values
df.dropna(inplace=True)

# View the cleaned DataFrame
df.head()

Unnamed: 0,sa2_code21,sa2_name,earners,median_age,median_income,mean_income
0,101021007,Braidwood,2467,51,46640,68904
1,101021008,Karabar,5103,42,65564,69672
2,101021009,Queanbeyan,7028,39,63528,69174
3,101021010,Queanbeyan - East,3398,39,66148,74162
4,101021012,Queanbeyan West - Jerrabomberra,8422,44,78630,91981


In [8]:
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point, Polygon, MultiPolygon
from geoalchemy2 import Geometry, WKTElement
import matplotlib.pyplot as plt

In [9]:
# Adjust path as needed
catchments_primary = gpd.read_file("~/Desktop/DATA2001/DATA2001-A/catchments/catchments_primary.shp")

catchments_primary.columns = catchments_primary.columns.str.lower()

# Preview structure
print(catchments_primary.columns)
catchments_primary.head()

DataSourceError: C:\Users\uwhit/Desktop/DATA2001/DATA2001-A/catchments/catchments_primary.shp: No such file or directory

In [None]:
# Adjust path as needed
catchments_secondary = gpd.read_file("~/Desktop/DATA2001/DATA2001-A/catchments/catchments_secondary.shp")

catchments_secondary.columns = catchments_secondary.columns.str.lower()

# Preview structure
print(catchments_secondary.columns)
catchments_secondary.head()

In [None]:
# Adjust path as needed
catchments_future = gpd.read_file("~/Desktop/DATA2001/DATA2001-A/catchments/catchments_future.shp")

catchments_future.columns = catchments_future.columns.str.lower()

# Preview structure
print(catchments_future.columns)
catchments_future.head()