In [64]:
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point, Polygon, MultiPolygon
from geoalchemy2 import Geometry, WKTElement
import matplotlib.pyplot as plt

In [65]:

# Load the data
population = pd.read_csv("Population.csv")

# Clean column names: lowercase, replace hyphens with underscores
population.columns = (
    population.columns
    .str.lower()
    .str.replace("-", "_")
    .str.replace(" ", "_")
)

# Show cleaned columns
print("Cleaned columns:", population.columns.tolist())

# Confirm data types and nulls (should be fine)
print(population.info())
print(population.isnull().sum())

Cleaned columns: ['sa2_code', 'sa2_name', '0_4_people', '5_9_people', '10_14_people', '15_19_people', '20_24_people', '25_29_people', '30_34_people', '35_39_people', '40_44_people', '45_49_people', '50_54_people', '55_59_people', '60_64_people', '65_69_people', '70_74_people', '75_79_people', '80_84_people', '85_and_over_people', 'total_people']
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 373 entries, 0 to 372
Data columns (total 21 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   sa2_code            373 non-null    int64 
 1   sa2_name            373 non-null    object
 2   0_4_people          373 non-null    int64 
 3   5_9_people          373 non-null    int64 
 4   10_14_people        373 non-null    int64 
 5   15_19_people        373 non-null    int64 
 6   20_24_people        373 non-null    int64 
 7   25_29_people        373 non-null    int64 
 8   30_34_people        373 non-null    int64 
 9   35_39_people  

In [66]:
# This loads the POIs from the REST service directly
poi_url = "https://maps.six.nsw.gov.au/arcgis/rest/services/public/NSW_POI/MapServer/0/query?where=1%3D1&outFields=*&f=geojson"

pois = gpd.read_file(poi_url)
print(pois.columns)
pois.head()

Index(['objectid', 'topoid', 'poigroup', 'poitype', 'poiname', 'poilabel',
       'poilabeltype', 'poialtlabel', 'poisourcefeatureoid', 'accesscontrol',
       'startdate', 'enddate', 'lastupdate', 'msoid', 'centroidid',
       'shapeuuid', 'changetype', 'processstate', 'urbanity', 'geometry'],
      dtype='object')


Unnamed: 0,objectid,topoid,poigroup,poitype,poiname,poilabel,poilabeltype,poialtlabel,poisourcefeatureoid,accesscontrol,startdate,enddate,lastupdate,msoid,centroidid,shapeuuid,changetype,processstate,urbanity,geometry
0,1,500000000,9,Mine - Underground,,Mine - Underground,GENERIC,,157,1,1628668563000,32503680000000,1628668617000,233046,,729e2b57-0cd4-3f70-90fa-9dce09e34a8e,I,,S,POINT (152.12202 -31.10616)
1,2,500005504,3,Lookout,KUNDERANG LOOKOUT,KUNDERANG LOOKOUT,NAMED,,56,1,1285588392000,32503680000000,1285588392535,83091,,d88a28a8-c572-3992-995f-d26a274aea18,I,,S,POINT (152.29869 -31.02148)
2,3,500005505,3,Lookout,FALLS LOOKOUT,FALLS LOOKOUT,NAMED,,56,1,1285588392000,32503680000000,1285588392535,83691,,21b476d2-6519-3e28-8b19-1526fcb9652f,I,,S,POINT (152.33786 -31.01576)
3,4,500005507,3,Lookout,MCCOYS LOOKOUT,MCCOYS LOOKOUT,NAMED,,56,1,1285588392000,32503680000000,1285588392535,83380,,016d69b2-6530-39e7-89a6-6e3054df55ac,I,,S,POINT (152.34181 -31.01897)
4,5,500012781,3,Picnic Area,WILSON RIVER PICNIC AREA,WILSON RIVER PICNIC AREA,NAMED,,62,1,1608714678000,32503680000000,1608714706360,231054,,49ad26c8-609e-3aa0-b4ad-51459b43ab51,M,,S,POINT (152.47882 -31.20754)


In [67]:
# Load stops.txt
stops = pd.read_csv('Stops.txt')

# Preview
stops.head()

# Some cleaning
# Drop rows where stop_lat or stop_lon is missing
stops = stops.dropna(subset=['stop_lat', 'stop_lon'])

# We want to only select the stops inside three areas: 
# Inner West, North Sydney and Hornsby, City and Inner South

# No column name telling which region a stop belongs to
# So: use the stop's latitude and longitude (stop_lat, stop_lon) to filter our disired data

In [68]:
sa2 = gpd.read_filesa2 = gpd.read_file("SA2_2021_AUST_SHP_GDA2020\\SA2_2021_AUST_GDA2020.shp")


print(sa2.columns)
sa2.head()

sa2_sydney = sa2[sa2['GCC_NAME21'] == 'Greater Sydney']

# Create geometry column for stops
stops['geometry'] = stops.apply(lambda row: Point(row['stop_lon'], row['stop_lat']), axis=1)

# Turn stops into a GeoDataFrame (same CRS)
stops_gdf = gpd.GeoDataFrame(stops, geometry='geometry', crs="EPSG:4283")

stops_gdf = stops_gdf.to_crs(epsg=7844)

stops_with_sa2 = gpd.sjoin(stops_gdf, sa2_sydney, how='left', predicate='within')

# print(stops_with_sa2.head(10)) #debug

target_regions = [
    'Sydney - Inner West',
    'Sydney - North Sydney and Hornsby',
    'Sydney - City and Inner South'
]

filtered_stops = stops_with_sa2[stops_with_sa2['SA4_NAME21'].isin(target_regions)]

filtered_stops = filtered_stops.dropna(subset=['SA4_NAME21'])

columns_to_keep = [
    'stop_id', 'stop_name', 'stop_lat', 'stop_lon',
    'wheelchair_boarding', 'SA2_CODE21', 'SA2_NAME21', 'SA4_NAME21', 'geometry'
]

filtered_stops = filtered_stops[columns_to_keep]

filtered_stops.columns = filtered_stops.columns.str.lower()

filtered_stops.to_csv('s_filtered.csv', index=False)


Index(['SA2_CODE21', 'SA2_NAME21', 'CHG_FLAG21', 'CHG_LBL21', 'SA3_CODE21',
       'SA3_NAME21', 'SA4_CODE21', 'SA4_NAME21', 'GCC_CODE21', 'GCC_NAME21',
       'STE_CODE21', 'STE_NAME21', 'AUS_CODE21', 'AUS_NAME21', 'AREASQKM21',
       'LOCI_URI21', 'geometry'],
      dtype='object')


In [69]:
# Adjust path as needed
catchments_primary = gpd.read_file("catchments\\catchments_primary.shp")

# Preview structure
print(catchments_primary.columns)
catchments_primary.head()

Index(['USE_ID', 'CATCH_TYPE', 'USE_DESC', 'ADD_DATE', 'KINDERGART', 'YEAR1',
       'YEAR2', 'YEAR3', 'YEAR4', 'YEAR5', 'YEAR6', 'YEAR7', 'YEAR8', 'YEAR9',
       'YEAR10', 'YEAR11', 'YEAR12', 'PRIORITY', 'geometry'],
      dtype='object')


Unnamed: 0,USE_ID,CATCH_TYPE,USE_DESC,ADD_DATE,KINDERGART,YEAR1,YEAR2,YEAR3,YEAR4,YEAR5,YEAR6,YEAR7,YEAR8,YEAR9,YEAR10,YEAR11,YEAR12,PRIORITY,geometry
0,2838,PRIMARY,Parklea PS,20181210,Y,Y,Y,Y,Y,Y,Y,N,N,N,N,N,N,,"POLYGON ((150.93564 -33.71612, 150.93715 -33.7..."
1,2404,PRIMARY,Lindfield EPS,20211219,Y,Y,Y,Y,Y,Y,Y,N,N,N,N,N,N,,"POLYGON ((151.18336 -33.74748, 151.18443 -33.7..."
2,4393,PRIMARY,Carlingford WPS,20220223,Y,Y,Y,Y,Y,Y,Y,N,N,N,N,N,N,,"POLYGON ((151.04518 -33.77303, 151.04526 -33.7..."
3,4615,PRIMARY,Caddies Ck PS,20181210,Y,Y,Y,Y,Y,Y,Y,N,N,N,N,N,N,,"POLYGON ((150.92567 -33.7296, 150.92602 -33.72..."
4,3918,PRIMARY,Killara PS,20211219,Y,Y,Y,Y,Y,Y,Y,N,N,N,N,N,N,,"POLYGON ((151.15379 -33.75586, 151.15404 -33.7..."


In [70]:
# Adjust path as needed
catchments_secondary = gpd.read_file("catchments\\catchments_secondary.shp")

# Preview structure
print(catchments_secondary.columns)
catchments_secondary.head()

Index(['USE_ID', 'CATCH_TYPE', 'USE_DESC', 'ADD_DATE', 'KINDERGART', 'YEAR1',
       'YEAR2', 'YEAR3', 'YEAR4', 'YEAR5', 'YEAR6', 'YEAR7', 'YEAR8', 'YEAR9',
       'YEAR10', 'YEAR11', 'YEAR12', 'PRIORITY', 'geometry'],
      dtype='object')


Unnamed: 0,USE_ID,CATCH_TYPE,USE_DESC,ADD_DATE,KINDERGART,YEAR1,YEAR2,YEAR3,YEAR4,YEAR5,YEAR6,YEAR7,YEAR8,YEAR9,YEAR10,YEAR11,YEAR12,PRIORITY,geometry
0,8503,HIGH_COED,Billabong HS,20200507,N,N,N,N,N,N,N,Y,Y,Y,Y,Y,Y,,"POLYGON ((146.67182 -35.31444, 146.6893 -35.31..."
1,8266,HIGH_COED,James Fallon HS,20200507,N,N,N,N,N,N,N,Y,Y,Y,Y,Y,Y,,"POLYGON ((147.08734 -35.86271, 147.10413 -35.8..."
2,8505,HIGH_COED,Murray HS,20200507,N,N,N,N,N,N,N,Y,Y,Y,Y,Y,Y,,"POLYGON ((146.81448 -35.78341, 146.8125 -35.79..."
3,8458,HIGH_COED,Kingswood HS,20201016,N,N,N,N,N,N,N,Y,Y,Y,Y,Y,Y,,"MULTIPOLYGON (((150.686 -33.74031, 150.68631 -..."
4,8559,HIGH_COED,Jamison HS,20201016,N,N,N,N,N,N,N,Y,Y,Y,Y,Y,Y,,"POLYGON ((150.69513 -33.75627, 150.68936 -33.7..."


In [71]:
# Adjust path as needed
catchments_future = gpd.read_file("catchments\\catchments_future.shp")

# Preview structure
print(catchments_future.columns)
catchments_future.head()

Index(['USE_ID', 'CATCH_TYPE', 'USE_DESC', 'ADD_DATE', 'KINDERGART', 'YEAR1',
       'YEAR2', 'YEAR3', 'YEAR4', 'YEAR5', 'YEAR6', 'YEAR7', 'YEAR8', 'YEAR9',
       'YEAR10', 'YEAR11', 'YEAR12', 'geometry'],
      dtype='object')


Unnamed: 0,USE_ID,CATCH_TYPE,USE_DESC,ADD_DATE,KINDERGART,YEAR1,YEAR2,YEAR3,YEAR4,YEAR5,YEAR6,YEAR7,YEAR8,YEAR9,YEAR10,YEAR11,YEAR12,geometry
0,8416,HIGH_COED,Ku-ring-gai HS,20230114,0,0,0,0,0,0,0,2024,2024,2024,2024,2024,2024,"POLYGON ((151.19849 -33.5399, 151.19945 -33.54..."
1,8161,HIGH_BOYS,Randwick BHS,20200220,0,0,0,0,0,0,0,2024,2024,2024,2024,2024,2024,"POLYGON ((151.27152 -33.91402, 151.27152 -33.9..."
2,8539,HIGH_COED,SSC Blackwattle Bay,20220609,0,0,0,0,0,0,0,0,0,0,0,2024,2024,"POLYGON ((151.15292 -33.83939, 151.16144 -33.8..."
3,8400,HIGH_COED,St Ives HS,20230114,0,0,0,0,0,0,0,2024,2024,2024,2024,2024,2024,"POLYGON ((151.17794 -33.6982, 151.17859 -33.69..."
4,8555,HIGH_COED,Rose Bay SC,20200220,0,0,0,0,0,0,0,2024,2024,2024,2024,2024,2024,"POLYGON ((151.28072 -33.83287, 151.28095 -33.8..."


In [72]:
from sqlalchemy import create_engine, text
import psycopg2
import psycopg2.extras
import json
import os
import pandas as pd

In [73]:


credentials = os.path.expanduser("\\Users\\uwhit\\data2001Tut\\Credentials.json")


def pgconnect(credential_filepath, db_schema="public"):
    with open(credential_filepath) as f:
        db_conn_dict = json.load(f)
        host       = db_conn_dict['host']
        db_user    = db_conn_dict['user']
        db_pw      = db_conn_dict['password']
        default_db = "sydney_analysis"  ##### The right database
        port       = db_conn_dict['port']
        try:
            db = create_engine(f'postgresql+psycopg2://{db_user}:{db_pw}@{host}:{port}/{default_db}', echo=False)
            conn = db.connect()
            print('Connected successfully.')
        except Exception as e:
            print("Unable to connect to the database.")
            print(e)
            db, conn = None, None
        return db,conn

def query(conn, sqlcmd, args=None, df=True):
    result = pd.DataFrame() if df else None
    try:
        if df:
            result = pd.read_sql_query(sqlcmd, conn, params=args)
        else:
            result = conn.execute(text(sqlcmd), args).fetchall()
            result = result[0] if len(result) == 1 else result
    except Exception as e:
        print("Error encountered: ", e, sep='\n')
    return result

In [74]:
db, conn = pgconnect(credentials)

Connected successfully.


In [75]:
import geopandas as gpd

# Define shapefile paths
base_path = "\\Users\\uwhit\\DATA2001-A\\catchments"
shapefiles = {
    "catchments_primary": "catchments_primary.shp",
    "catchments_secondary": "catchments_secondary.shp",
    "catchments_future": "catchments_future.shp"
}

# Loop through the catchment files and upload to PGadmin
for table_name, shp_file in shapefiles.items():
    print(f"Processing {table_name}...")
    gdf = gpd.read_file(os.path.expanduser(os.path.join(base_path, shp_file)))
    gdf.columns = [col.lower() for col in gdf.columns]  
    gdf.to_postgis(name=table_name, con=db, if_exists="replace", index=False)
    print(f"✅ Uploaded {table_name} successfully.")

print(f"Using DB: {db.url.database}")

Processing catchments_primary...
✅ Uploaded catchments_primary successfully.
Processing catchments_secondary...
✅ Uploaded catchments_secondary successfully.
Processing catchments_future...
✅ Uploaded catchments_future successfully.
Using DB: sydney_analysis


In [76]:
sa2_path = "SA2_2021_AUST_SHP_GDA2020\\SA2_2021_AUST_GDA2020.shp"
sa2_gdf = gpd.read_file(os.path.expanduser(sa2_path))

#Keep only useful columns
sa2_gdf.columns = [col.lower() for col in sa2_gdf.columns]
columns_to_keep = ["sa2_code21", "sa2_name21", "sa4_name21", "geometry"]
sa2_gdf = sa2_gdf[columns_to_keep]

# Filter rows by target SA4 regions
target_sa4 = [
    "Sydney - Inner West",
    "Sydney - North Sydney and Hornsby",
    "Sydney - City and Inner South"
]
sa2_gdf = sa2_gdf[sa2_gdf["sa4_name21"].isin(target_sa4)]

# SUpload to PostgreSQL
sa2_gdf.to_postgis(name="sa2_regions", con=db, if_exists="replace", index=False)

print(f"✅ Cleaned and uploaded 'sa2_regions' with {len(sa2_gdf)} rows.")
sa2_gdf.head()

✅ Cleaned and uploaded 'sa2_regions' with 74 rows.


Unnamed: 0,sa2_code21,sa2_name21,sa4_name21,geometry
343,117011320,Banksmeadow,Sydney - City and Inner South,"POLYGON ((151.20807 -33.95405, 151.20817 -33.9..."
344,117011321,Botany,Sydney - City and Inner South,"POLYGON ((151.18965 -33.94813, 151.18919 -33.9..."
345,117011323,Pagewood - Hillsdale - Daceyville,Sydney - City and Inner South,"POLYGON ((151.22312 -33.92869, 151.22189 -33.9..."
346,117011324,Port Botany Industrial,Sydney - City and Inner South,"POLYGON ((151.22091 -33.96895, 151.22066 -33.9..."
347,117011325,Sydney Airport,Sydney - City and Inner South,"POLYGON ((151.17103 -33.927, 151.17167 -33.926..."


# Task 2 - North Sydney & Hornsby

In [78]:
northSydney_gdf = gpd.read_postgis("""
    SELECT *
    FROM sa2_regions
    WHERE sa4_name21 = 'Sydney - North Sydney and Hornsby'
""", conn, geom_col="geometry")

In [79]:
import requests
import io
import geopandas as gpd

def get_pois_from_bbox(minx, miny, maxx, maxy):
    url = "https://maps.six.nsw.gov.au/arcgis/rest/services/public/NSW_POI/MapServer/0/query"
    params = {
        "f": "geojson",
        "geometryType": "esriGeometryEnvelope",
        "geometry": f"{minx},{miny},{maxx},{maxy}",
        "spatialRel": "esriSpatialRelIntersects",
        "outFields": "*",
        "inSR": "4283",
        "outSR": "4283"
    }
    try:
        response = requests.get(url, params=params)
        response.raise_for_status()
        return gpd.read_file(io.StringIO(response.text))
    except Exception as e:
        print(f"❌ Error fetching POIs for bbox {minx},{miny},{maxx},{maxy}:", e)
        return None


In [80]:
import time
from shapely.geometry import shape

all_pois = []

for _, row in northSydney_gdf.iterrows():
    sa2 = row["sa2_name21"]
    minx, miny, maxx, maxy = row["geometry"].bounds
    print(f"📦 Fetching POIs for {sa2}...")
    
    pois = get_pois_from_bbox(minx, miny, maxx, maxy)
    if pois is not None and not pois.empty:
        pois["sa2_name"] = sa2
        all_pois.append(pois)
    
    time.sleep(1)  # be nice to the server


📦 Fetching POIs for Chatswood (West) - Lane Cove North...
📦 Fetching POIs for St Leonards - Naremburn...
📦 Fetching POIs for Artarmon...
📦 Fetching POIs for Castle Cove - Northbridge...
📦 Fetching POIs for Chatswood - East...
📦 Fetching POIs for Greenwich - Riverview...
📦 Fetching POIs for Lane Cove...
📦 Fetching POIs for Willoughby...
📦 Fetching POIs for Asquith - Mount Colah...
📦 Fetching POIs for Berowra - Brooklyn - Cowan...
📦 Fetching POIs for Normanhurst - Thornleigh - Westleigh...
📦 Fetching POIs for Hornsby - East...
📦 Fetching POIs for Hornsby - West...
📦 Fetching POIs for Wahroonga (West) - Waitara...
📦 Fetching POIs for Gordon - Killara...
📦 Fetching POIs for Lindfield - Roseville...
📦 Fetching POIs for Pymble...
📦 Fetching POIs for St Ives...
📦 Fetching POIs for Turramurra...
📦 Fetching POIs for Wahroonga (East) - Warrawee...
📦 Fetching POIs for Cremorne - Cammeray...
📦 Fetching POIs for Crows Nest - Waverton...
📦 Fetching POIs for Neutral Bay - Kirribilli...
📦 Fetching POI

In [81]:
from geopandas import GeoDataFrame

if all_pois:
    all_pois_gdf = gpd.GeoDataFrame(pd.concat(all_pois, ignore_index=True), crs="EPSG:4283")
    all_pois_gdf.to_postgis("pois_northSydney", con=db, if_exists="replace", index=False)
    print(f"✅ Uploaded {len(all_pois_gdf)} POIs.")
else:
    print("⚠️ No POIs collected.")

✅ Uploaded 5650 POIs.


In [82]:
conn.rollback()


# Task 3 - North Sydney & Hornsby

In [84]:
sql_stops = """
SELECT sa2_code21,
       sa2_name21,
       COUNT(*) AS n_stops
FROM   "Stops"
WHERE  sa4_name21 = 'Sydney - North Sydney and Hornsby'
GROUP BY sa2_code21,
         sa2_name21;
"""

stop_counts = pd.read_sql(sql_stops, conn)
stop_counts.head()


ProgrammingError: (psycopg2.errors.UndefinedTable) relation "Stops" does not exist
LINE 5: FROM   "Stops"
               ^

[SQL: 
SELECT sa2_code21,
       sa2_name21,
       COUNT(*) AS n_stops
FROM   "Stops"
WHERE  sa4_name21 = 'Sydney - North Sydney and Hornsby'
GROUP BY sa2_code21,
         sa2_name21;
]
(Background on this error at: https://sqlalche.me/e/20/f405)

In [None]:
northsyd_query = """
SELECT sa2_code21, sa2_name21, geometry
FROM sa2_regions
WHERE sa4_name21 = 'Sydney - North Sydney and Hornsby ';
"""

sa2_northsyd = gpd.read_postgis(northsyd_query, conn, geom_col='geometry')
sa2_northsyd.crs = "EPSG:4283"

In [None]:
sql_primary = "SELECT * FROM catchments_primary;"
sql_second = "SELECT * FROM catchments_secondary;"
sql_future = "SELECT * FROM catchments_future;"



In [None]:
primary = gpd.read_postgis(sql_primary, conn, geom_col='geometry')
secondary= gpd.read_postgis(sql_second, conn, geom_col='geometry')
future = gpd.read_postgis(sql_future, conn, geom_col='geometry')

primary["school_type"] = "primary"
secondary["school_type"] = "secondary"
future["school_type"] = "future"

In [None]:
all_schools_combined = pd.concat(
    [primary, secondary, future],
    ignore_index=True
)

schools_sa2 = gpd.sjoin(
    all_schools_combined,
    sa2_northsyd,
    predicate="intersects"
)

In [None]:
school_counts_sa2 = schools_sa2.groupby(
    ["sa2_code21", "sa2_name21", "school_type"]
).size().unstack(fill_value=0).reset_index()

school_counts_sa2["total_schools"] = (
    school_counts_sa2.get("primary", 0) +
    school_counts_sa2.get("secondary", 0) +
    school_counts_sa2.get("future", 0)
)

school_counts_sa2.head()

In [None]:
sql_population = """
SELECT 
    "sa2_code",
    "age_5_9",
    "age_10_14 ",
    "age_15_19"
FROM "Population"
WHERE "sa2_code" IS NOT NULL;
"""

In [None]:
population_youth = pd.read_sql(sql_population, conn)
population_youth["youth_total"] = (
    population_youth["age_5_9"] +
    population_youth["age_10_14 "] +
    population_youth["age_15_19"]
)
population_youth.head()

In [None]:
population_youth["sa2_code"] = population_youth["sa2_code"].astype(str)
schools_with_youth = school_counts_by_sa2.merge(
    population_youth,
    left_on="sa2_code21",
    right_on="sa2_code",
    how="left"
)

In [None]:
schools_with_youth["schools_per_1000_youth"] = (
    schools_with_youth["total_schools"] / schools_with_youth["youth_total"]
) * 1000
schools_with_youth.head()

In [103]:
from sqlalchemy import text
sql_business_counts = """
SELECT sa2_code,
       sa2_name,
       SUM(total_businesses) AS n_businesses
FROM "Businesses"
WHERE "industry_code " = 'G'
GROUP BY sa2_code, sa2_name
ORDER BY n_businesses DESC;
"""



In [None]:
business_counts = pd.read_sql_query(text(sql_business_counts), conn)
business_counts.head()

In [None]:
sql_population_total = """
SELECT "sa2_code",
       "total"
FROM "Population"
"""


In [None]:
population_total = pd.read_sql_query(text(sql_population_total), conn)

business_pop = business_counts.merge(
    population_total, on="sa2_code", how="left"
)

business_pop["businesses_per_1000"] = (
    business_pop["n_businesses"] / business_pop["total"]
) * 1000
business_pop.head()

In [None]:
combined = stop_counts.merge(
    schools_with_youth[["sa2_code", "schools_per_1000_youth"]],
    left_on="sa2_code21", right_on="sa2_code", how="left"
)

combined = combined.merge(
    library_counts.rename(columns={"sa2_name": "sa2_name21"}),
    on="sa2_name21", how="left"
)

combined = combined.merge(
    park_counts.rename(columns={"sa2_name": "sa2_name21"}),
    on="sa2_name21", how="left"
)

combined = combined.merge(
    business_with_pop.rename(columns={"sa2_name": "sa2_name21"}),
    on="sa2_name21", how="left"
)

combined = combined.rename(columns={
    "n_stops": "stops",
    "schools_per_1000_youth": "schools",
    "businesses_per_1000": "businesses"
})

combined = combined.fillna(0)

combined.head()

In [None]:
# Compute z-scores
combined["z_stops"] = (combined["stops"] - combined["stops"].mean()) / combined["stops"].std()
combined["z_schools"] = (combined["schools"] - combined["schools"].mean()) / combined["schools"].std()
combined["z_businesses"] = (combined["businesses"] - combined["businesses"].mean()) / combined["businesses"].std()

In [None]:
# Total z-score
combined["z_total"] = (
    combined["z_stops"] +
    combined["z_schools"] +
    combined["z_businesses"]
)
combined = combined.drop(columns=["sa2_code_x", "sa2_code_y", "n_businesses", "total"])
combined.head()

In [None]:
import numpy as np
final_scores = combined
final_scores["real_score"] = 1 / (1 + np.exp(-final_scores["z_total"]))
final_scores = final_scores.reset_index(drop=True)
final_scores

In [None]:
#Task 4

In [None]:
final_scores.sort_values("real_score", ascending=False)

In [None]:
import matplotlib.pyplot as plt

# Bar chart: top 10 regions
top10 = final_scores.sort_values("real_score", ascending=False).head(10)
plt.figure(figsize=(10, 6))
plt.barh(top10["sa2_name21"], top10["real_score"], color="skyblue")
plt.xlabel("Real Score")
plt.title("Top 10 SA2 Regions by Real Score")
plt.gca().invert_yaxis()
plt.show()

In [None]:
import seaborn as sns

# Correlation matrix between z components and real score
score_corr = final_scores[["z_stops", "z_schools", "z_businesses", "real_score"]].corr()

plt.figure(figsize=(8, 6))
sns.heatmap(score_corr, annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Correlation Between Components and Real Score")
plt.show()

In [None]:
income = pd.read_csv("i_cleaned.csv")

final_scores["sa2_code21"] = final_scores["sa2_code21"].astype(str)
income["sa2_code"] = income["sa2_code"].astype(str)

income_joined = final_scores.merge(income, left_on="sa2_code21", right_on="sa2_code", how="left")

import matplotlib.pyplot as plt
# Scatterplot
plt.figure(figsize=(8, 6))
plt.scatter(income_joined["median_income"], income_joined["real_score"], alpha=0.7)
plt.xlabel("Median Income")
plt.ylabel("Real Score")
plt.title("Real Score vs Median Income")
plt.grid(True)
plt.show()

# Correlation coefficient
income_corr = income_joined["real_score"].corr(income_joined["median_income"])
print("correlation coeffient:", income_corr)

In [2]:
map_visual = northSydney_gdf.merge(final_scores, on="sa2_code21", how="left")
fig, ax = plt.subplots(figsize=(10, 8))
map_visual.plot(
    column="real_score",
    cmap="YlGnBu",
    linewidth=0.5,
    edgecolor="pink",
    legend=True,
    ax=ax
)
ax.set_title("Real Score by SA2 Region for North Sydney", fontsize=14)
ax.axis("off")
plt.tight_layout()
plt.show()

NameError: name 'northSydney_gdf' is not defined