# Liveability Preprocessing

In [None]:
import os
import pandas as pd
import requests
import geopandas as gpd
from shapely.geometry import shape
from shapely import wkt

In [None]:
# Define the path to the merged CSV
csv_path = '../data/curated/sa2_rent/merged_final.csv'

# Load the merged DataFrame from the CSV
merged_df = pd.read_csv(csv_path)

In [None]:
print(merged_df['geometry'].head())

In [None]:

# Adjust the column names according to your dataset
selected_columns = [
    'Suburbs',  # Spatial data identifier
    'geometry',  # Geometry for spatial data (ensure this is included)
    'MARCH2024COUNT_FLAT1',  # Replace with actual column name for flat 1 counts
    'MARCH2024MEDIAN_FLAT1',  # Replace with actual column name for flat 1 medians
    'MARCH2024COUNT_FLAT2',  # Replace with actual column name for flat 2 counts
    'MARCH2024MEDIAN_FLAT2',  # Replace with actual column name for flat 2 medians
    'MARCH2024COUNT_FLAT3',  # Replace with actual column name for flat 3 counts
    'MARCH2024MEDIAN_FLAT3',  # Replace with actual column name for flat 3 medians
    'MARCH2024COUNT_HOUSE2',  # Replace with actual column name for house 2 counts
    'MARCH2024MEDIAN_HOUSE2',  # Replace with actual column name for house 2 medians
    'MARCH2024COUNT_HOUSE3',  # Replace with actual column name for house 3 counts
    'MARCH2024MEDIAN_HOUSE3',  # Replace with actual column name for house 3 medians
    'MARCH2024COUNT_HOUSE4',  # Replace with actual column name for house 4 counts
    'MARCH2024MEDIAN_HOUSE4',  # Replace with actual column name for house 4 medians
]

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Assuming merged_df is your combined DataFrame with suburbs and rent data
# Let's make sure you have the correct column names for your plot

# Define a function to plot the top 5 and bottom 5 for a specific column
def plot_top_bottom(data, column, title, xlabel):
    # Ensure the column is numeric
    data[column] = pd.to_numeric(data[column], errors='coerce')
    
    # Drop rows with missing or zero values in the column
    data = data.dropna(subset=[column])
    data = data[data[column] > 0]

    # Sort by the column
    sorted_data = data[['Suburbs', column]].sort_values(by=column)
    
    # Get the top 5 and bottom 5
    top_5 = sorted_data.tail(5)
    bottom_5 = sorted_data.head(5)
    
    # Plot top 5 and bottom 5
    fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(14, 5))
    
    # Plot bottom 5
    bottom_5.plot(kind='barh', x='Suburbs', y=column, ax=axes[0], color='red', legend=False)
    axes[0].set_title(f"Bottom 5 {title}")
    axes[0].set_xlabel(xlabel)
    axes[0].set_ylabel("Suburbs")
    
    # Plot top 5
    top_5.plot(kind='barh', x='Suburbs', y=column, ax=axes[1], color='green', legend=False)
    axes[1].set_title(f"Top 5 {title}")
    axes[1].set_xlabel(xlabel)
    axes[1].set_ylabel("Suburbs")
    
    plt.tight_layout()
    plt.show()

# Define the property types and their corresponding median and count columns
property_types = {
    'All Properties': ('MARCH2024MEDIAN_RENT', 'MARCH2024COUNT_RENT'),
    '1 Bedroom Flat': ('MARCH2024MEDIAN_FLAT1', 'MARCH2024COUNT_FLAT1'),
    '2 Bedroom Flat': ('MARCH2024MEDIAN_FLAT2', 'MARCH2024COUNT_FLAT2'),
    '3 Bedroom Flat': ('MARCH2024MEDIAN_FLAT3', 'MARCH2024COUNT_FLAT3'),
    '2 Bedroom House': ('MARCH2024MEDIAN_HOUSE2', 'MARCH2024COUNT_HOUSE2'),
    '3 Bedroom House': ('MARCH2024MEDIAN_HOUSE3', 'MARCH2024COUNT_HOUSE3'),
    '4 Bedroom House': ('MARCH2024MEDIAN_HOUSE4', 'MARCH2024COUNT_HOUSE4')
}

# Plot for each property type
for property_type, (median_col, count_col) in property_types.items():
    print(f"Plotting for {property_type}...")
    
    # Plot top 5 and bottom 5 median rent prices
    plot_top_bottom(merged_df, median_col, f"Median Rent Prices ({property_type})", "Median Rent Price")
    
    # Plot top 5 and bottom 5 count of properties
    plot_top_bottom(merged_df, count_col, f"Rent Counts ({property_type})", "Rent Count")

# Getting Stations

In [None]:
import requests
import pandas as pd

# Define the bounding box for Victoria
min_lat, min_lon, max_lat, max_lon = -39.2, 140.96, -33.9, 150.03

# Construct the Overpass query to search for train stations operated by "Metro Trains Melbourne"
overpass_query = f"""
[out:json];
(
  node["railway"="station"]["operator"="Metro Trains Melbourne"]({min_lat},{min_lon},{max_lat},{max_lon});
  node["railway"="station"]["network"="Public Transport Victoria"]({min_lat},{min_lon},{max_lat},{max_lon});
  node["railway"="station"]["operator"~"V/Line|V/line|VLine"]({min_lat},{min_lon},{max_lat},{max_lon});
  node["public_transport"="station"]["train"="yes"]({min_lat},{min_lon},{max_lat},{max_lon});
);
out body;
"""

# Send the request to the Overpass API
overpass_url = "http://overpass-api.de/api/interpreter"
response = requests.get(overpass_url, params={'data': overpass_query})

# Check if the response is successful
if response.status_code != 200:
    print(f"Error: Received status code {response.status_code}")
    print("Response text:", response.text)
else:
    # Attempt to decode the JSON response
    try:
        data = response.json()
    except ValueError as e:
        print("Error decoding JSON:", e)
        print("Response text:", response.text)
    
    # Extract train station coordinates and names
    stations = []
    for element in data['elements']:
        if 'lat' in element and 'lon' in element:
            stations.append({
                'lat': element['lat'],
                'lon': element['lon'],
                'name': element['tags'].get('name', 'Unknown'),
                'operator': element['tags'].get('operator', 'Unknown')
            })
    
    # Convert the list of stations to a DataFrame
    stations_df = pd.DataFrame(stations)
    
    # Save the data to a CSV file
    stations_df.to_csv('../data/overpass_amenities/metro_train_stations_victoria.csv', index=False)
    
    # Print success message
    print("Train station data (Metro Trains Melbourne) saved to 'metro_train_stations_victoria.csv'")

# Getting Woolworths, Coles and Aldi and Other Fresh Food shops

In [None]:
import requests
import pandas as pd

# Define the bounding box for Victoria
min_lat, min_lon, max_lat, max_lon = -39.2, 140.96, -33.9, 150.03

# Construct the Overpass query to search for places selling fresh food (supermarkets, greengrocers)
overpass_query = f"""
[out:json];
(
  node["shop"="supermarket"]({min_lat},{min_lon},{max_lat},{max_lon});
  node["shop"="greengrocer"]({min_lat},{min_lon},{max_lat},{max_lon});
  node["shop"="grocery"]({min_lat},{min_lon},{max_lat},{max_lon});
  node["shop"="convenience"]({min_lat},{min_lon},{max_lat},{max_lon});
);
out body;
"""

# Send the request to the Overpass API
overpass_url = "http://overpass-api.de/api/interpreter"
response = requests.get(overpass_url, params={'data': overpass_query})

# Check if the response is successful
if response.status_code != 200:
    print(f"Error: Received status code {response.status_code}")
    print("Response text:", response.text)
else:
    # Attempt to decode the JSON response
    try:
        data = response.json()
    except ValueError as e:
        print("Error decoding JSON:", e)
        print("Response text:", response.text)
    
    # Extract supermarket and greengrocer coordinates and names
    fresh_food_places = []
    for element in data['elements']:
        if 'lat' in element and 'lon' in element:
            fresh_food_places.append({
                'lat': element['lat'],
                'lon': element['lon'],
                'name': element['tags'].get('name', 'Unknown'),
                'brand': element['tags'].get('brand', 'Unknown'),
                'shop': element['tags'].get('shop', 'Unknown')
            })
    
    # Convert the list of places to a DataFrame
    fresh_food_df = pd.DataFrame(fresh_food_places)
    
    # Save the data to a CSV file
    fresh_food_df.to_csv('../data/overpass_amenities/fresh_food_locations_victoria.csv', index=False)
    
    # Print success message
    print("Fresh food location data saved to 'fresh_food_locations_victoria.csv'")

# Public Schools

In [None]:
import requests
import pandas as pd

# Define the bounding box for Victoria
min_lat, min_lon, max_lat, max_lon = -39.2, 140.96, -33.9, 150.03

# Construct the Overpass query to search for public and private schools
overpass_query = f"""
[out:json];
(
  node["amenity"="school"]({min_lat},{min_lon},{max_lat},{max_lon});
  way["amenity"="school"]({min_lat},{min_lon},{max_lat},{max_lon});
  relation["amenity"="school"]({min_lat},{min_lon},{max_lat},{max_lon});
  node["building"="school"]({min_lat},{min_lon},{max_lat},{max_lon});
  way["building"="school"]({min_lat},{min_lon},{max_lat},{max_lon});
);
out center;
"""

# Send the request to the Overpass API
overpass_url = "http://overpass-api.de/api/interpreter"
response = requests.get(overpass_url, params={'data': overpass_query})

# Check if the response is successful
if response.status_code != 200:
    print(f"Error: Received status code {response.status_code}")
    print("Response text:", response.text)
else:
    # Attempt to decode the JSON response
    try:
        data = response.json()
    except ValueError as e:
        print("Error decoding JSON:", e)
        print("Response text:", response.text)
    
    # Extract school coordinates and details
    schools = []
    for element in data['elements']:
        if 'lat' in element and 'lon' in element:
            schools.append({
                'lat': element['lat'],
                'lon': element['lon'],
                'name': element['tags'].get('name', 'Unknown'),
                'operator': element['tags'].get('operator', 'Unknown'),
                'amenity': element['tags'].get('amenity', 'Unknown')
            })

    # Convert the list of schools to a DataFrame
    schools_df = pd.DataFrame(schools)
    
    # Save the data to a CSV file
    schools_df.to_csv('../data/overpass_amenities/public_schools_victoria.csv', index=False)
    
    # Print success message
    print("School location data saved to 'public_schools_victoria.csv'")

# Childcare services 

In [None]:
import requests
import pandas as pd

# Define the bounding box for Victoria
min_lat, min_lon, max_lat, max_lon = -39.2, 140.96, -33.9, 150.03

# Construct the Overpass query to search for childcare services (kindergarten and childcare centers)
overpass_query = f"""
[out:json];
(
  node["amenity"~"kindergarten|childcare"]({min_lat},{min_lon},{max_lat},{max_lon});
  way["amenity"~"kindergarten|childcare"]({min_lat},{min_lon},{max_lat},{max_lon});
  node["social_facility"="day_care"]({min_lat},{min_lon},{max_lat},{max_lon});
  way["social_facility"="day_care"]({min_lat},{min_lon},{max_lat},{max_lon});
);
out center;
"""

# Send the request to the Overpass API
overpass_url = "http://overpass-api.de/api/interpreter"
response = requests.get(overpass_url, params={'data': overpass_query})

# Check if the response is successful
if response.status_code != 200:
    print(f"Error: Received status code {response.status_code}")
    print("Response text:", response.text)
else:
    # Attempt to decode the JSON response
    try:
        data = response.json()
    except ValueError as e:
        print("Error decoding JSON:", e)
        print("Response text:", response.text)
    
    # Extract childcare service coordinates and details
    childcare_services = []
    for element in data['elements']:
        if 'lat' in element and 'lon' in element:
            childcare_services.append({
                'lat': element['lat'],
                'lon': element['lon'],
                'name': element['tags'].get('name', 'Unknown'),
                'amenity': element['tags'].get('amenity', 'Unknown')
            })
    
    # Convert the list of childcare services to a DataFrame
    childcare_df = pd.DataFrame(childcare_services)
    
    # Save the data to a CSV file
    childcare_df.to_csv('../data/overpass_amenities/childcare_services_victoria.csv', index=False)
    
    # Print success message
    print("Childcare service data saved to 'childcare_services_victoria.csv'")

# Health Services

In [None]:
import requests
import pandas as pd

# Define the bounding box for Victoria (latitude and longitude bounds)
min_lat, min_lon, max_lat, max_lon = -39.2, 140.96, -33.9, 150.03

# Construct the Overpass query to search for health services
overpass_query = f"""
[out:json];
(
  node["amenity"~"hospital|clinic|doctors|dentist|pharmacy"]({min_lat},{min_lon},{max_lat},{max_lon});
  way["amenity"~"hospital|clinic|doctors|dentist|pharmacy"]({min_lat},{min_lon},{max_lat},{max_lon});
  node["healthcare"~"hospital|clinic|doctor|dentist|pharmacy"]({min_lat},{min_lon},{max_lat},{max_lon});
  way["healthcare"~"hospital|clinic|doctor|dentist|pharmacy"]({min_lat},{min_lon},{max_lat},{max_lon});
);
out center;
"""

# Send the request to the Overpass API
overpass_url = "http://overpass-api.de/api/interpreter"
response = requests.get(overpass_url, params={'data': overpass_query})

# Check if the response is successful
if response.status_code != 200:
    print(f"Error: Received status code {response.status_code}")
    print("Response text:", response.text)
else:
    # Decode the JSON response
    try:
        data = response.json()
    except ValueError as e:
        print("Error decoding JSON:", e)
        print("Response text:", response.text)
    
    # Extract health service coordinates and details
    health_services = []
    for element in data['elements']:
        if 'lat' in element and 'lon' in element:
            health_services.append({
                'lat': element['lat'],
                'lon': element['lon'],
                'name': element['tags'].get('name', 'Unknown'),
                'amenity': element['tags'].get('amenity', 'Unknown')
            })
    
    # Convert the list of health services to a DataFrame
    health_services_df = pd.DataFrame(health_services)
    
    # Save the data to a CSV file
    health_services_df.to_csv('../data/overpass_amenities/health_services_victoria.csv', index=False)
    
    # Print success message
    print("Health services data saved to 'health_services_victoria.csv'")

# Sports and Recreation Centers

In [None]:
import requests
import pandas as pd

# Define the bounding box for Victoria (latitude and longitude bounds)
min_lat, min_lon, max_lat, max_lon = -39.2, 140.96, -33.9, 150.03

# Construct the Overpass query to search for sport and recreation facilities
overpass_query = f"""
[out:json];
(
  node["leisure"="fitness_centre"]({min_lat},{min_lon},{max_lat},{max_lon});
  way["leisure"="fitness_centre"]({min_lat},{min_lon},{max_lat},{max_lon});
  node["sport"="fitness"]({min_lat},{min_lon},{max_lat},{max_lon});
  way["sport"="fitness"]({min_lat},{min_lon},{max_lat},{max_lon});
  node["name"~"Fitness|Gym|Health Club|24/7"](if:t["leisure"]!="fitness_centre")({min_lat},{min_lon},{max_lat},{max_lon});
);
out center;
"""

# Send the request to the Overpass API
overpass_url = "http://overpass-api.de/api/interpreter"
response = requests.get(overpass_url, params={'data': overpass_query})

# Check if the response is successful
if response.status_code != 200:
    print(f"Error: Received status code {response.status_code}")
    print("Response text:", response.text)
else:
    # Decode the JSON response
    try:
        data = response.json()
    except ValueError as e:
        print("Error decoding JSON:", e)
        print("Response text:", response.text)
    
    # Extract sport and recreation facility coordinates and details
    recreation_facilities = []
    for element in data['elements']:
        if 'lat' in element and 'lon' in element:
            recreation_facilities.append({
                'lat': element['lat'],
                'lon': element['lon'],
                'name': element['tags'].get('name', 'Unknown'),
                'amenity': element['tags'].get('amenity', 'Unknown')
            })
    
    # Convert the list of recreation facilities to a DataFrame
    recreation_facilities_df = pd.DataFrame(recreation_facilities)
    
    # Save the data to a CSV file
    recreation_facilities_df.to_csv('../data/overpass_amenities/recreation_facilities_victoria.csv', index=False)
    
    # Print success message
    print("Sport and recreation facilities data saved to 'recreation_facilities_victoria.csv'")

# Convienience Stores

In [None]:
import requests
import pandas as pd

# Define the bounding box for Victoria (latitude and longitude bounds)
min_lat, min_lon, max_lat, max_lon = -39.2, 140.96, -33.9, 150.03

# Construct the Overpass query to search for convenience stores
overpass_query = f"""
[out:json];
(
  node["shop"~"convenience|grocery|supermarket"]({min_lat},{min_lon},{max_lat},{max_lon});
  way["shop"~"convenience|grocery|supermarket"]({min_lat},{min_lon},{max_lat},{max_lon});
  node["amenity"="marketplace"]({min_lat},{min_lon},{max_lat},{max_lon});
  way["amenity"="marketplace"]({min_lat},{min_lon},{max_lat},{max_lon});
);
out center;
"""

# Send the request to the Overpass API
overpass_url = "http://overpass-api.de/api/interpreter"
response = requests.get(overpass_url, params={'data': overpass_query})

# Check if the response is successful
if response.status_code != 200:
    print(f"Error: Received status code {response.status_code}")
    print("Response text:", response.text)
else:
    # Decode the JSON response
    try:
        data = response.json()
    except ValueError as e:
        print("Error decoding JSON:", e)
        print("Response text:", response.text)
    
    # Extract convenience store coordinates and details
    convenience_stores = []
    for element in data['elements']:
        if 'lat' in element and 'lon' in element:
            convenience_stores.append({
                'lat': element['lat'],
                'lon': element['lon'],
                'name': element['tags'].get('name', 'Unknown'),
                'shop': element['tags'].get('shop', 'Unknown')
            })
    
    # Convert the list of convenience stores to a DataFrame
    convenience_stores_df = pd.DataFrame(convenience_stores)
    
    # Save the data to a CSV file
    convenience_stores_df.to_csv('../data/overpass_amenities/convenience_stores_victoria.csv', index=False)
    
    # Print success message
    print("Convenience stores data saved to 'convenience_stores_victoria.csv'")

# Adding to Suburbs Dataframe

In [None]:
import geopandas as gpd
from shapely.geometry import Point

# Load the suburb shapefile
suburbs_gdf = gpd.read_file("../data/mappings/GDA94/vic_localities.shp")

# Ensure the coordinate reference system (CRS) is correct (should be the same as your amenity data)
suburbs_gdf = suburbs_gdf.to_crs(epsg=4326)

suburbs_gdf['centroid'] = suburbs_gdf.geometry.centroid


In [None]:
# Load the train stations data
train_stations_df = pd.read_csv('../data/overpass_amenities/metro_train_stations_victoria.csv')

# Step 3: Convert the CSV data into a GeoDataFrame
train_stations_gdf = gpd.GeoDataFrame(
    train_stations_df,
    geometry=[Point(xy) for xy in zip(train_stations_df.lon, train_stations_df.lat)],
    crs="EPSG:4326"
)

# Step 4: Calculate the centroids of each suburb
suburbs_gdf['centroid'] = suburbs_gdf.geometry.centroid

# Step 5: Create a buffer of 7.5 km around each centroid
suburbs_gdf['buffer'] = suburbs_gdf['centroid'].buffer(7500)  # 7.5 km = 7500 meters

# Step 6: Perform a spatial join to find train stations within each suburb's buffer
train_stations_in_buffers = gpd.sjoin(train_stations_gdf, suburbs_gdf, how="inner", predicate="within")

# Step 7: Group by suburb and count the number of train stations in each buffer
train_station_counts = train_stations_in_buffers.groupby('LOC_NAME').size().reset_index(name='train_station_count')

# Step 8: Merge the counts back into the suburb GeoDataFrame
suburbs_gdf = suburbs_gdf.merge(train_station_counts, how='left', left_on='LOC_NAME', right_on='LOC_NAME')

# Fill any missing values (for suburbs with no train stations) with 0
suburbs_gdf['train_station_count'] = suburbs_gdf['train_station_count'].fillna(0)

# Optionally, drop the temporary columns used for calculations if desired
suburbs_gdf.drop(columns=['centroid', 'buffer'], inplace=True)

# Display the updated suburbs GeoDataFrame with train station counts
print(suburbs_gdf[['LOC_NAME', 'train_station_count']])

In [None]:
# Step 2: Load the CSV with fresh food locations
fresh_food_df = pd.read_csv('../data/overpass_amenities/fresh_food_locations_victoria.csv')

# Step 3: Convert the CSV data into a GeoDataFrame
fresh_food_gdf = gpd.GeoDataFrame(
    fresh_food_df,
    geometry=[Point(xy) for xy in zip(fresh_food_df.lon, fresh_food_df.lat)],
    crs="EPSG:4326"
)

# Step 4: Calculate the centroids of each suburb (if not already done)
suburbs_gdf['centroid'] = suburbs_gdf.geometry.centroid

# Step 5: Create a buffer of 7.5 km around each centroid
suburbs_gdf['buffer'] = suburbs_gdf['centroid'].buffer(7500)  # 7.5 km = 7500 meters

# Step 6: Perform a spatial join to find fresh food locations within each suburb's buffer
fresh_food_in_buffers = gpd.sjoin(fresh_food_gdf, suburbs_gdf, how="inner", predicate="within")

# Step 7: Group by suburb and count the number of fresh food locations in each buffer
fresh_food_counts = fresh_food_in_buffers.groupby('LOC_NAME').size().reset_index(name='fresh_food_count')

# Step 8: Merge the counts back into the suburb GeoDataFrame
suburbs_gdf = suburbs_gdf.merge(fresh_food_counts, how='left', left_on='LOC_NAME', right_on='LOC_NAME')

# Fill any missing values (for suburbs with no fresh food locations) with 0
suburbs_gdf['fresh_food_count'] = suburbs_gdf['fresh_food_count'].fillna(0)

# Optionally, drop the temporary columns used for calculations if desired
suburbs_gdf.drop(columns=['centroid', 'buffer'], inplace=True)

# Display the updated suburbs GeoDataFrame with fresh food counts
print(suburbs_gdf[['LOC_NAME', 'fresh_food_count']])

In [None]:
# Step 2: Load the CSV with childcare locations
childcare_df = pd.read_csv('../data/overpass_amenities/childcare_services_victoria.csv')

# Step 3: Convert the CSV data into a GeoDataFrame
childcare_gdf = gpd.GeoDataFrame(
    childcare_df,
    geometry=[Point(xy) for xy in zip(childcare_df.lon, childcare_df.lat)],
    crs="EPSG:4326"
)

# Step 4: Calculate the centroids of each suburb (if not already done)
suburbs_gdf['centroid'] = suburbs_gdf.geometry.centroid

# Step 5: Create a buffer of 7.5 km around each centroid
suburbs_gdf['buffer'] = suburbs_gdf['centroid'].buffer(7500)  # 7.5 km = 7500 meters

# Step 6: Perform a spatial join to find childcare locations within each suburb's buffer
childcare_in_buffers = gpd.sjoin(childcare_gdf, suburbs_gdf, how="inner", predicate="within")

# Step 7: Group by suburb and count the number of childcare locations in each buffer
childcare_counts = childcare_in_buffers.groupby('LOC_NAME').size().reset_index(name='childcare_count')

# Step 8: Merge the counts back into the suburb GeoDataFrame
suburbs_gdf = suburbs_gdf.merge(childcare_counts, how='left', left_on='LOC_NAME', right_on='LOC_NAME')

# Fill any missing values (for suburbs with no childcare locations) with 0
suburbs_gdf['childcare_count'] = suburbs_gdf['childcare_count'].fillna(0)

# Optionally, drop the temporary columns used for calculations if desired
suburbs_gdf.drop(columns=['centroid', 'buffer'], inplace=True)

# Display the updated suburbs GeoDataFrame with childcare counts
print(suburbs_gdf[['LOC_NAME', 'childcare_count']])

In [None]:
# Step 2: Load the CSV with health services locations
health_df = pd.read_csv('../data/overpass_amenities/health_services_victoria.csv')

# Step 3: Convert the CSV data into a GeoDataFrame
health_gdf = gpd.GeoDataFrame(
    health_df,
    geometry=[Point(xy) for xy in zip(health_df.lon, health_df.lat)],
    crs="EPSG:4326"
)

# Step 4: Calculate the centroids of each suburb (if not already done)
suburbs_gdf['centroid'] = suburbs_gdf.geometry.centroid

# Step 5: Create a buffer of 7.5 km around each centroid
suburbs_gdf['buffer'] = suburbs_gdf['centroid'].buffer(7500)  # 7.5 km = 7500 meters

# Step 6: Perform a spatial join to find health services within each suburb's buffer
health_in_buffers = gpd.sjoin(health_gdf, suburbs_gdf, how="inner", predicate="within")

# Step 7: Group by suburb and count the number of health services in each buffer
health_counts = health_in_buffers.groupby('LOC_NAME').size().reset_index(name='health_count')

# Step 8: Merge the counts back into the suburb GeoDataFrame
suburbs_gdf = suburbs_gdf.merge(health_counts, how='left', left_on='LOC_NAME', right_on='LOC_NAME')

# Fill any missing values (for suburbs with no health services) with 0
suburbs_gdf['health_count'] = suburbs_gdf['health_count'].fillna(0)

# Optionally, drop the temporary columns used for calculations if desired
suburbs_gdf.drop(columns=['centroid', 'buffer'], inplace=True)

# Display the updated suburbs GeoDataFrame with health counts
print(suburbs_gdf[['LOC_NAME', 'health_count']])

In [None]:
# Step 2: Load the CSV with recreation facilities locations
recreation_df = pd.read_csv('../data/overpass_amenities/recreation_facilities_victoria.csv')

# Step 3: Convert the CSV data into a GeoDataFrame
recreation_gdf = gpd.GeoDataFrame(
    recreation_df,
    geometry=[Point(xy) for xy in zip(recreation_df.lon, recreation_df.lat)],
    crs="EPSG:4326"
)

# Step 4: Calculate the centroids of each suburb (if not already done)
suburbs_gdf['centroid'] = suburbs_gdf.geometry.centroid

# Step 5: Create a buffer of 7.5 km around each centroid
suburbs_gdf['buffer'] = suburbs_gdf['centroid'].buffer(7500)  # 7.5 km = 7500 meters

# Step 6: Perform a spatial join to find recreation facilities within each suburb's buffer
recreation_in_buffers = gpd.sjoin(recreation_gdf, suburbs_gdf, how="inner", predicate="within")

# Step 7: Group by suburb and count the number of recreation facilities in each buffer
recreation_counts = recreation_in_buffers.groupby('LOC_NAME').size().reset_index(name='recreation_count')

# Step 8: Merge the counts back into the suburb GeoDataFrame
suburbs_gdf = suburbs_gdf.merge(recreation_counts, how='left', left_on='LOC_NAME', right_on='LOC_NAME')

# Fill any missing values (for suburbs with no recreation facilities) with 0
suburbs_gdf['recreation_count'] = suburbs_gdf['recreation_count'].fillna(0)

# Optionally, drop the temporary columns used for calculations if desired
suburbs_gdf.drop(columns=['centroid', 'buffer'], inplace=True)

# Display the updated suburbs GeoDataFrame with recreation counts
print(suburbs_gdf[['LOC_NAME', 'recreation_count']])

In [None]:
# Step 2: Load the CSV with convenience store locations
convenience_stores_df = pd.read_csv('../data/overpass_amenities/convenience_stores_victoria.csv')

# Step 3: Convert the CSV data into a GeoDataFrame
convenience_stores_gdf = gpd.GeoDataFrame(
    convenience_stores_df,
    geometry=[Point(xy) for xy in zip(convenience_stores_df.lon, convenience_stores_df.lat)],
    crs="EPSG:4326"
)

# Step 4: Calculate the centroids of each suburb (if not already done)
suburbs_gdf['centroid'] = suburbs_gdf.geometry.centroid

# Step 5: Create a buffer of 7.5 km around each centroid
suburbs_gdf['buffer'] = suburbs_gdf['centroid'].buffer(7500)  # 7.5 km = 7500 meters

# Step 6: Perform a spatial join to find convenience stores within each suburb's buffer
convenience_stores_in_buffers = gpd.sjoin(convenience_stores_gdf, suburbs_gdf, how="inner", predicate="within")

# Step 7: Group by suburb and count the number of convenience stores in each buffer
convenience_store_counts = convenience_stores_in_buffers.groupby('LOC_NAME').size().reset_index(name='convenience_store_count')

# Step 8: Merge the counts back into the suburb GeoDataFrame
suburbs_gdf = suburbs_gdf.merge(convenience_store_counts, how='left', left_on='LOC_NAME', right_on='LOC_NAME')

# Fill any missing values (for suburbs with no convenience stores) with 0
suburbs_gdf['convenience_store_count'] = suburbs_gdf['convenience_store_count'].fillna(0)

# Optionally, drop the temporary columns used for calculations if desired
suburbs_gdf.drop(columns=['centroid', 'buffer'], inplace=True)

# Display the updated suburbs GeoDataFrame with convenience store counts
print(suburbs_gdf[['LOC_NAME', 'convenience_store_count']])

In [None]:
import pandas as pd
import geopandas as gpd
from fuzzywuzzy import process

# Load the new dataset
education_data = pd.read_csv('../data/overpass_amenities/schools_preprocessed.csv')  # Update the path to your dataset

# Step 1: Aggregate the school counts into a single column
education_data['number_of_schools'] = (education_data['num_primary'] +
                                        education_data['num_secondary_public'] +
                                        education_data['num_secondary_private'] +
                                        education_data['num_secondary_catholic'])

# Step 2: Remove unnecessary columns from the education dataset
education_data = education_data[['suburb', 'number_of_schools']]  # Keep only relevant columns

# Prepare suburb names for matching
education_data['suburb'] = education_data['suburb'].str.lower()  # Convert to lowercase

# Ensure suburbs_gdf is a GeoDataFrame and prepare for matching
if not isinstance(suburbs_gdf, gpd.GeoDataFrame):
    suburbs_gdf = gpd.GeoDataFrame(suburbs_gdf, geometry='geometry')

suburbs_gdf['LOC_NAME'] = suburbs_gdf['LOC_NAME'].str.lower()  # Convert to lowercase

# Step 3: Fuzzy match suburb names and create a mapping
def fuzzy_match(row, choices, scorer):
    match, score = process.extractOne(row['suburb'], choices, scorer=scorer)
    return match if score >= 80 else None  # Adjust threshold as necessary

# Create a mapping for suburb matching
suburb_choices = suburbs_gdf['LOC_NAME'].unique()
education_data['matched_suburb'] = education_data.apply(
    fuzzy_match, 
    choices=suburb_choices, 
    scorer=process.fuzz.token_sort_ratio,
    axis=1
)

# Step 4: Perform a left join on suburbs_gdf with the matched suburbs
suburbs_gdf = suburbs_gdf.merge(
    education_data,
    left_on='LOC_NAME', 
    right_on='matched_suburb',
    how='left'
)

# Step 5: Fill NaN values in the number_of_schools column with 0 for suburbs with no match
suburbs_gdf['number_of_schools'] = suburbs_gdf['number_of_schools'].fillna(0)

# Convert the number_of_schools to integer if necessary
suburbs_gdf['number_of_schools'] = suburbs_gdf['number_of_schools'].astype(int)

# Step 6: Remove unnecessary columns
suburbs_gdf = suburbs_gdf.drop(columns=[
    'suburb',  # Original suburb name
    'matched_suburb'  # Column used for matching
], errors='ignore')  # Use errors='ignore' to avoid issues if columns don't exist

# Now, suburbs_gdf will have the additional column 'number_of_schools' and be cleaned up

In [None]:
education_data.head()

In [None]:
suburbs_gdf.head()

In [None]:
import pandas as pd
from fuzzywuzzy import process

# Step 2: Prepare for fuzzy matching
# Ensure both columns are in a comparable format
merged_df['Suburbs'] = merged_df['Suburbs'].str.strip().str.title()
suburbs_gdf['LOC_NAME'] = suburbs_gdf['LOC_NAME'].str.strip().str.title()

# Step 3: Create a function to find the best match
def get_best_match(suburb, choices):
    match, score = process.extractOne(suburb, choices)
    return match, score

# Step 4: Apply fuzzy matching
merged_df['Best_Match'], merged_df['Match_Score'] = zip(*merged_df['Suburbs'].apply(lambda x: get_best_match(x, suburbs_gdf['LOC_NAME'].tolist())))

# Step 5: Set a threshold for matching (e.g., 80% confidence)
threshold = 80
matched_suburbs = merged_df[merged_df['Match_Score'] >= threshold]

# Step 6: Merge with the suburbs GeoDataFrame using the best matches
suburbs_gdf_merged = suburbs_gdf.merge(
    matched_suburbs,
    how='inner',  # Only keep rows where there's a match
    left_on='LOC_NAME',  # Suburb identifier in the GeoDataFrame
    right_on='Best_Match'  # Best matched suburb identifier in the rental data
)

# Optional: Drop unnecessary columns
suburbs_gdf_merged = suburbs_gdf_merged.drop(columns=['Suburbs', 'Best_Match', 'Match_Score'])

# Step 7: Inspect the merged DataFrame
print(suburbs_gdf_merged)

In [None]:
# Step 1: Remove unwanted columns from both sides of the merge
columns_to_delete = [
    'LC_PLY_PID_x', 'LOC_PID_x', 'DT_CREATE_x', 'LOC_CLASS_x', 'STATE_x', 
    'LC_PLY_PID_y', 'LOC_PID_y', 'DT_CREATE_y', 'LOC_CLASS_y', 'STATE_y', 
    'geometry_y'  # Assuming you want to keep geometry from the suburbs
]

# Drop unwanted columns
suburbs_gdf_merged = suburbs_gdf_merged.drop(columns=columns_to_delete)

# Step 2: Rename 'LOC_NAME_x' to 'Suburb' (keeping the 'LOC_NAME_x' because of merge)
suburbs_gdf_merged = suburbs_gdf_merged.rename(columns={'LOC_NAME_x': 'Suburb'})

# Step 3: Rename the rental columns
column_renames = {
    'MARCH2024COUNT_FLAT1': '1 Bedroom Flat Count',
    'MARCH2024MEDIAN_FLAT1': '1 Bedroom Flat Median',
    'MARCH2024COUNT_FLAT2': '2 Bedroom Flat Count',
    'MARCH2024MEDIAN_FLAT2': '2 Bedroom Flat Median',
    'MARCH2024COUNT_FLAT3': '3 Bedroom Flat Count',
    'MARCH2024MEDIAN_FLAT3': '3 Bedroom Flat Median',
    'MARCH2024COUNT_HOUSE2': '2 Bedroom House Count',
    'MARCH2024MEDIAN_HOUSE2': '2 Bedroom House Median',
    'MARCH2024COUNT_HOUSE3': '3 Bedroom House Count',
    'MARCH2024MEDIAN_HOUSE3': '3 Bedroom House Median',
    'MARCH2024COUNT_HOUSE4': '4 Bedroom House Count',
    'MARCH2024MEDIAN_HOUSE4': '4 Bedroom House Median'
}

# Rename rental columns (if they are still present)
suburbs_gdf_merged = suburbs_gdf_merged.rename(columns=column_renames)

# Step 4: Select relevant columns to keep, including amenity counts
# Ensure you include amenity counts in the final DataFrame
amenity_columns = [
    'train_station_count', 
    'fresh_food_count',
    'childcare_count', 
    'health_count', 
    'recreation_count',
    'convenience_store_count',
    'number_of_schools'
]

# Combine all relevant columns to keep in the final DataFrame
final_columns = ['Suburb', 'geometry_x'] + list(column_renames.values()) + amenity_columns

# Select the final columns to keep
suburbs_gdf_merged = suburbs_gdf_merged[final_columns]

# Optionally, rename the geometry column to 'geometry'
suburbs_gdf_merged = suburbs_gdf_merged.rename(columns={'geometry_x': 'geometry'})

# Step 5: Inspect the cleaned DataFrame
print(suburbs_gdf_merged.head())

In [None]:
suburbs_gdf_merged.head()
print(suburbs_gdf_merged.columns)

In [None]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

# Step 1: Normalize the relevant columns including median values
columns_to_normalize = [
    'train_station_count', 'fresh_food_count', 'childcare_count', 'health_count',
    'recreation_count', 'convenience_store_count', 'number_of_schools',
    '1 Bedroom Flat Count', '1 Bedroom Flat Median',
    '2 Bedroom Flat Count', '2 Bedroom Flat Median',
    '3 Bedroom Flat Count', '3 Bedroom Flat Median',
    '2 Bedroom House Count', '2 Bedroom House Median',
    '3 Bedroom House Count', '3 Bedroom House Median',
    '4 Bedroom House Count', '4 Bedroom House Median'
]

scaler = MinMaxScaler()
normalized_values = scaler.fit_transform(suburbs_gdf_merged[columns_to_normalize])

# Create a DataFrame with the normalized values
normalized_df = pd.DataFrame(normalized_values, columns=columns_to_normalize)

weighting_schemes = {
    'families_with_kids': {
        'train_station_count': 0.03,  # Lower emphasis on count
        'fresh_food_count': 0.08,
        'childcare_count': 0.20,
        'health_count': 0.05,
        'recreation_count': 0.10,
        'convenience_store_count': 0.05,
        'number_of_schools': 0.4,
        '1 Bedroom Flat Count': 0.02,
        '1 Bedroom Flat Median': 0.15,  # Increased weight on median
        '2 Bedroom Flat Count': 0.02,
        '2 Bedroom Flat Median': 0.20,  # Higher weight for larger flats
        '3 Bedroom Flat Count': 0.02,
        '3 Bedroom Flat Median': 0.25,  # Higher weight for larger flats
        '2 Bedroom House Count': 0.03,
        '2 Bedroom House Median': 0.25,  # Increased weight on median rents
        '3 Bedroom House Count': 0.03,
        '3 Bedroom House Median': 0.30,  # Higher weight for larger houses
        '4 Bedroom House Count': 0.02,
        '4 Bedroom House Median': 0.35,  # Emphasizing larger properties
    },
    'working_professionals': {
        'train_station_count': 0.15,  # Higher emphasis on transportation
        'fresh_food_count': 0.08,
        'childcare_count': 0.01,
        'health_count': 0.10,
        'recreation_count': 0.10,
        'convenience_store_count': 0.10,
        'number_of_schools': 0.2,
        '1 Bedroom Flat Count': 0.05,
        '1 Bedroom Flat Median': 0.20,  # Increased weight on median
        '2 Bedroom Flat Count': 0.02,
        '2 Bedroom Flat Median': 0.15,
        '3 Bedroom Flat Count': 0.02,
        '3 Bedroom Flat Median': 0.20,
        '2 Bedroom House Count': 0.02,
        '2 Bedroom House Median': 0.15,
        '3 Bedroom House Count': 0.02,
        '3 Bedroom House Median': 0.20,
        '4 Bedroom House Count': 0.02,
        '4 Bedroom House Median': 0.20,
    },
    'elderly': {
        'train_station_count': 0.02,
        'fresh_food_count': 0.15,
        'childcare_count': 0.01,
        'health_count': 0.35,  # Higher weight for health services
        'recreation_count': 0.10,
        'convenience_store_count': 0.20,
        'number_of_schools': 0.05,
        '1 Bedroom Flat Count': 0.03,
        '1 Bedroom Flat Median': 0.15,
        '2 Bedroom Flat Count': 0.02,
        '2 Bedroom Flat Median': 0.15,
        '3 Bedroom Flat Count': 0.02,
        '3 Bedroom Flat Median': 0.15,
        '2 Bedroom House Count': 0.02,
        '2 Bedroom House Median': 0.20,
        '3 Bedroom House Count': 0.02,
        '3 Bedroom House Median': 0.25,
        '4 Bedroom House Count': 0.02,
        '4 Bedroom House Median': 0.1,  # Emphasizing larger properties
    }
}

# Function to calculate livability score based on demographic
def calculate_livability_score(df, demographic):
    weights = weighting_schemes[demographic]
    
    # Normalize the relevant columns
    normalized_values = scaler.fit_transform(df[columns_to_normalize])
    normalized_df = pd.DataFrame(normalized_values, columns=columns_to_normalize)

    # Calculate the livability score
    df['livability_score'] = (normalized_df * pd.Series(weights)).sum(axis=1)
    
    return df

# Example usage
suburbs_with_family_scores = calculate_livability_score(suburbs_gdf_merged, 'families_with_kids')
suburbs_with_professional_scores = calculate_livability_score(suburbs_gdf_merged, 'working_professionals')
suburbs_with_elderly_scores = calculate_livability_score(suburbs_gdf_merged, 'elderly')

# Optional: Sort by livability score
suburbs_family_sorted = suburbs_with_family_scores.sort_values(by='livability_score', ascending=False)
suburbs_professional_sorted = suburbs_with_professional_scores.sort_values(by='livability_score', ascending=False)
suburbs_elderly_sorted = suburbs_with_elderly_scores.sort_values(by='livability_score', ascending=False)

In [None]:
import pandas as pd
from tabulate import tabulate

# Assuming suburbs_family_sorted, suburbs_professional_sorted, and suburbs_elderly_sorted are your DataFrames

# Get the top 10 ranked suburbs for families with kids
top_families = suburbs_family_sorted.nlargest(10, 'livability_score')

# Get the top 10 ranked suburbs for working professionals
top_professionals = suburbs_professional_sorted.nlargest(10, 'livability_score')

# Get the top 10 ranked suburbs for the elderly
top_elderly = suburbs_elderly_sorted.nlargest(10, 'livability_score')

# Display the top 10 for Families with Kids
print("Top 10 Livability Scores for Families with Kids:")
print(tabulate(top_families[['Suburb', 'livability_score']], headers='keys', tablefmt='psql', showindex=False))

# Display the top 10 for Working Professionals
print("\nTop 10 Livability Scores for Working Professionals:")
print(tabulate(top_professionals[['Suburb', 'livability_score']], headers='keys', tablefmt='psql', showindex=False))

# Display the top 10 for the Elderly
print("\nTop 10 Livability Scores for the Elderly:")
print(tabulate(top_elderly[['Suburb', 'livability_score']], headers='keys', tablefmt='psql', showindex=False))

In [None]:
import pandas as pd
import geopandas as gpd
import folium
from tabulate import tabulate
import pickle

# Assuming suburbs_family_sorted is your DataFrame containing the suburbs

# Get the top 11 ranked suburbs for families with kids
top_families = suburbs_family_sorted.nlargest(11, 'livability_score')

# Display the top 11 for Families with Kids
print("Top 11 Livability Scores for Families with Kids:")
print(tabulate(top_families[['Suburb', 'livability_score']], headers='keys', tablefmt='psql', showindex=False))

# Create a Folium map centered on Melbourne
m = folium.Map(location=[-37.8136, 144.9631], zoom_start=10)  # Centered on Melbourne

# Define a single color for the markers
marker_color = 'blue'  # Change this to your preferred color

# Add markers for suburbs ranked 2 to 11
for idx in range(1, len(top_families)):  # Start from index 1 to skip the top suburb
    row = top_families.iloc[idx]
    suburb_name = row['Suburb']
    suburb_score = row['livability_score']

    # Get the geometry for the suburb (assuming it exists in a GeoDataFrame)
    suburb_geom = suburbs_gdf[suburbs_gdf['LOC_NAME'] == suburb_name].geometry.iloc[0]

    # Calculate the centroid for the suburb geometry
    centroid = suburb_geom.centroid

    # Ranking for display (1 for 2nd suburb, 10 for 11th suburb)
    rank = idx  # Since idx starts from 1, it corresponds directly to the ranking from 1 to 10

    # Create a marker for the suburb with its rank
    folium.Marker(
        location=[centroid.y, centroid.x],
        popup=f"Rank {rank}: {suburb_name} - {suburb_score:.2f} livability score",
        icon=folium.Icon(color=marker_color, icon='info-sign')
    ).add_to(m)

    # Highlight the suburb on the map (with a transparent fill)
    folium.GeoJson(
        suburb_geom,
        style_function=lambda x: {
            'fillColor': 'lightgray',
            'color': 'black',
            'weight': 2,
            'fillOpacity': 0.4,  # Semi-transparent highlight
        },
    ).add_to(m)

display(m)
m.save('../data/curated/liveability/top10_livemap.html')



In [None]:
print(len(suburbs_gdf_merged))

In [None]:
import geopandas as gpd
import folium
from folium import LinearColormap
from IPython.display import display, HTML

# Define the suburb names
suburbs_list = [
    'Box Hill', 'Melbourne CBD - North', 'Doncaster', 'Glen Waverley - East', 
    'Glen Waverley - West', 'Balwyn North', 'Box Hill North', 'Balwyn', 
    'Wheelers Hill', 'Doncaster East - South', 'Mount Waverley - North', 
    'Mount Waverley - South', 'Melbourne CBD - West', 'Doncaster East - North', 
    'Carlton', 'Wantirna South', 'Blackburn', 'Vermont South', 
    'Templestowe', 'Docklands', 'Glen Waverley'
]

# Ensure suburbs_gdf_merged is a GeoDataFrame
if not isinstance(suburbs_gdf_merged, gpd.GeoDataFrame):
    suburbs_gdf_merged = gpd.GeoDataFrame(suburbs_gdf_merged, geometry='geometry')

# Set the CRS (use the appropriate CRS for your data)
suburbs_gdf_merged = suburbs_gdf_merged.set_crs("EPSG:4326")

# Filter the GeoDataFrame to include only the listed suburbs
result_suburbs = suburbs_gdf_merged[suburbs_gdf_merged['Suburb'].isin(suburbs_list)]

# Sort by livability score for better clarity
result_suburbs = result_suburbs.sort_values('livability_score', ascending=False)

# Create a Folium map centered on Melbourne
m = folium.Map(location=[-37.8136, 144.9631], zoom_start=10)  # Centered on Melbourne

# Create a color map based on livability scores
color_map = LinearColormap(
    colors=['#f9d0c8', '#bd6877', '#4c4557'], 
    vmin=result_suburbs['livability_score'].min(), 
    vmax=result_suburbs['livability_score'].max()
)

# Add polygons to the map with livability score popups
for idx, row in result_suburbs.iterrows():
    # Add GeoJson for the suburb with a specific color based on livability score
    folium.GeoJson(
        row['geometry'],
        style_function=lambda x, score=row['livability_score']: {
            'fillColor': color_map(score),
            'color': 'black',
            'weight': 2,
            'fillOpacity': 0.6,  # Adjust opacity
        },
    ).add_to(m)

    # Add a marker for each suburb at its centroid with livability score in popup
    centroid = row['geometry'].centroid
    folium.Marker(
        location=[centroid.y, centroid.x],
        popup=f"{row['Suburb']}: {row['livability_score']:.2f} livability score",
        icon=folium.Icon(color='blue')
    ).add_to(m)

# Add color legend to the map
color_map.add_to(m)

# Prepare the DataFrame for display
styled_table = result_suburbs[['Suburb', 'livability_score']]

# Format the DataFrame for better visibility
styled_table['livability_score'] = styled_table['livability_score'].map('{:.2f}'.format)

# Create an HTML table string
html_table = styled_table.to_html(index=False, 
                                   classes='table table-striped', 
                                   border=0, 
                                   justify='center', 
                                   col_space=100)

# Add custom CSS for styling
styled_html = f'''
<style>
.table {{
    width: 100%;
    border-collapse: collapse;
}}
.table th, .table td {{
    border: 1px solid black;
    padding: 8px;
    text-align: center;
}}
.table th {{
    background-color: #0f0f0f;
    font-weight: bold;
    text-align: left;
}}
</style>
{html_table}
'''

# Display the styled table
display(HTML(styled_html))

# Display the map
display(m)  # This will render the map in the notebook

In [None]:
import geopandas as gpd
import pandas as pd

# Ensure suburbs_gdf_merged is a GeoDataFrame
if not isinstance(suburbs_gdf_merged, gpd.GeoDataFrame):
    suburbs_gdf_merged = gpd.GeoDataFrame(suburbs_gdf_merged, geometry='geometry')

# Sort the suburbs by livability_score in descending order
suburbs_gdf_sorted = suburbs_gdf_merged[['Suburb', 'livability_score']].sort_values(by='livability_score', ascending=False)

# Remove the first row
suburbs_gdf_sorted = suburbs_gdf_sorted.iloc[1:]

# Add a rank column to assign ranks based on the sorted livability scores
suburbs_gdf_sorted['Rank'] = range(1, len(suburbs_gdf_sorted) + 1)

# Save the result to a CSV file, with ranks included
csv_file_path = "../data/curated/liveability/output_suburbs_livability_ranked.csv"  # Set the path where you want to save the CSV
suburbs_gdf_sorted.to_csv(csv_file_path, index=False)

# Confirmation message
print(f"CSV file has been saved to {csv_file_path}")

# No Rental

In [None]:
print(suburbs_gdf.columns)

In [None]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

# Step 1: Normalize the relevant columns including median values
columns_to_normalize = [
    'train_station_count', 'fresh_food_count', 'childcare_count', 'health_count',
    'recreation_count', 'convenience_store_count'
]

scaler = MinMaxScaler()
normalized_values = scaler.fit_transform(suburbs_gdf[columns_to_normalize])

# Create a DataFrame with the normalized values
normalized_df = pd.DataFrame(normalized_values, columns=columns_to_normalize)

weighting_schemes = {
    'families_with_kids': {
        'train_station_count': 0.03,  # Lower emphasis on count
        'fresh_food_count': 0.08,
        'childcare_count': 0.20,
        'health_count': 0.05,
        'recreation_count': 0.10,
        'convenience_store_count': 0.05,
        'number_of_schools': 0.4,
    },
    'working_professionals': {
        'train_station_count': 0.15,  # Higher emphasis on transportation
        'fresh_food_count': 0.08,
        'childcare_count': 0.01,
        'health_count': 0.10,
        'recreation_count': 0.10,
        'convenience_store_count': 0.10,
        'number_of_schools':0.2,
    },
    'elderly': {
        'train_station_count': 0.02,
        'fresh_food_count': 0.15,
        'childcare_count': 0.01,
        'health_count': 0.35,  # Higher weight for health services
        'recreation_count': 0.10,
        'convenience_store_count': 0.20,
        'number_of_schools': 0.05,
    }
}

# Function to calculate livability score based on demographic
def calculate_livability_score(df, demographic):
    weights = weighting_schemes[demographic]
    
    # Normalize the relevant columns
    normalized_values = scaler.fit_transform(df[columns_to_normalize])
    normalized_df = pd.DataFrame(normalized_values, columns=columns_to_normalize)

    # Calculate the livability score
    df['livability_score'] = (normalized_df * pd.Series(weights)).sum(axis=1)
    
    return df

# Example usage
suburbs_with_family_scores = calculate_livability_score(suburbs_gdf, 'families_with_kids')
suburbs_with_professional_scores = calculate_livability_score(suburbs_gdf, 'working_professionals')
suburbs_with_elderly_scores = calculate_livability_score(suburbs_gdf, 'elderly')

# Optional: Sort by livability score
suburbs_family_sorted = suburbs_with_family_scores.sort_values(by='livability_score', ascending=False)
suburbs_professional_sorted = suburbs_with_professional_scores.sort_values(by='livability_score', ascending=False)
suburbs_elderly_sorted = suburbs_with_elderly_scores.sort_values(by='livability_score', ascending=False)

In [None]:
import pandas as pd
from tabulate import tabulate

# Assuming suburbs_family_sorted, suburbs_professional_sorted, and suburbs_elderly_sorted are your DataFrames

# Get the top 10 ranked suburbs for families with kids
top_families = suburbs_family_sorted.nlargest(10, 'livability_score')

# Get the top 10 ranked suburbs for working professionals
top_professionals = suburbs_professional_sorted.nlargest(10, 'livability_score')

# Get the top 10 ranked suburbs for the elderly
top_elderly = suburbs_elderly_sorted.nlargest(10, 'livability_score')

# Display the top 10 for Families with Kids
print("Top 10 Livability Scores for Families with Kids:")
print(tabulate(top_families[['LOC_NAME', 'livability_score']], headers='keys', tablefmt='psql', showindex=False))

# Display the top 10 for Working Professionals
print("\nTop 10 Livability Scores for Working Professionals:")
print(tabulate(top_professionals[['LOC_NAME', 'livability_score']], headers='keys', tablefmt='psql', showindex=False))

# Display the top 10 for the Elderly
print("\nTop 10 Livability Scores for the Elderly:")
print(tabulate(top_elderly[['LOC_NAME', 'livability_score']], headers='keys', tablefmt='psql', showindex=False))

In [None]:
print(suburbs_gdf.columns)

In [None]:
import geopandas as gpd
import pandas as pd
from shapely.ops import nearest_points
from shapely.geometry import Point
import folium
from folium import LinearColormap
from IPython.display import display, HTML

def find_closest_suburb(point, gdf):
    return gdf.geometry.distance(point).idxmin()

# Ensure suburbs_gdf is a GeoDataFrame
if not isinstance(suburbs_gdf, gpd.GeoDataFrame):
    suburbs_gdf = gpd.GeoDataFrame(suburbs_gdf, geometry='geometry')

# Ensure the geometry column is correctly set
if 'geometry' not in suburbs_gdf.columns:
    raise KeyError("The 'geometry' column is missing from suburbs_gdf.")

# Set the CRS (use the appropriate CRS for your data)
suburbs_gdf = suburbs_gdf.set_crs("EPSG:4326")

# Rank all suburbs based on livability_score
suburbs_gdf['Overall_Rank'] = suburbs_gdf['livability_score'].rank(ascending=False, method='min')

# List of suburbs from the image
suburbs_list = [
    'Mickleham', 'Yuroke', 'Tarneit', 'Wollert', 'Rockbank',
    'Mount Cottrell', 'Cranbourne East', 'Truganina', 'Clyde North', 
    'Cranbourne West', 'Craigieburn', 'Cobblebank', 'Strathtulloh', 
    'Lynbrook', 'Lyndhurst', 'Pakenham', 'Cranbourne'
]

# Find the closest suburbs
result_suburbs = []
for suburb in suburbs_list:
    if suburb in suburbs_gdf['LOC_NAME'].values:
        result_suburbs.append(suburb)
    else:
        suburb_point = suburbs_gdf[suburbs_gdf['LOC_NAME'] == suburb].geometry.iloc[0]
        closest_suburb_index = find_closest_suburb(suburb_point, suburbs_gdf)
        closest_suburb = suburbs_gdf.loc[closest_suburb_index, 'LOC_NAME']
        result_suburbs.append(closest_suburb)

# Remove duplicates and take the top 10 from the selected list
result_suburbs = list(dict.fromkeys(result_suburbs))[:10]

# Filter the main GeoDataFrame to include only the result suburbs
filtered_gdf = suburbs_gdf[suburbs_gdf['LOC_NAME'].isin(result_suburbs)]

# Sort the filtered GeoDataFrame by livability_score (descending)
filtered_gdf = filtered_gdf.sort_values('livability_score', ascending=False).head(10)

# Reorder columns to place the Overall_Rank next to LOC_NAME
filtered_gdf = filtered_gdf[['LOC_NAME', 'Overall_Rank', 'livability_score', 'train_station_count', 
                              'fresh_food_count', 'childcare_count', 
                              'health_count', 'recreation_count', 'geometry']]

# Create a Folium map
m = folium.Map(location=[-37.8136, 144.9631], zoom_start=10)  # Centered on Melbourne

# Create a color map
color_map = LinearColormap(colors=['#f9d0c8', '#bd6877', '#4c4557'], 
                           vmin=0,  # Adjust min for shading
                           vmax=filtered_gdf['livability_score'].max())

# Add polygons to the map
for idx, row in filtered_gdf.iterrows():
    folium.GeoJson(
        row['geometry'],
        style_function=lambda x, score=row['livability_score']: {
            'fillColor': color_map(score),
            'color': 'black',
            'weight': 2,
            'fillOpacity': 1,  # Full opacity
        },
    ).add_to(m)

    # Add a marker for each suburb at its centroid
    centroid = row['geometry'].centroid
    folium.Marker(
        location=[centroid.y, centroid.x],
        icon=folium.Icon(color='blue')
    ).add_to(m)

# Add color legend to the map
color_map.add_to(m)

# Prepare the DataFrame for display
styled_table = filtered_gdf[['LOC_NAME', 'Overall_Rank', 'livability_score', 'train_station_count', 
                              'fresh_food_count', 'childcare_count', 
                              'health_count', 'recreation_count']]

# Format the DataFrame for better visibility
styled_table['livability_score'] = styled_table['livability_score'].map('{:.2f}'.format)

# Create a HTML table string
html_table = styled_table.to_html(index=False, 
                                   classes='table table-striped', 
                                   border=0, 
                                   justify='center', 
                                   col_space=100)

# Add custom CSS for styling
styled_html = f'''
<style>
.table {{
    width: 100%;
    border-collapse: collapse;
}}
.table th, .table td {{
    border: 1px solid black;
    padding: 8px;
    text-align: center;
}}
.table th {{
    background-color: #0f0f0f;
    font-weight: bold;
    text-align: center;
}}
</style>
{html_table}
'''

# Display the styled table
display(HTML(styled_html))

# Display the map
display(m)  # This will render the map in the notebook

In [None]:
import geopandas as gpd
import folium
from folium import LinearColormap
from IPython.display import display

# Ensure suburbs_gdf is a GeoDataFrame
if not isinstance(suburbs_gdf, gpd.GeoDataFrame):
    suburbs_gdf = gpd.GeoDataFrame(suburbs_gdf, geometry='geometry')

# Ensure the geometry column is correctly set
if 'geometry' not in suburbs_gdf.columns:
    raise KeyError("The 'geometry' column is missing from suburbs_gdf.")

# Set the CRS (use the appropriate CRS for your data)
suburbs_gdf = suburbs_gdf.set_crs("EPSG:4326")

# Rank all suburbs based on livability_score (optional, already exists in previous code)
suburbs_gdf['Overall_Rank'] = suburbs_gdf['livability_score'].rank(ascending=False, method='min')

# Filter top 10 most livable suburbs
top_10_suburbs = suburbs_gdf.sort_values('livability_score', ascending=False).head(10)

# Create a Folium map centered on Melbourne
m = folium.Map(location=[-37.8136, 144.9631], zoom_start=10)  # Melbourne coordinates

# Create a color map based on the range of livability scores
color_map = LinearColormap(
    colors=['#f9d0c8', '#bd6877', '#4c4557'],  # Define color range
    vmin=suburbs_gdf['livability_score'].min(),  # Minimum score
    vmax=suburbs_gdf['livability_score'].max()   # Maximum score
)

# Add polygons to the map for each suburb
for idx, row in suburbs_gdf.iterrows():
    # Prepare GeoJson feature
    geo_json = folium.GeoJson(
        row['geometry'],
        style_function=lambda x, score=row['livability_score']: {
            'fillColor': color_map(score),
            'color': 'black',
            'weight': 1,
            'fillOpacity': 0.7,  # Adjust opacity for the heatmap effect
        }
    )

    # Add a tooltip with suburb name and livability score
    folium.Popup(f"{row['LOC_NAME']}: {row['livability_score']:.2f}").add_to(geo_json)
    
    geo_json.add_to(m)

# Add markers for the top 10 most livable suburbs
for idx, row in top_10_suburbs.iterrows():
    # Get centroid of the suburb
    centroid = row['geometry'].centroid
    
    # Add a marker at the centroid
    folium.Marker(
        location=[centroid.y, centroid.x],
        popup=f"Top {int(row['Overall_Rank'])}: {row['LOC_NAME']} (Score: {row['livability_score']:.2f})",
        icon=folium.Icon(color='green', icon='info-sign')  # Green marker for top suburbs
    ).add_to(m)

# Add the color legend to the map
color_map.caption = 'Livability Score of Melbourne Suburbs'
color_map.add_to(m)

# Display the map
display(m)