In [7]:
import pandas as pd
import geopandas as gpd

# Directories
data_dir = '../data/'
landing_dir = data_dir + 'landing/'
raw_dir = data_dir + 'raw/'
curated_dir = data_dir + 'curated/'

In [36]:

def feat_sf (shapefile, feature_name, feat_type = None, feat_subtypes = None):
    """
    Cleaning shapefiles and dataframes for features we want.

    Args:
        shapefile (gpd.Geodataframe or pd.dataframe): the file with information on neighbourhood features
        feature_name (str): name of the feature
        feat_type (str or list, optional): any specific types of feature we want. Defaults to None.
        feat_subtypes (list, optional): feature subtypes, for example, a chicken is a subtype of a bird . Defaults to None.

    Raises:
        ValueError: feature name is wrong and not mentioned

    Returns:
        gpd.Geodataframe or pd.dataframe: the cleaned shapefile or dataframe
    """
    
    #Removing irrelevant features
    if feature_name in ("shopping", "parks", "hospital") and feat_type is not None and feat_subtypes is not None:
        # We only want features in VIC
        filtered_sf = shapefile[shapefile['STATE'] == "VIC"]
        filtered_sf = filtered_sf[filtered_sf['FTYPE'] == feat_type]
        filtered_sf = filtered_sf[filtered_sf['FEATSUBTYP'].isin(feat_subtypes)]
        
    elif feature_name == "train_station":
        filtered_sf = shapefile[shapefile['STATUS'] == "Active"]
        # Renaming columns for ease of use for future functions
        filtered_sf = filtered_sf.rename(columns={'STATION': 'NAME'})
    
    elif feature_name in ("primary_school", "secondary_school") and feat_type is not None:
        filtered_sf = shapefile[shapefile['School_Type'].isin(feat_type)]
        # Renaming columns for ease of use for future functions
        filtered_sf = filtered_sf.rename(columns={'School_Name': 'NAME'})
        filtered_sf = filtered_sf.rename(columns={'Y': 'latitude'})
        filtered_sf = filtered_sf.rename(columns={'X': 'longitude'})
        filtered_sf = filtered_sf.dropna(subset=['latitude', 'longitude']).copy()
        
        # As the df for school data is just a dataframe, we do not need to convert polygons into coordinates
        return filtered_sf.reset_index(drop=True)
    else:
        # Handle cases where feature_name does not match any known types
        raise ValueError("Invalid feature_name provided.")
        
    # Setting shapefile format
    filtered_sf['geometry'] = filtered_sf['geometry'].to_crs("+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs")
    
    # Creating an array of centroids of polygons in the feature shapefiles
    filtered_sf['centroid'] = filtered_sf['geometry'].centroid.apply(lambda geom: (geom.y, geom.x))
    filtered_sf['latitude'] = filtered_sf['centroid'].apply(lambda coord: coord[0])
    filtered_sf['longitude'] = filtered_sf['centroid'].apply(lambda coord: coord[1])
    filtered_sf = filtered_sf.dropna(subset=['latitude', 'longitude']).copy()

    return filtered_sf.reset_index(drop=True)  
    
foi_sf = gpd.read_file(f"{landing_dir}FOI/GEOMARK_POLYGON.shp")

shopping_type = "commercial facility"
shopping_feature = "shopping"
shopping_labels = ["shopping precinct", "shopping centre"]

shopping_sf = feat_sf(foi_sf, shopping_feature, shopping_type, shopping_labels)

parks_type = "reserve"
parks_feature = "parks"
parks_labels = ["park", "conservation park", "gardens", "national park", "city square"]

parks_sf = feat_sf(foi_sf, parks_feature, parks_type, parks_labels)

hospital_type = "hospital"
hospital_feature = "hospital"
hospital_labels = ["hospital complex"]

hospital_sf = feat_sf(foi_sf, hospital_feature, hospital_type, hospital_labels)



  filtered_sf['centroid'] = filtered_sf['geometry'].centroid.apply(lambda geom: (geom.y, geom.x))

  filtered_sf['centroid'] = filtered_sf['geometry'].centroid.apply(lambda geom: (geom.y, geom.x))

  filtered_sf['centroid'] = filtered_sf['geometry'].centroid.apply(lambda geom: (geom.y, geom.x))


In [39]:
sa2_path = f'{data_dir}sa2_shapefile/SA2_2021_AUST_GDA2020.shp'
gdf = gpd.read_file(sa2_path)
victoria_gdf = gdf[gdf['STE_CODE21'] == '2']
victoria_gdf = victoria_gdf.to_crs(epsg=4326)

def feature_vic_merge(feature_gdf, victoria_gdf, feature_name):
    # Perform a spatial join to assign each {feature_name} to its respective SA2 area (use 'predicate' instead of 'op')
    features_in_sa2 = gpd.sjoin(feature_gdf, victoria_gdf, how='inner', predicate='within')

    # Count the number of features in each SA2 area
    features_per_sa2 = features_in_sa2.groupby('SA2_NAME21').size().reset_index(name=f'{feature_name}_count')

    # Merge the {feature_name} counts with the Victoria SA2 GeoDataFrame
    victoria_gdf = victoria_gdf.merge(features_per_sa2, on='SA2_NAME21', how='left')

    # Fill NaN values with 0 (areas with no features)
    victoria_gdf[f'{feature_name}_count'] = victoria_gdf[f'{feature_name}_count'].fillna(0)
    return victoria_gdf.to_crs(epsg=4326)

shopping_victoria_gdf = feature_vic_merge(shopping_sf, victoria_gdf, shopping_feature)
parks_victoria_gdf = feature_vic_merge(parks_sf, victoria_gdf, parks_feature)
hospital_victoria_gdf = feature_vic_merge(hospital_sf, victoria_gdf, hospital_feature)

Use `to_crs()` to reproject one of the input geometries to match the CRS of the other.

Left CRS: +proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs + ...
Right CRS: EPSG:4326

  features_in_sa2 = gpd.sjoin(feature_gdf, victoria_gdf, how='inner', predicate='within')
Use `to_crs()` to reproject one of the input geometries to match the CRS of the other.

Left CRS: +proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs + ...
Right CRS: EPSG:4326

  features_in_sa2 = gpd.sjoin(feature_gdf, victoria_gdf, how='inner', predicate='within')
Use `to_crs()` to reproject one of the input geometries to match the CRS of the other.

Left CRS: +proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs + ...
Right CRS: EPSG:4326

  features_in_sa2 = gpd.sjoin(feature_gdf, victoria_gdf, how='inner', predicate='within')


In [40]:
shopping_victoria_gdf.to_csv(f"{data_dir}{shopping_feature}_count.csv", index=False)
parks_victoria_gdf.to_csv(f"{data_dir}{parks_feature}_count.csv", index=False)
hospital_victoria_gdf.to_csv(f"{data_dir}{hospital_feature}_count.csv", index=False)