In [1]:
import os
import geopandas as gpd
import folium
from folium.plugins import MarkerCluster
from pyspark.sql import SparkSession

# Define relative path for shapefile
shp_point_path = "../../data/landing/three_external/extracted/ll_gda2020/esrishape/whole_of_dataset/victoria/VMFEAT/FOI_POINT.shp"

# Load the Shapefile
gdf = gpd.read_file(shp_point_path)

# Filter the required subfeatures 'library' and 'tourist attraction'
multipoint_gdf = gdf[gdf['FEATSUBTYP'].isin(['library', 'tourist attraction'])]

# Convert the data to projected coordinate system EPSG:3111 (suitable for Australia)
multipoint_gdf = multipoint_gdf.to_crs(epsg=3111)

# Calculate the centroid of MultiPoint
multipoint_gdf['centroid'] = multipoint_gdf.geometry.centroid

# Extract the latitude and longitude of the centroid, convert back to GDA2020 (EPSG:7844)
multipoint_gdf['latitude'] = multipoint_gdf['centroid'].to_crs(epsg=7844).y
multipoint_gdf['longitude'] = multipoint_gdf['centroid'].to_crs(epsg=7844).x

# Count and print the number of rows with empty 'NAME' column
empty_name_count = multipoint_gdf['NAME'].isna().sum()
print(f"Number of rows with empty NAME: {empty_name_count}")

# Fill 'Unnamed Library' or 'Unnamed Tourist Attraction' for records with missing 'NAME'
multipoint_gdf.loc[multipoint_gdf['NAME'].isna() & (multipoint_gdf['FEATSUBTYP'] == 'library'), 'NAME'] = 'Unnamed Library'
multipoint_gdf.loc[multipoint_gdf['NAME'].isna() & (multipoint_gdf['FEATSUBTYP'] == 'tourist attraction'), 'NAME'] = 'Unnamed Tourist Attraction'

# Keep only the necessary columns
multipoint_gdf_cleaned = multipoint_gdf[['NAME', 'FEATSUBTYP', 'latitude', 'longitude']]


Number of rows with empty NAME: 11


## preprocessing step

In [2]:
# 1. Check for duplicate entries based on 'NAME', 'latitude', and 'longitude'
duplicate_count = multipoint_gdf_cleaned.duplicated(subset=['NAME', 'latitude', 'longitude']).sum()
print(f"Number of duplicate rows: {duplicate_count}")

# Drop duplicate rows
multipoint_gdf_cleaned = multipoint_gdf_cleaned.drop_duplicates(subset=['NAME', 'latitude', 'longitude'])

# 2. Check for missing values in key columns
missing_values_count = multipoint_gdf_cleaned[['NAME', 'latitude', 'longitude']].isna().sum()
print(f"Missing values:\n{missing_values_count}")

# Drop rows with missing latitude or longitude
multipoint_gdf_cleaned = multipoint_gdf_cleaned.dropna(subset=['latitude', 'longitude'])

# 3. Ensure latitude and longitude values are within valid ranges
invalid_lat_lon = multipoint_gdf_cleaned[
    (multipoint_gdf_cleaned['latitude'] < -90) | 
    (multipoint_gdf_cleaned['latitude'] > 90) | 
    (multipoint_gdf_cleaned['longitude'] < -180) | 
    (multipoint_gdf_cleaned['longitude'] > 180)
]
print(f"Number of invalid latitude/longitude rows: {len(invalid_lat_lon)}")

# Drop rows with invalid latitude/longitude values
multipoint_gdf_cleaned = multipoint_gdf_cleaned[
    (multipoint_gdf_cleaned['latitude'] >= -90) & 
    (multipoint_gdf_cleaned['latitude'] <= 90) & 
    (multipoint_gdf_cleaned['longitude'] >= -180) & 
    (multipoint_gdf_cleaned['longitude'] <= 180)
]

Number of duplicate rows: 0
Missing values:
NAME         0
latitude     0
longitude    0
dtype: int64
Number of invalid latitude/longitude rows: 0


## save data

In [4]:
# Define save directory using relative path
save_data_dir = "../../data/raw/three_external"
os.makedirs(save_data_dir, exist_ok=True)

# Save data for 'library' and 'tourist attraction' as separate CSV files
for feature in ['library', 'tourist attraction']:
    # Filter records for each feature
    feature_gdf = multipoint_gdf_cleaned[multipoint_gdf_cleaned['FEATSUBTYP'] == feature]

    # Save as CSV file
    save_path = os.path.join(save_data_dir, f'{feature}_data.csv')
    feature_gdf.to_csv(save_path, index=False)
    print(f"Data for {feature} saved to {save_path}")

Data for library saved to ../../data/raw/three_external/library_data.csv
Data for tourist attraction saved to ../../data/raw/three_external/tourist attraction_data.csv


## show on plot

In [6]:
# Create SparkSession
spark = SparkSession.builder.appName("Library and Tourist Attraction Mapping").getOrCreate()

# Define the directory where the CSV files are saved
save_data_dir = "../../data/raw/three_external"

# Function to generate and display the map (combining PySpark and Folium)
def generate_map(spark_df, feature_type):
    # Convert the PySpark DataFrame to a Pandas DataFrame for use with Folium
    pandas_df = spark_df.toPandas()

    # Calculate the average position for the map center
    center_lat = pandas_df['latitude'].mean()
    center_lon = pandas_df['longitude'].mean()

    # Initialize a Folium map
    map_object = folium.Map(location=[center_lat, center_lon], zoom_start=10)

    # Create a MarkerCluster object
    marker_cluster = MarkerCluster().add_to(map_object)

    # Add markers to the map
    for _, row in pandas_df.iterrows():
        popup_text = f"Name: {row['NAME']}<br>Type: {row['FEATSUBTYP']}"
        folium.Marker(
            location=[row['latitude'], row['longitude']],
            popup=popup_text,
            icon=folium.Icon(icon="info-sign", color="blue")
        ).add_to(marker_cluster)

    # Add a title to the map
    title_html = f'<h3 align="center" style="font-size:16px"><b>{feature_type} Distribution Map</b></h3>'
    map_object.get_root().html.add_child(folium.Element(title_html))

    return map_object

# Read the CSV files for 'library' and 'tourist attraction' and generate the map
for feature in ['library', 'tourist attraction']:
    # Read the CSV file into a Spark DataFrame
    csv_file_path = os.path.join(save_data_dir, f'{feature}_data.csv')
    spark_df = spark.read.csv(csv_file_path, header=True, inferSchema=True)
    
    # Generate and display the map
    map_object = generate_map(spark_df, feature)

    # Display the map in Jupyter Notebook (no need to save)
    display(map_object)

# Stop the SparkSession
spark.stop()



your 131072x1 screen size is bogus. expect trouble
24/09/07 19:14:02 WARN Utils: Your hostname, LAPTOP-1H9MAQ2V resolves to a loopback address: 127.0.1.1; using 10.255.255.254 instead (on interface lo)
24/09/07 19:14:02 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/09/07 19:14:03 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
