# Apache Sedona Example Notebook

This notebook demonstrates basic Apache Sedona functionality for spatial data processing.

## Setup and Initialization

In [None]:
from pyspark.sql import SparkSession
from sedona.register import SedonaRegistrator
from sedona.utils import SedonaKryoRegistrator, KryoSerializer
import geopandas as gpd
import matplotlib.pyplot as plt
import folium

In [None]:
# Initialize Spark with Sedona
spark = SparkSession.builder \
    .appName("SedonaExample") \
    .config("spark.serializer", KryoSerializer.getName) \
    .config("spark.kryo.registrator", SedonaKryoRegistrator.getName) \
    .config('spark.jars.packages',
            'org.apache.sedona:sedona-spark-shaded-3.0_2.12:1.4.1,'
            'org.apache.sedona:sedona-viz-3.0_2.12:1.4.1,'
            'org.apache.sedona:sedona-python-adapter-3.0_2.12:1.4.1') \
    .getOrCreate()

# Register Sedona functions
SedonaRegistrator.registerAll(spark)

## Creating Spatial Data

In [None]:
# Create sample spatial data
sample_points = spark.sql("""
    SELECT 
        ST_Point(CAST(RAND() * 360 - 180 AS DECIMAL(10,6)), 
                 CAST(RAND() * 180 - 90 AS DECIMAL(10,6))) as geometry,
        CAST(RAND() * 100 AS INT) as value,
        'point_' || CAST(RAND() * 1000 AS INT) as name
    FROM range(100)
""")

sample_points.show(5)

## Spatial Operations

In [None]:
# Create a polygon for spatial filtering
bounding_box = spark.sql("""
    SELECT ST_PolygonFromEnvelope(-10.0, -10.0, 10.0, 10.0) as bbox
""")

# Filter points within the bounding box
sample_points.createOrReplaceTempView("points")
bounding_box.createOrReplaceTempView("bbox")

filtered_points = spark.sql("""
    SELECT p.*, ST_X(p.geometry) as longitude, ST_Y(p.geometry) as latitude
    FROM points p, bbox b
    WHERE ST_Within(p.geometry, b.bbox)
""")

print(f"Total points: {sample_points.count()}")
print(f"Points within bounding box: {filtered_points.count()}")
filtered_points.show()

## Spatial Analysis

In [None]:
# Calculate distances between points
distance_analysis = spark.sql("""
    SELECT 
        p1.name as point1,
        p2.name as point2,
        ST_Distance(p1.geometry, p2.geometry) as distance
    FROM points p1
    CROSS JOIN points p2
    WHERE p1.name != p2.name
    ORDER BY distance
    LIMIT 10
""")

print("Closest point pairs:")
distance_analysis.show()

## Visualization

In [None]:
# Convert to pandas for visualization
points_pdf = filtered_points.toPandas()

# Create a simple scatter plot
plt.figure(figsize=(10, 6))
plt.scatter(points_pdf['longitude'], points_pdf['latitude'], 
           c=points_pdf['value'], cmap='viridis', alpha=0.7)
plt.colorbar(label='Value')
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.title('Spatial Points Distribution')
plt.grid(True, alpha=0.3)
plt.show()

In [None]:
# Create an interactive map with Folium
if len(points_pdf) > 0:
    center_lat = points_pdf['latitude'].mean()
    center_lon = points_pdf['longitude'].mean()
    
    m = folium.Map(location=[center_lat, center_lon], zoom_start=6)
    
    for idx, row in points_pdf.iterrows():
        folium.CircleMarker(
            location=[row['latitude'], row['longitude']],
            radius=5,
            popup=f"Name: {row['name']}<br>Value: {row['value']}",
            color='red',
            fill=True,
            fillColor='red'
        ).add_to(m)
    
    # Add bounding box
    folium.Rectangle(
        bounds=[[-10, -10], [10, 10]],
        color='blue',
        fill=False,
        popup='Bounding Box'
    ).add_to(m)
    
    m
else:
    print("No points found within the bounding box to display.")

## Cleanup

In [None]:
# Stop Spark session
spark.stop()