In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as f
from sedona.spark import SedonaContext

In [2]:
additional_packages = [
    'org.apache.hadoop:hadoop-aws:3.3.4',
    'org.apache.hadoop:hadoop-client-api:3.3.4',
    'org.apache.hadoop:hadoop-common:3.3.4',
    'org.apache.sedona:sedona-spark-3.5_2.12:1.7.0',
    'org.datasyslab:geotools-wrapper:1.7.0-28.5',
    'uk.co.gresearch.spark:spark-extension_2.12:2.11.0-3.4'
]

config = (
    SedonaContext.builder()
    .config(
     "spark.driver.memory", "5G"   
    )
    .config("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider") \
    .config("spark.hadoop.fs.s3a.access.key", "sedona") \
    .config("spark.hadoop.fs.s3a.secret.key", "sedona_password") \
    .config("spark.hadoop.fs.s3a.endpoint", "http://localhost:9000") \
    .config(
      "spark.hadoop.fs.s3a.aws.credentials.provider",
      "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider")\
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")\
    .config("spark.hadoop.fs.s3a.path.style.access", "true") \
    .config("spark.jars.packages", ",".join(additional_packages))\
    .getOrCreate()
)

sedona = SedonaContext.create(config)

24/12/18 19:27:46 WARN Utils: Your hostname, Pawels-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 192.168.0.115 instead (on interface en0)
24/12/18 19:27:46 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


:: loading settings :: url = jar:file:/Users/pawelkocinski/Desktop/projects/apache-sedona-book/.venv/lib/python3.12/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /Users/pawelkocinski/.ivy2/cache
The jars for the packages stored in: /Users/pawelkocinski/.ivy2/jars
org.apache.hadoop#hadoop-aws added as a dependency
org.apache.hadoop#hadoop-client-api added as a dependency
org.apache.hadoop#hadoop-common added as a dependency
org.apache.sedona#sedona-spark-3.5_2.12 added as a dependency
org.datasyslab#geotools-wrapper added as a dependency
uk.co.gresearch.spark#spark-extension_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-14865d4a-2a1a-453f-8c16-5cfe8b2eda56;1.0
	confs: [default]
	found org.apache.hadoop#hadoop-aws;3.3.4 in central
	found com.amazonaws#aws-java-sdk-bundle;1.12.262 in central
	found org.wildfly.openssl#wildfly-openssl;1.0.7.Final in central
	found org.apache.hadoop#hadoop-client-api;3.3.4 in central
	found org.xerial.snappy#snappy-java;1.1.8.2 in local-m2-cache
	found org.apache.hadoop#hadoop-common;3.3.4 in central
	found org.apache.hadoop.thirdparty#hadoop-s

# CSV With WKT

In [3]:
sedona.read.\
    format("csv").\
    option("header", "true").\
    option("quote", '"').\
    load("s3a://sedona/vector/csv_wkt/wkt_file.csv").\
    withColumn("wkt", f.expr("ST_GeomFromText(wkt)")).\
    show()


24/12/18 19:28:08 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties
[Stage 4:>                                                          (0 + 1) / 1]

+-------+--------------------+
| osm_id|                 wkt|
+-------+--------------------+
|4326379|LINESTRING (21.01...|
|4326702|LINESTRING (20.98...|
|4308966|LINESTRING (20.99...|
|4311409|LINESTRING (21.00...|
|4315349|LINESTRING (21.00...|
|4317242|LINESTRING (20.98...|
+-------+--------------------+



                                                                                

# Shapefile

In [4]:
sedona.\
    read.\
    format("shapefile").\
    load("s3a://sedona/vector/shapefile").\
    select("osm_id", "geometry").\
    show(5)

[Stage 6:>                                                          (0 + 1) / 1]

+-------+--------------------+
| osm_id|            geometry|
+-------+--------------------+
|4307220|LINESTRING (21.01...|
|4307329|LINESTRING (20.99...|
|4307330|LINESTRING (20.99...|
|4308966|LINESTRING (20.99...|
|4308968|LINESTRING (21.01...|
+-------+--------------------+
only showing top 5 rows



                                                                                

# GeoJSON

In [5]:
sedona.read.\
    format("geojson").\
    option("multiLine", "true").\
    load("s3a://sedona/vector/geojson/masovia_roads.geojson").\
    show(5)

                                                                                

+--------------------+--------------------+-------------+-----------------+
|                 crs|            features|         name|             type|
+--------------------+--------------------+-------------+-----------------+
|{{urn:ogc:def:crs...|[{MULTILINESTRING...|masovia_roads|FeatureCollection|
+--------------------+--------------------+-------------+-----------------+



In [6]:
sedona.read.\
    format("geojson").\
    option("multiLine", "true").\
    load("s3a://sedona/vector/geojson/masovia_roads.geojson").\
    selectExpr("explode(features) as features").\
    select("features.geometry", "features.properties.osm_id").\
    show(5)


[Stage 10:>                                                         (0 + 1) / 1]

+--------------------+-------+
|            geometry| osm_id|
+--------------------+-------+
|MULTILINESTRING (...|4307220|
|MULTILINESTRING (...|4307329|
|MULTILINESTRING (...|4307330|
|MULTILINESTRING (...|4308966|
|MULTILINESTRING (...|4308968|
+--------------------+-------+
only showing top 5 rows



                                                                                

# Distributed GeoJSON

In [7]:
sedona.read.\
    format("geojson").\
    load("s3a://sedona/vector/geojson_distributes").\
    selectExpr("geometry", "properties.osm_id").\
    show(5)

                                                                                

+--------------------+-------+
|            geometry| osm_id|
+--------------------+-------+
|MULTILINESTRING (...|4311413|
|MULTILINESTRING (...|4315349|
|MULTILINESTRING (...|4307220|
|MULTILINESTRING (...|4307329|
|MULTILINESTRING (...|4307330|
+--------------------+-------+
only showing top 5 rows



# Geopackage

In [8]:
sedona.read.\
    format("geopackage").\
    option("showMetadata", "true").\
    load("s3a://sedona/vector/geopackage").\
    select("table_name", "data_type", "srs_id").\
    show()

[Stage 13:>                                                         (0 + 1) / 1]

+--------------------+---------+------+
|          table_name|data_type|srs_id|
+--------------------+---------+------+
|gis_osm_roads_free_1| features|  4326|
+--------------------+---------+------+



                                                                                

In [9]:
sedona.read.\
    format("geopackage").\
    option("tableName", "gis_osm_roads_free_1").\
    load("s3a://sedona/vector/geopackage").\
    select("osm_id", "geom").\
    show(5)


+-------+--------------------+
| osm_id|                geom|
+-------+--------------------+
|4307220|MULTILINESTRING (...|
|4307329|MULTILINESTRING (...|
|4307330|MULTILINESTRING (...|
|4308966|MULTILINESTRING (...|
|4308968|MULTILINESTRING (...|
+-------+--------------------+
only showing top 5 rows



# Geoparquet

In [10]:
sedona.\
    read.\
    format("geoparquet").\
    load("s3a://sedona/vector/geoparquet").\
    select("osm_id", "geometry").\
    show(5)




+-------+--------------------+
| osm_id|            geometry|
+-------+--------------------+
|4307220|LINESTRING (21.01...|
|4307329|LINESTRING (20.99...|
|4307330|LINESTRING (20.99...|
|4308966|LINESTRING (20.99...|
|4308968|LINESTRING (21.01...|
+-------+--------------------+
only showing top 5 rows



                                                                                

In [None]:
sedona.\
    read.\
    format("geoparquet").\
    load("s3a://sedona/vector/overture_places").\
    withColumn("geohash", f.expr("ST_GeoHash(geometry, 5)")).\
    orderBy("geohash").\
    write.\
    mode("overwrite").\
    format("geoparquet").\
    save("s3a://sedona/vector/overture_places_sorted")



In [17]:
sedona.read.\
    format("geoparquet").\
    load("s3a://sedona/vector/overture_places_sorted").\
    where("ST_Within(geometry, ST_PolygonFromEnvelope(-123.425407, 35.749839, -118.763583, 41.806623))").\
    selectExpr("brand.names.primary as name").\
    groupBy("name").\
    count().\
    explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- HashAggregate(keys=[name#2295], functions=[count(1)])
   +- Exchange hashpartitioning(name#2295, 200), ENSURE_REQUIREMENTS, [plan_id=285]
      +- HashAggregate(keys=[name#2295], functions=[partial_count(1)])
         +- Project [brand#2276.names.primary AS name#2295]
            +- Filter (isnotnull(geometry#2265) AND  **org.apache.spark.sql.sedona_sql.expressions.ST_Within**  )
               +- FileScan geoparquet [geometry#2265,brand#2276] Batched: false, DataFilters: [isnotnull(geometry#2265),  **org.apache.spark.sql.sedona_sql.expressions.ST_Within**  ], Format: GeoParquet with spatial filter [geometry INTERSECTS POLYGON ((-123.425407 35.749839, -123.425407 ..., Location: InMemoryFileIndex(1 paths)[s3a://sedona/vector/overture_places_sorted], PartitionFilters: [], PushedFilters: [IsNotNull(geometry)], ReadSchema: struct<geometry:binary,brand:struct<names:struct<primary:string>>>


