In [1]:
import os

import geopandas as gpd
from pyspark.sql import SparkSession

from sedona.spark import SedonaContext
from sedona.maps.SedonaKepler import SedonaKepler
from shapely.geometry import Polygon
from shapely.wkt import dumps

In [2]:
additional_packages = [
    'org.apache.hadoop:hadoop-aws:3.3.4',
    'org.apache.hadoop:hadoop-client-api:3.3.4',
    'org.apache.hadoop:hadoop-common:3.3.4',
    'org.apache.sedona:sedona-spark-3.5_2.12:1.7.0',
    'org.datasyslab:geotools-wrapper:1.7.0-28.5',
    'uk.co.gresearch.spark:spark-extension_2.12:2.11.0-3.4'
]

config = (
    SedonaContext.builder()
    .config("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider") \
    .config("spark.hadoop.fs.s3a.access.key", "sedona") \
    .config("spark.hadoop.fs.s3a.secret.key", "sedona_password") \
    .config("spark.hadoop.fs.s3a.endpoint", "http://localhost:9000") \
    .config(
      "spark.hadoop.fs.s3a.aws.credentials.provider",
      "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider")\
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")\
    .config("spark.hadoop.fs.s3a.path.style.access", "true") \
    .config("spark.jars.packages", ",".join(additional_packages))\
    .config("spark.driver.memory", "6G")
    .getOrCreate()
)

sedona = SedonaContext.create(config)

24/12/18 19:43:28 WARN Utils: Your hostname, Pawels-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 192.168.0.115 instead (on interface en0)
24/12/18 19:43:28 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


:: loading settings :: url = jar:file:/Users/pawelkocinski/Desktop/projects/apache-sedona-book/.venv/lib/python3.12/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /Users/pawelkocinski/.ivy2/cache
The jars for the packages stored in: /Users/pawelkocinski/.ivy2/jars
org.apache.hadoop#hadoop-aws added as a dependency
org.apache.hadoop#hadoop-client-api added as a dependency
org.apache.hadoop#hadoop-common added as a dependency
org.apache.sedona#sedona-spark-3.5_2.12 added as a dependency
org.datasyslab#geotools-wrapper added as a dependency
uk.co.gresearch.spark#spark-extension_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-dafdfd1b-d9ad-4feb-9c81-d3fcaf2edc4e;1.0
	confs: [default]
	found org.apache.hadoop#hadoop-aws;3.3.4 in central
	found com.amazonaws#aws-java-sdk-bundle;1.12.262 in central
	found org.wildfly.openssl#wildfly-openssl;1.0.7.Final in central
	found org.apache.hadoop#hadoop-client-api;3.3.4 in central
	found org.xerial.snappy#snappy-java;1.1.8.2 in local-m2-cache
	found org.apache.hadoop#hadoop-common;3.3.4 in central
	found org.apache.hadoop.thirdparty#hadoop-s

In [3]:
nyx_taxi = sedona\
    .read\
    .format("parquet")\
    .load("s3a://sedona/analysis/nyc_yellow")

24/12/18 19:43:50 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties


In [4]:
def visualize_map(df, name):
    orig_html = str(SedonaKepler.create_map(df, name)._repr_html_(),'utf-8')
    
    better_html = orig_html
    import base64
    b64 = base64.b64encode(better_html.encode('utf-8'))
    src = f"data:text/html;base64,{b64.decode('utf-8')}"
    base64d_html = f'<iframe src="{src}" style="width:95%; height: 600px">'
    return base64d_html


In [5]:
# analysis

In [6]:
explore = nyx_taxi\
    .where("pickup_longitude IS NOT NULL AND pickup_latitude IS NOT NULL")\
    .where("dropoff_longitude IS NOT NULL AND dropoff_latitude IS NOT NULL")\
    .selectExpr(
        "ST_POINT(CAST(pickup_longitude AS DOUBLE), CAST(pickup_latitude AS DOUBLE)) AS geom"
    ).sample(0.00001)

In [7]:
explore.cache().count()

                                                                                

163

In [8]:
import IPython
IPython.display.HTML(visualize_map(explore, "map"))

User Guide: https://docs.kepler.gl/docs/keplergl-jupyter


In [9]:
nyx_taxi_with_coordinates = nyx_taxi\
    .where("pickup_longitude IS NOT NULL AND pickup_latitude IS NOT NULL")\
    .where("dropoff_longitude IS NOT NULL AND dropoff_latitude IS NOT NULL")\
    .selectExpr(
        "CAST(total_amount AS DECIMAL(24, 10)) AS total_amount",
        "ST_POINT(CAST(pickup_longitude AS DOUBLE), CAST(pickup_latitude AS DOUBLE)) AS pickup_geom",
        "ST_POINT(CAST(dropoff_longitude AS DOUBLE), CAST(dropoff_latitude AS DOUBLE)) AS dropoff_geom"
    ).createOrReplaceTempView("taxi")

In [10]:
polygon = {"type":"Polygon","coordinates":[[[-74.11902177180524,40.83533792558916],[-74.22186537559533,40.649655372448066],[-73.75749417169013,40.60368516322427],[-73.73730411934669,40.69323888136599],[-73.8988245380961,40.90133870775822],[-74.12557435672555,40.914249930541786],[-74.11780895197751,40.83556976599659],[-74.11902177180524,40.83533792558916]]]}

In [11]:
polygon_wkt = dumps(Polygon(polygon["coordinates"][0]))

In [12]:
sedona.sql(
f"""
    SELECT * 
    FROM taxi
    WHERE ST_Within(dropoff_geom, ST_GeomFromText('{polygon_wkt}')) 
    AND ST_Within(pickup_geom, ST_GeomFromText('{polygon_wkt}'))
    AND total_amount > 0
"""
).createOrReplaceTempView("taxi_cleaned")

In [13]:
most_popular_pickup = sedona.sql(
    """
    WITH h3_index AS (
        SELECT 
            ST_H3CellIDs(pickup_geom, 8, true)[0] AS h3_id 
        FROM taxi_cleaned
    )
    SELECT 
        h3_id,
        count(h3_id) AS cnt,
        ST_H3ToGeom(array(h3_id))[0] AS geom
    FROM h3_index
    GROUP BY h3_id
    ORDER BY count(h3_id) DESC
    """
)

In [14]:
most_popular_pickup.cache()

DataFrame[h3_id: bigint, cnt: bigint, geom: udt]

In [15]:
most_popular_pickup.count()

                                                                                

1040

In [16]:
map_data = visualize_map(most_popular_pickup, "most-popular-pickup")
IPython.display.HTML(map_data)

User Guide: https://docs.kepler.gl/docs/keplergl-jupyter


In [17]:
most_popular_routes = sedona.sql("""
    WITH h3_indexes AS (
        SELECT 
            ST_H3CellIDs(pickup_geom, 8, true)[0] AS h3_p_id,
            ST_H3CellIDs(dropoff_geom, 8, true)[0] AS h3_d_id
        FROM taxi_cleaned
    ),
    indexed AS (
        SELECT 
            CASE 
                WHEN h3_p_id > h3_d_id 
                THEN CONCAT(h3_d_id, ' ', h3_p_id) 
                ELSE CONCAT(h3_p_id, ' ', h3_d_id) 
            END AS id,
            h3_p_id,
            h3_d_id
        FROM h3_indexes
    ),
    groupped AS (
        SELECT 
            id,
            count(id) AS cnt,
            first(h3_p_id) AS h3_p_id,
            first(h3_d_id) AS h3_d_id
        FROM indexed
        GROUP BY id
    )
    SELECT 
        h3_p_id,
        h3_d_id,
        ST_MakeLine(
            ST_Centroid(ST_H3ToGeom(array(h3_p_id))[0]),
            ST_Centroid(ST_H3ToGeom(array(h3_d_id))[0])
        ) AS geom,
        cnt
    FROM groupped
    ORDER BY cnt DESC
    LIMIT 20

""")

In [18]:
most_popular_routes.cache()

DataFrame[h3_p_id: bigint, h3_d_id: bigint, geom: udt, cnt: bigint]

In [19]:
most_popular_routes.count()

                                                                                

20

In [20]:
map_data = visualize_map(most_popular_routes, "Most popular routes")
IPython.display.HTML(map_data)

User Guide: https://docs.kepler.gl/docs/keplergl-jupyter


In [21]:
most_popular_routes.select("h3_p_id", "h3_d_id", "cnt").show()

+------------------+------------------+------+
|           h3_p_id|           h3_d_id|   cnt|
+------------------+------------------+------+
|613229524246593535|613229524185776127|127870|
|613229524244496383|613229524185776127|117873|
|613229524240302079|613229524246593535| 94500|
|613229524240302079|613229524185776127| 89051|
|613229524185776127|613229524173193215| 87539|
|613229524244496383|613229551345991679| 82697|
|613229524173193215|613229524181581823| 79790|
|613229524181581823|613229524185776127| 78218|
|613229524244496383|613229524246593535| 77396|
|613229524240302079|613229524181581823| 76718|
|613229524242399231|613229524185776127| 74493|
|613229522950553599|613229524248690687| 72973|
|613229524248690687|613229524240302079| 72923|
|613229551345991679|613229524185776127| 72650|
|613229522948456447|613229524248690687| 69012|
|613229524240302079|613229524244496383| 69002|
|613229524240302079|613229524242399231| 68983|
|613229522948456447|613229522950553599| 68348|
|613229524240