In [1]:
%%configure -f
{
    "conf": {
        "spark.executor.instances": "8",
        "spark.executor.memory": "2g",
        "spark.executor.cores": "1"
    }
}

ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
3839,application_1732639283265_3779,pyspark,busy,Link,Link,,
3840,application_1732639283265_3780,pyspark,idle,Link,Link,,
3843,application_1732639283265_3783,pyspark,idle,Link,Link,,
3850,application_1732639283265_3790,pyspark,idle,Link,Link,,
3855,application_1732639283265_3795,pyspark,busy,Link,Link,,
3861,application_1732639283265_3801,pyspark,idle,Link,Link,,
3869,application_1732639283265_3809,pyspark,busy,Link,Link,,


In [2]:
import time
from pyspark.sql import SparkSession
from pyspark.sql.window import Window
from pyspark.sql.functions import col, year, count, when, desc, sum, to_timestamp, row_number, regexp_replace, expr, asc
from pyspark.sql.types import DecimalType
from pyspark.sql import functions as F
from sedona.spark import *

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
3872,application_1732639283265_3812,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [3]:
# QUERY 5
APP_NAME = "Police Station/Crimes Proximity"
spark = SparkSession.builder.appName(APP_NAME).getOrCreate()
sedona = SedonaContext.create(spark)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [4]:
# crime data
d1 = spark.read.csv(
    "s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2010_to_2019_20241101.csv",
    header=True, inferSchema=True)
d2 = spark.read.csv(
    "s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2020_to_Present_20241101.csv",
    header=True, inferSchema=True)
crime_data = d1.union(d2)

# remove NULL ISLAND (0,0)
crime_data = crime_data.filter(~((col('LAT') == 0) & (col('LON') == 0)))
crime_data = crime_data.withColumn("crime_geometry", ST_Point(F.col("LON"), F.col("LAT")))
crime_data = crime_data.select(col('crime_geometry'), col('AREA '), col('DR_NO'))

# police stations data
police_stations = spark.read.csv(
    "s3://initial-notebook-data-bucket-dblab-905418150721/LA_Police_Stations.csv",
    header=True, inferSchema=True).withColumnRenamed("X", "PS_LON").withColumnRenamed("Y", "PS_LAT")
police_stations = police_stations.withColumn("ps_geometry", ST_Point(F.col("PS_LON"), F.col("PS_LAT"))).select(col('ps_geometry'), col('DIVISION'))

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [5]:
def query5(crime_data, police_stations):
    joined_df = crime_data.crossJoin(police_stations).withColumn(
        "distance_km",
        ST_DistanceSphere(F.col("crime_geometry"), F.col("ps_geometry")) / 1000
    )

    window_spec = Window.partitionBy("DR_NO").orderBy(asc("distance_km"))
    joined_df_with_rank = joined_df.withColumn("rank", F.row_number().over(window_spec))
    nearest_station_df = joined_df_with_rank.filter(F.col("rank") == 1).select(
        "DR_NO", "DIVISION", "distance_km"
    )

    result_df = nearest_station_df.groupBy("DIVISION").agg(
        F.count("*").alias("#"),
        F.mean("distance_km").alias("average_distance")
    ).orderBy(F.col("#").desc())

    result_df.show(n=result_df.count(), truncate=False)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [6]:
start = time.time()
query5(crime_data, police_stations)
end = time.time()
print("Execution Time:", end-start)
conf = spark.sparkContext.getConf()
print("Executor Instances:", conf.get("spark.executor.instances"))
print("Executor Memory:", conf.get("spark.executor.memory"))
print("Executor Cores:", conf.get("spark.executor.cores"))

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+----------------+------+------------------+
|DIVISION        |#     |average_distance  |
+----------------+------+------------------+
|HOLLYWOOD       |224340|2.076263960178718 |
|VAN NUYS        |210134|2.9533697428197803|
|SOUTHWEST       |188901|2.191398805780884 |
|WILSHIRE        |185996|2.5926655329787645|
|77TH STREET     |171827|1.7165449719700954|
|OLYMPIC         |170897|1.7236036971780955|
|NORTH HOLLYWOOD |167854|2.6430060941415645|
|PACIFIC         |161359|3.850070655307917 |
|CENTRAL         |153871|0.9924764374568908|
|RAMPART         |152736|1.5345341879190122|
|SOUTHEAST       |152176|2.4218662158881803|
|WEST VALLEY     |138643|3.035671216314069 |
|TOPANGA         |138217|3.296954841755553 |
|FOOTHILL        |134896|4.250921708424982 |
|HARBOR          |126747|3.702561599356521 |
|HOLLENBECK      |115837|2.680181237706819 |
|WEST LOS ANGELES|115781|2.7924572890341226|
|NEWTON          |111110|1.6346357397097429|
|NORTHEAST       |108109|3.6236655246040868|
|MISSION  