In [1]:
%%configure -f
{
    "conf":{
        "spark.executor.instances": "2",
        "spark.executor.memory": "2g",
        "spark.executor.cores": "1"
    }
}

ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
254,application_1765289937462_0251,pyspark,idle,Link,Link,,
260,application_1765289937462_0257,pyspark,idle,Link,Link,,
274,application_1765289937462_0271,pyspark,idle,Link,Link,,
284,application_1765289937462_0281,pyspark,idle,Link,Link,,
294,application_1765289937462_0291,pyspark,idle,Link,Link,,
299,application_1765289937462_0296,pyspark,idle,Link,Link,,


In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, year, to_date, count, sum as _sum, corr, desc, lit
from pyspark.sql.types import IntegerType, StringType, StructType, StructField, DoubleType, FloatType

# Sedona Imports
from sedona.register import SedonaRegistrator
from sedona.utils import SedonaKryoRegistrator, KryoSerializer

spark = SparkSession.builder \
    .appName("Query 4 execution") \
    .config("spark.serializer", KryoSerializer.getName) \
    .config("spark.kryo.registrator", SedonaKryoRegistrator.getName) \
    .config("spark.sql.extensions", "org.apache.spark.sql.sedona_sql.io.SedonaSqlWrapper") \
    .getOrCreate()
SedonaRegistrator.registerAll(spark)

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
301,application_1765289937462_0298,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

True

In [3]:
# Preparing the data
Crime_data_schema = StructType([
    StructField("DR_NO", IntegerType()),
    StructField("Date Rptd", StringType()),
    StructField("DATE OCC", StringType()),
    StructField("TIME OCC", IntegerType()),
    StructField("AREA", IntegerType()),
    StructField("AREA NAME", StringType()),
    StructField("Rpt Dist No", IntegerType()),
    StructField("Part 1-2", IntegerType()),
    StructField("Crm Cd", IntegerType()),
    StructField("Crm Cd Desc", StringType()),
    StructField("Mocodes", StringType()),
    StructField("Vict Age", IntegerType()),
    StructField("Vict Sex", StringType()),
    StructField("Vict Descent", StringType()),
    StructField("Premis Cd", IntegerType()),
    StructField("Premis Desc", StringType()),
    StructField("Weapon Used Cd", IntegerType()),
    StructField("Weapon Desc", StringType()),
    StructField("Status", StringType()),
    StructField("Status Desc", StringType()),
    StructField("Crm Cd 1", IntegerType()),
    StructField("Crm Cd 2", IntegerType()),
    StructField("Crm Cd 3", IntegerType()),
    StructField("Crm Cd 4", IntegerType()),
    StructField("LOCATION", StringType()),
    StructField("Cross Street", StringType()),
    StructField("LAT", FloatType()),
    StructField("LON", FloatType()),
])

Recent_crime_data_df = spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_Crime_Data/LA_Crime_Data_2020_2025.csv", \
                                      header = True, \
                                      schema = Crime_data_schema)
Older_crime_data_df = spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_Crime_Data/LA_Crime_Data_2010_2019.csv", \
                                     header = True, \
                                     schema = Crime_data_schema)
Crime_df = Recent_crime_data_df.union(Older_crime_data_df)

# Load stations
stations_df = spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_Police_Stations.csv",
                                 header=True,
                                 inferSchema=True)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [4]:
# Implementation using 1 core, 2 GB RAM

import time
from pyspark.sql import functions as F
from pyspark.sql.window import Window
spark.catalog.clearCache()
# Clean crime data (remove Null Island)

crimes_filtered = Crime_df.filter(
    (F.col("LAT").isNotNull()) &
    (F.col("LON").isNotNull()) &
    (F.col("LAT") != 0.0) &
    (F.col("LON") != 0.0)
)

# Create geometry column
crime_geom = crimes_filtered.withColumn(
    "crime_geom",
    F.expr("ST_Point(CAST(LON AS DOUBLE), CAST(LAT AS DOUBLE))")
)

# Prepare police stations

stations = stations_df.select(
    F.col("DIVISION").alias("division"),
    F.col("X").cast("double").alias("station_lon"),
    F.col("Y").cast("double").alias("station_lat")
)

stations_geom = stations.withColumn(
    "station_geom",
    F.expr("ST_Point(station_lon, station_lat)")
)

# Cross join + compute distances

joined = crime_geom.crossJoin(F.broadcast(stations_geom)) \
    .withColumn(
        "distance",
        F.expr("ST_Distance(crime_geom, station_geom)")
    )

joined.explain("formatted")

# Window → nearest police station per crime

w = Window.partitionBy("DR_NO").orderBy(F.col("distance").asc())

nearest_station = joined.withColumn(
    "rn",
    F.row_number().over(w)
).filter(F.col("rn") == 1)

# Aggregate results per police division

result = nearest_station.groupBy("division") \
    .agg(
        F.round(F.avg("distance"), 3).alias("average_distance"),
        F.count("*").alias("#")
    ) \
    .select("division", "average_distance", "#") \
    .orderBy(F.col("#").desc())

# Execute + Measure time

start = time.time()
result.show(21, truncate=False)
end = time.time()

print(f"\nExecution time: {end - start:.2f} seconds")

# Explain final execution plan

result.explain("formatted")


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

== Physical Plan ==
AdaptiveSparkPlan (13)
+- Project (12)
   +- BroadcastNestedLoopJoin Cross BuildRight (11)
      :- Union (7)
      :  :- Project (3)
      :  :  +- Filter (2)
      :  :     +- Scan csv  (1)
      :  +- Project (6)
      :     +- Filter (5)
      :        +- Scan csv  (4)
      +- BroadcastExchange (10)
         +- Project (9)
            +- Scan csv  (8)


(1) Scan csv 
Output [28]: [DR_NO#24, Date Rptd#25, DATE OCC#26, TIME OCC#27, AREA#28, AREA NAME#29, Rpt Dist No#30, Part 1-2#31, Crm Cd#32, Crm Cd Desc#33, Mocodes#34, Vict Age#35, Vict Sex#36, Vict Descent#37, Premis Cd#38, Premis Desc#39, Weapon Used Cd#40, Weapon Desc#41, Status#42, Status Desc#43, Crm Cd 1#44, Crm Cd 2#45, Crm Cd 3#46, Crm Cd 4#47, LOCATION#48, Cross Street#49, LAT#50, LON#51]
Batched: false
Location: InMemoryFileIndex [s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_Crime_Data/LA_Crime_Data_2020_2025.csv]
PushedFilters: [IsNotNull(LAT), IsNotNull(LON), Not(EqualTo(LAT,

In [4]:
# Implementation using 2 cores, 4 GB RAM

import time
from pyspark.sql import functions as F
from pyspark.sql.window import Window
spark.catalog.clearCache()
# Clean crime data (remove Null Island)

crimes_filtered = Crime_df.filter(
    (F.col("LAT").isNotNull()) &
    (F.col("LON").isNotNull()) &
    (F.col("LAT") != 0.0) &
    (F.col("LON") != 0.0)
)

# Create geometry column
crime_geom = crimes_filtered.withColumn(
    "crime_geom",
    F.expr("ST_Point(CAST(LON AS DOUBLE), CAST(LAT AS DOUBLE))")
)

# Prepare police stations

stations = stations_df.select(
    F.col("DIVISION").alias("division"),
    F.col("X").cast("double").alias("station_lon"),
    F.col("Y").cast("double").alias("station_lat")
)

stations_geom = stations.withColumn(
    "station_geom",
    F.expr("ST_Point(station_lon, station_lat)")
)

# Cross join + compute distances

joined = crime_geom.crossJoin(F.broadcast(stations_geom)) \
    .withColumn(
        "distance",
        F.expr("ST_Distance(crime_geom, station_geom)")
    )

joined.explain("formatted")

# Window → nearest police station per crime

w = Window.partitionBy("DR_NO").orderBy(F.col("distance").asc())

nearest_station = joined.withColumn(
    "rn",
    F.row_number().over(w)
).filter(F.col("rn") == 1)

# Aggregate results per police division

result = nearest_station.groupBy("division") \
    .agg(
        F.round(F.avg("distance"), 3).alias("average_distance"),
        F.count("*").alias("#")
    ) \
    .select("division", "average_distance", "#") \
    .orderBy(F.col("#").desc())

# Execute + Measure time

start = time.time()
result.show(21, truncate=False)
end = time.time()

print(f"\nExecution time: {end - start:.2f} seconds")

# Explain final execution plan

result.explain("formatted")


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

== Physical Plan ==
AdaptiveSparkPlan (13)
+- Project (12)
   +- BroadcastNestedLoopJoin Cross BuildRight (11)
      :- Union (7)
      :  :- Project (3)
      :  :  +- Filter (2)
      :  :     +- Scan csv  (1)
      :  +- Project (6)
      :     +- Filter (5)
      :        +- Scan csv  (4)
      +- BroadcastExchange (10)
         +- Project (9)
            +- Scan csv  (8)


(1) Scan csv 
Output [28]: [DR_NO#24, Date Rptd#25, DATE OCC#26, TIME OCC#27, AREA#28, AREA NAME#29, Rpt Dist No#30, Part 1-2#31, Crm Cd#32, Crm Cd Desc#33, Mocodes#34, Vict Age#35, Vict Sex#36, Vict Descent#37, Premis Cd#38, Premis Desc#39, Weapon Used Cd#40, Weapon Desc#41, Status#42, Status Desc#43, Crm Cd 1#44, Crm Cd 2#45, Crm Cd 3#46, Crm Cd 4#47, LOCATION#48, Cross Street#49, LAT#50, LON#51]
Batched: false
Location: InMemoryFileIndex [s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_Crime_Data/LA_Crime_Data_2020_2025.csv]
PushedFilters: [IsNotNull(LAT), IsNotNull(LON), Not(EqualTo(LAT,

In [4]:
# Implementation using 4 cores, 8 GB RAM

import time
from pyspark.sql import functions as F
from pyspark.sql.window import Window
spark.catalog.clearCache()
# Clean crime data (remove Null Island)

crimes_filtered = Crime_df.filter(
    (F.col("LAT").isNotNull()) &
    (F.col("LON").isNotNull()) &
    (F.col("LAT") != 0.0) &
    (F.col("LON") != 0.0)
)

# Create geometry column
crime_geom = crimes_filtered.withColumn(
    "crime_geom",
    F.expr("ST_Point(CAST(LON AS DOUBLE), CAST(LAT AS DOUBLE))")
)

# Prepare police stations

stations = stations_df.select(
    F.col("DIVISION").alias("division"),
    F.col("X").cast("double").alias("station_lon"),
    F.col("Y").cast("double").alias("station_lat")
)

stations_geom = stations.withColumn(
    "station_geom",
    F.expr("ST_Point(station_lon, station_lat)")
)

# Cross join + compute distances

joined = crime_geom.crossJoin(F.broadcast(stations_geom)) \
    .withColumn(
        "distance",
        F.expr("ST_Distance(crime_geom, station_geom)")
    )

joined.explain("formatted")

# Window → nearest police station per crime

w = Window.partitionBy("DR_NO").orderBy(F.col("distance").asc())

nearest_station = joined.withColumn(
    "rn",
    F.row_number().over(w)
).filter(F.col("rn") == 1)

# Aggregate results per police division

result = nearest_station.groupBy("division") \
    .agg(
        F.round(F.avg("distance"), 3).alias("average_distance"),
        F.count("*").alias("#")
    ) \
    .select("division", "average_distance", "#") \
    .orderBy(F.col("#").desc())

# Execute + Measure time

start = time.time()
result.show(21, truncate=False)
end = time.time()

print(f"\nExecution time: {end - start:.2f} seconds")

# Explain final execution plan

result.explain("formatted")


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

== Physical Plan ==
AdaptiveSparkPlan (13)
+- Project (12)
   +- BroadcastNestedLoopJoin Cross BuildRight (11)
      :- Union (7)
      :  :- Project (3)
      :  :  +- Filter (2)
      :  :     +- Scan csv  (1)
      :  +- Project (6)
      :     +- Filter (5)
      :        +- Scan csv  (4)
      +- BroadcastExchange (10)
         +- Project (9)
            +- Scan csv  (8)


(1) Scan csv 
Output [28]: [DR_NO#24, Date Rptd#25, DATE OCC#26, TIME OCC#27, AREA#28, AREA NAME#29, Rpt Dist No#30, Part 1-2#31, Crm Cd#32, Crm Cd Desc#33, Mocodes#34, Vict Age#35, Vict Sex#36, Vict Descent#37, Premis Cd#38, Premis Desc#39, Weapon Used Cd#40, Weapon Desc#41, Status#42, Status Desc#43, Crm Cd 1#44, Crm Cd 2#45, Crm Cd 3#46, Crm Cd 4#47, LOCATION#48, Cross Street#49, LAT#50, LON#51]
Batched: false
Location: InMemoryFileIndex [s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_Crime_Data/LA_Crime_Data_2020_2025.csv]
PushedFilters: [IsNotNull(LAT), IsNotNull(LON), Not(EqualTo(LAT,