In [1]:
%%configure -f
{
    "conf":{
        "spark.executor.instances": "4",
        "spark.executor.memory": "2g",
        "spark.executor.cores": "1"
    }
}

ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
663,application_1761923966900_0675,pyspark,idle,Link,Link,,
665,application_1761923966900_0677,pyspark,idle,Link,Link,,


In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.window import Window
from pyspark.sql.types import StructField, StructType, IntegerType, FloatType, StringType
from pyspark.sql.functions import col, to_date, year, count, desc, rank, sum as _sum, round, split, explode, length, trim
import time

spark = SparkSession \
    .builder \
    .appName("Query 3 execution") \
    .getOrCreate()

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
666,application_1761923966900_0678,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [3]:
# Preparing the data
Crime_data_schema = StructType([
    StructField("DR_NO", IntegerType()),
    StructField("Date Rptd", StringType()),
    StructField("DATE OCC", StringType()),
    StructField("TIME OCC", IntegerType()),
    StructField("AREA", IntegerType()),
    StructField("AREA NAME", StringType()),
    StructField("Rpt Dist No", IntegerType()),
    StructField("Part 1-2", IntegerType()),
    StructField("Crm Cd", IntegerType()),
    StructField("Crm Cd Desc", StringType()),
    StructField("Mocodes", StringType()),
    StructField("Vict Age", IntegerType()),
    StructField("Vict Sex", StringType()),
    StructField("Vict Descent", StringType()),
    StructField("Premis Cd", IntegerType()),
    StructField("Premis Desc", StringType()),
    StructField("Weapon Used Cd", IntegerType()),
    StructField("Weapon Desc", StringType()),
    StructField("Status", StringType()),
    StructField("Status Desc", StringType()),
    StructField("Crm Cd 1", IntegerType()),
    StructField("Crm Cd 2", IntegerType()),
    StructField("Crm Cd 3", IntegerType()),
    StructField("Crm Cd 4", IntegerType()),
    StructField("LOCATION", StringType()),
    StructField("Cross Street", StringType()),
    StructField("LAT", FloatType()),
    StructField("LON", FloatType()),
])

Recent_crime_data_df = spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_Crime_Data/LA_Crime_Data_2020_2025.csv", \
                                      header = True, \
                                      schema = Crime_data_schema)
Older_crime_data_df = spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_Crime_Data/LA_Crime_Data_2010_2019.csv", \
                                     header = True, \
                                     schema = Crime_data_schema)
Crime_df = Recent_crime_data_df.union(Older_crime_data_df)
raw_mo = spark.read.text("s3://initial-notebook-data-bucket-dblab-905418150721/project_data/MO_codes.txt")
# Here we make a list of two elements: the MO code as the first element and
# the MO description as the second
MO_codes_df = raw_mo.select(
    split(col("value"), " ", 2)[0].alias("MO_code"),
    split(col("value"), " ", 2)[1].alias("MO_desc")
)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [4]:
# Implementation 1: DataFrame API
# Operations and Visualization
# Here we take the Mocodes column and break it into tuple arrays
# for each MO code we find. Then we explode the tuples to have a column
# that has one MO code in every row.
crime_mocodes_exploded = Crime_df.filter(col("Mocodes").isNotNull()) \
    .select(explode(split(col("Mocodes"), " ")).alias("MO_code")) \
    .filter(length(col("MO_code")) > 0)
# Now we count
crime_counts = crime_mocodes_exploded.groupBy("MO_code").count()
join_strategies = ["BROADCAST", "MERGE", "SHUFFLE_HASH", "SHUFFLE_REPLICATE_NL"]
for strategy in join_strategies:
    spark.catalog.clearCache()
    print(f"\n--- Testing Strategy: {strategy} ---")
    # Apply hint() to the MO_codes_df dataframe
    joined_df = crime_counts.join(
        MO_codes_df.hint(strategy),
        crime_counts["MO_code"] == MO_codes_df["MO_code"]
    )
    joined_df.explain()
    # Time Benchmarking
    start_time = time.time()
    count_result = joined_df.count()
    end_time = time.time()
    print(f"Time taken for {strategy}: {end_time - start_time:.4f} seconds")
final_result_df = crime_counts \
    .join(MO_codes_df.hint("BROADCAST"), "MO_code") \
    .select("MO_code", "MO_desc", "count") \
    .orderBy(col("count").desc())
final_result_df.show(20, truncate=False)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…


--- Testing Strategy: BROADCAST ---
== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- BroadcastHashJoin [MO_code#159], [MO_code#153], Inner, BuildRight, false
   :- HashAggregate(keys=[MO_code#159], functions=[count(1)], schema specialized)
   :  +- Exchange hashpartitioning(MO_code#159, 1000), ENSURE_REQUIREMENTS, [plan_id=57]
   :     +- HashAggregate(keys=[MO_code#159], functions=[partial_count(1)], schema specialized)
   :        +- Filter (length(MO_code#159) > 0)
   :           +- Generate explode(split(Mocodes#10,  , -1)), false, [MO_code#159]
   :              +- Union
   :                 :- Filter isnotnull(Mocodes#10)
   :                 :  +- FileScan csv [Mocodes#10] Batched: false, DataFilters: [isnotnull(Mocodes#10)], Format: CSV, Location: InMemoryFileIndex(1 paths)[s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_C..., PartitionFilters: [], PushedFilters: [IsNotNull(Mocodes)], ReadSchema: struct<Mocodes:string>
   :                 +- Fil

In [7]:
# Implementation 2: RDD API
spark.catalog.clearCache()
# We take only the Mocodes column from Crime_df
crime_rdd = Crime_df.select("Mocodes").rdd.filter(lambda x: x[0] is not None)
# We take both the code and the description from the MO_codes_df
mo_rdd = MO_codes_df.rdd.map(lambda x: (x[0], x[1]))

# Map & Reduce
# flatMap: We break the string "0416 0334" in a list [0416, 0334]
# and then many records
counts_rdd = crime_rdd \
    .flatMap(lambda row: row[0].split(" ")) \
    .filter(lambda code: len(code) > 0) \
    .map(lambda code: (code, 1)) \
    .reduceByKey(lambda a, b: a + b)

# Join with RDDs: Requires (Key, Value), where the Key
# in both our RDDs is the MO code
joined_rdd = counts_rdd.join(mo_rdd)
# Sorting
# The result of the join is (Code, (Count, Description))
# and we want sorting based on Count (1st element of value tuple)
final_rdd = joined_rdd.sortBy(lambda x: x[1][0], ascending=False)

# Time Benchmarking
start_time = time.time()
total_rows = final_rdd.count()
end_time = time.time()
print(f"RDD Execution Time: {end_time - start_time:.4f} seconds")

# Visualization
print("\n--- RDD Results ---")
for row in final_rdd.take(20):
    print(f"Code: {row[0]}, Count: {row[1][0]}, Desc: {row[1][1]}")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

RDD Execution Time: 38.0870 seconds

--- RDD Results ---
Code: 0344, Count: 1002900, Desc: Removes vict property
Code: 1822, Count: 548422, Desc: Stranger
Code: 0416, Count: 404773, Desc: Hit-Hit w/ weapon
Code: 0329, Count: 377536, Desc: Vandalized
Code: 0913, Count: 278618, Desc: Victim knew Suspect
Code: 2000, Count: 256188, Desc: Domestic violence
Code: 1300, Count: 219082, Desc: Vehicle involved
Code: 0400, Count: 213165, Desc: Force used
Code: 1402, Count: 177470, Desc: Evidence Booked (any crime)
Code: 1609, Count: 131229, Desc: Smashed
Code: 1309, Count: 122108, Desc: Susp uses vehicle
Code: 1202, Count: 120238, Desc: Victim was aged (60 & over) or blind/physically disabled/unable to care for self
Code: 0325, Count: 120159, Desc: Took merchandise
Code: 1814, Count: 118073, Desc: Susp is/was current/former boyfriend/girlfriend
Code: 0444, Count: 116763, Desc: Pushed
Code: 1501, Count: 115589, Desc: Other MO (see rpt)
Code: 1307, Count: 113609, Desc: Breaks window
Code: 0334, Cou