In [1]:
%%configure -f
{
    "conf":{
        "spark.executor.instances": "4",
        "spark.executor.memory": "2g",
        "spark.executor.cores": "1"
    }
}

ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
501,application_1765289937462_0495,pyspark,idle,Link,Link,,
507,application_1765289937462_0501,pyspark,idle,Link,Link,,


In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.window import Window
from pyspark.sql.types import StructField, StructType, IntegerType, FloatType, StringType
from pyspark.sql.functions import col, to_date, year, count, desc, rank, sum as _sum, round
import time

spark = SparkSession \
    .builder \
    .appName("Query 1 execution") \
    .getOrCreate()

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
509,application_1765289937462_0503,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [5]:
# Preparing the data
Crime_data_schema = StructType([
    StructField("DR_NO", IntegerType()),
    StructField("Date Rptd", StringType()),
    StructField("DATE OCC", StringType()),
    StructField("TIME OCC", IntegerType()),
    StructField("AREA", IntegerType()),
    StructField("AREA NAME", StringType()),
    StructField("Rpt Dist No", IntegerType()),
    StructField("Part 1-2", IntegerType()),
    StructField("Crm Cd", IntegerType()),
    StructField("Crm Cd Desc", StringType()),
    StructField("Mocodes", StringType()),
    StructField("Vict Age", IntegerType()),
    StructField("Vict Sex", StringType()),
    StructField("Vict Descent", StringType()),
    StructField("Premis Cd", IntegerType()),
    StructField("Premis Desc", StringType()),
    StructField("Weapon Used Cd", IntegerType()),
    StructField("Weapon Desc", StringType()),
    StructField("Status", StringType()),
    StructField("Status Desc", StringType()),
    StructField("Crm Cd 1", IntegerType()),
    StructField("Crm Cd 2", IntegerType()),
    StructField("Crm Cd 3", IntegerType()),
    StructField("Crm Cd 4", IntegerType()),
    StructField("LOCATION", StringType()),
    StructField("Cross Street", StringType()),
    StructField("LAT", FloatType()),
    StructField("LON", FloatType()),
])

Recent_crime_data_df = spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_Crime_Data/LA_Crime_Data_2020_2025.csv", \
                                      header = True, \
                                      schema = Crime_data_schema)
Older_crime_data_df = spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_Crime_Data/LA_Crime_Data_2010_2019.csv", \
                                     header = True, \
                                     schema = Crime_data_schema)
Crime_df = Recent_crime_data_df.union(Older_crime_data_df)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [4]:
# Implementation 1: Dataframe API (no UDF)

from pyspark.sql import functions as F
import time

spark.catalog.clearCache()

# Filter only incidents that contain "aggravated assault"
assault = Crime_df.filter(
    F.lower(F.col("Crm Cd Desc")).contains("aggravated assault")
)

# Ensure the age column is integer 
assault = assault.withColumn("Vict Age", F.col("Vict Age").cast("int"))

# Create the age groups
assault_with_group = assault.withColumn(
    "age_group",
    F.when(F.col("Vict Age") < 18, "Children")
     .when((F.col("Vict Age") >= 18) & (F.col("Vict Age") <= 24), "Young Adults")
     .when((F.col("Vict Age") >= 25) & (F.col("Vict Age") <= 64), "Adults")
     .when(F.col("Vict Age") > 64, "Elderly")
)

# Count incidents per age group and sort in descending order
query_1 = (
    assault_with_group
    .groupBy("age_group")
    .count()
    .orderBy(F.desc("count"))
)

# Measure execution time
start_time = time.time()
result = query_1.collect()   
end_time = time.time()

print(f"\nExecution Time: {end_time - start_time:.4f} seconds\n")

# Display the results
query_1.show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…


Execution Time: 11.0353 seconds

+------------+------+
|   age_group| count|
+------------+------+
|      Adults|121660|
|Young Adults| 33758|
|    Children| 16014|
|     Elderly|  6011|
+------------+------+

In [4]:
# Implementation 2: Dataframe API (UDF)

from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
import time
from pyspark.sql import functions as F

spark.catalog.clearCache()

def age_group_func(age):
    if age is None:
        return None
    try:
        age = int(age)
    except:
        return None

    if age < 18:
        return "Children"
    elif 18 <= age <= 24:
        return "Young Adults"
    elif 25 <= age <= 64:
        return "Adults"
    elif age > 64:
        return "Elderly"
    else:
        return None

age_group = udf(age_group_func, StringType())

# Filter aggravated assault incidents
assault = Crime_df.filter(
    F.lower(F.col("Crm Cd Desc")).contains("aggravated assault")
)

assault = assault.withColumn("Vict Age", F.col("Vict Age").cast("int"))

# Apply UDF to create age_group column
assault_group = assault.withColumn(
    "age_group",
    age_group(F.col("Vict Age"))
)

query_1 = (
    assault_group
    .groupBy("age_group")
    .count()
    .orderBy(F.desc("count"))
)

start_time = time.time()

result = query_1.collect() 

end_time = time.time()

print(f"\nExecution Time: {end_time - start_time:.4f} seconds\n")

# Display result
query_1.show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…


Execution Time: 15.1313 seconds

+------------+------+
|   age_group| count|
+------------+------+
|      Adults|121660|
|Young Adults| 33758|
|    Children| 16014|
|     Elderly|  6011|
+------------+------+

In [4]:
# Implementation 3: RDD API

import time

spark.catalog.clearCache()

# Convert the relevant columns to RDD
crime_rdd = Crime_df.select("Crm Cd Desc", "Vict Age").rdd

def map_to_age_group(row):
    desc = row["Crm Cd Desc"]
    age = row["Vict Age"]

    # Filter aggravated assaults
    if desc is None:
        return None
    if "aggravated assault" not in desc.lower():
        return None

    # Validate age
    if age is None:
        return None

    try:
        age = int(age)
    except:
        return None

    # Assign age group
    if age < 18:
        return "Children"
    elif age <= 24:
        return "Young Adults"
    elif age <= 64:
        return "Adults"
    else:
        return "Elderly"

# RDD pipeline (map → filter → reduce)
start_time = time.time()

query_1_rdd = (
    crime_rdd
    .map(map_to_age_group)               # Compute age group or None
    .filter(lambda x: x is not None)     # Keep only valid records
    .map(lambda g: (g, 1))               # Prepare for reduction
    .reduceByKey(lambda a, b: a + b)     # Count per group
    .sortBy(lambda x: -x[1])             # Sort descending by count
    .collect()                           # Force computation
)

end_time = time.time()

print(f"\nExecution Time: {end_time - start_time:.4f} seconds\n")

# Display results
for group, count in query_1_rdd:
    print(f"{group:15}  {count}")


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…


Execution Time: 22.8356 seconds

Adults           121660
Young Adults     33758
Children         16014
Elderly          6011