In [1]:
import time
from pyspark.sql import SparkSession
from pyspark.sql.window import Window
from pyspark.sql.functions import col, year, count, when, desc, sum, to_timestamp, row_number, regexp_replace, expr, asc
from pyspark.sql.types import DecimalType
from pyspark.sql import functions as F

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
3690,application_1732639283265_3636,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [2]:
### QUERY 1
APP_NAME = "Crime Victime Age Analysis"
SPARK_EXECUTORS = 4
spark = SparkSession.builder.appName(APP_NAME).config("spark.executor.instances", SPARK_EXECUTORS).getOrCreate()

# crime data
d1 = spark.read.csv(
    "s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2010_to_2019_20241101.csv",
    header=True, inferSchema=True)
d2 = spark.read.csv(
    "s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2020_to_Present_20241101.csv",
    header=True, inferSchema=True)
crime_data = d1.union(d2)

# DATAFRAME API BEGIN #
start = time.time()
filtered_df = crime_data.filter(crime_data["Crm Cd Desc"].contains("AGGRAVATED ASSAULT")) #get only aggravated assault
grouped_df = filtered_df.withColumn("AGE GROUP", when(filtered_df['Vict Age'] <= 0, "Unknown").when(filtered_df['Vict Age'] < 18, "Children").when(filtered_df["Vict Age"] <= 24, "Young adults")
                              .when(filtered_df['Vict Age'] <= 64, "Adults").when(filtered_df['Vict Age'] > 64, "Elderly"))
categories_df = grouped_df.groupBy("AGE GROUP").count().orderBy(desc('count')).collect() #group rows based on the age group assigned and count them
end = time.time()
dataframe_time = end-start
print("DATAFRAME performance:", end-start)
print("DATAFRAME results:", categories_df)
# DATAFRAME API END #

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

DATAFRAME performance: 7.735968351364136
DATAFRAME results: [Row(AGE GROUP='Adults', count=121093), Row(AGE GROUP='Young adults', count=33605), Row(AGE GROUP='Children', count=10830), Row(AGE GROUP='Elderly', count=5985), Row(AGE GROUP='Unknown', count=5098)]

In [3]:
# crime data
d1 = spark.read.csv(
    "s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2010_to_2019_20241101.csv",
    header=True, inferSchema=True)
d2 = spark.read.csv(
    "s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2020_to_Present_20241101.csv",
    header=True, inferSchema=True)
crime_data = d1.union(d2)

def get_category_by_age(age):
    if age is None:
        return "NULL"
    age = int(age)
    category = None
    if age <= 0:
        category = 'Unknown'
    elif age < 18:
        category = "Children"
    elif age <= 24:
        category = "Young adults"
    elif age <= 64:
        category = "Adults"
    else:
        category = "Elderly"
    return category

# RDD API BEGIN #
start = time.time()
crime_rdd = crime_data.rdd # convert dataframe to rdd
filtered_rdd = crime_rdd.filter(lambda row: "AGGRAVATED ASSAULT" in row["Crm Cd Desc"])
grouped_rdd = filtered_rdd.map(lambda row: (get_category_by_age(row["Vict Age"]), 1))
categories_rdd = grouped_rdd.reduceByKey(lambda x,y: x+y).sortBy(lambda tup: -tup[1]).collect()
end = time.time()
rdd_time = end - start
print("RDD performance:", end - start)
print("RDD results:", categories_rdd)
# RDD API END #

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

RDD performance: 16.28496217727661
RDD results: [('Adults', 121093), ('Young adults', 33605), ('Children', 10830), ('Elderly', 5985), ('Unknown', 5098)]

In [4]:
print("Dataframe Speedup:", rdd_time/dataframe_time)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Dataframe Speedup: 2.1050967943017724