# Query 1

In [1]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("Query 1") \
    .getOrCreate()

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
125,application_1738075734771_0126,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [2]:
%%configure -f
{
    "conf": {
        "spark.executor.instances": "4"
    }
}

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
126,application_1738075734771_0127,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
123,application_1738075734771_0124,pyspark,idle,Link,Link,,
124,application_1738075734771_0125,pyspark,idle,Link,Link,,
126,application_1738075734771_0127,pyspark,idle,Link,Link,,✔


In [3]:
master = spark.sparkContext.master
print(f"Spark Master: {master}")

conf = spark.sparkContext.getConf()
print("Executor Instances:", conf.get("spark.executor.instances"))

data_path_1 = "s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2010_to_2019_20241101.csv"
data_path_2 = "s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2020_to_Present_20241101.csv"

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Spark Master: yarn
Executor Instances: 4

### Dataframe API

In [12]:
import time
start_time = time.time()

# Step 1: Load the dataset with header and infer schema
data_1 = spark.read.csv(
    path=data_path_1,
    header=True,          # Use header row for column names
    inferSchema=True      # Let Spark infer the schema
).select("Vict Age", "Crm Cd Desc")

data_2 = spark.read.csv(
    path=data_path_2,
    header=True,
    inferSchema=True
).select("Vict Age", "Crm Cd Desc")

data_1.show()
data_2.show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [13]:
from pyspark.sql import DataFrame
from pyspark.sql.functions import col, when

def count_aggravated_assaults_by_age_group(df: DataFrame) -> DataFrame:
    """
    Filters rows where 'Crm Cd Desc' contains 'AGGRAVATED ASSAULT' and calculates the
    sum of incidents per age group.

    Parameters:
    df (DataFrame): Input PySpark DataFrame with 'Crm Cd Desc' and 'Vict Age' columns.

    Returns:
    DataFrame: A new DataFrame with 'Age Group' and the count of aggravated assaults per group.
    """
    # Step 1: Filter rows with "AGGRAVATED ASSAULT" in the 'Crm Cd Desc' column
    filtered_df = df.filter(col("Crm Cd Desc").like("%AGGRAVATED ASSAULT%"))
    
    # Step 2: Create a new column for age groups
    categorized_df = filtered_df.withColumn(
        "Age Group",
        when(col("Vict Age") < 18, "Children (<18)")
        .when((col("Vict Age") >= 18) & (col("Vict Age") <= 24), "Young Adults (18-24)")
        .when((col("Vict Age") > 24) & (col("Vict Age") < 65), "Adults (25-64)")
        .when(col("Vict Age") >= 65, "Elderly (65+)")
        .otherwise("Unknown")
    )
    
    # Step 3: Group by age group and count incidents
    result = categorized_df.groupBy("Age Group").count().orderBy(col("count").desc())
    
    return result

print("Ok")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Ok

In [14]:
# Call the function for both datasets
result_1 = count_aggravated_assaults_by_age_group(data_1)
result_2 = count_aggravated_assaults_by_age_group(data_2)

result_1.show()
result_2.show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [15]:
import time

# Combine the two DataFrames
combined_df = result_1.union(result_2)
combined_df.show()

# Group by age group and sum the counts
result = combined_df \
    .groupBy("Age Group") \
    .sum("count") \
    .withColumnRenamed("sum(count)", "total_count") \
    .orderBy(col("total_count").desc())

print("Final Result:") 
result.show()

end_time = time.time()
print(f"Execution time: {end_time - start_time:.2f} seconds")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+--------------------+-----+
|           Age Group|count|
+--------------------+-----+
|      Adults (25-64)|72610|
|Young Adults (18-24)|23472|
|      Children (<18)|10724|
|       Elderly (65+)| 3099|
|      Adults (25-64)|48483|
|Young Adults (18-24)|10133|
|      Children (<18)| 5204|
|       Elderly (65+)| 2886|
+--------------------+-----+

Final Result:
+--------------------+-----------+
|           Age Group|total_count|
+--------------------+-----------+
|      Adults (25-64)|     121093|
|Young Adults (18-24)|      33605|
|      Children (<18)|      15928|
|       Elderly (65+)|       5985|
+--------------------+-----------+

Execution time: 23.36 seconds

### RDD API

The csv parser is used because:
- It correctly interprets commas within quoted fields, ensuring that these commas are treated as part of the field and not as delimiters. (We had an issue with the Crime Description quote and the commas inside)

- It handles cases where some rows might have missing or extra fields.

- It supports escaping characters, such as double quotes within a quoted field ("field, with \"quotes\" and commas").

In [8]:
# Load the data
sc = spark.sparkContext

import time
start_time = time.time()

import csv

# Load the CSV file
rdd_1 = sc.textFile(data_path_1).map(lambda x: list(csv.reader([x]))[0])
rdd_2 = sc.textFile(data_path_2).map(lambda x: list(csv.reader([x]))[0])

# Get the header row
header_1 = rdd_1.first()
header_2 = rdd_2.first()

# Remove the header and keep relevant info
rdd_1 = rdd_1.filter(lambda row: row != header_1).map(lambda x: [int(x[11]), x[9]])
rdd_2 = rdd_2.filter(lambda row: row != header_2).map(lambda x: [int(x[11]), x[9]])

# print(rdd_1.take(10))
# print(rdd_2.take(10))

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

[[48, 'VIOLATION OF COURT ORDER'], [0, 'VANDALISM - FELONY ($400 & OVER, ALL CHURCH VANDALISMS)'], [0, 'OTHER MISCELLANEOUS CRIME'], [47, 'VIOLATION OF COURT ORDER'], [47, 'RAPE, ATTEMPTED'], [23, 'SHOPLIFTING - PETTY THEFT ($950 & UNDER)'], [46, 'BURGLARY FROM VEHICLE'], [51, 'ASSAULT WITH DEADLY WEAPON, AGGRAVATED ASSAULT'], [30, 'ASSAULT WITH DEADLY WEAPON, AGGRAVATED ASSAULT'], [55, 'THEFT-GRAND ($950.01 & OVER)EXCPT,GUNS,FOWL,LIVESTK,PROD']]
[[0, 'VEHICLE - STOLEN'], [47, 'BURGLARY FROM VEHICLE'], [19, 'BIKE - STOLEN'], [19, 'SHOPLIFTING-GRAND THEFT ($950.01 & OVER)'], [28, 'THEFT OF IDENTITY'], [41, 'THEFT OF IDENTITY'], [25, 'THEFT OF IDENTITY'], [27, 'THEFT OF IDENTITY'], [24, 'THEFT OF IDENTITY'], [26, 'BATTERY - SIMPLE ASSAULT']]

In [5]:
from pyspark.sql import SparkSession

def process_rdd_for_aggravated_assaults(rdd):
    """
    Processes an RDD to calculate the total number of aggravated assaults per age group.

    Parameters:
    rdd (RDD): Input RDD with [age, crime_description].

    Returns:
    RDD: RDD with total counts of aggravated assaults per age group.
    """
    # Filter for rows containing 'aggravated assault'
    filtered_rdd = rdd.filter(lambda row: "aggravated assault" in row[1].lower())

    # Map to age groups
    age_grouped_rdd = filtered_rdd.map(lambda row: (
        "Children (<18)" if row[0] < 18 else
        "Young Adults (18-24)" if 18 <= row[0] <= 24 else
        "Adults (25-64)" if 25 <= row[0] <= 64 else
        "Elderly (65+)",
        1  # Each row represents one crime
    ))

    # Count total crimes per age group
    result_rdd = age_grouped_rdd.reduceByKey(lambda x, y: x + y)

    return result_rdd

print("Ok")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Ok

In [6]:
result_rdd_1 = process_rdd_for_aggravated_assaults(rdd_1)
result_rdd_2 = process_rdd_for_aggravated_assaults(rdd_2)

# print(result_rdd_1.sortByKey().take(5))
# print(result_rdd_2.sortByKey().take(5))

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [7]:
import time

# Combine the results and sort the final rdd
combined_rdd = result_rdd_1 \
    .union(result_rdd_2) \
    .reduceByKey(lambda x, y: x + y) \
    .sortBy(lambda x: x[1], ascending=False)

print("Final Result:")
print(combined_rdd.collect())

end_time = time.time()
print(f"Execution time: {end_time - start_time:.2f} seconds")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Final Result:
[('Adults (25-64)', 121093), ('Young Adults (18-24)', 33605), ('Children (<18)', 15928), ('Elderly (65+)', 5985)]
Execution time: 28.31 seconds