In [1]:
%%configure -f
{
    "conf": {
        "spark.executor.memory": "8g",
        "spark.executor.cores": "4"
    }
}

ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
3608,application_1732639283265_3564,pyspark,idle,Link,Link,,
3612,application_1732639283265_3568,pyspark,idle,Link,Link,,
3624,application_1732639283265_3580,pyspark,idle,Link,Link,,
3630,application_1732639283265_3586,pyspark,idle,Link,Link,,
3634,application_1732639283265_3590,pyspark,idle,Link,Link,,
3637,application_1732639283265_3593,pyspark,busy,Link,Link,,
3641,application_1732639283265_3597,pyspark,idle,Link,Link,,
3659,application_1732639283265_3605,pyspark,idle,Link,Link,,
3660,application_1732639283265_3606,pyspark,idle,Link,Link,,
3661,application_1732639283265_3607,pyspark,idle,Link,Link,,


In [2]:
import time
from pyspark.sql import SparkSession
from pyspark.sql.window import Window
from pyspark.sql.functions import col, year, count, when, desc, sum, to_timestamp, row_number, regexp_replace, expr, asc
from pyspark.sql.types import DecimalType
from pyspark.sql import functions as F
from sedona.spark import *

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
3774,application_1732639283265_3718,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [3]:
### Query 4
APP_NAME = "Crime Victime Race Analysis"
SPARK_EXECUTORS = 2
spark = SparkSession.builder.appName(APP_NAME).config("spark.executor.instances", SPARK_EXECUTORS).getOrCreate()
sedona = SedonaContext.create(spark)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [4]:
# crime data
crime_data = spark.read.csv(
    "s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2010_to_2019_20241101.csv",
    header=True, inferSchema=True
)

# remove NULL ISLAND (0,0)
crime_data = crime_data.filter(~((F.col('LAT') == 0) & (F.col('LON') == 0))) \
    .select('DR_NO', 'LAT', 'LON', 'Vict Descent', 'Date Rptd') \
    .withColumn('geometry', ST_Point('LON', 'LAT')) \
    .drop('LON').drop('LAT')

# filter crimes for the year 2015
crime_data = crime_data.withColumn("Date Rptd", to_timestamp(F.col("Date Rptd"), "MM/dd/yyyy hh:mm:ss a"))
crimes_2015 = crime_data.filter(
    (F.year(F.col("Date Rptd")) == 2015) & (F.col("Vict Descent").isNotNull())
)

# median household income data
income_data = spark.read.csv(
    "s3://initial-notebook-data-bucket-dblab-905418150721//LA_income_2015.csv",
    header=True, inferSchema=True
)
income_data = income_data.withColumn(
    "median_income",
    F.regexp_replace(F.col("Estimated Median Income"), "[$,]", "").cast(DecimalType())
).drop('Estimated Median Income', 'Community')

# census block data
geojson_path = "s3://initial-notebook-data-bucket-dblab-905418150721/2010_Census_Blocks.geojson"
census_blocks = sedona.read.format("geojson") \
            .option("multiLine", "true").load(geojson_path) \
            .selectExpr("explode(features) as features") \
            .select("features.*")
census_blocks = census_blocks.select(
            [col(f"properties.{col_name}").alias(col_name) for col_name in
            census_blocks.schema["properties"].dataType.fieldNames()] + ["geometry"],).drop("properties").drop("type")
census_blocks = census_blocks.filter(F.col('CITY') == 'Los Angeles')

# race and ethnicity legend
descent_legend = spark.read.csv( 
    "s3://initial-notebook-data-bucket-dblab-905418150721/RE_codes.csv", 
    header=True, inferSchema=True
)

### Query 3
comm_join_crimes = crime_data.join(census_blocks, ST_Within(crime_data['geometry'], census_blocks['geometry']))
count_comm_crimes = comm_join_crimes.groupby('COMM').agg(F.count('*').alias('crimes_count'))

zip_comm_houses_pop = census_blocks.groupBy('ZCTA10', 'COMM').agg(
    sum(col('POP_2010')).alias('population'),
    sum(col('HOUSING10')).alias('houses'),
).select('ZCTA10', 'COMM', 'population', 'houses')

zip_comm_houses_pop_hincome = zip_comm_houses_pop.join(
    income_data, 
    income_data['Zip Code'] == zip_comm_houses_pop['ZCTA10']
).drop('Zip Code')

zip_comm_total_income = zip_comm_houses_pop_hincome.withColumn(
    "zip_total_income",
    col('median_income') * col('houses')
)

comm_total_population_total_income = zip_comm_total_income.groupBy('COMM').agg(
    sum('population').alias('total_population'),
    sum('zip_total_income').alias('total_income')
)

comm_crime_income = comm_total_population_total_income.join(
    count_comm_crimes,
    on='COMM',
    how='right'
)
comm_mincome_person = comm_crime_income.withColumn(
    'average_income_per_person',
    when(col('total_population') > 0, col('total_income') / col('total_population'))
    .otherwise(0)).withColumn(
    'crime_rate_per_person',
    when(col('total_population') > 0, col('crimes_count') / col('total_population'))
    .otherwise(0)
).drop('total_income').drop('crimes_count').drop('total_population')

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [5]:
def query4(crimes_2015, income_data, census_blocks, descent_legend, comm_mincome_person):
    window_spec = Window.orderBy("average_income_per_person")
    comm_mincome_person = comm_mincome_person.withColumn("row_num", row_number().over(window_spec)) # with row number
    comm_bottom_3 = comm_mincome_person.filter(comm_mincome_person.row_num <= 3).select('COMM')
    comm_top_3 = comm_mincome_person.filter(comm_mincome_person.row_num > comm_mincome_person.count() - 3).select("COMM")

    # match crimes to zip codes
    crimes_with_zipcodes = crimes_2015.join(
        census_blocks,
        ST_Within(crime_data['geometry'], census_blocks['geometry'])
    )

    # join legend for descriptions
    crimes_with_descriptions = crimes_with_zipcodes.join(
        descent_legend,
        crimes_with_zipcodes["Vict Descent"] == descent_legend["Vict Descent"],
        how="left"
    ).select(
        crimes_with_zipcodes["*"],
        descent_legend["Vict Descent Full"].alias("descent_description")
    )

    top_crimes = crimes_with_descriptions.join(comm_top_3, "COMM", "inner")
    bottom_crimes = crimes_with_descriptions.join(comm_bottom_3, "COMM", "inner")

    # group and count victims by description for each group
    top_victim_count = top_crimes.groupBy("descent_description") \
        .agg(F.count("*").alias("victim_count")) \
        .orderBy(F.desc("victim_count"))

    bottom_victim_count = bottom_crimes.groupBy("descent_description") \
        .agg(F.count("*").alias("victim_count")) \
        .orderBy(F.desc("victim_count"))

    print("Top COMM Victim Count:")
    top_victim_count.show()

    print("Bottom COMM Victim Count:")
    bottom_victim_count.show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [6]:
start = time.time()
query4(crimes_2015, income_data, census_blocks, descent_legend, comm_mincome_person)
end = time.time()

conf = spark.sparkContext.getConf()
print("Executor Cores:", conf.get("spark.executor.cores"))
print("Executor Memory:", conf.get("spark.executor.memory"))
print("Processing time:", end-start)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Top COMM Victim Count:
+--------------------+------------+
| descent_description|victim_count|
+--------------------+------------+
|               White|         682|
|               Other|          81|
|Hispanic/Latin/Me...|          75|
|             Unknown|          50|
|               Black|          46|
|         Other Asian|          22|
|             Chinese|           1|
|American Indian/A...|           1|
+--------------------+------------+

Bottom COMM Victim Count:
+--------------------+------------+
| descent_description|victim_count|
+--------------------+------------+
|Hispanic/Latin/Me...|        3314|
|               Black|        1152|
|               White|         435|
|               Other|         250|
|         Other Asian|         136|
|             Unknown|          31|
|American Indian/A...|          22|
|             Chinese|           4|
|              Korean|           4|
|            Filipino|           3|
|         AsianIndian|           1|
|           Gu