In [None]:
from sedona.spark import *
from pyspark.sql.functions import *
from pyspark.sql import SparkSession
import time

# Create spark Session
spark = SparkSession.builder \
    .appName("Query 3") \
    .getOrCreate()

Starting Spark application




In [None]:
# Create sedona context
sedona = SedonaContext.create(spark)

# Read the file from s3
geojson_path = "s3://initial-notebook-data-bucket-dblab-905418150721/2010_Census_Blocks.geojson"
blocks_df = sedona.read.format("geojson") \
            .option("multiLine", "true").load(geojson_path) \
            .selectExpr("explode(features) as features") \
            .select("features.*")
# Formatting magic
flattened_df = blocks_df.select( \
                [col(f"properties.{col_name}").alias(col_name) for col_name in \
                blocks_df.schema["properties"].dataType.fieldNames()] + ["geometry"]) \
            .drop("properties") \
            .drop("type")

# Print schema
flattened_df.printSchema()

In [12]:
from pyspark.sql.functions import regexp_replace, col, substring, year, to_date, expr, sum, avg

# Filter Los Angeles areas based on CITY column
LA_areas = (flattened_df
    .filter(col("CITY") == "Los Angeles")
    .groupBy("COMM", "ZCTA10", "POP_2010", "HOUSING10")
    .agg(ST_Union_Aggr("geometry").alias("geometry"))
)

# Load income data from the CSV file
file_path = 's3://initial-notebook-data-bucket-dblab-905418150721/LA_income_2015.csv'
income_df = spark.read.csv(file_path, header=True, inferSchema=True)

# Clean and convert the "Estimated Median Income" column from string format "$33,887" to double format 33887.0
income_df = (income_df
    .withColumn("Estimated Median Income", 
        regexp_replace(col("Estimated Median Income"), "[$,]", "").cast("double")
    )
)

# Convert "Zip Code" column to double and rename it to "ZipCode"
income_df = income_df.withColumn("ZipCode", col("Zip Code").cast("double"))

# Convert the "ZCTA10" column in Los Angeles areas to double
LA_areas = LA_areas.withColumn("ZCTA10", col("ZCTA10").cast("double"))


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [13]:
# Join Los Angeles areas with income data based on ZCTA10
joined_df = (LA_areas.join(
    income_df,
    LA_areas.ZCTA10 == income_df.ZipCode, "inner"
).select("COMM", "ZCTA10", "POP_2010", "HOUSING10", *income_df.columns))

print("Joined_df Explain:")
joined_df.explain()

# Calculate total population and total number of households per ZIP Code
zip_totals = (joined_df.groupBy("ZCTA10").agg(
    sum("POP_2010").alias("TOTAL_ZIP_POP"),
    sum("HOUSING10").alias("TOTAL_ZIP_HOUSING")
))

# Join the total data (population and housing) with the main DataFrame
joined_with_totals = joined_df.join(zip_totals, "ZCTA10", "left")

print("joined_with_totals Explain:")
joined_with_totals.explain()

# Group by area (COMM) and calculate estimated income per person
result_df = (joined_with_totals.groupBy("COMM").agg(
    sum("TOTAL_ZIP_HOUSING").alias("TOTAL_HOUSING"),
    sum("TOTAL_ZIP_POP").alias("TOTAL_POP"),
    avg("Estimated Median Income").alias("AVG_MEDIAN_INCOME")
).withColumn("Estimated_Income_Per_Person",
    (col("AVG_MEDIAN_INCOME") * col("TOTAL_HOUSING")) / col("TOTAL_POP")
))

# Filter to remove areas with NULL values
result_df = result_df.filter(result_df.Estimated_Income_Per_Person.isNotNull())

# Display the results
result_df.select("COMM", "Estimated_Income_Per_Person").show(result_df.count(), truncate=False)


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Joined_df Explain:
== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- BroadcastHashJoin [knownfloatingpointnormalized(normalizenanandzero(ZCTA10#2143))], [knownfloatingpointnormalized(normalizenanandzero(ZipCode#2138))], Inner, BuildRight, false
   :- HashAggregate(keys=[COMM#49, ZCTA10#66, POP_2010#58L, HOUSING10#55L], functions=[], schema specialized)
   :  +- Exchange hashpartitioning(COMM#49, ZCTA10#66, POP_2010#58L, HOUSING10#55L, 1000), ENSURE_REQUIREMENTS, [plan_id=7290]
   :     +- HashAggregate(keys=[COMM#49, ZCTA10#66, POP_2010#58L, HOUSING10#55L], functions=[], schema specialized)
   :        +- Project [features#33.properties.COMM AS COMM#49, features#33.properties.HOUSING10 AS HOUSING10#55L, features#33.properties.POP_2010 AS POP_2010#58L, features#33.properties.ZCTA10 AS ZCTA10#66]
   :           +- Filter ((isnotnull(features#33.properties.CITY) AND (features#33.properties.CITY = Los Angeles)) AND isnotnull(cast(features#33.properties.ZCTA10 as double)))
   :    

In [18]:
# Load crime data from the specified CSV file
file_path = 's3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2010_to_2019_20241101.csv'
crimeData_df = spark.read.csv(file_path, header=True, inferSchema=True)

# Extract the year from the "Date Rptd" column and filter for crimes reported in 2015
crimeData_df = crimeData_df.withColumn("year", year(to_date(col("Date Rptd"), "MM/dd/yyyy hh:mm:ss a")))
crimeData_df = crimeData_df.filter(col("year") == 2015)

# Create a geometry column using the latitude (LAT) and longitude (LON) values
crimeData_df = crimeData_df.withColumn("geom", expr("ST_Point(LON, LAT)"))

# Define a function to execute spatial joins and measure execution time for different join strategies
def join_strategy(strategy):
    print(f"\nExecuting join using the {strategy} strategy")
    start_time = time.time()

    # Perform a spatial join between crime data and Los Angeles area geometries
    crimeDataGEO_df = crimeData_df.hint(strategy).join(
        LA_areas.hint(strategy), expr("ST_Within(geom, geometry)"), "inner"
    )

    # Group by community (COMM) and calculate the total number of crimes per community
    community_crime_stats = crimeDataGEO_df.groupBy("COMM").agg(
        expr("count(*)").alias("Total_Crimes_Per_Community")
    )

    # Aggregate the total population for each community
    community_population = LA_areas.groupBy("COMM").agg(
        sum("POP_2010").alias("TOTAL_COMM_POP")
    )

    # Join the crime statistics with the population data on community (COMM)
    final_stats = community_crime_stats.hint(strategy).join(
        community_population.hint(strategy), "COMM", "inner"
    )

    # Compute the crime rate per person for each community
    final_stats = final_stats.withColumn(
        "Crimes_Per_Person", col("Total_Crimes_Per_Community") / col("TOTAL_COMM_POP")
    )

    # Calculate and display the execution time
    execution_time = time.time() - start_time
    print(f"Execution time using {strategy}: {execution_time:.4f} seconds")

    # Show the final computed statistics
    final_stats.show(final_stats.count())
    return execution_time


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [19]:
# Test the join function with four different strategies
strategies = ["BROADCAST", "SHUFFLE_HASH", "MERGE", "SHUFFLE_REPLICATE_NL"]
results = {}

# Execute the join for each strategy and store the execution times
for strategy in strategies:
    results[strategy] = join_strategy(strategy)

# Summarize the results and display execution times for each strategy
for strategy, time_taken in results.items():
    print(f"Having Join Strategy: {strategy},the execution time is : {time_taken:.4f} sec")


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…


Executing join using the BROADCAST strategy
Execution time using BROADCAST: 0.0421 seconds
+--------------------+--------------------------+--------------+--------------------+
|                COMM|Total_Crimes_Per_Community|TOTAL_COMM_POP|   Crimes_Per_Person|
+--------------------+--------------------------+--------------+--------------------+
|       Granada Hills|                      2417|         52113| 0.04637998196227429|
|         North Hills|                      2579|         55437|0.046521276403845804|
|      Temple-Beaudry|                      1823|         36396| 0.05008792174964282|
|         Harbor City|                      1315|         26953| 0.04878863206322116|
|       Panorama City|                      3343|         68634| 0.04870763761401055|
|         Westchester|                      3741|         46196| 0.08098103731924843|
|   Wellington Square|                       276|          4606|0.059921841076856275|
|      Mt. Washington|                       809