In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, year, to_date, count, sum as _sum, corr, desc, lit
from pyspark.sql.types import IntegerType, StringType, StructType, StructField, DoubleType, FloatType

# Sedona Imports
from sedona.register import SedonaRegistrator
from sedona.utils import SedonaKryoRegistrator, KryoSerializer

spark = SparkSession.builder \
    .appName("Query 5 execution") \
    .config("spark.serializer", KryoSerializer.getName) \
    .config("spark.kryo.registrator", SedonaKryoRegistrator.getName) \
    .config("spark.sql.extensions", "org.apache.spark.sql.sedona_sql.io.SedonaSqlWrapper") \
    .getOrCreate()
SedonaRegistrator.registerAll(spark)

In [None]:
# Preparing the data
Crime_data_schema = StructType([
    StructField("DR_NO", IntegerType()),
    StructField("Date Rptd", StringType()),
    StructField("DATE OCC", StringType()),
    StructField("TIME OCC", IntegerType()),
    StructField("AREA", IntegerType()),
    StructField("AREA NAME", StringType()),
    StructField("Rpt Dist No", IntegerType()),
    StructField("Part 1-2", IntegerType()),
    StructField("Crm Cd", IntegerType()),
    StructField("Crm Cd Desc", StringType()),
    StructField("Mocodes", StringType()),
    StructField("Vict Age", IntegerType()),
    StructField("Vict Sex", StringType()),
    StructField("Vict Descent", StringType()),
    StructField("Premis Cd", IntegerType()),
    StructField("Premis Desc", StringType()),
    StructField("Weapon Used Cd", IntegerType()),
    StructField("Weapon Desc", StringType()),
    StructField("Status", StringType()),
    StructField("Status Desc", StringType()),
    StructField("Crm Cd 1", IntegerType()),
    StructField("Crm Cd 2", IntegerType()),
    StructField("Crm Cd 3", IntegerType()),
    StructField("Crm Cd 4", IntegerType()),
    StructField("LOCATION", StringType()),
    StructField("Cross Street", StringType()),
    StructField("LAT", FloatType()),
    StructField("LON", FloatType()),
])

Income_schema = StructType([
    StructField("Zip Code", IntegerType()),
    StructField("Community", StringType()),
    StructField("Estimated Median Income", StringType()),
])

Crime_df = spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_Crime_Data/LA_Crime_Data_2020_2025.csv", \
                          header = True, \
                          schema = Crime_data_schema)

census_df = spark.read.format("geojson") \
    .option("dropMalformed", "true") \
    .load("s3://initial-notebook-data-bucket-dblab-905418150723/project_data/LA_Census_Blocks_2020.geojson")

income_df = spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_income_2021.csv", \
                           header = True, \
                           schema = Income_schema, \
                           sep = ";")