In [13]:
# Import findspark and initialize. 
import pandas as pd
import findspark
findspark.init()

In [14]:
# Import findspark and initialize. 
import pandas as pd
import findspark
findspark.init()

In [15]:
# Import packages
from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.sql.types import StructType,StructField,StringType, DateType,IntegerType



In [16]:
# Spark Analysis

In [17]:
# Import our input dataset
df = pd.read_csv('https://profilingbucket.s3.us-east-2.amazonaws.com/combined_data.csv')

profiling_arrest_analysis = pd.DataFrame(df)

spark = SparkSession.builder.appName("pandasToSpark").getOrCreate()

# Convert Pandas DataFrame to Spark DataFrame
spark_df = spark.createDataFrame(profiling_arrest_analysis)

# Show the Spark DataFrame
spark_df.show(5)

  if should_localize and is_datetime64tz_dtype(s.dtype) and s.dt.tz is not None:


+-----------+-------------------+---+------------------+--------+--------------------+--------------------+------------+--------------------+------------------+------------------+
|arrest_date|        arrest_time|sex|              race|searched|      reason_stopped|     search_based_on|search_found|          race_known|               lng|               lat|
+-----------+-------------------+---+------------------+--------+--------------------+--------------------+------------+--------------------+------------------+------------------+
| 2015-01-01|2015-01-01 00:02:00|  M|HISPANIC OR LATINO| YES = 1|    CALL FOR SERVICE|INCIDENTAL TO ARREST|     NOTHING|NO - RACE OR ETHN...|-97.73419151256908| 30.26646917024065|
| 2015-01-01|2015-01-01 03:17:00|  M|             WHITE| YES = 1|VIOLATION OF TRAN...|INCIDENTAL TO ARREST|       DRUGS|NO - RACE OR ETHN...| -97.7776889200335|30.227662552230814|
| 2015-01-01|2015-01-01 03:17:00|  F|             WHITE| YES = 1|VIOLATION OF TRAN...|INCIDENTAL TO 

In [18]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import when

# Create a Spark session
spark = SparkSession.builder.appName("GroupSearch").getOrCreate()

# Your DataFrame
# Replace this with your actual DataFrame
# Assuming your DataFrame is named 'df'
# df = ...

# Use the 'when' function to create a new column 'search_grouped'
df = spark_df.withColumn(
    "search_grouped",
    when((spark_df["searched"] == "YES") | (spark_df["searched"] == "YES = 1"), "Yes").otherwise("No")
)

# Show the updated DataFrame
df.show()


+-----------+-------------------+---+------------------+--------+--------------------+--------------------+------------+--------------------+------------------+------------------+--------------+
|arrest_date|        arrest_time|sex|              race|searched|      reason_stopped|     search_based_on|search_found|          race_known|               lng|               lat|search_grouped|
+-----------+-------------------+---+------------------+--------+--------------------+--------------------+------------+--------------------+------------------+------------------+--------------+
| 2015-01-01|2015-01-01 00:02:00|  M|HISPANIC OR LATINO| YES = 1|    CALL FOR SERVICE|INCIDENTAL TO ARREST|     NOTHING|NO - RACE OR ETHN...|-97.73419151256908| 30.26646917024065|           Yes|
| 2015-01-01|2015-01-01 03:17:00|  M|             WHITE| YES = 1|VIOLATION OF TRAN...|INCIDENTAL TO ARREST|       DRUGS|NO - RACE OR ETHN...| -97.7776889200335|30.227662552230814|           Yes|
| 2015-01-01|2015-01-01 0

In [19]:
# Register the Spark DataFrame as a temporary table
spark_df.createOrReplaceTempView("arrest_data")

In [20]:
# Group by race and calculate the count of arrests for each race
arrests_by_race = df.groupBy("race").agg(count("race").alias("arrest_count"))

# Calculate the total number of arrests
total_arrests = df.count()

# Calculate the arrest rate by dividing the arrest count for each race by the total arrests
arrests_by_race = arrests_by_race.withColumn("arrest_rate", arrests_by_race["arrest_count"] / total_arrests)

# Show the result
arrests_by_race.show()

+--------------------+------------+--------------------+
|                race|arrest_count|         arrest_rate|
+--------------------+------------+--------------------+
|               WHITE|       13584|  0.3279337566086474|
|               BLACK|        9905| 0.23911836419380536|
|AMERICAN INDIAN/A...|          20|4.828235521328730...|
|  HISPANIC OR LATINO|       17212| 0.41551794896555055|
|             UNKNOWN|         112|0.002703811891944089|
|      MIDDLE EASTERN|         138|0.003331482509716824|
|HAWAIIAN/PACIFIC ...|          22|5.311059073461604E-4|
|               ASIAN|         427| 0.01030828283803684|
|          not_listed|           3|7.242353281993095E-5|
+--------------------+------------+--------------------+



In [31]:
from pyspark.sql.functions import year, count

# Assuming 'arrest_date' is the column containing the arrest date
df = df.withColumn("year", year("arrest_date"))

# Group by race and year and calculate the count of arrests for each race and year
arrests_by_race_year = df.groupBy("race", "year").agg(count("race").alias("arrest_count"))

# Calculate the total number of arrests for each year
total_arrests_year = df.groupBy("year").agg(count("race").alias("total_arrests_year"))

# Join the two DataFrames to calculate the arrest rate for each race and year
arrests_by_race_year = arrests_by_race_year.join(total_arrests_year, "year")
arrests_by_race_year = arrests_by_race_year.withColumn("arrest_rate", arrests_by_race_year["arrest_count"] / arrests_by_race_year["total_arrests_year"])

# Show the result
arrests_by_race_year.show()


+----+--------------------+------------+------------------+--------------------+
|year|                race|arrest_count|total_arrests_year|         arrest_rate|
+----+--------------------+------------+------------------+--------------------+
|2015|  HISPANIC OR LATINO|        3735|              9209|  0.4055814963622543|
|2015|               ASIAN|          99|              9209|0.010750352915626017|
|2015|AMERICAN INDIAN/A...|           5|              9209| 5.42947116950809E-4|
|2015|HAWAIIAN/PACIFIC ...|           2|              9209|2.171788467803236E-4|
|2015|      MIDDLE EASTERN|          28|              9209|0.003040503854924...|
|2015|               WHITE|        3147|              9209| 0.34173091540883915|
|2015|               BLACK|        2164|              9209| 0.23498751221631012|
|2015|             UNKNOWN|          29|              9209|0.003149093278314692|
|2016|AMERICAN INDIAN/A...|           5|              9184|5.444250871080139E-4|
|2016|               BLACK| 

In [35]:
# Convert PySpark DataFrame to Pandas DataFrame
pandas_df = arrests_by_race_year.toPandas()

# Show the Pandas DataFrame
print(pandas_df)

# Save the Pandas DataFrame as a CSV file
output_path_pandas = "arrests_by_race_year_pandas.csv"
pandas_df.to_csv(output_path_pandas, index=False)

    year                            race  arrest_count  total_arrests_year  \
0   2015              HISPANIC OR LATINO          3735                9209   
1   2015                           ASIAN            99                9209   
2   2015  AMERICAN INDIAN/ALASKAN NATIVE             5                9209   
3   2015       HAWAIIAN/PACIFIC ISLANDER             2                9209   
4   2015                  MIDDLE EASTERN            28                9209   
5   2015                           WHITE          3147                9209   
6   2015                           BLACK          2164                9209   
7   2015                         UNKNOWN            29                9209   
8   2016  AMERICAN INDIAN/ALASKAN NATIVE             5                9184   
9   2016                           BLACK          2152                9184   
10  2016                           ASIAN            93                9184   
11  2016                  MIDDLE EASTERN            35          

In [9]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import count
value_counts = df.groupBy("search_grouped").agg(count("*").alias("count"))

# Show the value counts
value_counts.show()

+--------------+-----+
|search_grouped|count|
+--------------+-----+
|            No| 7550|
|           Yes|33873|
+--------------+-----+



In [12]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

# Assuming your DataFrame is named 'df'
# Replace this with your actual DataFrame
# df = ...

# Filter rows where 'searched' column is 'not_listed'
not_listed_count = df.filter(col("searched") == "not_listed").count()

# Show the result
print("Count of 'not_listed' in the 'searched' column:", not_listed_count)


Count of 'not_listed' in the 'searched' column: 516


In [12]:
# Aggregate by race and count the number of arrests
race_aggregation = spark.sql("SELECT race,COUNT(*) as arrest_count FROM arrest_data GROUP BY race").show()


+--------------------+------------+
|                race|arrest_count|
+--------------------+------------+
|               WHITE|       13584|
|               BLACK|        9905|
|AMERICAN INDIAN/A...|          20|
|  HISPANIC OR LATINO|       17212|
|             UNKNOWN|         112|
|      MIDDLE EASTERN|         138|
|HAWAIIAN/PACIFIC ...|          22|
|               ASIAN|         427|
|          not_listed|           3|
+--------------------+------------+



In [35]:
#AGGS for TABLEAU
#Arrest Counts by Race:
race_aggregation = spark.sql("SELECT race, COUNT(*) as arrest_count FROM arrest_data GROUP BY race").show()


+--------------------+------------+
|                race|arrest_count|
+--------------------+------------+
|               WHITE|       13584|
|               BLACK|        9905|
|AMERICAN INDIAN/A...|          20|
|  HISPANIC OR LATINO|       17212|
|             UNKNOWN|         112|
|      MIDDLE EASTERN|         138|
|HAWAIIAN/PACIFIC ...|          22|
|               ASIAN|         427|
|          not_listed|           3|
+--------------------+------------+



In [36]:
#Reasons for Stops:
reason_aggregation = spark.sql("SELECT reason_stopped, COUNT(*) as reason_count FROM arrest_data GROUP BY reason_stopped").show()


+--------------------+------------+
|      reason_stopped|reason_count|
+--------------------+------------+
|    CALL FOR SERVICE|         841|
|PRE-EXISTING KNOW...|         348|
|VIOLATION OF PENA...|         126|
|    WATER SAFETY ACT|         201|
|VIOLATION OF TRAN...|       13632|
|SUSPICIOUS PERSON...|         297|
|  CONSENSUAL CONTACT|          36|
|          not_listed|        2387|
|               OTHER|         253|
|VIOLATION OF CITY...|         229|
|MOTOR VEHICLE DRIVER|          43|
|Pre-existing know...|         743|
|Violation of law ...|        6859|
|Moving Traffic Vi...|       15428|
+--------------------+------------+



In [37]:
#Overall Stop/Arrest Rate:
overall_rate = spark.sql("SELECT COUNT(*) as total_stops FROM arrest_data").collect()[0]['total_stops']


In [38]:
#Group-Specific Stop/Arrest Rates:
group_rates = spark.sql("SELECT race, COUNT(*) as group_stops FROM arrest_data GROUP BY race").show()


+--------------------+-----------+
|                race|group_stops|
+--------------------+-----------+
|               WHITE|      13584|
|               BLACK|       9905|
|AMERICAN INDIAN/A...|         20|
|  HISPANIC OR LATINO|      17212|
|             UNKNOWN|        112|
|      MIDDLE EASTERN|        138|
|HAWAIIAN/PACIFIC ...|         22|
|               ASIAN|        427|
|          not_listed|          3|
+--------------------+-----------+



In [52]:
#Search and Found Percentages:
from pyspark.sql.functions import col
total_arrests = spark_df.count()
searches_conducted = spark_df.filter(col("searched") == "YES = 1").count()
searches_found = spark_df.filter((col("searched") == "YES = 1") & (col("search_found").isin("WEAPONS", "CASH", "ALCOHOL", "DRUGS"))).count()


In [48]:
#Arrest Counts by Year:
arrests_by_year = spark_df.groupBy("arrest_year").count().orderBy("arrest_year").show()


+-----------+-----+
|arrest_year|count|
+-----------+-----+
|       2015| 9209|
|       2016| 9184|
|       2018|11214|
|       2019| 7747|
|       2020| 4069|
+-----------+-----+



In [56]:
#Arrest Counts by Race and Gender:
arrests_by_race_gender = spark_df.groupBy("race", "sex").count().orderBy("race", "sex").show


In [50]:
#Reasons for Stops and Search Outcomes:
reasons_for_stops = spark_df.groupBy("reason_stopped", "search_found").count().orderBy("count", ascending=False)


In [43]:
#Geospatial Analysis:
arrests_by_location = spark_df.groupBy("location").count()


In [44]:
#Temporal Trends:
result_by_month = spark.sql("""
    SELECT
        sex,
        race,
        month(arrest_date) AS arrest_month,
        COUNT(*) AS arrest_count
    FROM
        arrest_data
    GROUP BY
        sex, race, arrest_month
    ORDER BY
        arrest_month, arrest_count DESC
""")

In [45]:
#Demographic Analysis:
result_demographic_analysis = spark.sql("""
    SELECT
        sex,
        race,
        COUNT(*) AS arrest_count
    FROM
        arrest_data
    GROUP BY
        sex, race
    ORDER BY
        arrest_count DESC
""")

In [46]:
#Aggregate Location Data:
result_location = spark.sql("""
    SELECT
        sex,
        race,
        AVG(lng) AS avg_lng,
        AVG(lat) AS avg_lat,
        COUNT(*) AS arrest_count
    FROM
        arrest_data
    GROUP BY
        sex, race
    ORDER BY
        arrest_count DESC
""")


In [21]:
# Aggregate by reason_stopped and find the count of each reason
reason_aggregation = spark.sql("SELECT reason_stopped, COUNT(*) as reason_count FROM arrest_data GROUP BY reason_stopped").show()


+--------------------+------------+
|      reason_stopped|reason_count|
+--------------------+------------+
|    CALL FOR SERVICE|         841|
|PRE-EXISTING KNOW...|         348|
|VIOLATION OF PENA...|         126|
|    WATER SAFETY ACT|         201|
|VIOLATION OF TRAN...|       13632|
|SUSPICIOUS PERSON...|         297|
|  CONSENSUAL CONTACT|          36|
|          not_listed|        2387|
|               OTHER|         253|
|VIOLATION OF CITY...|         229|
|MOTOR VEHICLE DRIVER|          43|
|Pre-existing know...|         743|
|Violation of law ...|        6859|
|Moving Traffic Vi...|       15428|
+--------------------+------------+



In [22]:
# Calculate Overall Stop/Arrest Rates:
overall_rate = spark.sql("SELECT COUNT(*) as total_stops FROM arrest_data").collect()[0]['total_stops']

# Display the overall rate
print("Overall Stop/Arrest Rate:", overall_rate)


Overall Stop/Arrest Rate: 41423


In [23]:
#Calculate Group-Specific Stop/Arrest Rates:
#Calculate the stop or arrest rate for each racial or ethnic group.

group_rates = spark.sql("SELECT race, COUNT(*) as group_stops FROM arrest_data GROUP BY race").show()


+--------------------+-----------+
|                race|group_stops|
+--------------------+-----------+
|               WHITE|      13584|
|               BLACK|       9905|
|AMERICAN INDIAN/A...|         20|
|  HISPANIC OR LATINO|      17212|
|             UNKNOWN|        112|
|      MIDDLE EASTERN|        138|
|HAWAIIAN/PACIFIC ...|         22|
|               ASIAN|        427|
|          not_listed|          3|
+--------------------+-----------+



In [24]:
#Calculate Disparity Index:
#Calculate the Disparity Index for each racial or ethnic group.

# Calculate Group-Specific Stop/Arrest Rates:
group_rates = spark.sql("SELECT race, COUNT(*) as group_stops FROM arrest_data GROUP BY race")

# Collect the group_rates DataFrame into a Pandas DataFrame
group_rates_pd = group_rates.toPandas()

# Calculate Overall Stop/Arrest Rates:
overall_rate = group_rates_pd['group_stops'].sum()

# Calculate Disparity Index:
disparity_index = group_rates_pd.assign(disparity_index=group_rates_pd['group_stops'] / overall_rate)

# Display the results
print(disparity_index)


                             race  group_stops  disparity_index
0                           WHITE        13584         0.327934
1                           BLACK         9905         0.239118
2  AMERICAN INDIAN/ALASKAN NATIVE           20         0.000483
3              HISPANIC OR LATINO        17212         0.415518
4                         UNKNOWN          112         0.002704
5                  MIDDLE EASTERN          138         0.003331
6       HAWAIIAN/PACIFIC ISLANDER           22         0.000531
7                           ASIAN          427         0.010308
8                      not_listed            3         0.000072


In [25]:
# Calculate Overall Stop/Arrest Rates:
overall_rate = spark.sql("SELECT COUNT(*) as total_stops FROM arrest_data").collect()[0]['total_stops']

# Display the overall rate
print("Overall Stop/Arrest Rate:", overall_rate)


Overall Stop/Arrest Rate: 41423


In [None]:
from pyspark.sql.functions import count, col, when

# Check for Null Values/'
spark_df.select([count(when(col(c).isNull(), c)).alias(c) for c in spark_df.columns]).show()


In [None]:
record_count = spark_df.count()
print("Total number of records:", record_count)

In [None]:
summary_stats = spark_df.describe()
summary_stats.show()

In [None]:
race_counts = spark_df.groupBy("race").count()
race_counts.show()

In [None]:
total_searches = spark_df.filter(spark_df["searched"] == "YES = 1").count()
print("Total number of searches:", total_searches)

In [None]:
from pyspark.sql.functions import year

# Extract the year from the 'arrest_date' column
spark_df = spark_df.withColumn("arrest_year", year("arrest_date"))

# Aggregate the number of arrests for each year
arrests_by_year = spark_df.groupBy("arrest_year").count().orderBy("arrest_year")

# Show the results
arrests_by_year.show()


In [None]:
# Aggregate the number of arrests by race and gender
arrests_by_race_gender = spark_df.groupBy("race", "sex").count().orderBy("race", "sex")

# Show the results
arrests_by_race_gender.show()


In [None]:
# Aggregate the count of stops by reason and outcome
reasons_for_stops = spark_df.groupBy("reason_stopped", "search_found").count().orderBy("count", ascending=False)

# Show the results
reasons_for_stops.show()


In [None]:
from pyspark.sql.functions import col, concat, lit

# Calculate the percentage of arrests where searches were conducted
total_arrests = spark_df.count()
searches_conducted = spark_df.filter(col("searched") == "YES = 1").count()

# Calculate the percentage of searches where something was found
searches_found = spark_df.filter((col("searched") == "YES = 1") & (col("search_found").isin("WEAPONS", "CASH", "ALCOHOL", "DRUGS"))).count()

# Calculate percentages
search_percentage = (searches_conducted / total_arrests) * 100
found_percentage = (searches_found / searches_conducted) * 100

# Show the results
print("Percentage of arrests where searches were conducted:", search_percentage)
print("Percentage of searches where something was found:", found_percentage)


In [None]:
from pyspark.sql.functions import col

# Calculate the total number of arrests for each race
arrests_by_race = spark_df.groupBy("race").count()

# Calculate the number of searches conducted for each race
searches_by_race = spark_df.filter(col("searched") == "YES = 1").groupBy("race").count()

# Calculate the number of searches where something was found for each race
found_by_race = spark_df.filter((col("searched") == "YES = 1") & (col("search_found").isin("WEAPONS", "CASH", "ALCOHOL", "DRUGS"))).groupBy("race").count()

# Join the DataFrames to calculate percentages
search_percentage_by_race = searches_by_race.join(arrests_by_race.withColumnRenamed("count", "total_arrests"), "race").withColumn("search_percentage", (col("count") / col("total_arrests")) * 100)

found_percentage_by_race = found_by_race.join(searches_by_race.withColumnRenamed("count", "total_searches"), "race").withColumn("found_percentage", (col("count") / col("total_searches")) * 100)

# Show the results
print("Search Percentage by Race:")
search_percentage_by_race.show()

print("Found Percentage by Race:")
found_percentage_by_race.show()


In [None]:
# Import necessary functions for geospatial analysis
from pyspark.sql.functions import col
from pyspark.sql.types import StringType

# Create a new column combining 'lng' and 'lat' as a string
spark_df = spark_df.withColumn("location", concat(col("lng"), lit(","), col("lat")).cast(StringType()))

# Aggregate the count of arrests by location
arrests_by_location = spark_df.groupBy("location").count()

# Show the results
arrests_by_location.show()


In [None]:
import matplotlib.pyplot as plt

# Convert Spark DataFrame to Pandas DataFrame for visualization
pandas_df = spark_df.toPandas()

# Plotting race distribution
race_distribution = pandas_df["race"].value_counts()
race_distribution.plot(kind="bar")
plt.title("Race Distribution")
plt.xlabel("Race")
plt.ylabel("Count")
plt.show()


### Analysis:
#### 1. Aggregations by Month or Day:

The query on arrests by month provides insights into the distribution of arrests across different demographic groups over each month. Here are some findings based on the provided result:

Monthly Distribution:

The data shows the distribution of arrests across different months.
Each row represents a combination of sex, race, and the respective month.
Highest Arrest Months:

January (month 1) appears to have higher arrest counts across various demographic groups.
For example, Hispanic or Latino Males (sex=M, race=HISPANIC OR LATINO), White Males (sex=M, race=WHITE), and Black Males (sex=M, race=BLACK) have relatively high arrest counts in January.
Variation Across Demographic Groups:

Arrest counts vary across sex and race categories for each month.
Different demographic groups may have distinct patterns in terms of arrests, potentially influenced by various factors.
Low Arrest Counts:

Some demographic groups, especially in minority categories, may have lower arrest counts in certain months.
Data Exploration:

Further exploration could involve visualizing the monthly trends using line charts or other time series visualizations.
Analyzing whether certain events or seasons contribute to the observed patterns.
Consideration for Seasonality:

Patterns in arrests could be influenced by seasonal factors, holidays, or specific events that occur at different times of the year.
It's important to note that while these findings provide insights into the monthly distribution of arrests, a more detailed analysis, possibly with visualizations, could reveal additional patterns and context. Domain knowledge and understanding the context of the dataset would contribute to a more comprehensive interpretation of the findings.

#### Notes for findings and Visualiszations: 
- This analysis aims to understand the patterns of arrests by examining the demographic attributes of gender (sex), racial or ethnic background (race), and the temporal aspect represented by monthly trends.
- 

In [None]:
# Register the DataFrame as a temporary SQL table
spark_df.createOrReplaceTempView("arrest_data")

In [None]:
from pyspark.sql.functions import month, dayofmonth

result_by_month = spark.sql("""
    SELECT
        sex,
        race,
        month(arrest_date) AS arrest_month,
        COUNT(*) AS arrest_count
    FROM
        arrest_data
    GROUP BY
        sex, race, arrest_month
    ORDER BY
        arrest_month, arrest_count DESC
""")
print('This analysis aims to understand the patterns of arrests by examining the demographic attributes of gender (sex), racial or ethnic background (race), and the temporal aspect represented by monthly trends.')
result_by_month.show(truncate=False)

#### 2. Temporal Trends:

#### Notes for findings and Visualiszations: 

The query on temporal trends analyzes the distribution of arrests over time across different demographic groups (combinations of sex and race). Here are some findings based on the provided result:

Temporal Distribution:

The data shows the temporal trends in arrests from January 1, 2015, onward.
Arrests on Specific Dates:

The table reveals the number of arrests on specific dates for different demographic groups.
For example, on January 1, 2015, there were arrests for Hispanic or Latino Males (sex=M, race=HISPANIC OR LATINO), Black Males (sex=M, race=BLACK), White Males (sex=M, race=WHITE), and White Females (sex=F, race=WHITE).
Variation Across Dates:

The count of arrests varies across different dates and demographic groups.
Temporal Patterns:

Patterns in arrests may emerge over time, reflecting factors such as day of the week, holidays, or specific events.
Data Exploration:

Further exploration of the data could involve visualizing temporal trends using line charts or other time series visualizations.
Identifying spikes or patterns on specific dates may lead to insights or hypotheses about the reasons behind those trends.
Consideration for Time of Day:

It might be beneficial to explore temporal patterns not only by date but also by the time of day to uncover patterns related to specific hours.
It's important to note that a more detailed analysis, possibly with visualizations, could provide a clearer understanding of the temporal trends in arrests for different demographic groups. Additionally, domain knowledge and context about the dataset could contribute to a more comprehensive interpretation of the findings.







In [None]:
result_temporal_trends = spark.sql("""
    SELECT
        sex,
        race,
        arrest_date,
        COUNT(*) AS arrest_count
    FROM
        arrest_data
    GROUP BY
        sex, race, arrest_date
    ORDER BY
        arrest_date, arrest_count DESC
""")
print('This query analyzes temporal trends by providing the count for each combination of sex, race, and arrest date.')
result_temporal_trends.show(truncate=False)

#### 3. Reasons for Arrest:

#### Notes for findings and Visualiszations: 


The query on reasons for arrest provides insights into the distribution of different reasons for arrests across various demographic groups (combinations of sex and race). Here are some findings based on the provided result:

Top Reasons for Arrest:

For Hispanic or Latino Males (sex=M, race=HISPANIC OR LATINO), the most common reasons for arrest are "Moving Traffic Violation" (5,231 arrests) and "VIOLATION OF TRANSPORTATION CODE/VEHICLE LAWS" (4,367 arrests).
For Black Males and White Males, "Moving Traffic Violation" and "VIOLATION OF TRANSPORTATION CODE/VEHICLE LAWS" are also among the top reasons.
Traffic Violations:

"Moving Traffic Violation" and "VIOLATION OF TRANSPORTATION CODE/VEHICLE LAWS" appear to be prominent reasons for arrest across different demographic groups.
Violation of Law Other Than Traffic:

Another category is "Violation of law other than traffic," which contributes to a significant number of arrests for Hispanic or Latino Males and White Males.
Not Listed Reasons:

Some records have reasons for arrest marked as "not_listed," indicating that the specific reason is not provided.
Gender and Race Dynamics:

The reasons for arrest may vary based on both gender and race, and the analysis can help identify patterns and potential areas for further investigation.
Law Enforcement Practices:

The findings may reflect law enforcement practices, and further analysis could involve exploring the context behind the arrests in each category.
As with any analysis, the interpretation may benefit from additional context, domain knowledge, and visualizations. Consider visualizing the data to better understand the patterns and relationships between different demographic groups and the reasons for their arrests.








In [None]:
result_reasons_for_arrest = spark.sql("""
    SELECT
        sex,
        race,
        reason_stopped,
        COUNT(*) AS arrest_count
    FROM
        arrest_data
    GROUP BY
        sex, race, reason_stopped
    ORDER BY
        arrest_count DESC
""")
print('This query explores the reasons for arrest, providing the count for each combination of sex, race, and reason_stopped.')
result_reasons_for_arrest.show(truncate=False)


#### 4. Search Outcomes:

#### Notes for findings and Visualiszations: 

The query on search outcomes provides insights into the distribution of different search outcomes for various demographic groups (combinations of sex and race). Here are some findings based on the provided result:

Common Search Outcomes:

The column search_found represents the outcomes of searches, such as "NOTHING," "not_listed," "DRUGS," "OTHER," "CASH," and so on.
The majority of searches across all demographic groups resulted in "NOTHING."
Demographic Patterns:

Hispanic or Latino Males (sex=M, race=HISPANIC OR LATINO) have the highest count of searches resulting in "NOTHING" (7,561), followed by White Males and Black Males.
Different demographic groups may exhibit varying patterns in search outcomes.
Variation in Outcomes:

There is a variety of outcomes, including "not_listed," "DRUGS," "OTHER," and "CASH." These outcomes may provide insights into the reasons for searches and the subsequent results.
Potential Areas of Interest:

Analyzing the outcomes of searches can be crucial for understanding law enforcement practices and identifying any disparities in treatment based on demographic factors.
Further Analysis:

It would be valuable to explore the reasons behind different search outcomes, especially for cases where searches did not result in findings ("NOTHING") or when outcomes are not explicitly listed ("not_listed").
As with any analysis, the interpretation may be enhanced with additional context, domain knowledge, and visualizations. Consider exploring visual representations of these search outcomes to better understand the patterns and identify any areas that require closer examination

In [None]:
result_search_outcomes = spark.sql("""
    SELECT
        sex,
        race,
        search_found,
        COUNT(*) AS arrest_count
    FROM
        arrest_data
    GROUP BY
        sex, race, search_found
    ORDER BY
        arrest_count DESC
""")
print('This query investigates outcomes of searches by providing the count for each combination of sex, race, and search_found.')
result_search_outcomes.show(truncate=False)

#### 5. Geospatial Analysis:

#### Notes for findings and Visualiszations: 

The geospatial analysis query provides insights into the average geographical locations (longitude and latitude) and arrest counts for different demographic groups (combinations of sex and race). Here are the findings based on the provided result:

Average Geographical Locations:

The columns avg_lng and avg_lat represent the average longitude and latitude, respectively, for each combination of sex and race.
For example, Hispanic or Latino Males (sex=M, race=HISPANIC OR LATINO) have an average longitude of approximately 7.40 and an average latitude of approximately 85.78.
Arrest Counts:

The column arrest_count represents the total number of arrests for each combination of sex and race.
For example, Hispanic or Latino Males have the highest arrest count with 13,613 arrests, followed by White Males with 9,408 arrests and Black Males with 7,713 arrests.
Variation in Geographical Locations:

There is variation in the average geographical locations across different demographic groups. This could be indicative of different patterns of arrests in different areas.
Demographic Distribution:

The table provides a breakdown of arrests, considering both demographic factors (sex and race) and geographical factors (average longitude and latitude).
Potential Insights:

Further analysis could involve visualizing these geospatial patterns on a map to identify clusters or trends in arrests for specific demographic groups.
Keep in mind that these findings are based on the provided data, and the interpretation may vary based on the context and domain knowledge.

In [None]:
result_geospatial_analysis = spark.sql("""
    SELECT
        sex,
        race,
        AVG(lng) AS avg_lng,
        AVG(lat) AS avg_lat,
        COUNT(*) AS arrest_count
    FROM
        arrest_data
    GROUP BY
        sex, race
    ORDER BY
        arrest_count DESC
""")
print('This query performs geospatial analysis by providing the average longitude, latitude, and count for each combination of sex and race.')
result_geospatial_analysis.show(truncate=False)

#### 6. Searches Based On:

#### Notes for findings and Visualiszations: 

Result Explanation:

The result provides a tabular representation of the distribution of different reasons for conducting searches, broken down by gender, race, and search reason.
Each row represents a unique combination of gender, race, and search reason, and the corresponding count of arrests for that combinatio
ple Insiata):

The table might show, for instance, that there are a certain number of arrests for searches conducted based on "Probable Cause" for Hispanic or Latino Males (sex=M, race=HISPANIC OR LATINO).
Another row might indicate the count of arrests where searches were conducted based on "Consent" for White Females (sex=F, race=WHITE).
This analysis allows for an understanding of how searches are distributed across different demographic groups and the reasons for conducting those searches within the dataset.

In [None]:
result_searches_based_on = spark.sql("""
    SELECT
        sex,
        race,
        search_based_on,
        COUNT(*) AS arrest_count
    FROM
        arrest_data
    GROUP BY
        sex, race, search_based_on
    ORDER BY
        arrest_count DESC
""")
print('This query analyzes the distribution of different reasons for conducting searches, providing the count for each combination of sex, race, and search_based_on.')
result_searches_based_on.show(truncate=False)

#### 7. Demographic Analysis:

#### Notes for findings and Visualiszations: Observations:

The table provides a detailed breakdown of arrests, allowing for an analysis of law enforcement interactions with individuals based on their gender and race.
The counts represent the frequency of arrests for each demographic grou
ple Insiata):

Hispanic or Latino Males (sex=M, race=HISPANIC OR LATINO):
Arrest Count: 13,613.
White Females (sex=F, race=WHITE):
Arrest Count: 4,176.
Black Males (sex=M, race=BLACK):
Arrest Count: 7,713.
Considerations:

Use this data for demographic profiling and understanding the distribution of arrests across different groups.
Identify any disparities in arrest counts based on gender and race.
This dataset can be a starting point for further analysis, such as examining arrest rates, trends over time, or geographic patterns.
Limitations:

The table provides a count of arrests but does not include additional contextual information.
Demographic analysis should be approached with caution to avoid perpetuating stereotypes or biases.

In [None]:
result_demographic_analysis = spark.sql("""
    SELECT
        sex,
        race,
        COUNT(*) AS arrest_count
    FROM
        arrest_data
    GROUP BY
        sex, race
    ORDER BY
        arrest_count DESC
""")
print('This query performs demographic analysis by providing the count for each combination of sex and race.')
result_demographic_analysis.show(truncate=False)


#### 8. Average Arrest Location

#### Notes for findings and Visualiszations: 
Observations:
Geographical Insights:

The table allows you to compare the average arrest locations for different demographic groups.
For example, Hispanic or Latino males (sex=M, race=HISPANIC OR LATINO) have an average arrest location with a longitude of approximately 7.40 and a latitude of approximately 85.78.
Temporal Insights:

The maximum and minimum arrest dates provide insights into the temporal distribution of arrests within each demographic group.
For example, the data suggests that arrests for Hispanic or Latino males span from January 1, 2015, to December 31, 2020.
Arrest Volumes:

The total number of records indicates the arrest volume for each demographic group.
For example, Hispanic or Latino males have the highest total arrest count, with approximately 13,

Interpretation

Hispanic or Latino Males (sex=M, race=HISPANIC OR LATINO):

Average arrest location: Longitude 7.40, Latitude 85.78.
Arrest date range: January 1, 2015, to December 31, 2020.
Total arrest count: 13,613.
White Females (sex=F, race=WHITE):

Average arrest location: [Average values for longitude and latitude].
Arrest date range: [Date range].
Total arrest count: [Count].
Considerations:
Comparative Analysis:

Use the table to compare arrest patterns between different demographic groups.
Temporal Trends:

Explore how arrest patterns have evolved over time within each demographic category.
Geographical Disparities:

Investigate if there are geographical disparities in arrests across different gender and race groups.
Limitations:
Geographical Precision:

The average longitude and latitude represent central points and may not capture the full spatial variability within a demographic category.
Temporal Resolution:

The date range provides an overview but might not capture finer temporal details.
This table serves as a valuable tool for understanding the spatial and temporal dimensions of arrests within different demographic groups, enabling further analysis and insights into law enforcement activities.





613 records.

In [None]:

# Run SQL queries on the DataFrame
result = spark.sql("""
    SELECT
        AVG(lng) AS avg_lng,
        AVG(lat) AS avg_lat,
        MAX(arrest_date) AS max_arrest_date,
        MIN(arrest_date) AS min_arrest_date,
        COUNT(*) AS total_records
    FROM
        arrest_data
    GROUP BY
        sex, race
""")
print('The table allows us to compare and contrast different demographic groups based on their average geographical location, arrest date ranges, and arrest volumes.')
# Show the result
result.show()

### 9. Geographical Distribution 

#### Notes for findings and Visualiszations: 


Key Insights:
Location Information:

The table includes the average longitude and latitude values for each gender and racial or ethnic category, providing an indication of the central location associated with arrests.
Arrest Counts (arrest_count):

Indicates the total number of arrests for each combination of gender and race.
The table is ordered by the arrest count in descending order.
Observations:
Dominant Contributors to Arrests:

The table shows which gender and racial or ethnic categories contribute the most to the total arrest count.
In this example, Hispanic or Latino males have the highest average arrest count, followed by white males and black males.
Geographical Context:

The average longitude and latitude values provide an approximate central location associated with arrests for each demogr
aInterpretation 
Hispanic or Latino Males (sex=M, race=HISPANIC OR LATINO):

Highest average arrest count.
Associated with a specific geographical location, characterized by an average longitude of approximately 7.40 and an average latitude of approximately 85.78.
White Males (sex=M, race=WHITE):

Second-highest average arrest count.
Associated with a different geographical location, characterized by an average longitude of approximately 7.41 and an average latitude of approximately 85.80.
Black Males (sex=M, race=BLACK):

Third-highest average arrest count.
Associated with another geographical location, characterized by an average longitude of approximately 7.77 and an average latitude of approximately 85.78.
Considerations:
Spatial Distribution:

The table allows for an exploration of the spatial distribution of arrests, highlighting areas where certain demographic groups are more frequently arrested.
Potential Hotspots:

Areas with higher average arrest counts may indicate potential hotspots for law enforcement activities related to specific demographic categories.
Limitations:
Geographical Precision:

The provided averages represent central points and might not accurately capture the spatial variability within a demographic category.
Context and External Factors:

The analysis is based solely on arrest counts and geographical averages, and external factors influencing arrest locations are not considered here.
This table serves as a tool for understanding the geographical distribution of arrests across different demographic categories, offering insights into potential patterns and disparities in law enforcement activities.ic category.
Ex

In [None]:
result_location = spark.sql("""
    SELECT
        sex,
        race,
        AVG(lng) AS avg_lng,
        AVG(lat) AS avg_lat,
        COUNT(*) AS arrest_count
    FROM
        arrest_data
    GROUP BY
        sex, race
    ORDER BY
        arrest_count DESC
""")

result_location.show()

### 10. highlighting the count and percentage of arrests based on 'sex' and 'race'

#### Notes for findings and Visualiszations: 
Hispanic or Latino Males:

Males of Hispanic or Latino ethnicity contribute the highest percentage (32.86%) to the total arrest count, indicating a significant presence in the dataset.
White and Black Males:

White males (22.71%) and black males (18.62%) are the next two significant contributors to the total arrest count.
Gender Disparities:

The table provides insights into potential gender disparities in arrests, with a breakdown of contributions from both males and females.
Considerations:
Limitations:

The analysis is based on the available data and may not capture the entire context of arrests. Further exploration and context-specific knowledge may be needed.
Policy and Social Implications:

Discussions around the percentages can lead to considerations of policy implications and potential areas for further examination of law enforcement practices.
This table serves as a summary of the distribution of arrests across different demographic categories, offering insights into the relative contributions of each group to the overall arrest count.

In [None]:
result_with_percentages = spark.sql("""
    SELECT
        sex,
        race,
        COUNT(*) AS arrest_count,
        (COUNT(*) / SUM(COUNT(*)) OVER ()) * 100 AS arrest_percentage
    FROM
        arrest_data
    GROUP BY
        sex, race
    ORDER BY
        arrest_count DESC
""")

result_with_percentages.show()

#### 11. Percentage by year

#### Notes for findings and Visualiszations: 

This table serves as a valuable tool for understanding how arrests are distributed across gender and racial or ethnic categories over multiple years, offering insights into potential trends or shifts in law enforcement practices.


Yearly Distribution:

The table is segmented by arrest year, providing insights into changes in arrest patterns over time.
Dominant Contributors Each Year:

For each year, the table shows which gender and racial or ethnic categories contribute the most to the total arrest c

##### Considerations:
Yearly Trends:

The table facilitates the examination of trends in arrests over time, providing insights into whether certain demographic categories consistently contribute more to arrests or if there are variations.
Potential Changes in Enforcement:

Substantial changes in percentages from one year to the next might indicate shifts in law enforcement priorities, policies, or practices.
Intersectionality:

The intersection of gender and race allows for a nuanced understanding of arrest patterns, considering the unique experiences of different demographic groups.ount.

In [None]:
result_by_year = spark.sql("""
    SELECT
        sex,
        race,
        YEAR(arrest_date) AS arrest_year,
        COUNT(*) AS arrest_count,
        (COUNT(*) / SUM(COUNT(*)) OVER (PARTITION BY YEAR(arrest_date))) * 100 AS arrest_percentage
    FROM
        arrest_data
    GROUP BY
        sex, race, arrest_year
    ORDER BY
        arrest_year, arrest_count DESC
""")

result_by_year.show(truncate=False)

In [28]:
# Convert Spark DataFrame to Pandas DataFrame
pandas_df = df.toPandas()

# Save Pandas DataFrame as JSON file
pandas_df.to_json('austinArrests.json', orient='records', lines=True)
