In [0]:
from pyspark.sql.functions import to_date, col, desc, count, lower, lit

data = spark.read.csv('dbfs:/FileStore/crimes_2017.csv', header=True, inferSchema=True).withColumn('Date', to_date(col('Date'), 'MM/dd/yyyy hh:mm:ss a'))
data.show(5)


+--------+-----------+----------+--------------------+----+-------------------+--------------------+--------------------+------+--------+----+--------+----+--------------+--------+------------+------------+----+--------------------+--------+---------+--------+
|      ID|Case Number|      Date|               Block|IUCR|       Primary Type|         Description|Location Description|Arrest|Domestic|Beat|District|Ward|Community Area|FBI Code|X Coordinate|Y Coordinate|Year|          Updated On|Latitude|Longitude|Location|
+--------+-----------+----------+--------------------+----+-------------------+--------------------+--------------------+------+--------+----+--------+----+--------------+--------+------------+------------+----+--------------------+--------+---------+--------+
|11227634|   JB147599|2017-08-26| 001XX W RANDOLPH ST|0281|CRIM SEXUAL ASSAULT|      NON-AGGRAVATED|         HOTEL/MOTEL| false|   false| 122|       1|  42|            32|      02|        null|        null|2017|02/11/

In [0]:
# Group by date and count the number of crimes per day
top_dates = data.groupBy('Date').count().orderBy(desc('count'))

# Display top 5 dates with the most reported crimes
top_dates.show(5)


+----------+-----+
|      Date|count|
+----------+-----+
|2017-01-01| 1256|
|2017-08-01|  965|
|2017-07-01|  937|
|2017-08-05|  928|
|2017-08-04|  926|
+----------+-----+
only showing top 5 rows



In [0]:
# Get the date with the most reported crimes
most_crimes_date = top_dates.first()['Date']

# Filter data for that date and count occurrences of each crime
top_crimes = data.filter(col('Date') == lit(most_crimes_date)).groupBy('Primary Type').count().orderBy(desc('count'))

# Display top 3 crimes on the most crime-heavy day
top_crimes.show(3)


+------------------+-----+
|      Primary Type|count|
+------------------+-----+
|DECEPTIVE PRACTICE|  208|
|           BATTERY|  207|
|             THEFT|  187|
+------------------+-----+
only showing top 3 rows



In [0]:
from pyspark.sql.functions import month

# Extract month from 'Date' and count crimes per month
crimes_by_month = data.withColumn('Month', month(col('Date'))).groupBy('Month').count().orderBy(desc('count'))

# Display the month with the most crimes
crimes_by_month.show(1)


+-----+-----+
|Month|count|
+-----+-----+
|    7|24889|
+-----+-----+
only showing top 1 row



In [0]:
from pyspark.sql.functions import lower

# Filter crimes where Description contains 'gun'
gun_crimes = data.filter(lower(col('Description')).contains('gun'))

# Count total crimes and gun crimes
total_crimes = data.count()
gun_crimes_count = gun_crimes.count()

# Calculate the percentage of crimes involving a gun
gun_crime_percentage = (gun_crimes_count / total_crimes) * 100

# Output unique Description values containing 'gun'
gun_crimes.select('Description').distinct().show(truncate=False)

# Output percentage of crimes involving a gun
print(f"Percentage of crimes involving a gun: {gun_crime_percentage:.2f}%")


+--------------------------------------------------+
|Description                                       |
+--------------------------------------------------+
|AGGRAVATED POLICE OFFICER - HANDGUN               |
|UNLAWFUL SALE HANDGUN                             |
|UNLAWFUL POSS OF HANDGUN                          |
|ATTEMPT ARMED - HANDGUN                           |
|ARMED - HANDGUN                                   |
|AGGRAVATED: HANDGUN                               |
|GUN OFFENDER: DUTY TO REGISTER                    |
|ARMED: HANDGUN                                    |
|AGGRAVATED - HANDGUN                              |
|UNLAWFUL USE HANDGUN                              |
|AGGRAVATED PO: HANDGUN                            |
|GUN OFFENDER NOTIFICATION-NO CONTACT              |
|ATTEMPT: ARMED-HANDGUN                            |
|GUN OFFENDER: ANNUAL REGISTRATION                 |
|AGGRAVATED DOMESTIC BATTERY: HANDGUN              |
|AGG PRO.EMP: HANDGUN                         