# Chicago Crimes

In [2]:
import pyspark.sql.functions as F

In [3]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

In [7]:
from IPython.core.display import HTML
display(HTML("<style>pre { white-space: pre !important; }</style>"))

### Load data

In [10]:
data_path = '/home/lorenzo/Desktop/chicago_crimes_parquet'
df = spark.read.format('parquet').load(data_path)

In [11]:
df.printSchema()

root
 |-- ID: integer (nullable = true)
 |-- Case_Number: string (nullable = true)
 |-- Date: timestamp (nullable = true)
 |-- Block: string (nullable = true)
 |-- IUCR: string (nullable = true)
 |-- Primary_Type: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Location_Description: string (nullable = true)
 |-- Arrest: boolean (nullable = true)
 |-- Domestic: boolean (nullable = true)
 |-- Beat: integer (nullable = true)
 |-- District: integer (nullable = true)
 |-- Ward: integer (nullable = true)
 |-- Community_Area: integer (nullable = true)
 |-- FBI_Code: string (nullable = true)
 |-- X_Coordinate: integer (nullable = true)
 |-- Y_Coordinate: integer (nullable = true)
 |-- Year: integer (nullable = true)
 |-- Updated_On: string (nullable = true)
 |-- Latitude: double (nullable = true)
 |-- Longitude: double (nullable = true)
 |-- Location: string (nullable = true)



In [12]:
df.show(5)

+-------+-----------+-------------------+------------------+----+------------+--------------------+--------------------+------+--------+----+--------+----+--------------+--------+------------+------------+----+--------------------+------------+-------------+--------------------+
|     ID|Case_Number|               Date|             Block|IUCR|Primary_Type|         Description|Location_Description|Arrest|Domestic|Beat|District|Ward|Community_Area|FBI_Code|X_Coordinate|Y_Coordinate|Year|          Updated_On|    Latitude|    Longitude|            Location|
+-------+-----------+-------------------+------------------+----+------------+--------------------+--------------------+------+--------+----+--------+----+--------------+--------+------------+------------+----+--------------------+------------+-------------+--------------------+
|1734630|    G539147|2001-09-08 16:25:00|035XX W CHICAGO AV|0313|     ROBBERY|ARMED: OTHER DANG...|              STREET| false|   false|1121|      11|null|     

### Crimes EDA

In [14]:
df.groupBy('Primary_Type').count().orderBy('count', ascending=False).show(10)

+-------------------+-------+
|       Primary_Type|  count|
+-------------------+-------+
|              THEFT|1418315|
|            BATTERY|1231958|
|    CRIMINAL DAMAGE| 771382|
|          NARCOTICS| 711616|
|      OTHER OFFENSE| 418811|
|            ASSAULT| 418479|
|           BURGLARY| 388008|
|MOTOR VEHICLE THEFT| 314113|
| DECEPTIVE PRACTICE| 265487|
|            ROBBERY| 255565|
+-------------------+-------+
only showing top 10 rows



In [17]:
df.groupBy('Location_Description').count().orderBy('count', ascending=False).show(10)

+--------------------+-------+
|Location_Description|  count|
+--------------------+-------+
|              STREET|1770352|
|           RESIDENCE|1144515|
|           APARTMENT| 698074|
|            SIDEWALK| 665439|
|               OTHER| 256867|
|PARKING LOT/GARAG...| 193788|
|               ALLEY| 150887|
|SCHOOL, PUBLIC, B...| 142327|
|    RESIDENCE-GARAGE| 131628|
|  SMALL RETAIL STORE| 119267|
+--------------------+-------+
only showing top 10 rows



In [24]:
arrest_count = df.filter(df.Arrest == 'true').count()
no_arrest_count = df.filter(df.Arrest == 'false').count()

arrest_ratio = arrest_count / (no_arrest_count + no_arrest_count)
print(f'Arrest ratio: {arrest_ratio}')

Arrest ratio: 0.1920836784229473


In [20]:
df.groupBy('Year').count().orderBy('Year', ascending=False).show(10)

+----+------+
|Year| count|
+----+------+
|2018|232731|
|2017|268517|
|2016|269286|
|2015|264325|
|2014|275442|
|2013|307200|
|2012|336061|
|2011|351824|
|2010|370317|
|2009|392715|
+----+------+
only showing top 10 rows

