# Chicago Crimes Data
This dataset reflects reported incidents of crime (with the exception of murders where data exists for each victim) that occurred in the City of Chicago from 2001 to present, minus the most recent seven days.

Link to download: https://data.cityofchicago.org/Public-Safety/Crimes-2001-to-present/ijzp-q8t2

In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
spark = SparkSession.builder.getOrCreate()

In [2]:
data_path = '/home/lorenzo/Desktop/chicago_crimes.csv'

df = spark.read.format('csv').option('header', 'true')\
                             .option('inferSchema', 'true')\
                             .load(data_path)

In [3]:
df = df.withColumnRenamed('Case Number',  'Case_Number') \
        .withColumnRenamed('Primary Type',  'Primary_Type') \
        .withColumnRenamed('Location Description',  'Location_Description') \
        .withColumnRenamed('Community Area',  'Community_Area') \
        .withColumnRenamed('FBI Code',  'FBI_Code') \
        .withColumnRenamed('X Coordinate',  'X_Coordinate') \
        .withColumnRenamed('Y Coordinate',  'Y_Coordinate') \
        .withColumnRenamed('Updated On',  'Updated_On') \
        \
        .withColumn('Date',F.to_timestamp(F.col('Date'),'MM/dd/yyyy hh:mm:ss a')) \
        .filter(F.col('Date') <= F.lit('2018-11-11'))

In [4]:
df.printSchema()

root
 |-- ID: integer (nullable = true)
 |-- Case_Number: string (nullable = true)
 |-- Date: timestamp (nullable = true)
 |-- Block: string (nullable = true)
 |-- IUCR: string (nullable = true)
 |-- Primary_Type: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Location_Description: string (nullable = true)
 |-- Arrest: boolean (nullable = true)
 |-- Domestic: boolean (nullable = true)
 |-- Beat: integer (nullable = true)
 |-- District: integer (nullable = true)
 |-- Ward: integer (nullable = true)
 |-- Community_Area: integer (nullable = true)
 |-- FBI_Code: string (nullable = true)
 |-- X_Coordinate: integer (nullable = true)
 |-- Y_Coordinate: integer (nullable = true)
 |-- Year: integer (nullable = true)
 |-- Updated_On: string (nullable = true)
 |-- Latitude: double (nullable = true)
 |-- Longitude: double (nullable = true)
 |-- Location: string (nullable = true)



In [5]:
data_path = '/home/lorenzo/Desktop/chicago_crimes_parquet'
df.write.parquet(data_path, 'overwrite')

In [6]:
df2 = spark.read.format('parquet').load(data_path)

In [7]:
df2.printSchema()

root
 |-- ID: integer (nullable = true)
 |-- Case_Number: string (nullable = true)
 |-- Date: timestamp (nullable = true)
 |-- Block: string (nullable = true)
 |-- IUCR: string (nullable = true)
 |-- Primary_Type: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Location_Description: string (nullable = true)
 |-- Arrest: boolean (nullable = true)
 |-- Domestic: boolean (nullable = true)
 |-- Beat: integer (nullable = true)
 |-- District: integer (nullable = true)
 |-- Ward: integer (nullable = true)
 |-- Community_Area: integer (nullable = true)
 |-- FBI_Code: string (nullable = true)
 |-- X_Coordinate: integer (nullable = true)
 |-- Y_Coordinate: integer (nullable = true)
 |-- Year: integer (nullable = true)
 |-- Updated_On: string (nullable = true)
 |-- Latitude: double (nullable = true)
 |-- Longitude: double (nullable = true)
 |-- Location: string (nullable = true)



In [10]:
df2.head(1)

[Row(ID=1734630, Case_Number='G539147', Date=datetime.datetime(2001, 9, 8, 16, 25), Block='035XX W CHICAGO AV', IUCR='0313', Primary_Type='ROBBERY', Description='ARMED: OTHER DANGEROUS WEAPON', Location_Description='STREET', Arrest=False, Domestic=False, Beat=1121, District=11, Ward=None, Community_Area=None, FBI_Code='03', X_Coordinate=1152805, Y_Coordinate=1905150, Year=2001, Updated_On='08/17/2015 03:03:40 PM', Latitude=41.895587138, Longitude=-87.714231305, Location='(41.895587138, -87.714231305)')]