# Chicago Crimes Analysis

Copyright 2022 Google LLC

 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at

      http://www.apache.org/licenses/LICENSE-2.0

 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.

### 1) Create a Spark sesssion

In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
  .appName("Chicago Crimes Analysis")\
  .enableHiveSupport()\
  .getOrCreate()

In [None]:
spark

### 2) Load data from a public dataset into a dataframe

In [None]:
# Read data from BigQuery
baseDF = spark.read \
  .format('bigquery') \
  .load('bigquery-public-data.chicago_crime.crime')

In [None]:
# Print schema
baseDF.printSchema()

In [None]:
display(baseDF.show(3,truncate=False))

In [None]:
baseDF.createOrReplaceTempView("chicago_crimes_raw")

### 3) Transform the dataset

#### 3.1. Deduplicate

In [None]:
# 3.1.a. Read raw crimes into a dataframe
rawDF=spark.sql("select * from chicago_crimes_raw")

In [None]:
# 3.1.b. Dedupe the data
dedupedRawDF=rawDF.dropDuplicates()

In [None]:
dedupedRawDF.printSchema()

#### 3.2. Augment the data with temporal attributes

In [None]:
# 3.2.a. Create a UDF to return day of the week
from pyspark.sql.types import *
from pyspark.sql.functions import *

def getDayNameFromWeekdayNbr(weekday):
    if weekday == 0:
        return "Monday"
    if weekday == 1:
        return "Tuesday"
    if weekday == 2:
        return "Wednesday"
    if weekday == 3:
        return "Thursday"
    if weekday == 4:
        return "Friday"
    if weekday == 5:
        return "Saturday"
    if weekday == 6:
        return "Sunday"

udf_getDayNameFromWeekdayNbr = udf(getDayNameFromWeekdayNbr, StringType())

In [None]:
# 3.2.b. Add a column case_timestamp with case_dt_tm formatted to timestamp and create a temporary view on this
augmentedDF=spark.sql("select * from chicago_crimes_raw").withColumn("case_timestamp",to_timestamp("date","MM/dd/yyyy hh:mm:ss"))

In [None]:
augmentedDF.createOrReplaceTempView("crimes_raw_temp_with_timestamp")

In [None]:
augmentedDF.show(2)

In [None]:
augmentedDF.printSchema()

In [None]:
# 3.2.c. Add some temporal attributes using Spark date features
curatedInitialDF = spark.sql("select *, month(case_timestamp) as case_month,dayofmonth(case_timestamp) as case_day_of_month, hour(case_timestamp) as case_hour, dayofweek(case_timestamp) as case_day_of_week_nbr from crimes_raw_temp_with_timestamp")

In [None]:
curatedInitialDF.show(2)

In [None]:
# 3.2.d. Lets use the UDF we created to add the day of the week name
curatedDF=curatedInitialDF.withColumn("case_day_of_week_name",udf_getDayNameFromWeekdayNbr("case_day_of_week_nbr"))

In [None]:
curatedDF.printSchema()

In [None]:
curatedDF.show(2,truncate=False)

In [None]:
curatedDF.createOrReplaceTempView("chicago_crimes_temp_view")

### 4) Analyze crimes

#### 4.1. Crimes by year

In [None]:
crimesByYearDF=spark.sql("SELECT year,count(*) AS crime_count FROM chicago_crimes_temp_view GROUP BY year ORDER BY year;")
crimesByYearDF.show()

In [None]:
# Convert results to a Pandas dataframe for visualization
crimesByYearPDF = crimesByYearDF.toPandas()

In [None]:
crimesByYearPDF

In [None]:
import matplotlib.pyplot as plt

In [None]:
crimesByYearPDF.plot.bar(x='year', y='crime_count', rot=0,figsize=(12, 12))

#### 4.2. Crimes count by day

In [None]:

crimesByDayDF=spark.sql("SELECT case_day_of_week_name as day,count(*) AS crime_count FROM chicago_crimes_temp_view GROUP BY case_day_of_week_name ORDER BY case_day_of_week_name;")
crimesByDayDF.show()

In [None]:
# Convert results to a Pandas dataframe for visualization
crimesByDayPDF = crimesByDayDF.toPandas()

In [None]:
crimesByDayPDF.plot.pie(y='crime_count', x='day', figsize=(12,12),title='Crimes by day of week', labels = ['Monday','Tuesday','Wednesday','Thuesday','Friday','Saturday','Sunday'],autopct='%1.0f%%')