# Challenge

## Download and install Spark

In [None]:
!ls

In [None]:
#!apt-get update
#!apt-get install openjdk-8-jdk-headless -qq > /dev/null
#!wget -q http://archive.apache.org/dist/spark/spark-2.3.1/spark-2.3.1-bin-hadoop2.7.tgz
#!tar xf spark-2.3.1-bin-hadoop2.7.tgz
#!pip install -q findspark

## Setup environment

In [None]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.3.1-bin-hadoop2.7"

import findspark
findspark.init()
from pyspark import SparkContext
sc = SparkContext.getOrCreate()

import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate() 
spark

## Downloading and preprocessing Chicago's Reported Crime Data

In [None]:
#!wget https://data.cityofchicago.org/api/views/ijzp-q8t2/rows.csv?accessType=DOWNLOAD
#!ls -l

In [None]:
#!mv rows.csv\?accessType\=DOWNLOAD reported-crimes.csv
#!ls -l

In [None]:
from pyspark.sql.functions import to_timestamp,col,lit
rc = spark.read.csv('reported-crimes.csv',header=True).withColumn('Date',to_timestamp(col('Date'),'MM/dd/yyyy hh:mm:ss a')).filter(col('Date') <= lit('2018-11-11'))
rc.show(5)

## Challenge questions

**What is the most frequently reported non-criminal activity?**

In [None]:
rc.cache()
rc.count()

In [None]:
rc.show(5)

In [None]:
rc.select(col('Primary Type')).distinct().count()       # number of unique rows

In [None]:
rc.select(col('Primary Type')).distinct().orderBy(col('Primary Type')).show(35, truncate=False)

In [None]:
nc = rc.filter( (col('Primary Type') == 'NON - CRIMINAL') | (col('Primary Type') == 'NON-CRIMINAL') | (col('Primary Type') == 'NON-CRIMINAL (SUBJECT SPECIFIED)') )
nc.show(50)

In [None]:
nc.groupBy(col('Description')).count().orderBy('count',ascending=False).show(truncate=False)        # groupBy needs aggregation function, count()

**Using a bar chart, plot which day of the week has the most number of reported crime. 
**

In [None]:
from pyspark.sql.functions import dayofweek

In [None]:
help(dayofweek)     # dayofweek(col), extracts the day of the week of a given date as integer

In [None]:
rc.show(5)

In [None]:
rc.select(col('Date'),dayofweek(col('Date'))).show(5)
# Looking at Simple Date Format for Java
# https://docs.oracle.com/javase/8/docs/api/java/text/SimpleDateFormat.html
# E stands for day name in week
from pyspark.sql.functions import date_format

In [None]:
help(date_format)           # (date,format) converts a date/timestamp/string to a value of string in the format specified


In [None]:
rc.select(col('Date'), dayofweek(col('Date')), date_format(col('Date'), 'E')).show(5)       # E is date format

In [None]:
rc.groupBy( date_format(col('Date'), 'E')).count().orderBy('count', ascending=False)

In [None]:
rc.groupBy( date_format(col('Date'),'E')).count().collect()     # collect allows access to row objects

In [None]:
dow = [x[0] for x in rc.groupBy( date_format(col('Date'),'E')).count().collect()]   # day of week is 1st column of dataframe rc
dow

In [None]:
cnt = [x[1] for x in rc.groupBy( date_format(col('Date'),'E')).count().collect()]  # count is 2nd column of dataframe rc
cnt

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
cp = pd.DataFrame({'Day_of_week': dow, 'Count': cnt})
cp.head(7)

In [None]:
# plot in descending order of count
cp.sort_values('Count', ascending=False).plot(kind='bar', color='olive', x='Day_of_week', y='Count')
# if color is not specified, each bar gets its own color!
plt.xlabel('Day of the week')
plt.ylabel('Number of Reported Crimes')
plt.title('Number of reported crimes per day of week from 2001 to present')