In [1]:
from pyspark.sql import SQLContext
from pyspark import SparkContext
from pyspark.sql.functions import *
from pyspark.sql.types import TimestampType
from datetime import datetime
import matplotlib.pyplot as plt

sc = SparkContext()
sqlcontext = SQLContext(sc)
path = "hdfs://wolf.analytics.private/user/slx4192/data/crime/Crimes_-_2001_to_present.csv"
mydata = sqlcontext.read.csv(path, header=True)
mydata = mydata.filter(mydata.Arrest == "true")
mydata.show(5)

+--------+-----------+--------------------+--------------------+----+--------------------+--------------------+--------------------+------+--------+----+--------+----+--------------+--------+------------+------------+----+--------------------+--------+---------+--------+
|      ID|Case Number|                Date|               Block|IUCR|        Primary Type|         Description|Location Description|Arrest|Domestic|Beat|District|Ward|Community Area|FBI Code|X Coordinate|Y Coordinate|Year|          Updated On|Latitude|Longitude|Location|
+--------+-----------+--------------------+--------------------+----+--------------------+--------------------+--------------------+------+--------+----+--------+----+--------------+--------+------------+------------+----+--------------------+--------+---------+--------+
|11042582|   JA377037|01/01/2011 12:01:...|054XX S CALIFORNI...|1754|OFFENSE INVOLVING...|AGG SEX ASSLT OF ...|           APARTMENT|  true|    true|0923|     009|  14|            63|  

## Question 4:
Find patterns of crimes with arrest with respect to time of the day, day of the week, and month. Use whatever method in spark you would like. (25 pts)

### Time of the day

In [50]:
getDateTime = udf(lambda x: datetime.strptime( x, '%m/%d/%Y %I:%M:%S %p'), TimestampType())
mydata_daytime = mydata\
                    .withColumn('Date_time', getDateTime(col('Date')))\
                    .withColumn("Hour", hour(col("Date_time")))\
                    .withColumn("Newdate",to_date(mydata['Date'], "MM/dd/yyyy"))
hourly_avg = mydata_daytime.groupBy("Hour", "Newdate").count().select("Hour", "count").groupBy("Hour").avg("count").orderBy("Hour")
hourly_avg.show(24)

+----+------------------+
|Hour|        avg(count)|
+----+------------------+
|   0|11.356300610882228|
|   1|  8.56985556985557|
|   2| 6.946610416060518|
|   3| 5.399343381584838|
|   4|3.9454717875770506|
|   5| 2.935792349726776|
|   6|3.3753855644663786|
|   7| 4.269413864921154|
|   8| 6.444428468727534|
|   9| 8.267645800655925|
|  10|11.450830140485312|
|  11|14.860102156640181|
|  12|15.446509648127128|
|  13|14.933314415437003|
|  14|14.678449744463373|
|  15|14.273294568146362|
|  16|13.621809415768576|
|  17|13.202723790608596|
|  18|15.406750815487165|
|  19|19.293250141803743|
|  20|19.395233366434955|
|  21|18.708954164892862|
|  22| 17.33427922814983|
|  23|14.329455164585697|
+----+------------------+



In [51]:
hourly_avg.toPandas().plot.bar(x="Hour", y="avg(count)")
plt.title('Average Number of Crime with Arrest by Hour of the Day')
plt.savefig("slx4192_4_Hour_of_Day.png")

### Day of the week

In [2]:
temp = mydata.withColumn("newdate",to_date(mydata['Date'], "MM/dd/yyyy"))
mydata_day_of_week = temp\
                    .withColumn("Day_of_week_number",date_format(temp["newdate"], "u"))\
                    .withColumn("Day_of_week",date_format(temp["newdate"], "E"))
day_of_week_avg = mydata_day_of_week.groupBy("Day_of_week","Day_of_week_number", "newdate").count().select("Day_of_week", "Day_of_week_number","count")\
    .groupBy("Day_of_week", "Day_of_week_number").avg().orderBy("Day_of_week_number").select("Day_of_week", "avg(count)")
day_of_week_avg.show(7)

+-----------+------------------+
|Day_of_week|        avg(count)|
+-----------+------------------+
|        Mon| 263.4424603174603|
|        Tue|  284.140873015873|
|        Wed| 287.6746031746032|
|        Thu| 283.6309523809524|
|        Fri| 290.1140873015873|
|        Sat| 272.7083333333333|
|        Sun|251.28670634920636|
+-----------+------------------+



In [None]:
day_of_week_avg.toPandas().plot.bar(x="Day_of_week", y="avg(count)")
plt.title('Average Number of Crime with Arrest by Day_of_week')
plt.savefig("slx4192_4_Day_of_week.png")

### Month of Year

In [3]:
mydata_month = mydata.withColumn('Month', mydata['Date'].substr(0, 2))
monthly_avg = mydata_month.groupBy("Month", "Year").count().select("Month", "count").groupBy("Month").avg().orderBy("Month")
monthly_avg.show(12)

+-----+------------------+
|Month|        avg(count)|
+-----+------------------+
|   01|           8246.15|
|   02|           7632.65|
|   03|            8618.4|
|   04|           8206.35|
|   05| 9010.631578947368|
|   06|            8652.0|
|   07| 8945.263157894737|
|   08|  9028.21052631579|
|   09| 8586.315789473685|
|   10| 8620.631578947368|
|   11| 7929.736842105263|
|   12|7353.0526315789475|
+-----+------------------+



In [None]:
monthly_avg.toPandas().plot.bar(x="Month", y="avg(count)")
plt.title('Average Number of Crime with Arrest by Month')
plt.savefig("slx4192_4_Month_of_Year.png")