In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *

crimes_schema = StructType([StructField("INCIDENT_NUMBER", StringType(), True),
                            StructField("OFFENSE_CODE", StringType(), True),
                            StructField("OFFENSE_CODE_GROUP", StringType(), True),
                            StructField("OFFENSE_DESCRIPTION", StringType(), True),
                            StructField("DISTRICT", StringType(), True),
                            StructField("REPORTING_AREA", StringType(), True),
                            StructField("SHOOTING", StringType(), True),
                            StructField("OCCURRED_ON_DATE", TimestampType(), True),
                            StructField("YEAR", IntegerType(), True),
                            StructField("MONTH", IntegerType(), True),
                            StructField("DAY_OF_WEEK", StringType(), True),
                            StructField("HOUR", IntegerType(), True),
                            StructField("UCR_PART",StringType(), True),
                            StructField("STREET", StringType(), True),
                            StructField("Lat", StringType(), True),
                            StructField("Long", StringType(), True),
                            StructField("Location", StringType(), True)])

spark = SparkSession.builder.appName("Crimes in boston").getOrCreate()

crimes = spark.read.csv("./data/BostenCrime2.csv", header = True, schema = crimes_schema)

In [None]:
crimes

In [None]:
crimes.select("MONTH").distinct().orderBy("MONTH").show(12)

## Hourly based monthly data

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

def show_crime_per_hour_perMonth(hour):

    x_axis = [i+1 for i in range(11)]
    y_axis = []
    
    result = crimes.filter("HOUR="+hour).groupBy("MONTH").count().orderBy("MONTH")
    data = result.take(12)
    for i in range(1, 12):
        y_axis.append(data[i-1][1])

    fig1 = plt.figure(figsize=(7, 7))
    fig1.suptitle('Crimes per month per within a hour', fontsize=14)
    ax = fig1.add_subplot(111)
    ax.set_xlabel('month')
    ax.set_ylabel('Crime')
    ax.bar(x_axis, y_axis, fc='darksalmon', align='center')
    plt.show()
    
    return

In [None]:
from IPython.display import HTML

time = '12'

form = """
<div style='background-color:gainsboro; border:solid black; width:300px; padding:20px;'>
Hour : <input id='hour' type='range' min='0' max='23' step='1' value='12' onchange='set_vars()'/>
</div>"""

javascript = """
<script type="text/Javascript">
    const kernelCell = 5;

    function set_vars(){
        var time = document.getElementById('hour').value;
        var kernel = IPython.notebook.kernel;
        
        kernel.execute('time = "' + time + '"');
        cell = IPython.notebook.get_cell(kernelCell);
        cell.execute();
        cell.focus_cell(kernelCell);
    }
</script>
"""

HTML(form + javascript)

In [None]:
print('Time: ' + str(time))
show_crime_per_hour_perMonth(time)

In [None]:
from datetime import datetime
from pyspark.sql.functions import col,udf
myfunc =  udf(lambda x: datetime.strptime(x, '%m/%d/%Y %I:%M:%S %p'), TimestampType())
df =crimes.withColumn('OCCURRED_ON_DATE',myfunc(col('OCCURRED_ON_DATE'))).drop('OCCURRED_ON_DATE')


#data = df.groupBy('Year').count().show()
data = df.groupBy('Year').count().sort('Year').collect()
count = [item[1] for item in data]
year = [item[0] for item in data]
data

In [None]:
x_axis = [i for i in year] 
y_axis= [i for i in count]



fig1 = plt.figure(figsize=(8, 8))
 
plt.xlabel("Year", fontsize = 18)
plt.ylabel("Number of Crimes", fontsize = 18)
plt.title("Number of Crimes Per Year", fontsize = 28)
plt.bar(x_axis, y_axis, fc='darksalmon', align='center',alpha=0.5)
plt.xticks(size = 18)
plt.yticks(size = 18)
plt.show()

In [None]:
crime_location  = crimes.groupBy("STREET").count().sort('count', ascending  = False).limit(20).collect()
crime_location

In [None]:
location = [item[0] for item in crime_location]
count = [item[1] for item in crime_location]
location

In [None]:
x_axis = [str(i) for i in location]
y_axis = [i for i in count]

fig = plt.figure(figsize=(20,20))
#fig.suptitle('Number of Crimes Per year per location', fontsize=36)

plt.title("Number of crimes per year per location", fontsize=36)
plt.xlabel("Number of Crimes", fontsize = 28)
plt.ylabel("Crime Location", fontsize = 28)
plt.barh(x_axis,y_axis,color = "darksalmon")
plt.xticks(size = 24)
plt.yticks(size = 24)
plt.show()

In [None]:
crime_latitude  = crimes.groupBy("Lat").count().sort('count', ascending  = False).limit(20).collect()
crime_longitude  = crimes.groupBy("Long").count().sort('count', ascending  = False).limit(20).collect()
crime_coordinates = [crime_latitude,crime_longitude]


In [None]:
crime_latitude

In [None]:
crime_longitude

In [None]:
location_lat = [i for i in crime_latitude]
location_long = [i for i in crime_longitude]
location_count = [item[1] for item in crime_coordinates]

location_lat