In [None]:
#import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import  *
from pyspark.sql import SQLContext
#Zonder schema wordt de namen van de column onduidelijk zoals col1,col2 ....
chicago_crimes_schema = StructType([StructField("ID", StringType(), True),
                            StructField("CaseNumber", StringType(), True),
                            StructField("Date", StringType(), True ),
                            StructField("Block", StringType(), True),
                            StructField("IUCR", StringType(), True),
                            StructField("PrimaryType", StringType(), True  ),
                            StructField("Description", StringType(), True ),
                            StructField("LocationDescription", StringType(), True ),
                            StructField("Arrest", BooleanType(), True),
                            StructField("Domestic", BooleanType(), True),
                            StructField("Beat", StringType(), True),
                            StructField("District", StringType(), True),
                            StructField("Ward", StringType(), True),
                            StructField("CommunityArea", StringType(), True),
                            StructField("FBICode", StringType(), True ),
                            StructField("XCoordinate", DoubleType(), True),
                            StructField("YCoordinate", DoubleType(), True ),
                            StructField("Year", IntegerType(), True),
                            StructField("UpdatedOn", DateType(), True ),
                            StructField("Latitude", DoubleType(), True),
                            StructField("Longitude", DoubleType(), True),
                            StructField("Location", StringType(), True )
                            ])


boston_crimes_schema = StructType([StructField("INCIDENT_NUMBER", StringType(), True),
                            StructField("OFFENSE_CODE", StringType(), True),
                            StructField("OFFENSE_CODE_GROUP", StringType(), True),
                            StructField("OFFENSE_DESCRIPTION", StringType(), True),
                            StructField("DISTRICT", StringType(), True),
                            StructField("REPORTING_AREA", StringType(), True),
                            StructField("SHOOTING", StringType(), True),
                            StructField("OCCURRED_ON_DATE", TimestampType(), True),
                            StructField("YEAR", IntegerType(), True),
                            StructField("MONTH", IntegerType(), True),
                            StructField("DAY_OF_WEEK", StringType(), True),
                            StructField("HOUR", IntegerType(), True),
                            StructField("UCR_PART",StringType(), True),
                            StructField("STREET", StringType(), True),
                            StructField("Lat", StringType(), True),
                            StructField("Long", StringType(), True),
                            StructField("Location", StringType(), True)])


schema = StructType([StructField("ID Year", StringType(), True),
                            StructField("Year", StringType(), True),
                            StructField("Income", IntegerType(), True),
                            StructField("Household Income by Race Moe", StringType(), True),
                            StructField("Geography", StringType(), True),
                            StructField("ID Geography", StringType(), True),
                            StructField("Slug", StringType(), True)])


spark = SparkSession.builder.appName("Crimes in boston and chicago").getOrCreate()

bostoncrimes = spark.read.csv("./data/BostonData.csv", header = True, schema = boston_crimes_schema)



spark = SparkSession.builder.master("local[*]").config("spark.execute.memory", "1gb").appName("Analyzing Bosten and chicago Crime data").getOrCreate()

chicagocrimes = spark.read.csv("./data/Crimes_-_2001_to_present.csv",header = True,schema = chicago_crimes_schema)


#spark = SparkSession.builder.appName("Analyzing Bosten and chicago Income data").getOrCreate()

chicagoIncome = spark.read.csv("./data/ChicagoHouseholdIncome.csv",header = True,schema = schema)


#spark = SparkSession.builder.appName("Analyzing Bosten and Boston Income data").getOrCreate()

bostonIncome = spark.read.csv("./data/BostonHouseholdIncome.csv",header = True,schema = schema)

sc = spark.sparkContext
sqlContext = SQLContext(sc)

In [None]:
chicagocrimes.count()

In [None]:
chicagocrimes.columns

In [None]:
bostoncrimes.count()

In [None]:
bostoncrimes.columns

In [None]:
chicagocrimes.select('Date').show(10,truncate=False)

In [None]:
from datetime import datetime
from pyspark.sql.functions import col,udf
myfunc =  udf(lambda x: datetime.strptime(x, '%m/%d/%Y %I:%M:%S %p'), TimestampType())
df =chicagocrimes.withColumn('Date_time',myfunc(col('Date'))).drop('Date')
df.select(df["Date_time"]).show(5)

dg =bostoncrimes.withColumn('OCCURRED_ON_DATE',myfunc(col('OCCURRED_ON_DATE'))).drop('OCCURRED_ON_DATE')
#dg.select(df["OCCURRED_ON_DATE"]).show(5)

In [None]:
#data = df.groupBy('Year').count().show()
chicago_data =df.groupBy('Year').count().sort('Year').collect()
chicago_count = [item[1] for item in chicago_data]
chicago_year = [item[0] for item in chicago_data]
chicago_data

In [None]:
#data = df.groupBy('Year').count().show()
boston_data =dg.groupBy('Year').count().sort('Year').collect()
boston_count = [item[1] for item in boston_data]
boston_year = [item[0] for item in boston_data]
boston_data

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np

y_axis = [i for i in boston_count] 
x_axis= [i for i in chicago_count]
#z_axis = [i for i in boston_count]

fig = plt.figure(figsize=(18,10))
plt.title('Comparing number of crimes per year in Boston and Chicago from 2012 to 2019', fontsize=24)
#print(x_axis)
#print(y_axis)
#print(y_axis[14:])
data = [x_axis[11:],y_axis]

d = [2012,2013,2014,2015,2016,2017,2018,2019]
years = ['2012','2013','2014','2015','2016','2017','2018','2019']
X = np.arange(len(years))
plt.bar(X - 0.15, data[0], color = '#003f5c', width = 0.3,label='chicago')
plt.bar(X + 0.15, data[1], color = '#58508d', width = 0.3,label='boston')
ax = plt.gca()
ax.legend(shadow=True)
plt.xticks(X,(years))

# Show the major grid lines with dark grey lines
plt.grid(b=True, which='major', color='#666666', linestyle='-')

# Show the minor grid lines with very faint and almost transparent grey lines
plt.minorticks_on()
plt.grid(b=True, which='minor', color='#999999', linestyle='-', alpha=0.2)

plt.show()

In [None]:
#data = df.groupBy('Year').count().show()
boston_data =dg.groupBy('MONTH').count().sort('MONTH').collect()
boston_month_count = [item[1] for item in boston_data]
boston_month = [item[0] for item in boston_data]
boston_data

In [None]:
from pyspark.sql.functions import month
chicago_monthdf = df.withColumn("Month",month('Date_time'))
chicago_month_count = chicago_monthdf.select("Month").where(chicagocrimes['Year'] >= 2015).groupBy("Month").count()
chicago_month_count = chicago_month_count.collect()
chicago_month_count

In [None]:
chicago_month_count.sort()
chicago_months = [item[0] for item in chicago_month_count]
chicago_month_counts = [item[1] for item in chicago_month_count]
chicago_month_count

In [None]:
x_axis = [i for i in boston_month_count] 
y_axis= [i for i in chicago_month_counts]
#z_axis = [i for i in boston_count]

fig = plt.figure(figsize=(10,8))
plt.title('Comparing number of crimes per month in Boston and Chicago from 2012 to 2019', fontsize=24)
print(x_axis)
print(y_axis)
print(y_axis[11:])
data = [x_axis,y_axis]

d = [2015,2016,2017,2018,2019]
years = ['2012','2013','2014','2015','2016','2017','2018','2019']
X = np.arange(12)
plt.plot(X + 0.00, data[0], color = 'green',linewidth = 3, label='boston')
plt.plot(X + 0.00, data[1], color = 'red',linewidth = 3, label='chicago')
ax = plt.gca()
ax.legend(shadow=True)

# Show the major grid lines with dark grey lines
plt.grid(b=True, which='major', color='#666666', linestyle='-')

# Show the minor grid lines with very faint and almost transparent grey lines
plt.minorticks_on()
plt.grid(b=True, which='minor', color='#999999', linestyle='-', alpha=0.2)

plt.show()

In [None]:
x_axis = [i for i in boston_month_count] 
y_axis= [i for i in chicago_month_counts]

In [None]:
crime_location  = chicagocrimes.groupBy("LocationDescription").count().sort('count', ascending  = False).limit(20).collect()
crime_location

In [None]:
location = [item[0] for item in crime_location]
count = [item[1] for item in crime_location]
location

In [None]:
x_axis = [i for i in location] 
y_axis= [i for i in count]

fig = plt.figure(figsize=(20,20))
fig.suptitle('Chigago Number of Crimes Per year', fontsize=36)
 
plt.xlabel("Number of Crimes", fontsize = 28)
plt.ylabel("Crimes Location", fontsize = 28)
plt.barh(x_axis,y_axis,color = "red")
plt.xticks(size = 24)
plt.yticks(size = 24)
plt.show()

In [None]:
crime_long_lat  = bostoncrimes.groupBy("Lat","Long").count().sort('count', ascending  = False).limit(50).collect()
crime_long_lat.pop(0)
map_location = [[item[0],item[1],item[2]] for item in crime_long_lat]
long_lat=[[item[0],item[1]] for item in map_location]       
long_lat.pop(0)
count=[item[2] for item in map_location]
map_location

In [None]:
_long_lat  = chicagocrimes.groupBy("Latitude","Longitude").count().sort('count', ascending  = False).limit(50).collect()
_long_lat.pop(0)
_location = [[item[0],item[1],item[2]] for item in _long_lat]
coordinates=[[item[0],item[1]] for item in _location]       
coordinates.pop(0)
count=[item[2] for item in _location]
_location

In [None]:
import folium
from folium.plugins import HeatMap
first_map = folium.Map(location=[42.355300, -71.055280], zoom_start=5)
test = HeatMap(crime_long_lat,name=None, min_opacity=0.5, max_zoom=18, max_val=1.0, radius=25, blur=10, gradient=None, overlay=True, control=True, show=True).add_to(first_map)
test1 = HeatMap(_long_lat,name=None, min_opacity=0.5, max_zoom=18, max_val=1.0, radius=25, blur=10, gradient=None, overlay=True, control=True, show=True).add_to(first_map)
first_map

In [None]:
from datetime import datetime
from pyspark.sql.functions import col,udf
myfunc =  udf(lambda x: datetime.strptime(x, '%Y %I'), TimestampType())

In [None]:
db =bostonIncome.withColumn('Date_time',myfunc(col('Year')))
dc =chicagoIncome.withColumn('Date_time',myfunc(col('Year')))
ddb =bostonIncome.withColumn('',myfunc(col('Year')))
db

In [None]:
boston_income_data =db.groupBy('Year').sum("Income").sort('Year').collect()
db.registerTempTable("income")
#boston_income_data = sqlContext.sql("SELECT year, slug, Income FROM income WHERE slug LIKE 'boston%'").groupBy('Year').sum('Income').sort('Year').collect()
boston_income_count = [item[1] for item in boston_income_data]
boston_incoem_year = [item[0] for item in boston_income_data]
boston_income_data

In [None]:
chicago_income_data =dc.groupBy('Year').sum('Income').sort('Year').collect()
chicago_income_count = [item[1] for item in chicago_income_data]
chicago_income_year = [item[0] for item in chicago_income_data]
chicago_income_data

In [None]:
chicago_income_count

In [None]:
chicagoIncome.columns

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np

y_axis = [i for i in boston_income_count] 
x_axis= [i for i in chicago_income_count]
chicago_income_count[0] - boston_income_count[0]

In [None]:
data = [x_axis,y_axis]

fig = plt.figure(figsize=(18,10))
plt.title('Comparing income per year in Boston and Chicago from 2013 to 2017', fontsize=24)

#bars = np.add(x_axis, y_axis).tolist()



 
# Names of group and bar width
names = ['2013','2014','2015','2016','2017']
#barWidth = 0.25
X = np.arange(len(names))
plt.bar(X - 0.15, data[0], color = '#003f5c', width = 0.3,label='chicago')
plt.bar(X + 0.15, data[1], color = '#58508d', width = 0.3,label='boston')
# Create green bars (top)
#plt.bar(r, bars3, bottom=bars, color='#2d7f5e', edgecolor='white', width=barWidth)
 
ax = plt.gca()
ax.set_ylim(min(data[0]) - min(data[1]), max(data[0]) + (max(data[0]) - max(data[1])))
ax.legend(shadow=True)
    
# Custom X axis
plt.xticks(X, names, fontweight='bold')
plt.xlabel("year")
plt.ylabel("income per year")

# Show the major grid lines with dark grey lines
plt.grid(b=True, which='major', color='#666666', linestyle='-')

# Show the minor grid lines with very faint and almost transparent grey lines
plt.minorticks_on()
plt.grid(b=True, which='minor', color='#999999', linestyle='-', alpha=0.2)


# Show graphic
plt.show()