In [1]:
from pyspark import SparkContext
sc = SparkContext()

In [2]:
myRDD = sc.textFile("hdfs://wolf.analytics.private/user/slx4192/data/crime/Crimes_-_2001_to_present.csv") 
header = myRDD.first()
data = myRDD.filter(lambda row: row != header)

In [3]:
header.split(",")

['ID',
 'Case Number',
 'Date',
 'Block',
 'IUCR',
 'Primary Type',
 'Description',
 'Location Description',
 'Arrest',
 'Domestic',
 'Beat',
 'District',
 'Ward',
 'Community Area',
 'FBI Code',
 'X Coordinate',
 'Y Coordinate',
 'Year',
 'Updated On',
 'Latitude',
 'Longitude',
 'Location']

In [4]:
splitted_data = data.map(lambda line: line.split(","))

In [5]:
splitted_data.take(1)

[['11034701',
  'JA366925',
  '01/01/2001 11:00:00 AM',
  '016XX E 86TH PL',
  '1153',
  'DECEPTIVE PRACTICE',
  'FINANCIAL IDENTITY THEFT OVER $ 300',
  'RESIDENCE',
  'false',
  'false',
  '0412',
  '004',
  '8',
  '45',
  '11',
  '',
  '',
  '2001',
  '08/05/2017 03:50:08 PM',
  '',
  '',
  '']]

## Question 2:
By using plain Spark (RDDs): 
* (1) find the top 10 blocks in crime events in the last 3 years; 

In [6]:
top_10_blocks_last_3_years = splitted_data\
    .map(lambda x: (x[3][0:5], x[17]))\
    .filter(lambda x:  x[1] in ["2019","2018","2017"])\
    .map(lambda x: (x[0], 1))\
    .reduceByKey(lambda a, b: a+b)\
    .sortBy(lambda x: -x[1]).take(10)

In [7]:
for i in range(len(top_10_blocks_last_3_years)):
    print(str(top_10_blocks_last_3_years[i]) + "\n")

('0000X', 29735)

('001XX', 21553)

('002XX', 16943)

('003XX', 15281)

('008XX', 14520)

('015XX', 14150)

('006XX', 13384)

('007XX', 13255)

('014XX', 13218)

('011XX', 13165)



* (2) find the two beats that are adjacent with the highest correlation in the number of crime events (this will require you looking at the map to determine if the correlated beats are adjacent to each other) over the last 5 years 

In [80]:
beat_as_key = splitted_data\
    .map(lambda x: (x[10], x[17]))\
    .filter(lambda x:  x[1] in ["2019","2018","2017","2016","2015"])\
    .map(lambda x: (x[0]+"-"+x[1],1))\
    .reduceByKey(lambda a, b: a+b)\
    .map(lambda x: (x[0][0:4],x[0][5:], x[1]))\
    .sortBy(lambda x: (x[0], x[1]))\
    .map(lambda x: (x[0], [x[2]]))\
    .reduceByKey(lambda a,b: a+b)
beat_as_key.take(5)

[('0222', [977, 1038, 1154, 1026, 1011]),
 ('0323', [1257, 1207, 1199, 1183, 1073]),
 ('0332', [1201, 1125, 1255, 1240, 1197]),
 ('0412', [1169, 1159, 1080, 1002, 1032]),
 ('0414', [1736, 1570, 1536, 1482, 1512])]

In [81]:
dict = beat_as_key.collectAsMap()
import numpy as np
cor_m = np.corrcoef(list(dict.values()))
cor_m

array([[ 1.        ,  0.01643896,  0.43276865, ..., -0.13230473,
         0.99711788, -0.78387219],
       [ 0.01643896,  1.        , -0.03872042, ...,  0.16980257,
        -0.01132901,  0.01257553],
       [ 0.43276865, -0.03872042,  1.        , ..., -0.93076138,
         0.47903001, -0.06663072],
       ...,
       [-0.13230473,  0.16980257, -0.93076138, ...,  1.        ,
        -0.19425822, -0.09288897],
       [ 0.99711788, -0.01132901,  0.47903001, ..., -0.19425822,
         1.        , -0.79291696],
       [-0.78387219,  0.01257553, -0.06663072, ..., -0.09288897,
        -0.79291696,  1.        ]])

In [82]:
beat = list(dict.keys())
results = []
for j in range(len(beat)):
    for i in range(len(beat)):
        if i > j:
            results.append([cor_m[j][i],beat[j],beat[i]])
results.sort(key = lambda x: abs(x[0]),reverse = True)

In [83]:
print('\n'.join(map(str, results[0:50]))) 

[0.9990217951872251, '0932', '1712']
[-0.9989663097085922, '0722', '0524']
[-0.9989584071519785, '0423', '2233']
[-0.9984745766683734, '0823', '0114']
[0.9983682879794051, '1833', '0112']
[-0.9981285880478877, '0322', '0121']
[-0.9979341284804639, '0724', '1023']
[-0.9978705580636585, '1224', '0334']
[0.9977500996044635, '2012', '2022']
[-0.9975881001567053, '1234', '1114']
[-0.9975015396744252, '0631', '1712']
[0.9971178756977078, '0222', '2424']
[0.9969888556541437, '0323', '1633']
[-0.9969707138179038, '0133', '1935']
[-0.9968407957234459, '0932', '0631']
[0.9967070577227053, '0913', '0432']
[-0.9966681789381544, '1233', '0321']
[0.9966563036077083, '1134', '1624']
[-0.9966087733181834, '0622', '0522']
[0.996501504317359, '0726', '0522']
[0.9964135922410454, '1653', '0232']
[0.9963289779881659, '0123', '1232']
[0.9963015026442582, '0214', '1434']
[0.9960817471423913, '1231', '1922']
[0.9958957346998863, '0114', '1822']
[0.9957097166358225, '1125', '1511']
[-0.9956998887399396, '1233

* (3) establish if the number of crime events is different between Majors Daly and Emanuel at a granularity of your choice (not only at the city level). Find an explanation of results. (20 pts)

    * Rahm Emanuel (2011–2019):     - use 2015,2016,2017,2018,2019 for this problem 

    * Richard M. Daley (1989–2011): - use 2006,2007,2008,2009,2010 for this problem

In [17]:
Emanuel_year = ['2015','2016','2017','2018','2019']
Daley_year = ['2006','2007','2008','2009','2010']

# the average number of arrest in block 0000X during the last five years when Emanuel was in charge
Emanuel_avg = splitted_data\
    .map(lambda x: (x[3][0:5], x[8],x[17]))\
    .filter(lambda x: x[1] =='true')\
    .filter(lambda x:x[0]=="0000X")\
    .filter(lambda x:  x[2] in Emanuel_year)\
    .map(lambda x: (x[0], 1))\
    .reduceByKey(lambda a, b: a+b)\
    .map(lambda x: x[1]/60).take(1)
print(Emanuel_avg)
#[191.35]
    
# the average number of arrest in block 0000X during the last five years when Daley was in charge
Daley_avg = splitted_data\
    .map(lambda x: (x[3][0:5], x[8],x[17]))\
    .filter(lambda x: x[1] =='true')\
    .filter(lambda x:x[0]=="0000X")\
    .filter(lambda x:  x[2] in Daley_year)\
    .map(lambda x: (x[0], 1))\
    .reduceByKey(lambda a, b: a+b)\
    .map(lambda x: x[1]/60).take(1)
print(Daley_avg)
# [284.65]

[191.35]
[284.65]
