In [1]:
from pyspark import SparkContext, RDD
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import StringType
from itertools import chain
import csv
import pandas as pd
import numpy as numpy
import json
import string

In [2]:
# Init PySpark context
sc = SparkContext()
spark = SparkSession.builder.appName("final_p3").config("spark.some.config.option", "some-value").getOrCreate()

In [3]:
# Get Datasets (local version)
dataset_prefix = "311_Service_Requests_"
dataset_label = ["for_2004", "for_2005", "for_2006", "for_2007", "for_2008", "for_2009", "from_2010_to_Present"]
dataset_names = [dataset_prefix + i + '.csv' for i in dataset_label]
# print(dataset_names)

In [7]:
# Import Zipcode-Borough maps
# The zip map is relatively small(7kb), so store a seperate copy in each node

zip_borough_dir = "zip_borough.csv" # relatively small: 7kb
zip_borough_dict = {}
with open (zip_borough_dir, newline='') as zip_borough_csv:
    zip_reader = csv.DictReader(zip_borough_csv, ['zip', 'borough'])
    for row in zip_reader:
        if row['borough'] == "Staten":
            zip_borough_dict[row['zip']] = "STATEN ISLAND"
        else:
            zip_borough_dict[row['zip']] = row['borough'].upper()
zip_borough_dict['null'] = 'Unspecified'
# test
print(json.dumps(zip_borough_dict))

{"10001": "MANHATTAN", "10451": "BRONX", "10002": "MANHATTAN", "10452": "BRONX", "10003": "MANHATTAN", "10453": "BRONX", "10004": "MANHATTAN", "10454": "BRONX", "10005": "MANHATTAN", "10455": "BRONX", "10006": "MANHATTAN", "10456": "BRONX", "10007": "MANHATTAN", "10457": "BRONX", "10009": "MANHATTAN", "10458": "BRONX", "10010": "MANHATTAN", "10459": "BRONX", "10011": "MANHATTAN", "10460": "BRONX", "10012": "MANHATTAN", "10461": "BRONX", "10013": "MANHATTAN", "10462": "BRONX", "10014": "MANHATTAN", "10463": "BRONX", "10015": "MANHATTAN", "10464": "BRONX", "10016": "MANHATTAN", "10465": "BRONX", "10017": "MANHATTAN", "10466": "BRONX", "10018": "MANHATTAN", "10467": "BRONX", "10019": "MANHATTAN", "10468": "BRONX", "10020": "MANHATTAN", "10469": "BRONX", "10021": "MANHATTAN", "10470": "BRONX", "10022": "MANHATTAN", "10471": "BRONX", "10023": "MANHATTAN", "10472": "BRONX", "10024": "MANHATTAN", "10473": "BRONX", "10025": "MANHATTAN", "10474": "BRONX", "10026": "MANHATTAN", "10475": "BRONX",

In [5]:
# Load main datasets
df_full_list = []
df_part_list = []

for dataset in dataset_names:
    df = spark.read.csv(dataset, header=True)
    df_part = df.select("Created Date","Complaint Type", "Incident Zip", "City", "Borough")
    df_full_list.append(df)
    df_part_list.append(df_part)
# test
df_part_list[0].show(50)


+--------------------+--------------------+------------+-------------+-------------+
|        Created Date|      Complaint Type|Incident Zip|         City|      Borough|
+--------------------+--------------------+------------+-------------+-------------+
|04/02/2004 12:00:...|DCA / DOH New Lic...|        null|         null|  Unspecified|
|04/02/2004 12:00:...|DCA / DOH New Lic...|        null|         null|  Unspecified|
|04/06/2004 12:00:...|Street Sign - Dam...|       10306|STATEN ISLAND|STATEN ISLAND|
|04/06/2004 12:00:...|      Taxi Complaint|        null|     BROOKLYN|     BROOKLYN|
|04/06/2004 12:00:...|    Street Condition|        null|         null|     BROOKLYN|
|04/06/2004 12:00:...|Street Sign - Dam...|       11207|     BROOKLYN|     BROOKLYN|
|09/01/2004 12:00:...|Street Sign - Dam...|        null|         null|STATEN ISLAND|
|04/06/2004 12:00:...|    Street Condition|        null|         null|     BROOKLYN|
|04/09/2004 12:00:...|DCA / DOH New Lic...|        null|         

In [15]:
# Try to fill the borough column of each cell
zip_brgh_map = create_map([lit(x) for x in chain(*zip_borough_dict.items())])    
for df in df_part_list:
    df = df.withColumn("Borough2", when((col('Incident Zip') != 'null') & (col('Borough') == 'Unspecified'), zip_brgh_map.getItem(col('Incident Zip'))).otherwise(col('Borough')))

+--------------------+--------------------+------------+-------------+-------------+-------------+
|        Created Date|      Complaint Type|Incident Zip|         City|      Borough|     Borough2|
+--------------------+--------------------+------------+-------------+-------------+-------------+
|04/02/2004 12:00:...|DCA / DOH New Lic...|        null|         null|  Unspecified|  Unspecified|
|04/02/2004 12:00:...|DCA / DOH New Lic...|        null|         null|  Unspecified|  Unspecified|
|04/06/2004 12:00:...|Street Sign - Dam...|       10306|STATEN ISLAND|STATEN ISLAND|STATEN ISLAND|
|04/06/2004 12:00:...|      Taxi Complaint|        null|     BROOKLYN|     BROOKLYN|     BROOKLYN|
|04/06/2004 12:00:...|    Street Condition|        null|         null|     BROOKLYN|     BROOKLYN|
|04/06/2004 12:00:...|Street Sign - Dam...|       11207|     BROOKLYN|     BROOKLYN|     BROOKLYN|
|09/01/2004 12:00:...|Street Sign - Dam...|        null|         null|STATEN ISLAND|STATEN ISLAND|
|04/06/200

In [24]:
# Aggregate values
years = [2004, 2005, 2006, 2007, 2008, 2009, 2010]
df_agg_dic = {}
for df, year in zip(df_part_list, years):
    df_agg_dic[year] = {
        "bk" : df.filter("Borough='BROOKLYN'").groupBy('Borough','Complaint Type').count().orderBy('count', ascending=False).take(5),
        "ma" : df.filter("Borough='MANHATTAN'").groupBy('Borough','Complaint Type').count().orderBy('count', ascending=False).take(5),
        "qu" : df.filter("Borough='QUEENS'").groupBy('Borough','Complaint Type').count().orderBy('count', ascending=False).take(5),
        "br" : df.filter("Borough='BRONX'").groupBy('Borough','Complaint Type').count().orderBy('count', ascending=False).take(5),
        "si" : df.filter("Borough='STATEN ISLAND'").groupBy('Borough','Complaint Type').count().orderBy('count', ascending=False).take(5)
    }

# Test
print(df_agg_dic[2004])

KeyboardInterrupt: 