### MAST30034: Applied Data Science Project 1
---
# Preprocessing Part 2: Aggregating Data by MMWR Week
#### Xavier Travers (1178369)

Aggregate all the data by MMWR week (defined [here](https://ndc.services.cdc.gov/wp-content/uploads/MMWR_Week_overview.pdf)).
This means counting trips to and from each of the boroughs per month.
This is done for each of the taxi types.

In [1]:
# imports used throughout this notebook
from pyspark.sql import DataFrame
from pyspark.sql import functions as F
import os
import sys
import re
from itertools import chain

# add homemade helpers
sys.path.insert(1, '../scripts')
import helpers.aggregation_helpers as ah
import helpers.join_helpers as jh

# for printouts
DEBUGGING = True

In [2]:
from pyspark.sql import SparkSession

# Create a spark session (which will run spark jobs)
spark = (
    SparkSession.builder.appName('MAST30034 XT Project 1')
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config('spark.sql.repl.eagerEval.enabled', True) 
    .config('spark.sql.parquet.cacheMetadata', 'true')
    .config("spark.executor.memory", "2g")
    .config("spark.driver.memory", "4g")
    .getOrCreate()
)

22/08/13 00:13:47 WARN Utils: Your hostname, Polaris resolves to a loopback address: 127.0.1.1; using 172.26.233.226 instead (on interface eth0)
22/08/13 00:13:47 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/08/13 00:13:48 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/08/13 00:13:49 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


### 1. Aggregating the TLC dataset

In [3]:
# TODO: commenting
TLC_NAMES = ['yellow']

In [4]:
# read in the cleaned yellow dataset
tlc_df = spark.read.parquet('../data/curated/tlc/cleaned/yellow')
print(f'{tlc_df.count()} ROWS')
tlc_df.limit(5)

                                                                                

107981362 ROWS


                                                                                

day,week_index,cdc_week,date,passengers,trip_distance,pu_location_id,do_location_id,hours_elapsed,pu_borough,do_borough,year,month
17,12,12,03/17/2019,1.0,1.08,144,231,0.1594444444444444,Manhattan,Manhattan,2019,3
5,10,10,03/05/2019,1.0,0.6,186,68,0.1677777777777777,Manhattan,Manhattan,2019,3
31,14,14,03/31/2019,3.0,1.3,211,79,0.1275,Manhattan,Manhattan,2019,3
5,10,10,03/05/2019,1.0,9.27,138,80,0.9866666666666668,Queens,Brooklyn,2019,3
17,12,12,03/17/2019,1.0,1.61,125,249,0.2036111111111111,Manhattan,Manhattan,2019,3


In [5]:
TLC_COMMON_GROUP_COLUMNS = [
    'week_index',
]

TLC_GROUP_BY_PU_COLUMNS = TLC_COMMON_GROUP_COLUMNS + ['pu_borough'];
TLC_GROUP_BY_DO_COLUMNS = TLC_COMMON_GROUP_COLUMNS + ['do_borough'];

# TODO: commenting
TLC_AGGREGATE_COLUMNS = {
    '*': ['count'],
    'passengers': ['total', 'daily_average', 'average'],
    'trip_distance': ['total', 'daily_average', 'average'],
    'hours_elapsed': ['total', 'daily_average', 'average'],
}

#### Group by pick-up location

In [6]:
tlc_by_pu_df = ah.group_and_aggregate(tlc_df, TLC_GROUP_BY_PU_COLUMNS, 
    TLC_AGGREGATE_COLUMNS)
# TODO: commenting
# force this into memory 
# otherwise writing parquets results in a java executor out of memory error
tlc_by_pu_df = spark.createDataFrame(tlc_by_pu_df.collect())

                                                                                

In [7]:
tlc_by_pu_df.sort('week_index').limit(5)
# TODO: commenting

week_index,pu_borough,num_*,tot_passengers,daily_avg_passengers,avg_passengers,tot_trip_distance,daily_avg_trip_distance,avg_trip_distance,tot_hours_elapsed,daily_avg_hours_elapsed,avg_hours_elapsed
1,Manhattan,959765,1558724.0,222674.85714285716,1.6240683917417285,2196464.49000007,313780.64142858144,2.288544060264825,246687.7069444424,35241.1009920632,0.2570292800263006
1,Staten Island,46,54.0,7.714285714285714,1.173913043478261,717.2500000000001,102.46428571428574,15.592391304347828,26.336111111111105,3.7623015873015855,0.5725241545893719
1,Queens,73277,120878.0,17268.285714285714,1.6496035590976703,847635.0500000104,121090.72142857291,11.56754575105436,39884.57972222228,5697.797103174612,0.5442987529814578
1,Brooklyn,13995,21683.0,3097.5714285714284,1.549339049660593,64314.69999999993,9187.814285714276,4.595548410146476,4998.995277777792,714.1421825396845,0.3571986622206353
1,Bronx,2403,3417.0,488.1428571428572,1.4219725343320848,15782.659999999983,2254.665714285712,6.567898460258004,877.7744444444457,125.3963492063494,0.3652827484163319


In [8]:
tlc_by_pu_df.sort('avg_trip_distance', ascending=False).limit(5)

week_index,pu_borough,num_*,tot_passengers,daily_avg_passengers,avg_passengers,tot_trip_distance,daily_avg_trip_distance,avg_trip_distance,tot_hours_elapsed,daily_avg_hours_elapsed,avg_hours_elapsed
82,Staten Island,21,21.0,3.0,1.0,763.6599999999999,109.09428571428568,36.3647619047619,18.80888888888889,2.6869841269841275,0.8956613756613757
83,Staten Island,29,32.0,4.571428571428571,1.103448275862069,962.67,137.52428571428572,33.195517241379314,23.1475,3.306785714285714,0.7981896551724138
90,Staten Island,37,37.0,5.285714285714286,1.0,1218.4499999999998,174.0642857142857,32.931081081081075,31.63388888888889,4.519126984126984,0.85496996996997
88,Staten Island,31,31.0,4.428571428571429,1.0,1007.67,143.95285714285714,32.505483870967744,24.21388888888889,3.4591269841269847,0.7810931899641578
107,Staten Island,56,57.0,8.142857142857142,1.0178571428571428,1811.5,258.7857142857143,32.348214285714285,50.38972222222223,7.198531746031748,0.8998164682539684


In [9]:
tlc_by_pu_df.write.mode('overwrite').parquet('../data/curated/tlc/aggregated/yellow/by_pu')
# TODO: commenting

#### Group by drop-off location

In [10]:
tlc_by_do_df = ah.group_and_aggregate(tlc_df, TLC_GROUP_BY_DO_COLUMNS, 
    TLC_AGGREGATE_COLUMNS)
# TODO: commenting
# force this into memory 
# otherwise writing parquets results in a java executor out of memory error
tlc_by_do_df = spark.createDataFrame(tlc_by_do_df.collect())

                                                                                

In [11]:
tlc_by_do_df.sort('week_index').limit(5)
# TODO: commenting

week_index,do_borough,num_*,tot_passengers,daily_avg_passengers,avg_passengers,tot_trip_distance,daily_avg_trip_distance,avg_trip_distance,tot_hours_elapsed,daily_avg_hours_elapsed,avg_hours_elapsed
1,Manhattan,942556,1532085.0,218869.2857142857,1.625457797732973,2226908.010000055,318129.7157142936,2.3626267404802,239174.98249999524,34167.854642856466,0.2537514826705206
1,Staten Island,351,531.0,75.85714285714286,1.5128205128205128,7289.039999999997,1041.2914285714282,20.76649572649572,325.4125,46.4875,0.9271011396011396
1,Queens,50336,82381.0,11768.714285714286,1.6366219008264462,435504.3900000012,62214.91285714303,8.651946718054695,24226.107499999995,3460.872499999999,0.4812878953432929
1,Brooklyn,46996,75273.0,10753.285714285714,1.601689505489829,367896.9299999991,52556.70428571416,7.828260490254471,23923.985555555464,3417.712222222209,0.509064293887894
1,Bronx,9247,14486.0,2069.428571428572,1.5665621282578133,87315.77999999968,12473.68285714281,9.44260625067586,4824.904444444457,689.2720634920653,0.5217805174050456


In [12]:
tlc_by_do_df.sort('avg_trip_distance', ascending=False).limit(5)

week_index,do_borough,num_*,tot_passengers,daily_avg_passengers,avg_passengers,tot_trip_distance,daily_avg_trip_distance,avg_trip_distance,tot_hours_elapsed,daily_avg_hours_elapsed,avg_hours_elapsed
82,Staten Island,62,85.0,12.142857142857142,1.3709677419354838,1672.9699999999996,238.99571428571423,26.983387096774187,54.57611111111112,7.796587301587302,0.8802598566308245
109,Staten Island,105,132.0,18.857142857142858,1.2571428571428571,2774.84,396.4057142857142,26.427047619047617,80.17805555555556,11.454007936507937,0.7636005291005291
111,Staten Island,118,149.0,21.285714285714285,1.2627118644067796,3098.180000000001,442.597142857143,26.255762711864417,108.19055555555556,15.45579365079365,0.9168691148775894
108,Staten Island,114,148.0,21.142857142857142,1.2982456140350878,2988.390000000001,426.91285714285726,26.21394736842106,85.87972222222224,12.26853174603175,0.75333089668616
90,Staten Island,75,92.0,13.142857142857142,1.2266666666666666,1948.92,278.4171428571428,25.985599999999994,57.99083333333332,8.28440476190476,0.773211111111111


In [13]:
tlc_by_do_df.write.mode('overwrite').parquet('../data/curated/tlc/aggregated/yellow/by_du')
# TODO: commenting

### 2. Aggregating the COVID dataset

In [14]:
# read in the covid dataset
covid_df = spark.read.parquet('../data/curated/virals/covid/cleaned/cases-by-day')
covid_df.limit(5)
# TODO: commenting

date,year,cdc_week,week_index,cases,deaths,hospitalised,borough
02/29/2020,2020,9,61,0,0,1,Brooklyn
03/01/2020,2020,10,62,0,0,0,Brooklyn
03/02/2020,2020,10,62,0,0,2,Brooklyn
03/03/2020,2020,10,62,0,0,3,Brooklyn
03/04/2020,2020,10,62,1,0,1,Brooklyn


In [15]:
COVID_GROUP_COLUMNS = [
    'year',
    'cdc_week',
    'week_index',
    'borough'
]
# TODO: commenting
COVID_AGGREGATE_COLUMNS = {
    'cases': ['total', 'daily_average'],
    'deaths': ['total', 'daily_average'],
    'hospitalised': ['total', 'daily_average'],
}

In [16]:
covid_df = ah.group_and_aggregate(covid_df, COVID_GROUP_COLUMNS, 
    COVID_AGGREGATE_COLUMNS)

# force this into memory 
# otherwise writing parquets results in a java executor out of memory error
covid_df = spark.createDataFrame(covid_df.collect())
# TODO: commenting

In [17]:
covid_df.sort('week_index').limit(5)
# TODO: commenting

year,cdc_week,week_index,borough,tot_cases,daily_avg_cases,tot_deaths,daily_avg_deaths,tot_hospitalised,daily_avg_hospitalised
2020,9,61,Brooklyn,0.0,0.0,0.0,0.0,1.0,0.1428571428571428
2020,9,61,Queens,0.0,0.0,0.0,0.0,0.0,0.0
2020,9,61,Bronx,0.0,0.0,0.0,0.0,0.0,0.0
2020,9,61,Manhattan,1.0,0.1428571428571428,0.0,0.0,0.0,0.0
2020,9,61,Staten Island,0.0,0.0,0.0,0.0,0.0,0.0
2020,10,62,Brooklyn,7.0,1.0,0.0,0.0,13.0,1.8571428571428568
2020,10,62,Queens,7.0,1.0,0.0,0.0,12.0,1.7142857142857142
2020,10,62,Bronx,2.0,0.2857142857142857,0.0,0.0,5.0,0.7142857142857143
2020,10,62,Manhattan,6.0,0.8571428571428571,0.0,0.0,10.0,1.4285714285714286
2020,10,62,Staten Island,2.0,0.2857142857142857,0.0,0.0,2.0,0.2857142857142857


In [18]:
# save it
# TODO: commenting
covid_df.write.mode('overwrite').parquet('../data/curated/virals/covid/aggregated/cases-by-week')

### 3. Aggregating the Flu dataset
*The flu dataset is already grouped by MMWR week, so only daily_averages can be calculated*

In [19]:
# read in the flu dataset
flu_df = spark.read.parquet('../data/curated/virals/flu/cleaned/cases-by-week')
flu_df.limit(5)

year,month,day,week_index,cdc_week,date,borough,disease,cases
2019,2,16,7,7,02/16/2019,Staten Island,INFLUENZA_UNSPECI...,0
2021,12,25,156,51,12/25/2021,Queens,INFLUENZA_A,582
2020,12,12,102,50,12/12/2020,Staten Island,INFLUENZA_B,3
2019,10,19,42,42,10/19/2019,Queens,INFLUENZA_A,6
2019,10,12,41,41,10/12/2019,Manhattan,INFLUENZA_A,3


In [20]:
# add the daily average cases
flu_df = flu_df.withColumn(
    'daily_avg_cases',
    F.col('cases') / 7
)

In [21]:
flu_df.write.mode('overwrite').parquet('../data/curated/virals/flu/aggregated/cases-by-week')