### MAST30034: Applied Data Science Project 1
---
# Preprocessing Part 2: Aggregating Data by MMWR Week
#### Xavier Travers (1178369)

Aggregate all the data by MMWR week (defined [here](https://ndc.services.cdc.gov/wp-content/uploads/MMWR_Week_overview.pdf)).
This means counting trips to and from each of the boroughs per month.
This is done for each of the taxi types.

In [1]:
# imports used throughout this notebook
from pyspark.sql import DataFrame
from pyspark.sql import functions as F
import os
import sys
import re
from itertools import chain

# add homemade helpers
sys.path.insert(1, '../scripts')
import helpers.aggregation_helpers as ah
import helpers.join_helpers as jh

# Used for saving time (if you don't want sanity-check printouts)
INTERMEDIATE_OUTPUTS = False

In [2]:
from pyspark.sql import SparkSession

# Create a spark session (which will run spark jobs)
spark = (
    SparkSession.builder.appName('MAST30034 XT Project 1')
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config('spark.sql.repl.eagerEval.enabled', True) 
    .config('spark.sql.parquet.cacheMetadata', 'true')
    .config("spark.executor.memory", "2g")
    .config("spark.driver.memory", "4g")
    .getOrCreate()
)

22/08/13 18:53:46 WARN Utils: Your hostname, Polaris resolves to a loopback address: 127.0.1.1; using 172.26.235.73 instead (on interface eth0)
22/08/13 18:53:46 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/08/13 18:53:48 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/08/13 18:53:48 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


### 1. Aggregating the TLC dataset

In [3]:
# TODO: commenting
TLC_NAMES = ['yellow']

In [4]:
# read in the cleaned yellow dataset
tlc_df = spark.read.parquet('../data/curated/tlc/cleaned/yellow')

if INTERMEDIATE_OUTPUTS:
    print(f'{tlc_df.count()} ROWS')
    tlc_df.limit(5)

In [5]:
TLC_COMMON_GROUP_COLUMNS = [
    'week_year',
    'week_month',
    'week_ending',
    'timeline'
]

TLC_GROUP_BY_PU_COLUMNS = TLC_COMMON_GROUP_COLUMNS + ['pu_borough'];
TLC_GROUP_BY_DO_COLUMNS = TLC_COMMON_GROUP_COLUMNS + ['do_borough'];

# TODO: commenting
TLC_AGGREGATE_COLUMNS = {
    '*': ['count'],
    'passengers': ['total', 'daily_average', 'average'],
    'trip_distance': ['total', 'daily_average', 'average'],
    'hours_elapsed': ['total', 'daily_average', 'average'],
}

#### Group by pick-up location

In [6]:
tlc_by_pu_df = ah.group_and_aggregate(tlc_df, TLC_GROUP_BY_PU_COLUMNS, 
    TLC_AGGREGATE_COLUMNS)
# TODO: commenting
# force this into memory 
# otherwise writing parquets results in a java executor out of memory error
tlc_by_pu_df = spark.createDataFrame(tlc_by_pu_df.collect())

                                                                                

In [7]:
if INTERMEDIATE_OUTPUTS:
    tlc_by_pu_df.sort('week_index').limit(5)
# TODO: commenting

In [8]:
if INTERMEDIATE_OUTPUTS:
    tlc_by_pu_df.sort('avg_trip_distance', ascending=False).limit(5)

In [9]:
tlc_by_pu_df.write.mode('overwrite').parquet('../data/curated/tlc/aggregated/yellow/by_pu')
# TODO: commenting

                                                                                

#### Group by drop-off location

In [10]:
tlc_by_do_df = ah.group_and_aggregate(tlc_df, TLC_GROUP_BY_DO_COLUMNS, 
    TLC_AGGREGATE_COLUMNS)
# TODO: commenting
# force this into memory 
# otherwise writing parquets results in a java executor out of memory error
tlc_by_do_df = spark.createDataFrame(tlc_by_do_df.collect())

                                                                                

In [11]:
if INTERMEDIATE_OUTPUTS:
    tlc_by_do_df.sort('week_index').limit(5)
# TODO: commenting

In [12]:
if INTERMEDIATE_OUTPUTS:
    tlc_by_do_df.sort('avg_trip_distance', ascending=False).limit(5)

In [13]:
tlc_by_do_df.write.mode('overwrite').parquet('../data/curated/tlc/aggregated/yellow/by_do')
# TODO: commenting

### 2. Aggregating the COVID dataset

In [14]:
# read in the covid dataset
covid_df = spark.read.parquet('../data/curated/virals/covid/cleaned/cases-by-day')
covid_df.limit(5)
# TODO: commenting

date,week_ending,week_year,week_month,timeline,cases,deaths,hospitalised,borough
02/29/2020,2020-02-29,2020,2,keep for graphing,0,0,1,Brooklyn
03/01/2020,2020-03-07,2020,3,post,0,0,0,Brooklyn
03/02/2020,2020-03-07,2020,3,post,0,0,2,Brooklyn
03/03/2020,2020-03-07,2020,3,post,0,0,3,Brooklyn
03/04/2020,2020-03-07,2020,3,post,1,0,1,Brooklyn


In [15]:
COVID_GROUP_COLUMNS = [
    'week_year',
    'week_month',
    'week_ending',
    'timeline',
    'borough'
]
# TODO: commenting
COVID_AGGREGATE_COLUMNS = {
    'cases': ['total', 'daily_average'],
    'deaths': ['total', 'daily_average'],
    'hospitalised': ['total', 'daily_average'],
}

In [16]:
covid_df = ah.group_and_aggregate(covid_df, COVID_GROUP_COLUMNS, 
    COVID_AGGREGATE_COLUMNS)

# force this into memory 
# otherwise writing parquets results in a java executor out of memory error
covid_df = spark.createDataFrame(covid_df.collect())
# TODO: commenting

In [17]:
if INTERMEDIATE_OUTPUTS:
    covid_df.sort('week_index').limit(5)
# TODO: commenting

In [18]:
# save it
# TODO: commenting
covid_df.write.mode('overwrite').parquet('../data/curated/virals/covid/aggregated/cases-by-week')

### 3. Aggregating the Flu dataset
*The flu dataset is already grouped by MMWR week, so only daily_averages can be calculated*

In [19]:
# read in the flu dataset
flu_df = spark.read.parquet('../data/curated/virals/flu/cleaned/cases-by-week')
flu_df.limit(5)

year,month,day,week_ending,week_year,week_month,timeline,date,borough,disease,cases
2018,1,20,2018-01-20,2018,1,neither,01/20/2018,Bronx,INFLUENZA_B,203
2018,5,5,2018-05-05,2018,5,pre,05/05/2018,Staten Island,INFLUENZA_UNSPECI...,0
2018,3,10,2018-03-10,2018,3,pre,03/10/2018,Manhattan,INFLUENZA_A,65
2018,3,3,2018-03-03,2018,2,neither,03/03/2018,Brooklyn,INFLUENZA_B,287
2018,5,12,2018-05-12,2018,5,pre,05/12/2018,Manhattan,INFLUENZA_B,10


In [20]:
# add the daily average cases
flu_df = flu_df.withColumn(
    'daily_avg_cases',
    F.col('cases') / 7
)

In [21]:
flu_df.write.mode('overwrite').parquet('../data/curated/virals/flu/aggregated/cases-by-week')