### MAST30034: Applied Data Science Project 1
---
# Preprocessing Part 2: Aggregating Data by MMWR Week
#### Xavier Travers (1178369)

Aggregate all the data by MMWR week (defined [here](https://ndc.services.cdc.gov/wp-content/uploads/MMWR_Week_overview.pdf)).
This means counting trips to and from each of the boroughs per month.
This is done for each of the taxi types.

In [1]:
# imports used throughout this notebook
from pyspark.sql import DataFrame
from pyspark.sql import functions as F
import os
import sys
import re
from itertools import chain

# add homemade helpers
sys.path.insert(1, '../../scripts')
import helpers.aggregation_helpers as ah
import helpers.join_helpers as jh

# path where the data files are stored
DATA_PATH = '../../data'

In [2]:
from pyspark.sql import SparkSession

# Create a spark session (which will run spark jobs)
spark = (
    SparkSession.builder.appName('MAST30034 XT Project 1')
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config('spark.sql.repl.eagerEval.enabled', True) 
    .config('spark.sql.parquet.cacheMetadata', 'true')
    .config("spark.executor.memory", "2g")
    .config("spark.driver.memory", "4g")
    .getOrCreate()
)

22/08/20 16:18:57 WARN Utils: Your hostname, Polaris resolves to a loopback address: 127.0.1.1; using 172.18.201.167 instead (on interface eth0)
22/08/20 16:18:57 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/08/20 16:18:58 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/08/20 16:18:58 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [3]:
# read in the population data
pop_df = spark.read.parquet(f'{DATA_PATH}/curated/population_by_borough_by_year')
pop_df.limit(5)

week_year,population,borough
2020,495522.0,Staten Island
2021,493494.0,Staten Island
2018,1629055.0,Manhattan
2019,1628706.0,Manhattan
2019,2253858.0,Queens


### 1. Aggregating the TLC dataset

In [4]:
# read in the cleaned yellow dataset
tlc_df = spark.read.parquet(f'{DATA_PATH}/curated/tlc/cleaned/yellow')

# count the raw # of rows and print it out just to check the formatting
f'{tlc_df.count()} ROWS'
tlc_df.limit(5)

                                                                                

year,month,day,week_ending,week_index,timeline,date,passengers,trip_distance,pu_location_id,do_location_id,hours_elapsed,pu_borough,do_borough,week_year,week_month
2018,10,14,2018-10-20,42,1,10/14/2018,1.0,2.37,43,163,0.1791666666666666,Manhattan,Manhattan,2018,10
2018,10,5,2018-10-06,40,1,10/05/2018,1.0,1.12,230,246,0.1625,Manhattan,Manhattan,2018,10
2018,11,3,2018-11-03,44,1,11/03/2018,1.0,1.0,170,48,0.1597222222222222,Manhattan,Manhattan,2018,10
2018,10,3,2018-10-06,40,1,10/03/2018,1.0,1.7,239,163,0.2166666666666666,Manhattan,Manhattan,2018,10
2018,10,14,2018-10-20,42,1,10/14/2018,1.0,1.64,163,239,0.1897222222222222,Manhattan,Manhattan,2018,10


In [5]:
# columns to group by 
TLC_COMMON_GROUP_COLUMNS = [
    'week_year',
    'week_month',
    'week_ending',
    'week_index',
    'timeline'
]

# add the borough type to groupby
TLC_GROUP_BY_PU_COLUMNS = TLC_COMMON_GROUP_COLUMNS + ['pu_borough'];
TLC_GROUP_BY_DO_COLUMNS = TLC_COMMON_GROUP_COLUMNS + ['do_borough'];

# determine how the values are to be aggregated
TLC_AGGREGATE_COLUMNS = {
    '*': [
        'count', 
        'count_per_capita',
        'count_per_100k'
    ],
    'passengers': [
        'total', 
        # 'total_per_capita', 
        # 'daily_average', 
        'average'
    ],
    'trip_distance': [
        # 'total', 
        # 'total_per_capita', 
        # 'daily_average', 
        # 'daily_average_per_capita', 
        'average'
    ],
    # 'hours_elapsed': [
    #     'total', 
    #     'total_per_capita', 
    #     'daily_average', 
    #     'daily_average_per_capita', 
    #     'average'
    # ],
}

#### Group by pick-up location

In [6]:
# perform the grouping and aggregation in (function in `scripts/helpers`)
tlc_by_pu_df = ah.group_and_aggregate(tlc_df, pop_df, TLC_GROUP_BY_PU_COLUMNS, 
    TLC_AGGREGATE_COLUMNS)

# force this into memory 
# otherwise writing parquets results in a java executor out of memory error
# (this is personal experience, your mileage may vary)
tlc_by_pu_df = spark.createDataFrame(tlc_by_pu_df.collect())

                                                                                

In [7]:
# check to see that aggregation was successful
tlc_by_pu_df.limit(5)

week_year,week_month,week_ending,week_index,timeline,pu_borough,population,num_trips,num_pc_trips,num_p100k_trips,tot_passengers,avg_passengers,avg_trip_distance
2018,10,2018-10-06,40,1,Bronx,1432087.0,2859,0.001996387091007739,199.6387091007739,4478.0,1.5662819167541098,6.123228401539025
2018,10,2018-10-20,42,1,Brooklyn,2578074.0,27722,0.010752988471238607,1075.2988471238607,43129.0,1.555767982108073,4.208166438207927
2018,10,2018-11-03,44,1,Bronx,1432087.0,2966,0.002071103222080781,207.11032220807812,4690.0,1.581254214430209,6.578695212407289
2018,10,2018-10-20,42,1,Staten Island,476260.0,89,1.868727165833788...,18.68727165833788,117.0,1.3146067415730338,12.547752808988765
2018,10,2018-10-20,42,1,Bronx,1432087.0,2847,0.001988007711821977,198.8007711821977,4409.0,1.5486476993326308,6.572125043905856


In [8]:
# check the average distances for potential outliers? (unlikely after cleaning)
tlc_by_pu_df.sort('avg_trip_distance', ascending=False).limit(5)

week_year,week_month,week_ending,week_index,timeline,pu_borough,population,num_trips,num_pc_trips,num_p100k_trips,tot_passengers,avg_passengers,avg_trip_distance
2021,3,2021-03-13,167,2,Staten Island,493494.0,51,1.033447215163710...,10.334472151637105,55.0,1.0784313725490196,32.103529411764704
2020,9,2020-09-19,142,2,Staten Island,495522.0,39,7.870488091346096E-5,7.870488091346096,39.0,1.0,31.24230769230769
2020,12,2021-01-02,157,2,Staten Island,495522.0,48,9.686754573964425E-5,9.686754573964423,48.0,1.0,31.226874999999996
2021,1,2021-01-16,159,2,Staten Island,493494.0,59,1.195556582248213...,11.955565822482138,60.0,1.0169491525423728,30.70338983050847
2021,6,2021-06-26,182,2,Staten Island,493494.0,42,8.510741771936437E-5,8.510741771936438,53.0,1.261904761904762,30.67214285714286


In [9]:
# filter for only the considered timeline
tlc_by_pu_df = tlc_by_pu_df.where(F.col('timeline') > 0)

In [10]:
# save the aggregated by pickup data
tlc_by_pu_df.write.mode('overwrite')\
    .parquet(f'{DATA_PATH}/curated/tlc/aggregated/yellow/by_pu')

#### Group by drop-off location

In [11]:
# perform the grouping and aggregation in (function in `scripts/helpers`)
tlc_by_do_df = ah.group_and_aggregate(tlc_df, pop_df, TLC_GROUP_BY_DO_COLUMNS, 
    TLC_AGGREGATE_COLUMNS)

# force this into memory 
# otherwise writing parquets results in a java executor out of memory error
# (this is personal experience, your mileage may vary)
tlc_by_do_df = spark.createDataFrame(tlc_by_do_df.collect())

                                                                                

In [12]:
# check to see that aggregation was successful
tlc_by_do_df.limit(5)

week_year,week_month,week_ending,week_index,timeline,do_borough,population,num_trips,num_pc_trips,num_p100k_trips,tot_passengers,avg_passengers,avg_trip_distance
2018,10,2018-10-06,40,1,Bronx,1432087.0,12889,0.009000151527106943,900.0151527106942,20403.0,1.582977732950578,9.301758864147724
2018,10,2018-10-20,42,1,Brooklyn,2578074.0,89060,0.03454516821472153,3454.5168214721534,139264.0,1.5637098585223446,6.961849315068433
2018,10,2018-11-03,44,1,Bronx,1432087.0,12699,0.008867478023332381,886.7478023332382,20035.0,1.5776832821482007,9.501114260965412
2018,10,2018-10-20,42,1,Staten Island,476260.0,515,0.001081342124049...,108.13421240498889,832.0,1.6155339805825242,19.770757281553397
2018,10,2018-10-20,42,1,Bronx,1432087.0,12280,0.008574898033429533,857.4898033429533,19045.0,1.5508957654723128,9.581476384364828


In [13]:
# check the average distances for potential outliers? (unlikely after cleaning)
tlc_by_do_df.sort('avg_trip_distance', ascending=False).limit(5)

week_year,week_month,week_ending,week_index,timeline,do_borough,population,num_trips,num_pc_trips,num_p100k_trips,tot_passengers,avg_passengers,avg_trip_distance
2021,3,2021-03-13,167,2,Staten Island,493494.0,124,0.0002512695189809805,25.126951898098053,156.0,1.2580645161290325,24.661129032258067
2020,12,2020-12-05,153,2,Staten Island,495522.0,114,0.0002300604211316551,23.00604211316551,142.0,1.2456140350877194,24.417982456140347
2020,12,2021-01-02,157,2,Staten Island,495522.0,119,0.0002401507904795347,24.01507904795347,145.0,1.218487394957983,24.379663865546224
2021,6,2021-06-19,181,2,Staten Island,493494.0,169,0.0003424560379660138,34.24560379660138,264.0,1.5621301775147929,24.308165680473365
2021,2,2021-02-20,164,2,Staten Island,493494.0,123,0.0002492431518924242,24.92431518924242,164.0,1.3333333333333333,24.152195121951223


In [15]:
# save the aggregated by dropoff data
tlc_by_do_df.write.mode('overwrite')\
    .parquet(f'{DATA_PATH}/curated/tlc/aggregated/yellow/by_do')

### 2. Aggregating the COVID dataset

In [16]:
# read in the covid dataset
covid_df = spark.read.parquet(f'{DATA_PATH}/curated/virals/covid/cleaned/cases_by_day')
covid_df.limit(5)

date,week_ending,week_year,week_month,week_index,timeline,cases,borough
02/29/2020,2020-02-29,2020,2,113,1,0,Brooklyn
03/01/2020,2020-03-07,2020,3,114,1,0,Brooklyn
03/02/2020,2020-03-07,2020,3,114,1,0,Brooklyn
03/03/2020,2020-03-07,2020,3,114,1,0,Brooklyn
03/04/2020,2020-03-07,2020,3,114,1,1,Brooklyn


In [17]:
# columns to group by 
COVID_GROUP_COLUMNS = [
    'week_year',
    'week_month',
    'week_ending',
    'week_index',
    'timeline',
    'borough'
]

# determine how the values are to be aggregated
COVID_AGGREGATE_COLUMNS = {
    'cases': [
        'total', 
        'total_per_capita', 
        'total_per_100k',
        # 'daily_average', 
        # 'daily_average_per_capita'
    ],
}

In [18]:
# perform the grouping and aggregation in (function in `scripts/helpers`)
covid_df = ah.group_and_aggregate(covid_df, pop_df, COVID_GROUP_COLUMNS, 
    COVID_AGGREGATE_COLUMNS)

# force this into memory 
# otherwise writing parquets results in a java executor out of memory error
# (this is personal experience, your mileage may vary)
covid_df = spark.createDataFrame(covid_df.collect())

In [19]:
# check to see that aggregation was successful 
covid_df.limit(5)

week_year,week_month,week_ending,week_index,timeline,borough,population,tot_cases,tot_pc_cases,tot_p100k_cases
2021,6,2021-07-03,183,2,Brooklyn,2641052.0,369.0,1.397170521443727...,13.971705214437277
2021,6,2021-06-12,180,2,Brooklyn,2641052.0,330.0,1.249502092348049E-4,12.495020923480492
2020,8,2020-08-29,139,2,Brooklyn,2727393.0,556.0,2.038576765431311E-4,20.38576765431311
2020,3,2020-03-28,117,1,Brooklyn,2727393.0,7792.0,0.002856940675582...,285.6940675582874
2020,8,2020-08-22,138,2,Brooklyn,2727393.0,447.0,1.638927723287403E-4,16.38927723287403


In [20]:
# save the aggregated data
covid_df.write.mode('overwrite')\
    .parquet(f'{DATA_PATH}/curated/virals/covid/aggregated/cases_by_week')

### 3. Aggregating the Flu dataset
*The flu dataset is already grouped by MMWR week, so only daily_averages can be calculated*

In [21]:
# read in the flu dataset
flu_df = spark.read.parquet(f'{DATA_PATH}/curated/virals/flu/cleaned/cases_by_week')
flu_df.limit(5)

year,month,day,week_ending,week_year,week_month,week_index,timeline,date,borough,disease,cases
2019,2,16,2019-02-16,2019,2,59,1,02/16/2019,Staten Island,INFLUENZA_UNSPECI...,0
2018,12,22,2018-12-22,2018,12,51,1,12/22/2018,Staten Island,INFLUENZA_B,2
2018,10,13,2018-10-13,2018,10,41,1,10/13/2018,Manhattan,INFLUENZA_B,0
2018,11,3,2018-11-03,2018,10,44,1,11/03/2018,Staten Island,INFLUENZA_A,0
2020,12,12,2020-12-12,2020,12,154,2,12/12/2020,Staten Island,INFLUENZA_B,3


In [22]:
# columns to group by 
FLU_GROUP_COLUMNS = [
    'week_year',
    'week_month',
    'week_ending',
    'week_index',
    'timeline',
    'borough'
]

# determine how the values are to be aggregated
FLU_AGGREGATE_COLUMNS = {
    'cases': [
        'total', 
        'total_per_capita', 
        'total_per_100k',
    ],
}

In [23]:
# perform the grouping and aggregation in (function in `scripts/helpers`)
flu_df = ah.group_and_aggregate(flu_df, pop_df, FLU_GROUP_COLUMNS, 
    FLU_AGGREGATE_COLUMNS)

# force this into memory 
# otherwise writing parquets results in a java executor out of memory error
# (this is personal experience, your mileage may vary)
flu_df = spark.createDataFrame(flu_df.collect())

In [24]:
# check to see that aggregation was successful 
flu_df.limit(5)

week_year,week_month,week_ending,week_index,timeline,borough,population,tot_cases,tot_pc_cases,tot_p100k_cases
2020,11,2020-11-28,152,2,Staten Island,495522.0,10.0,2.018073869575922e-05,2.018073869575922
2020,10,2020-10-31,148,2,Staten Island,495522.0,3.0,6.054221608727766e-06,0.6054221608727766
2021,1,2021-01-30,161,2,Staten Island,493494.0,0.0,0.0,0.0
2021,1,2021-01-16,159,2,Staten Island,493494.0,2.0,4.052734177112589e-06,0.4052734177112589
2020,3,2020-03-28,117,1,Staten Island,495522.0,9.0,1.81626648261833e-05,1.8162664826183297


In [25]:
# save the aggregated data
flu_df.write.mode('overwrite')\
    .parquet(f'{DATA_PATH}/curated/virals/flu/aggregated/cases_by_week')