### MAST30034: Applied Data Science Project 1
---
# Preprocessing Part 2: Aggregating Data by MMWR Week
#### Xavier Travers (1178369)

Aggregate all the data by MMWR week (defined [here](https://ndc.services.cdc.gov/wp-content/uploads/MMWR_Week_overview.pdf))
and borough.
This means counting trips to and from each of the boroughs per month.
This is done for each of the taxi types.

In [1]:
# imports used throughout this notebook
from pyspark.sql import functions as F
import sys

# add homemade helpers
sys.path.insert(1, '../../scripts')
import helpers.aggregation_helpers as ah

# path where the data files are stored
DATA_PATH = '../../data'

In [2]:
from pyspark.sql import SparkSession

# Create a spark session (which will run spark jobs)
spark = (
    SparkSession.builder.appName('MAST30034 XT Project 1')
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config('spark.sql.repl.eagerEval.enabled', True) 
    .config('spark.sql.parquet.cacheMetadata', 'true')
    .config("spark.executor.memory", "2g")
    .config("spark.driver.memory", "4g")
    .getOrCreate()
)

22/08/21 08:25:31 WARN Utils: Your hostname, Polaris resolves to a loopback address: 127.0.1.1; using 172.18.201.145 instead (on interface eth0)
22/08/21 08:25:31 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/08/21 08:25:32 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
# read in the population data
pop_df = spark.read.parquet(f'{DATA_PATH}/curated/population_by_borough_by_year')
pop_df.limit(5)

week_year,population,borough
2020,495522.0,Staten Island
2021,493494.0,Staten Island
2018,1629055.0,Manhattan
2019,1628706.0,Manhattan
2019,2253858.0,Queens


### 1. Aggregating the TLC dataset

In [4]:
# read in the cleaned yellow dataset
tlc_df = spark.read.parquet(f'{DATA_PATH}/curated/tlc/cleaned/yellow')

# count the raw # of rows and print it out just to check the formatting
f'{tlc_df.count()} ROWS'
tlc_df.limit(5)

                                                                                

year,month,day,week_ending,week_index,date,trip_distance,pu_borough,hours_elapsed,week_year,week_month
2020,2,3,2020-02-08,6,02/03/2020,1.66,Bronx,0.0722222222222222,2020,2
2020,2,19,2020-02-22,8,02/19/2020,1.33,Manhattan,0.2091666666666666,2020,2
2020,2,3,2020-02-08,6,02/03/2020,0.99,Manhattan,0.0813888888888888,2020,2
2020,2,19,2020-02-22,8,02/19/2020,1.52,Manhattan,0.1111111111111111,2020,2
2020,2,3,2020-02-08,6,02/03/2020,2.9,Bronx,0.2211111111111111,2020,2


In [5]:
# columns to group by 
TLC_COMMON_GROUP_COLUMNS = [
    'week_year',
    'week_month',
    'week_ending',
    'week_index',
]

# add the borough type to groupby
TLC_GROUP_BY_PU_COLUMNS = TLC_COMMON_GROUP_COLUMNS + ['pu_borough'];
TLC_GROUP_BY_DO_COLUMNS = TLC_COMMON_GROUP_COLUMNS + ['do_borough'];

# determine how the values are to be aggregated
TLC_AGGREGATE_COLUMNS = {
    '*': [
        'count', 
        'count_per_capita',
        'count_per_100k'
    ],
    # 'passengers': [
    #     'total', 
    #     'total_per_capita', 
    #     'average'
    # ],
    'trip_distance': [
        # 'total', 
        # 'total_per_capita', 
        'average'
    ],
    # 'hours_elapsed': [
    #     'total', 
    #     'total_per_capita', 
    #     'average'
    # ],
}

#### Group by pick-up location

In [6]:
# perform the grouping and aggregation in (function in `scripts/helpers`)
tlc_by_pu_df = ah.group_and_aggregate(tlc_df, pop_df, TLC_GROUP_BY_PU_COLUMNS, 
    TLC_AGGREGATE_COLUMNS)

# force this into memory 
# otherwise writing parquets results in a java executor out of memory error
# (this is personal experience, your mileage may vary)
tlc_by_pu_df = spark.createDataFrame(tlc_by_pu_df.collect())

                                                                                

In [7]:
# check to see that aggregation was successful
tlc_by_pu_df.limit(5)

week_year,week_month,week_ending,week_index,pu_borough,population,num_trips,num_pc_trips,num_p100k_trips,avg_trip_distance
2020,2,2020-02-08,6,Manhattan,1687834.0,1430265,0.8473967226634846,84739.67226634847,2.2641292977175858
2020,2,2020-02-08,6,Queens,2395791.0,87883,0.03668224815937617,3668.224815937617,10.88470034022497
2020,2,2020-02-22,8,Brooklyn,2727393.0,14297,0.005242002161038032,524.2002161038032,4.103749038259784
2020,2,2020-02-22,8,Bronx,1466438.0,2374,0.001618888763111...,161.88887631117032,6.130808761583833
2020,2,2020-02-08,6,Brooklyn,2727393.0,16986,0.006227925348492...,622.7925348492131,4.057908866125025


In [8]:
# check the average distances for potential outliers? (unlikely after cleaning)
tlc_by_pu_df.sort('avg_trip_distance', ascending=False).limit(5)

week_year,week_month,week_ending,week_index,pu_borough,population,num_trips,num_pc_trips,num_p100k_trips,avg_trip_distance
2020,9,2020-09-19,38,Staten Island,495522.0,45,9.081332413091648e-05,9.08133241309165,31.347111111111115
2021,7,2021-07-31,83,Staten Island,493494.0,75,0.0001519775316417221,15.197753164172209,30.23546666666668
2020,8,2020-08-29,35,Staten Island,495522.0,40,8.072295478303688e-05,8.072295478303689,29.75225
2021,8,2021-08-14,85,Staten Island,493494.0,71,0.0001438720632874969,14.387206328749691,29.49450704225353
2021,6,2021-06-26,78,Staten Island,493494.0,54,0.0001094238227820399,10.94238227820399,29.341666666666665


In [9]:
# save the aggregated by pickup data
tlc_by_pu_df.write.mode('overwrite')\
    .parquet(f'{DATA_PATH}/curated/tlc/aggregated/yellow/by_pu')

### 2. Aggregating the COVID dataset

In [10]:
# read in the covid dataset
covid_df = spark.read.parquet(f'{DATA_PATH}/curated/virals/covid/cleaned/cases_by_day')
covid_df.limit(5)

date,week_ending,week_year,week_month,week_index,cases,borough
,2020-01-04,2020,1,1,0,Brooklyn
,2020-01-04,2020,1,1,0,Brooklyn
,2020-01-04,2020,1,1,0,Brooklyn
,2020-01-04,2020,1,1,0,Brooklyn
,2020-01-04,2020,1,1,0,Brooklyn


In [11]:
# columns to group by 
COVID_GROUP_COLUMNS = [
    'week_year',
    'week_month',
    'week_ending',
    'week_index',
    'borough'
]

# determine how the values are to be aggregated
COVID_AGGREGATE_COLUMNS = {
    'cases': [
        'total', 
        'total_per_capita', 
        'total_per_100k',
        # 'daily_average', 
        # 'daily_average_per_capita'
    ],
}

In [12]:
# perform the grouping and aggregation in (function in `scripts/helpers`)
covid_df = ah.group_and_aggregate(covid_df, pop_df, COVID_GROUP_COLUMNS, 
    COVID_AGGREGATE_COLUMNS)

# force this into memory 
# otherwise writing parquets results in a java executor out of memory error
# (this is personal experience, your mileage may vary)
covid_df = spark.createDataFrame(covid_df.collect())

In [13]:
# check to see that aggregation was successful 
covid_df.limit(5)

week_year,week_month,week_ending,week_index,borough,population,tot_cases,tot_pc_cases,tot_p100k_cases
2020,12,2020-12-12,50,Brooklyn,2727393.0,28660,0.010508203254903125,1050.8203254903126
2021,1,2021-01-30,57,Brooklyn,2641052.0,45010,0.017042451265632027,1704.2451265632028
2021,9,2021-10-02,92,Brooklyn,2641052.0,13590,0.005145676798487876,514.5676798487875
2020,3,2020-03-21,12,Brooklyn,2727393.0,29070,0.010658529958828815,1065.8529958828815
2020,10,2020-10-17,42,Brooklyn,2727393.0,6350,0.002328230658361...,232.82306583613


In [14]:
# save the aggregated data
covid_df.write.mode('overwrite')\
    .parquet(f'{DATA_PATH}/curated/virals/covid/aggregated/cases_by_week')

### 3. Aggregating the Flu dataset
*The flu dataset is already grouped by MMWR week, so only daily_averages can be calculated*

In [15]:
# read in the flu dataset
flu_df = spark.read.parquet(f'{DATA_PATH}/curated/virals/flu/cleaned/cases_by_week')
flu_df.limit(5)

year,month,day,week_ending,week_year,week_month,week_index,date,borough,disease,cases
2019,12,29,2020-01-04,2020,1,1,,Bronx,,0
2019,12,30,2020-01-04,2020,1,1,,Bronx,,0
2019,12,31,2020-01-04,2020,1,1,,Bronx,,0
2020,1,1,2020-01-04,2020,1,1,,Bronx,,0
2020,1,2,2020-01-04,2020,1,1,,Bronx,,0


In [16]:
# columns to group by 
FLU_GROUP_COLUMNS = [
    'week_year',
    'week_month',
    'week_ending',
    'week_index',
    'borough'
]

# determine how the values are to be aggregated
FLU_AGGREGATE_COLUMNS = {
    'cases': [
        'total', 
        'total_per_capita', 
        'total_per_100k',
    ],
}

In [17]:
# perform the grouping and aggregation in (function in `scripts/helpers`)
flu_df = ah.group_and_aggregate(flu_df, pop_df, FLU_GROUP_COLUMNS, 
    FLU_AGGREGATE_COLUMNS)

# force this into memory 
# otherwise writing parquets results in a java executor out of memory error
# (this is personal experience, your mileage may vary)
flu_df = spark.createDataFrame(flu_df.collect())

In [18]:
# check to see that aggregation was successful 
flu_df\
    .where(F.col('week_year') == 2020)\
    .where(F.col('borough') == 'Queens')\
    .sort('week_index').limit(20)

week_year,week_month,week_ending,week_index,borough,population,tot_cases,tot_pc_cases,tot_p100k_cases
2020,1,2020-01-04,1,Queens,2395791.0,1471,6.13993457693096E-4,61.39934576930959
2020,1,2020-01-11,2,Queens,2395791.0,1821,7.600829955534519E-4,76.00829955534519
2020,1,2020-01-18,3,Queens,2395791.0,2117,8.836330047153529E-4,88.36330047153528
2020,1,2020-01-25,4,Queens,2395791.0,2250,9.391470291022882E-4,93.9147029102288
2020,1,2020-02-01,5,Queens,2395791.0,2425,0.001012191798032466,101.2191798032466
2020,2,2020-02-08,6,Queens,2395791.0,2235,9.328860489082729E-4,93.28860489082729
2020,2,2020-02-15,7,Queens,2395791.0,1746,7.287780945833755E-4,72.87780945833757
2020,2,2020-02-22,8,Queens,2395791.0,967,4.036245231741834E-4,40.36245231741834
2020,2,2020-02-29,9,Queens,2395791.0,756,3.155534017783688E-4,31.55534017783688
2020,3,2020-03-07,10,Queens,2395791.0,623,2.600393773914336E-4,26.00393773914336


In [19]:
# save the aggregated data
flu_df.write.mode('overwrite')\
    .parquet(f'{DATA_PATH}/curated/virals/flu/aggregated/cases_by_week')