### MAST30034: Applied Data Science Project 1
---
# Preprocessing Part 2: Aggregating Data by MMWR Week
#### Xavier Travers (1178369)

Aggregate all the data by MMWR week (defined [here](https://ndc.services.cdc.gov/wp-content/uploads/MMWR_Week_overview.pdf))
and borough.
This means counting trips to and from each of the boroughs per month.
This is done for each of the taxi types.

In [None]:
# imports used throughout this notebook
from pyspark.sql import functions as F
import sys

# add homemade helpers
sys.path.insert(1, '../../scripts')
import helpers.aggregation_helpers as ah

# path where the data files are stored
DATA_PATH = '../../data'

In [None]:
from pyspark.sql import SparkSession

# Create a spark session (which will run spark jobs)
spark = (
    SparkSession.builder.appName('MAST30034 XT Project 1')
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config('spark.sql.repl.eagerEval.enabled', True) 
    .config('spark.sql.parquet.cacheMetadata', 'true')
    .config("spark.executor.memory", "2g")
    .config("spark.driver.memory", "4g")
    .getOrCreate()
)

In [None]:
# read in the population data
pop_df = spark.read.parquet(f'{DATA_PATH}/curated/population_by_borough_by_year')
pop_df.limit(5)

### 1. Aggregating the TLC dataset

In [None]:
# read in the cleaned yellow dataset
tlc_df = spark.read.parquet(f'{DATA_PATH}/curated/tlc/cleaned/yellow')

# count the raw # of rows and print it out just to check the formatting
f'{tlc_df.count()} ROWS'
tlc_df.limit(5)

In [None]:
# columns to group by 
TLC_COMMON_GROUP_COLUMNS = [
    'week_year',
    'week_month',
    'week_ending',
    'week_index',
]

# add the borough type to groupby
TLC_GROUP_BY_PU_COLUMNS = TLC_COMMON_GROUP_COLUMNS + ['pu_borough'];
TLC_GROUP_BY_DO_COLUMNS = TLC_COMMON_GROUP_COLUMNS + ['do_borough'];

# determine how the values are to be aggregated
TLC_AGGREGATE_COLUMNS = {
    '*': [
        'count', 
        'count_per_capita',
        'count_per_100k'
    ],
    # 'passengers': [
    #     'total', 
    #     'total_per_capita', 
    #     'average'
    # ],
    'trip_distance': [
        # 'total', 
        # 'total_per_capita', 
        'average'
    ],
    # 'hours_elapsed': [
    #     'total', 
    #     'total_per_capita', 
    #     'average'
    # ],
}

#### Group by pick-up location

In [None]:
# perform the grouping and aggregation in (function in `scripts/helpers`)
tlc_by_pu_df = ah.group_and_aggregate(tlc_df, pop_df, TLC_GROUP_BY_PU_COLUMNS, 
    TLC_AGGREGATE_COLUMNS)

# force this into memory 
# otherwise writing parquets results in a java executor out of memory error
# (this is personal experience, your mileage may vary)
tlc_by_pu_df = spark.createDataFrame(tlc_by_pu_df.collect())

In [None]:
# check to see that aggregation was successful
tlc_by_pu_df.limit(5)

In [None]:
# check to see that aggregation was successful
tlc_by_pu_df.sort('week_ending', ascending = False).limit(5)

In [None]:
# check the average distances for potential outliers? (unlikely after cleaning)
tlc_by_pu_df.sort('avg_trip_distance', ascending=False).limit(5)

In [None]:
# save the aggregated by pickup data
tlc_by_pu_df.write.mode('overwrite')\
    .parquet(f'{DATA_PATH}/curated/tlc/aggregated/yellow/by_pu')

### 2. Aggregating the COVID dataset

In [None]:
# read in the covid dataset
covid_df = spark.read.parquet(f'{DATA_PATH}/curated/virals/covid/cleaned/cases_by_day')
covid_df.limit(5)

In [None]:
# check that this all worked correctly
covid_df.sort('week_index', ascending = False).limit(5)

In [None]:
# columns to group by 
COVID_GROUP_COLUMNS = [
    'week_year',
    'week_month',
    'week_ending',
    'week_index',
    'borough'
]

# determine how the values are to be aggregated
COVID_AGGREGATE_COLUMNS = {
    'cases': [
        'total', 
        'total_per_capita', 
        'total_per_100k',
        # 'daily_average', 
        # 'daily_average_per_capita'
    ],
}

In [None]:
# perform the grouping and aggregation in (function in `scripts/helpers`)
covid_df = ah.group_and_aggregate(covid_df, pop_df, COVID_GROUP_COLUMNS, 
    COVID_AGGREGATE_COLUMNS)

# force this into memory 
# otherwise writing parquets results in a java executor out of memory error
# (this is personal experience, your mileage may vary)
covid_df = spark.createDataFrame(covid_df.collect())

In [None]:
# check to see that aggregation was successful 
covid_df.limit(5)

In [None]:
# check to see that aggregation was successful 
covid_df.sort('week_ending', ascending = False).limit(5)

In [None]:
# save the aggregated data
covid_df.write.mode('overwrite')\
    .parquet(f'{DATA_PATH}/curated/virals/covid/aggregated/cases_by_week')

### 3. Aggregating the Flu dataset
*The flu dataset is already grouped by MMWR week, so only daily_averages can be calculated*

In [None]:
# read in the flu dataset
flu_df = spark.read.parquet(f'{DATA_PATH}/curated/virals/flu/cleaned/cases_by_week')
flu_df.limit(5)

In [None]:
# columns to group by 
FLU_GROUP_COLUMNS = [
    'week_year',
    'week_month',
    'week_ending',
    'week_index',
    'borough'
]

# determine how the values are to be aggregated
FLU_AGGREGATE_COLUMNS = {
    'cases': [
        'total', 
        'total_per_capita', 
        'total_per_100k',
    ],
}

In [None]:
# perform the grouping and aggregation in (function in `scripts/helpers`)
flu_df = ah.group_and_aggregate(flu_df, pop_df, FLU_GROUP_COLUMNS, 
    FLU_AGGREGATE_COLUMNS)

# force this into memory 
# otherwise writing parquets results in a java executor out of memory error
# (this is personal experience, your mileage may vary)
flu_df = spark.createDataFrame(flu_df.collect())

In [None]:
# check to see that aggregation was successful 
flu_df.limit(5)

In [None]:
# check to see that aggregation was successful 
flu_df.sort('week_ending', ascending = False).limit(5)

In [None]:
# save the aggregated data
flu_df.write.mode('overwrite')\
    .parquet(f'{DATA_PATH}/curated/virals/flu/aggregated/cases_by_week')