### MAST30034: Applied Data Science Project 1
---
# Preprocessing Part 3: Aggregating TLC Data
#### Xavier Travers (1178369)

Aggregate the TLC data by month.
This means counting trips to and from each of the boroughs per month.
This is done for each of the taxi types.

In [1]:
# imports used throughout this notebook
from pyspark.sql import DataFrame
from pyspark.sql import functions as F
import os
import re
from itertools import chain

# for printouts
DEBUGGING = True

In [2]:
from pyspark.sql import SparkSession

# Create a spark session (which will run spark jobs)
spark = (
    SparkSession.builder.appName('MAST30034 XT Project 1')
    .config('spark.sql.repl.eagerEval.enabled', True) 
    .config('spark.sql.parquet.cacheMetadata', 'true')
    .getOrCreate()
)

22/08/06 01:08:05 WARN Utils: Your hostname, Polaris resolves to a loopback address: 127.0.1.1; using 172.20.11.120 instead (on interface eth0)
22/08/06 01:08:05 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/08/06 01:08:06 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
# read in the taxi zones dataset
zones_df = spark.read.parquet('../data/raw/tlc_zones/zones')

In [4]:
def prefix_column_names(df: DataFrame, prefix: str) -> DataFrame:
    """ Add a prefix to the columns names of a `DataFrame`.

    Args:
    - df (`DataFrame`): The `DataFrame` for which to add prefixes
    - prefix (str): The prefix

    Returns:
        `DataFrame`: The modified `DataFrame`
    """

    out_df = df
    for col in df.columns:
        out_df = out_df.withColumnRenamed(col, prefix + col)
    return out_df

In [5]:
# define the tlc dataset names
TLC_NAMES = ['yellow', 'green', 'fhv', 'fhvhv']

In [6]:
def add_borough_names(df: DataFrame) -> DataFrame:
    location_id_colname = 'LocationID'

    required_zone_colnames = [
        'LocationID',
        'borough',
        'zone']

    out_df = df
    for prefix in ['PU', 'DO']:
        out_df = out_df.join(
            prefix_column_names(zones_df.select(
                required_zone_colnames), prefix),
            on=prefix + location_id_colname,
            how='inner'
        )

    return out_df


In [7]:
# this filter is used to reduce the amt of columns stored
OUT_COL_NAMES = [
    'year', # year group
    'month', # month group
    'type', # taxi type group
    'PUborough', # pickup borough group
    'DOborough', # dropoff borough group
    # grouping by sharing configuration 
    'shared', # (binary value for fhvhv and #passengers for yellow/green)
    'total_trips', # total trips in the group 
    'avg_distance', # avg distance
]

In [8]:
def aggregate_trips_green_yellow(df: DataFrame, year:int, month:int, taxi_type:str) -> DataFrame:
    """ Group the trips from the green/yellow datasets by:
    - year
    - month
    - type
    - PUborough
    - DOborough
    - shared

    Args:
        df (DataFrame): Dataset to aggregate
        year (int): Year of df
        month (int): Month of df
        taxi_type (str): Taxi type of df

    Returns:
        DataFrame: Grouped dataframe with aggregate values:
        - count: # of trips in the above grouping
        - total_distance: # the sum of the distance travelled in the above grouping
        - avg_distance: average length in miles travelled for this grouping
    """
    # filter for only needed columns
    join_selections = ['PULocationID', 'DOLocationID', 'passenger_count', 'trip_distance']
    joined_df = df\
        .select(join_selections)\
        .join(prefix_column_names(zones_df, 'PU'), 'PULocationID', 'inner')\
        .join(prefix_column_names(zones_df, 'DO'), 'DOLocationID', 'inner')

    # group the dataset
    group_filters = ['PUborough', 'DOborough', 'passenger_count']
    grouped_df = joined_df.groupBy(group_filters)

    # out_df = grouped_df.agg(
    #     F.count('*').alias('total_trips'),
    #     F.sum('trip_distance').alias('total_distance'),
    #     F.avg('trip_distance').alias('avg_distance')
    # )

    # aggregate and force the data into memory (~10000 rows max)
    # (otherwise the java executors seem to run out of memory)
    out_rows = grouped_df.agg(
        F.count('*').alias('total_trips'),
        F.sum('trip_distance').alias('total_distance'),
        F.avg('trip_distance').alias('avg_distance')
    ).collect()
    out_df = spark.createDataFrame(out_rows)

    return out_df\
        .withColumn('type', F.lit(taxi_type))\
        .withColumn('year', F.lit(year))\
        .withColumn('month', F.lit(month))\
        .withColumnRenamed('passenger_count', 'shared')\
        .select(OUT_COL_NAMES)

In [9]:
def aggregate_trips_fhvhv(df: DataFrame, year:int, month:int) -> DataFrame:
    """ Group the trips from the fhvhv datasets by:
    - year
    - month
    - type
    - PUborough
    - DOborough
    - shared

    Args:
        df (DataFrame): Dataset to aggregate
        year (int): Year of df
        month (int): Month of df
        taxi_type (str): Company of fhvhv

    Returns:
        DataFrame: Grouped dataframe with aggregate values:
        - count: # of trips in the above grouping
        - total_distance: # the sum of the distance travelled in the above grouping
        - avg_distance: average length in miles travelled for this grouping
    """
    join_selections = ['PULocationID', 'DOLocationID', 'shared_request_flag', 'trip_miles', 'hvfhs_license_num']
    joined_df = df\
        .select(join_selections)\
        .join(prefix_column_names(zones_df, 'PU'), 'PULocationID', 'inner')\
        .join(prefix_column_names(zones_df, 'DO'), 'DOLocationID', 'inner')

    group_filters = ['PUborough', 'DOborough', 'hvfhs_license_num', 'shared_request_flag']
    grouped_df = joined_df.groupBy(group_filters)
    # group_counts = grouped_df.count().collect()
    # group_tot_distances = grouped_df.sum('trip_miles').collect()
    # group_avg_distances = grouped_df.avg('trip_miles').collect()
    # group_row_lists = [group_counts, group_tot_distances, group_avg_distances]

    # out_df = None
    # for row_list in group_row_lists:
    #     temp_df = spark.createDataFrame(row_list)
    #     if(out_df == None):
    #         out_df = temp_df
    #     else:
    #         out_df = out_df\
    #             .join(temp_df, group_filters, 'inner')


    # aggregate and force the data into memory (~10000 rows max)
    # (otherwise the java executors run out of memory at write time)
    out_rows = grouped_df.agg(
        F.count('*').alias('total_trips'),
        F.sum('trip_miles').alias('total_distance'),
        F.avg('trip_miles').alias('avg_distance')
    ).collect()
    out_df = spark.createDataFrame(out_rows)

    # create maps to map columns into other types/values
    licenses_dict = {
        'HV0002': 'juno',
        'HV0003': 'uber',
        'HV0004': 'via',
        'HV0005': 'lyft'
    }
    # from: https://stackoverflow.com/questions/42980704/pyspark-create-new-column-with-mapping-from-a-dict
    license_mapping_expr = F.create_map([F.lit(x) for x in chain(*licenses_dict.items())])
    
    flags_dict = {
        'Y': 1.0,
        'N': 0.0
    }
    # from: https://stackoverflow.com/questions/42980704/pyspark-create-new-column-with-mapping-from-a-dict
    flag_mapping_expr = F.create_map([F.lit(x) for x in chain(*flags_dict.items())])

    return out_df\
        .withColumn('year', F.lit(year))\
        .withColumn('month', F.lit(month))\
        .withColumn('type', license_mapping_expr[F.col('hvfhs_license_num')])\
        .withColumn('shared', flag_mapping_expr[F.col('shared_request_flag')])\
        .select(OUT_COL_NAMES)

    # return out_df\
    #     .withColumn('year', F.lit(year))\
    #     .withColumn('month', F.lit(month))\
    #     .withColumn('type', license_mapping_expr[F.col('hvfhs_license_num')])\
    #     .withColumn('shared', flag_mapping_expr[F.col('shared_request_flag')])\
    #     .withColumnRenamed('count', 'total_trips')\
    #     .withColumnRenamed('sum(trip_miles)', 'total_distance')\
    #     .withColumnRenamed('avg(trip_miles)', 'avg_distance')\
    #     .select(OUT_COL_NAMES)


In [10]:
def aggregate_trips(df: DataFrame, year:int, month:int, taxi_type: str) -> DataFrame:
    if taxi_type == 'fhvhv':
        return aggregate_trips_fhvhv(df, year, month)
    return aggregate_trips_green_yellow(df, year, month, taxi_type)

In [11]:
# TLC_NAMES = ['green', 'yellow', 'fhvhv']
# aggregated_df = None

# for name in TLC_NAMES:
#     if DEBUGGING:
#         print(f'\nAGGREGATING "{name}" DATA')
#     aggregated_df = None
#     for filename in os.listdir(f'../data/raw/tlc/{name}'):
#         tlc_df = spark.read.parquet(f'../data/raw/tlc/{name}/{filename}')
#         filedata = re.split(r'[-.]', filename)
#         if aggregated_df == None:
#             aggregated_df = aggregate_trips(tlc_df, int(filedata[0]), int(filedata[1]), name)
#         else:
#             aggregated_df = aggregated_df.union(aggregate_trips(tlc_df, int(filedata[0]), int(filedata[1]), name))
#     print(aggregated_df.limit(5))

#     if DEBUGGING:
#         print(aggregated_df.count())

#     aggregated_df.write.mode('overwrite').parquet(f'../data/curated/tlc/aggregated/{name}')

In [12]:
TLC_NAMES = ['green', 'yellow', 'fhvhv']
aggregated_df = None

for name in TLC_NAMES:
    if DEBUGGING:
        print(f'\nAGGREGATING "{name}" DATA')
    temp_df = None
    for filename in os.listdir(f'../data/raw/tlc/{name}'):
        tlc_df = spark.read.parquet(f'../data/raw/tlc/{name}/{filename}')
        filedata = re.split(r'[-.]', filename)
        if temp_df == None:
            temp_df = aggregate_trips(tlc_df, int(filedata[0]), int(filedata[1]), name)
        else:
            temp_df = temp_df.union(aggregate_trips(tlc_df, int(filedata[0]), int(filedata[1]), name))
    
    if DEBUGGING:
        print(temp_df.count())

    # aggregated_df.write.mode('overwrite').parquet(f'../data/curated/tlc/aggregated/{name}')

    if aggregated_df == None:
        aggregated_df = temp_df
    else:
        aggregated_df = aggregated_df.union(temp_df)

if DEBUGGING:
    print(aggregated_df.count())
aggregated_df.limit(20)

# save the aggregated data
aggregated_df.write.mode('overwrite').parquet('../data/curated/tlc/aggregated')


AGGREGATING "green" DATA


                                                                                

3825

AGGREGATING "yellow" DATA


                                                                                

4980

AGGREGATING "fhvhv" DATA


                                                                                

3626


                                                                                

12431
22/08/06 01:10:10 WARN MemoryManager: Total allocation exceeds 95.00% (997,143,335 bytes) of heap memory
Scaling row group sizes to 92.87% for 8 writers
22/08/06 01:10:10 WARN MemoryManager: Total allocation exceeds 95.00% (997,143,335 bytes) of heap memory
Scaling row group sizes to 82.55% for 9 writers
22/08/06 01:10:10 WARN MemoryManager: Total allocation exceeds 95.00% (997,143,335 bytes) of heap memory
Scaling row group sizes to 74.29% for 10 writers
22/08/06 01:10:10 WARN MemoryManager: Total allocation exceeds 95.00% (997,143,335 bytes) of heap memory
Scaling row group sizes to 67.54% for 11 writers
22/08/06 01:10:10 WARN MemoryManager: Total allocation exceeds 95.00% (997,143,335 bytes) of heap memory
Scaling row group sizes to 61.91% for 12 writers
22/08/06 01:10:10 WARN MemoryManager: Total allocation exceeds 95.00% (997,143,335 bytes) of heap memory
Scaling row group sizes to 57.15% for 13 writers
22/08/06 01:10:10 WARN MemoryManager: Total allocation exceeds 95.00% (9



22/08/06 01:10:13 WARN MemoryManager: Total allocation exceeds 95.00% (997,143,335 bytes) of heap memory
Scaling row group sizes to 92.87% for 8 writers
22/08/06 01:10:13 WARN MemoryManager: Total allocation exceeds 95.00% (997,143,335 bytes) of heap memory
Scaling row group sizes to 82.55% for 9 writers
22/08/06 01:10:13 WARN MemoryManager: Total allocation exceeds 95.00% (997,143,335 bytes) of heap memory
Scaling row group sizes to 74.29% for 10 writers
22/08/06 01:10:13 WARN MemoryManager: Total allocation exceeds 95.00% (997,143,335 bytes) of heap memory
Scaling row group sizes to 67.54% for 11 writers
22/08/06 01:10:13 WARN MemoryManager: Total allocation exceeds 95.00% (997,143,335 bytes) of heap memory
Scaling row group sizes to 61.91% for 12 writers
22/08/06 01:10:13 WARN MemoryManager: Total allocation exceeds 95.00% (997,143,335 bytes) of heap memory
Scaling row group sizes to 57.15% for 13 writers
22/08/06 01:10:13 WARN MemoryManager: Total allocation exceeds 95.00% (997,143

                                                                                

### 2: Aggregating the COVID Data.

In [38]:
# read in the covid dataset
covid_df = spark.read.parquet('../data/curated/covid/cases-by-day')

In [39]:
# check that the data is read in correctly
covid_df.limit(5)

year,month,date_of_interest,CASE_COUNT,PROBABLE_CASE_COUNT,HOSPITALIZED_COUNT,DEATH_COUNT,PROBABLE_DEATH_COUNT,CASE_COUNT_7DAY_AVG,ALL_CASE_COUNT_7DAY_AVG,HOSP_COUNT_7DAY_AVG,DEATH_COUNT_7DAY_AVG,ALL_DEATH_COUNT_7DAY_AVG,BX_CASE_COUNT,BX_PROBABLE_CASE_COUNT,BX_HOSPITALIZED_COUNT,BX_DEATH_COUNT,BX_PROBABLE_DEATH_COUNT,BX_CASE_COUNT_7DAY_AVG,BX_PROBABLE_CASE_COUNT_7DAY_AVG,BX_ALL_CASE_COUNT_7DAY_AVG,BX_HOSPITALIZED_COUNT_7DAY_AVG,BX_DEATH_COUNT_7DAY_AVG,BX_ALL_DEATH_COUNT_7DAY_AVG,BK_CASE_COUNT,BK_PROBABLE_CASE_COUNT,BK_HOSPITALIZED_COUNT,BK_DEATH_COUNT,BK_PROBABLE_DEATH_COUNT,BK_CASE_COUNT_7DAY_AVG,BK_PROBABLE_CASE_COUNT_7DAY_AVG,BK_ALL_CASE_COUNT_7DAY_AVG,BK_HOSPITALIZED_COUNT_7DAY_AVG,BK_DEATH_COUNT_7DAY_AVG,BK_ALL_DEATH_COUNT_7DAY_AVG,MN_CASE_COUNT,MN_PROBABLE_CASE_COUNT,MN_HOSPITALIZED_COUNT,MN_DEATH_COUNT,MN_PROBABLE_DEATH_COUNT,MN_CASE_COUNT_7DAY_AVG,MN_PROBABLE_CASE_COUNT_7DAY_AVG,MN_ALL_CASE_COUNT_7DAY_AVG,MN_HOSPITALIZED_COUNT_7DAY_AVG,MN_DEATH_COUNT_7DAY_AVG,MN_ALL_DEATH_COUNT_7DAY_AVG,QN_CASE_COUNT,QN_PROBABLE_CASE_COUNT,QN_HOSPITALIZED_COUNT,QN_DEATH_COUNT,QN_PROBABLE_DEATH_COUNT,QN_CASE_COUNT_7DAY_AVG,QN_PROBABLE_CASE_COUNT_7DAY_AVG,QN_ALL_CASE_COUNT_7DAY_AVG,QN_HOSPITALIZED_COUNT_7DAY_AVG,QN_DEATH_COUNT_7DAY_AVG,QN_ALL_DEATH_COUNT_7DAY_AVG,SI_CASE_COUNT,SI_PROBABLE_CASE_COUNT,SI_HOSPITALIZED_COUNT,SI_DEATH_COUNT,SI_PROBABLE_DEATH_COUNT,SI_CASE_COUNT_7DAY_AVG,SI_PROBABLE_CASE_COUNT_7DAY_AVG,SI_ALL_CASE_COUNT_7DAY_AVG,SI_HOSPITALIZED_COUNT_7DAY_AVG,SI_DEATH_COUNT_7DAY_AVG,SI_ALL_DEATH_COUNT_7DAY_AVG,INCOMPLETE
2020,2,02/29/2020,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2020,3,03/01/2020,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2020,3,03/02/2020,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2020,3,03/03/2020,1,0,7,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2020,3,03/04/2020,5,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,2,0,1,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [40]:
# define the borough codes in column names
BOROUGH_PREFIXES = {
    '': 'Overall',
    'BX_': 'Bronx',
    'BK_': 'Brooklyn',
    'MN_': 'Manhattan',
    'QN_': 'Queens',
    'SI_': 'Staten Island'
}

In [41]:
# define the suffixes of column names that I want to keep
COL_SUFFIXES = [
    'CASE_COUNT',
    'HOSPITALIZED_COUNT',
    'DEATH_COUNT'
]

In [42]:
# generate the list of columns I want
from itertools import product

COL_AGGREGATES = []
for pref, suff in product(BOROUGH_PREFIXES.keys(), COL_SUFFIXES):
    col_name = pref + suff
    COL_AGGREGATES.append(col_name)

In [46]:
# now define the aggregation groups
COL_GROUPS = [
    'year',
    'month'
] 

In [47]:
# chosen columns to send in the db
COL_CHOSEN = COL_GROUPS + COL_AGGREGATES

In [53]:
# select only these columns from the covid dataset
covid_df = covid_df.select(COL_CHOSEN)

In [87]:
grouped_df = covid_df.groupBy(COL_GROUPS)

aggregated_rows = grouped_df.agg(
    F.count('*').alias('num_days'),
    *[F.sum(chosen).alias(f'TOTAL_{chosen}') for chosen in COL_AGGREGATES],
    *[F.avg(chosen).alias(f'AVG_{chosen}') for chosen in COL_AGGREGATES]
).collect()

aggregated_df = spark.createDataFrame(aggregated_rows).sort('year', 'month')
aggregated_df.limit(5)

year,month,num_days,TOTAL_CASE_COUNT,TOTAL_HOSPITALIZED_COUNT,TOTAL_DEATH_COUNT,TOTAL_BX_CASE_COUNT,TOTAL_BX_HOSPITALIZED_COUNT,TOTAL_BX_DEATH_COUNT,TOTAL_BK_CASE_COUNT,TOTAL_BK_HOSPITALIZED_COUNT,TOTAL_BK_DEATH_COUNT,TOTAL_MN_CASE_COUNT,TOTAL_MN_HOSPITALIZED_COUNT,TOTAL_MN_DEATH_COUNT,TOTAL_QN_CASE_COUNT,TOTAL_QN_HOSPITALIZED_COUNT,TOTAL_QN_DEATH_COUNT,TOTAL_SI_CASE_COUNT,TOTAL_SI_HOSPITALIZED_COUNT,TOTAL_SI_DEATH_COUNT,AVG_CASE_COUNT,AVG_HOSPITALIZED_COUNT,AVG_DEATH_COUNT,AVG_BX_CASE_COUNT,AVG_BX_HOSPITALIZED_COUNT,AVG_BX_DEATH_COUNT,AVG_BK_CASE_COUNT,AVG_BK_HOSPITALIZED_COUNT,AVG_BK_DEATH_COUNT,AVG_MN_CASE_COUNT,AVG_MN_HOSPITALIZED_COUNT,AVG_MN_DEATH_COUNT,AVG_QN_CASE_COUNT,AVG_QN_HOSPITALIZED_COUNT,AVG_QN_DEATH_COUNT,AVG_SI_CASE_COUNT,AVG_SI_HOSPITALIZED_COUNT,AVG_SI_DEATH_COUNT
2020,2,1,1,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2020,3,31,65182,18428,2183,12681,3805,467,18516,5319,662,9538,2730,287,19983,6000,656,4461,750,111,2102.6451612903224,594.4516129032259,70.41935483870968,409.06451612903226,122.74193548387096,15.064516129032258,597.2903225806451,171.5806451612903,21.35483870967742,307.6774193548387,88.06451612903226,9.258064516129032,644.6129032258065,193.5483870967742,21.161290322580644,143.90322580645162,24.193548387096776,3.5806451612903225
2020,4,30,109296,27567,12712,26882,6235,2626,28741,7739,3857,11741,3863,1623,34120,8920,4041,7808,1263,565,3643.2,918.9,423.73333333333335,896.0666666666667,207.83333333333331,87.53333333333333,958.0333333333332,257.96666666666664,128.56666666666666,391.3666666666667,128.76666666666668,54.1,1137.3333333333333,297.3333333333333,134.7,260.26666666666665,42.1,18.83333333333333
2020,5,31,28417,3911,2816,6494,944,581,8654,1174,790,3795,545,391,8185,1079,900,1289,231,154,916.6774193548388,126.16129032258064,90.83870967741936,209.48387096774192,30.451612903225808,18.741935483870968,279.16129032258067,37.87096774193548,25.483870967741936,122.41935483870968,17.580645161290324,12.612903225806452,264.03225806451616,34.806451612903224,29.032258064516128,41.58064516129032,7.451612903225806,4.967741935483871
2020,6,30,10844,1528,675,2248,316,149,3111,549,179,1733,208,98,3219,417,213,533,55,36,361.4666666666666,50.93333333333333,22.5,74.93333333333334,10.533333333333331,4.966666666666667,103.7,18.3,5.966666666666667,57.766666666666666,6.933333333333334,3.2666666666666666,107.3,13.9,7.1,17.766666666666666,1.8333333333333333,1.2


In [88]:
aggregated_borough_df = None
for prefix in BOROUGH_PREFIXES:
    temp_df = aggregated_df\
        .select(
            [F.col(col_name) 
                for col_name in COL_GROUPS + ['num_days']] + 
            [F.col(f'TOTAL_{prefix}{suffix}').alias(f'TOTAL_{suffix}'.lower()) 
                for suffix in COL_SUFFIXES] + 
            [F.col(f'AVG_{prefix}{suffix}').alias(f'AVG_{suffix}'.lower()) 
                for suffix in COL_SUFFIXES])\
        .withColumn('borough', F.lit(BOROUGH_PREFIXES[prefix]))

    if aggregated_borough_df == None:
        aggregated_borough_df = temp_df
    else:
        aggregated_borough_df = aggregated_borough_df.union(temp_df)

aggregated_borough_df.write.mode('overwrite').parquet('../data/curated/covid/cases-by-month')

+----+-----+--------+----------------+------------------------+-----------------+------------------+----------------------+------------------+-------------+
|year|month|num_days|total_case_count|total_hospitalized_count|total_death_count|    avg_case_count|avg_hospitalized_count|   avg_death_count|      borough|
+----+-----+--------+----------------+------------------------+-----------------+------------------+----------------------+------------------+-------------+
|2020|    2|       1|               0|                       0|                0|               0.0|                   0.0|               0.0|        Bronx|
|2020|    2|       1|               0|                       1|                0|               0.0|                   1.0|               0.0|     Brooklyn|
|2020|    2|       1|               1|                       0|                0|               1.0|                   0.0|               0.0|    Manhattan|
|2020|    2|       1|               1|                    