### MAST30034: Applied Data Science Project 1
---
# Preprocessing Part 1: Cleaning The Data
#### Xavier Travers (1178369)

Cleaning the datasets of null, inconsistent, or unnecessary values.
This is performed on the TLC data and COVID data.

In [1]:
# imports used throughout this notebook
from collections import defaultdict
from itertools import product
import os
import sys
from pyspark.sql import DataFrame, Column
from pyspark.sql.types import IntegerType
from pyspark.sql import functions as F

# add homemade helpers
sys.path.insert(1, '../scripts')
import helpers.cleaning_helpers as ch
import helpers.join_helpers as jh

# Used for saving time (if you don't want sanity-check printouts)
INTERMEDIATE_OUTPUTS = False

In [2]:
from pyspark.sql import SparkSession

# Create a spark session (which will run spark jobs)
spark = (
    SparkSession.builder.appName('MAST30034 XT Project 1')
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config('spark.sql.repl.eagerEval.enabled', True) 
    .config('spark.sql.parquet.cacheMetadata', 'true')
    .config("spark.executor.memory", "2g")
    .config("spark.driver.memory", "4g")
    .getOrCreate()
)

22/08/14 00:21:47 WARN Utils: Your hostname, Polaris resolves to a loopback address: 127.0.1.1; using 172.26.235.73 instead (on interface eth0)
22/08/14 00:21:47 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/08/14 00:21:48 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/08/14 00:21:49 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
22/08/14 00:21:49 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
22/08/14 00:21:49 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.
22/08/14 00:21:49 WARN Utils: Service 'SparkUI' could not bind on port 4043. Attempting port 4044.


In [3]:
# import the cdc week file to convert all dates to cdc weeks now
mmwr_weeks_df = spark.read.parquet('../data/raw/virals/mmwr_weeks.parquet')
mmwr_weeks_df.limit(5)

year,month,day,cdc_week,week_index,us_format,week_ending,week_month,week_year,timeline
2017,12,31,1,1,12/31/2017,2018-01-06,1,2018,neither
2018,1,1,1,1,01/01/2018,2018-01-06,1,2018,neither
2018,1,2,1,1,01/02/2018,2018-01-06,1,2018,neither
2018,1,3,1,1,01/03/2018,2018-01-06,1,2018,neither
2018,1,4,1,1,01/04/2018,2018-01-06,1,2018,neither


In [4]:
# import the zones dataset
zones_df = spark.read.csv('../data/raw/tlc_zones/zones.csv',
    header = True)

### 1. Cleaning the TLC dataset(s)

In [5]:
if INTERMEDIATE_OUTPUTS:
    example_df = spark.read.parquet('../data/raw/tlc/yellow/2019-07.parquet/')
    example_df.limit(5)
# TODO: commenting

In [6]:
if INTERMEDIATE_OUTPUTS:
    example_df.sort('trip_distance', ascending = False).limit(5)

In [7]:
# names of the tlc datasets to clean 
# (I was originally planning on working on fhvhv and green as well)
TLC_NAMES = ['yellow']

# dictionary to rename all the columns I want to keep
TLC_KEEP_COLUMNS = {
    'tpep_pickup_datetime': 'date',
    'passenger_count': 'passengers',
    'trip_distance': 'trip_distance',
    'PULocationID': 'pu_location_id',
    'DOLocationID': 'do_location_id',
    'hours_elapsed': 'hours_elapsed'
    # #  below only apply to fhvhv
    # 'hvfhs_license_num': 'fhvhv_license',
    # 'pickup_datetime': 'date',
    # 'trip_miles': 'trip_distance',
    # 'shared_request_flag': 'shared'
}

# create a dictionary of the columns to keep and the required filters
TLC_CLEAN_COLUMNS = {
    'pu_location_id': [ch.non_null], 
    'do_location_id': [ch.non_null], 
    'passengers': [ch.non_null], 
    'trip_distance': [ch.non_null, ch.non_negative], 
    # 'fhvhv_license': [ch.non_null], 
}

In [8]:
# read in the tlc data
tlc_df = jh.read_stacked_tlc_df(spark)

In [9]:
if INTERMEDIATE_OUTPUTS:
    # sanity check
    tlc_df.limit(5)

In [10]:
# derive extra values which are used to filter out valid trips
SECONDS_TO_HOURS = 1 / (60*60)
tlc_df = tlc_df\
    .withColumn('hours_elapsed', 
        (
            (F.col("tpep_dropoff_datetime").cast("long")
            - F.col('tpep_pickup_datetime').cast("long")) 
            * SECONDS_TO_HOURS
        )
    )\
    .withColumn('mph', (F.col('trip_distance') / F.col('hours_elapsed')))

In [11]:
if INTERMEDIATE_OUTPUTS:
    tlc_df.limit(5) 

In [12]:
# https://ypdcrime.com/vt/article30.php?zoom_highlight=fifty+five+miles+per+hour#t1180-a.
# As per: https://www.dot.ny.gov/divisions/operating/oom/transportation-systems/repository/TSMI-17-05.pdf
# the NYS maximum speed limit is 65 mph. filter out trips faster than legal.
tlc_df = tlc_df.where(
    (F.col('mph').isNotNull()) &
    (F.col('mph') <= 65)
)

In [13]:
if INTERMEDIATE_OUTPUTS:
    # this one is time instensive 
    tlc_df.sort('trip_distance', ascending = False).limit(5)

In [14]:
tlc_df = ch.perform_cleaning(tlc_df, mmwr_weeks_df, TLC_KEEP_COLUMNS, 
    TLC_CLEAN_COLUMNS)

AnalysisException: Column 'week_index' does not exist. Did you mean one of the following? [week_ending, week_month, week_year, timeline, date, day, month, passengers, year, trip_distance, hours_elapsed, do_location_id, pu_location_id];
'Project [year#2470, month#2453, day#2461, week_ending#6, week_year#8L, week_month#7L, 'week_index, timeline#9, date#2446, passengers#2411, trip_distance#2418, pu_location_id#2425L, do_location_id#2432L, hours_elapsed#2439]
+- Project [week_ending#6, week_year#8L, week_month#7L, timeline#9, date#2446, passengers#2411, trip_distance#2418, pu_location_id#2425L, do_location_id#2432L, hours_elapsed#2439, month#2453, day#2461, year#2470]
   +- Project [day#2461, month#2453, year#2470, date#2446, passengers#2411, trip_distance#2418, pu_location_id#2425L, do_location_id#2432L, hours_elapsed#2439, cdc_week#3L, week_index#4L, us_format#5, week_ending#6, week_month#7L, week_year#8L, timeline#9]
      +- Join Inner, (((cast(day#2461 as bigint) = day#2L) AND (cast(month#2453 as bigint) = month#1L)) AND (cast(year#2470 as bigint) = year#0L))
         :- Project [date#2446, passengers#2411, trip_distance#2418, pu_location_id#2425L, do_location_id#2432L, hours_elapsed#2439, month#2453, day#2461, cast(substring(date#2446, 7, 4) as int) AS year#2470]
         :  +- Project [date#2446, passengers#2411, trip_distance#2418, pu_location_id#2425L, do_location_id#2432L, hours_elapsed#2439, month#2453, cast(substring(date#2446, 4, 2) as int) AS day#2461]
         :     +- Project [date#2446, passengers#2411, trip_distance#2418, pu_location_id#2425L, do_location_id#2432L, hours_elapsed#2439, cast(substring(date#2446, 1, 2) as int) AS month#2453]
         :        +- Project [date_format(date#2404, MM/dd/yyyy, Some(Etc/UTC)) AS date#2446, passengers#2411, trip_distance#2418, pu_location_id#2425L, do_location_id#2432L, hours_elapsed#2439]
         :           +- Filter (trip_distance#2418 >= cast(0 as double))
         :              +- Filter isnotnull(trip_distance#2418)
         :                 +- Filter isnotnull(passengers#2411)
         :                    +- Filter isnotnull(do_location_id#2432L)
         :                       +- Filter isnotnull(pu_location_id#2425L)
         :                          +- Project [date#2404, passengers#2411, trip_distance#2418, pu_location_id#2425L, do_location_id#2432L, hours_elapsed#2355 AS hours_elapsed#2439]
         :                             +- Project [date#2404, passengers#2411, trip_distance#2418, pu_location_id#2425L, DOLocationID#136L AS do_location_id#2432L, hours_elapsed#2355]
         :                                +- Project [date#2404, passengers#2411, trip_distance#2418, PULocationID#135L AS pu_location_id#2425L, DOLocationID#136L, hours_elapsed#2355]
         :                                   +- Project [date#2404, passengers#2411, trip_distance#132 AS trip_distance#2418, PULocationID#135L, DOLocationID#136L, hours_elapsed#2355]
         :                                      +- Project [date#2404, passenger_count#440 AS passengers#2411, trip_distance#132, PULocationID#135L, DOLocationID#136L, hours_elapsed#2355]
         :                                         +- Project [tpep_pickup_datetime#129 AS date#2404, passenger_count#440, trip_distance#132, PULocationID#135L, DOLocationID#136L, hours_elapsed#2355]
         :                                            +- Project [tpep_pickup_datetime#129, passenger_count#440, trip_distance#132, PULocationID#135L, DOLocationID#136L, hours_elapsed#2355]
         :                                               +- Filter (isnotnull(mph#2376) AND (mph#2376 <= cast(65 as double)))
         :                                                  +- Project [VendorID#128L, tpep_pickup_datetime#129, tpep_dropoff_datetime#130, passenger_count#440, trip_distance#132, RatecodeID#441, store_and_fwd_flag#134, PULocationID#135L, DOLocationID#136L, payment_type#137L, fare_amount#138, extra#139, mta_tax#140, tip_amount#141, tolls_amount#142, improvement_surcharge#143, total_amount#144, congestion_surcharge#375, airport_fee#1884, hours_elapsed#2355, (trip_distance#132 / hours_elapsed#2355) AS mph#2376]
         :                                                     +- Project [VendorID#128L, tpep_pickup_datetime#129, tpep_dropoff_datetime#130, passenger_count#440, trip_distance#132, RatecodeID#441, store_and_fwd_flag#134, PULocationID#135L, DOLocationID#136L, payment_type#137L, fare_amount#138, extra#139, mta_tax#140, tip_amount#141, tolls_amount#142, improvement_surcharge#143, total_amount#144, congestion_surcharge#375, airport_fee#1884, (cast((cast(tpep_dropoff_datetime#130 as bigint) - cast(tpep_pickup_datetime#129 as bigint)) as double) * 2.777777777777778E-4) AS hours_elapsed#2355]
         :                                                        +- Union false, false
         :                                                           :- Project [VendorID#128L, tpep_pickup_datetime#129, tpep_dropoff_datetime#130, cast(passenger_count#131L as double) AS passenger_count#440, trip_distance#132, cast(RatecodeID#133L as double) AS RatecodeID#441, store_and_fwd_flag#134, PULocationID#135L, DOLocationID#136L, payment_type#137L, fare_amount#138, extra#139, mta_tax#140, tip_amount#141, tolls_amount#142, improvement_surcharge#143, total_amount#144, cast(congestion_surcharge#145 as double) AS congestion_surcharge#375, cast(airport_fee#146 as double) AS airport_fee#1884]
         :                                                           :  +- Relation [VendorID#128L,tpep_pickup_datetime#129,tpep_dropoff_datetime#130,passenger_count#131L,trip_distance#132,RatecodeID#133L,store_and_fwd_flag#134,PULocationID#135L,DOLocationID#136L,payment_type#137L,fare_amount#138,extra#139,mta_tax#140,tip_amount#141,tolls_amount#142,improvement_surcharge#143,total_amount#144,congestion_surcharge#145,airport_fee#146] parquet
         :                                                           :- Project [VendorID#166L, tpep_pickup_datetime#167, tpep_dropoff_datetime#168, cast(passenger_count#169L as double) AS passenger_count#442, trip_distance#170, cast(RatecodeID#171L as double) AS RatecodeID#443, store_and_fwd_flag#172, PULocationID#173L, DOLocationID#174L, payment_type#175L, fare_amount#176, extra#177, mta_tax#178, tip_amount#179, tolls_amount#180, improvement_surcharge#181, total_amount#182, cast(congestion_surcharge#183 as double) AS congestion_surcharge#376, cast(airport_fee#184 as double) AS airport_fee#1885]
         :                                                           :  +- Relation [VendorID#166L,tpep_pickup_datetime#167,tpep_dropoff_datetime#168,passenger_count#169L,trip_distance#170,RatecodeID#171L,store_and_fwd_flag#172,PULocationID#173L,DOLocationID#174L,payment_type#175L,fare_amount#176,extra#177,mta_tax#178,tip_amount#179,tolls_amount#180,improvement_surcharge#181,total_amount#182,congestion_surcharge#183,airport_fee#184] parquet
         :                                                           :- Project [VendorID#223L, tpep_pickup_datetime#224, tpep_dropoff_datetime#225, cast(passenger_count#226L as double) AS passenger_count#444, trip_distance#227, cast(RatecodeID#228L as double) AS RatecodeID#445, store_and_fwd_flag#229, PULocationID#230L, DOLocationID#231L, payment_type#232L, fare_amount#233, extra#234, mta_tax#235, tip_amount#236, tolls_amount#237, improvement_surcharge#238, total_amount#239, cast(congestion_surcharge#240 as double) AS congestion_surcharge#377, cast(airport_fee#241 as double) AS airport_fee#1886]
         :                                                           :  +- Relation [VendorID#223L,tpep_pickup_datetime#224,tpep_dropoff_datetime#225,passenger_count#226L,trip_distance#227,RatecodeID#228L,store_and_fwd_flag#229,PULocationID#230L,DOLocationID#231L,payment_type#232L,fare_amount#233,extra#234,mta_tax#235,tip_amount#236,tolls_amount#237,improvement_surcharge#238,total_amount#239,congestion_surcharge#240,airport_fee#241] parquet
         :                                                           :- Project [VendorID#280L, tpep_pickup_datetime#281, tpep_dropoff_datetime#282, cast(passenger_count#283L as double) AS passenger_count#446, trip_distance#284, cast(RatecodeID#285L as double) AS RatecodeID#447, store_and_fwd_flag#286, PULocationID#287L, DOLocationID#288L, payment_type#289L, fare_amount#290, extra#291, mta_tax#292, tip_amount#293, tolls_amount#294, improvement_surcharge#295, total_amount#296, cast(congestion_surcharge#297 as double) AS congestion_surcharge#378, cast(airport_fee#298 as double) AS airport_fee#1887]
         :                                                           :  +- Relation [VendorID#280L,tpep_pickup_datetime#281,tpep_dropoff_datetime#282,passenger_count#283L,trip_distance#284,RatecodeID#285L,store_and_fwd_flag#286,PULocationID#287L,DOLocationID#288L,payment_type#289L,fare_amount#290,extra#291,mta_tax#292,tip_amount#293,tolls_amount#294,improvement_surcharge#295,total_amount#296,congestion_surcharge#297,airport_fee#298] parquet
         :                                                           :- Project [VendorID#337L, tpep_pickup_datetime#338, tpep_dropoff_datetime#339, cast(passenger_count#340L as double) AS passenger_count#448, trip_distance#341, cast(RatecodeID#342L as double) AS RatecodeID#449, store_and_fwd_flag#343, PULocationID#344L, DOLocationID#345L, payment_type#346L, fare_amount#347, extra#348, mta_tax#349, tip_amount#350, tolls_amount#351, improvement_surcharge#352, total_amount#353, congestion_surcharge#354, cast(airport_fee#355 as double) AS airport_fee#1888]
         :                                                           :  +- Relation [VendorID#337L,tpep_pickup_datetime#338,tpep_dropoff_datetime#339,passenger_count#340L,trip_distance#341,RatecodeID#342L,store_and_fwd_flag#343,PULocationID#344L,DOLocationID#345L,payment_type#346L,fare_amount#347,extra#348,mta_tax#349,tip_amount#350,tolls_amount#351,improvement_surcharge#352,total_amount#353,congestion_surcharge#354,airport_fee#355] parquet
         :                                                           :- Project [VendorID#402L, tpep_pickup_datetime#403, tpep_dropoff_datetime#404, passenger_count#405, trip_distance#406, RatecodeID#407, store_and_fwd_flag#408, PULocationID#409L, DOLocationID#410L, payment_type#411L, fare_amount#412, extra#413, mta_tax#414, tip_amount#415, tolls_amount#416, improvement_surcharge#417, total_amount#418, cast(congestion_surcharge#419 as double) AS congestion_surcharge#450, cast(airport_fee#420 as double) AS airport_fee#1889]
         :                                                           :  +- Relation [VendorID#402L,tpep_pickup_datetime#403,tpep_dropoff_datetime#404,passenger_count#405,trip_distance#406,RatecodeID#407,store_and_fwd_flag#408,PULocationID#409L,DOLocationID#410L,payment_type#411L,fare_amount#412,extra#413,mta_tax#414,tip_amount#415,tolls_amount#416,improvement_surcharge#417,total_amount#418,congestion_surcharge#419,airport_fee#420] parquet
         :                                                           :- Project [VendorID#472L, tpep_pickup_datetime#473, tpep_dropoff_datetime#474, passenger_count#475, trip_distance#476, RatecodeID#477, store_and_fwd_flag#478, PULocationID#479L, DOLocationID#480L, payment_type#481L, fare_amount#482, extra#483, mta_tax#484, tip_amount#485, tolls_amount#486, improvement_surcharge#487, total_amount#488, cast(congestion_surcharge#489 as double) AS congestion_surcharge#510, cast(airport_fee#490 as double) AS airport_fee#1890]
         :                                                           :  +- Relation [VendorID#472L,tpep_pickup_datetime#473,tpep_dropoff_datetime#474,passenger_count#475,trip_distance#476,RatecodeID#477,store_and_fwd_flag#478,PULocationID#479L,DOLocationID#480L,payment_type#481L,fare_amount#482,extra#483,mta_tax#484,tip_amount#485,tolls_amount#486,improvement_surcharge#487,total_amount#488,congestion_surcharge#489,airport_fee#490] parquet
         :                                                           :- Project [VendorID#531L, tpep_pickup_datetime#532, tpep_dropoff_datetime#533, passenger_count#534, trip_distance#535, RatecodeID#536, store_and_fwd_flag#537, PULocationID#538L, DOLocationID#539L, payment_type#540L, fare_amount#541, extra#542, mta_tax#543, tip_amount#544, tolls_amount#545, improvement_surcharge#546, total_amount#547, congestion_surcharge#548, cast(airport_fee#549 as double) AS airport_fee#1891]
         :                                                           :  +- Relation [VendorID#531L,tpep_pickup_datetime#532,tpep_dropoff_datetime#533,passenger_count#534,trip_distance#535,RatecodeID#536,store_and_fwd_flag#537,PULocationID#538L,DOLocationID#539L,payment_type#540L,fare_amount#541,extra#542,mta_tax#543,tip_amount#544,tolls_amount#545,improvement_surcharge#546,total_amount#547,congestion_surcharge#548,airport_fee#549] parquet
         :                                                           :- Project [VendorID#588L, tpep_pickup_datetime#589, tpep_dropoff_datetime#590, passenger_count#591, trip_distance#592, RatecodeID#593, store_and_fwd_flag#594, PULocationID#595L, DOLocationID#596L, payment_type#597L, fare_amount#598, extra#599, mta_tax#600, tip_amount#601, tolls_amount#602, improvement_surcharge#603, total_amount#604, cast(congestion_surcharge#605 as double) AS congestion_surcharge#626, cast(airport_fee#606 as double) AS airport_fee#1892]
         :                                                           :  +- Relation [VendorID#588L,tpep_pickup_datetime#589,tpep_dropoff_datetime#590,passenger_count#591,trip_distance#592,RatecodeID#593,store_and_fwd_flag#594,PULocationID#595L,DOLocationID#596L,payment_type#597L,fare_amount#598,extra#599,mta_tax#600,tip_amount#601,tolls_amount#602,improvement_surcharge#603,total_amount#604,congestion_surcharge#605,airport_fee#606] parquet
         :                                                           :- Project [VendorID#647L, tpep_pickup_datetime#648, tpep_dropoff_datetime#649, passenger_count#650, trip_distance#651, RatecodeID#652, store_and_fwd_flag#653, PULocationID#654L, DOLocationID#655L, payment_type#656L, fare_amount#657, extra#658, mta_tax#659, tip_amount#660, tolls_amount#661, improvement_surcharge#662, total_amount#663, congestion_surcharge#664, cast(airport_fee#665 as double) AS airport_fee#1893]
         :                                                           :  +- Relation [VendorID#647L,tpep_pickup_datetime#648,tpep_dropoff_datetime#649,passenger_count#650,trip_distance#651,RatecodeID#652,store_and_fwd_flag#653,PULocationID#654L,DOLocationID#655L,payment_type#656L,fare_amount#657,extra#658,mta_tax#659,tip_amount#660,tolls_amount#661,improvement_surcharge#662,total_amount#663,congestion_surcharge#664,airport_fee#665] parquet
         :                                                           :- Project [VendorID#704L, tpep_pickup_datetime#705, tpep_dropoff_datetime#706, passenger_count#707, trip_distance#708, RatecodeID#709, store_and_fwd_flag#710, PULocationID#711L, DOLocationID#712L, payment_type#713L, fare_amount#714, extra#715, mta_tax#716, tip_amount#717, tolls_amount#718, improvement_surcharge#719, total_amount#720, cast(congestion_surcharge#721 as double) AS congestion_surcharge#742, cast(airport_fee#722 as double) AS airport_fee#1894]
         :                                                           :  +- Relation [VendorID#704L,tpep_pickup_datetime#705,tpep_dropoff_datetime#706,passenger_count#707,trip_distance#708,RatecodeID#709,store_and_fwd_flag#710,PULocationID#711L,DOLocationID#712L,payment_type#713L,fare_amount#714,extra#715,mta_tax#716,tip_amount#717,tolls_amount#718,improvement_surcharge#719,total_amount#720,congestion_surcharge#721,airport_fee#722] parquet
         :                                                           :- Project [VendorID#763L, tpep_pickup_datetime#764, tpep_dropoff_datetime#765, passenger_count#766, trip_distance#767, RatecodeID#768, store_and_fwd_flag#769, PULocationID#770L, DOLocationID#771L, payment_type#772L, fare_amount#773, extra#774, mta_tax#775, tip_amount#776, tolls_amount#777, improvement_surcharge#778, total_amount#779, congestion_surcharge#780, cast(airport_fee#781 as double) AS airport_fee#1895]
         :                                                           :  +- Relation [VendorID#763L,tpep_pickup_datetime#764,tpep_dropoff_datetime#765,passenger_count#766,trip_distance#767,RatecodeID#768,store_and_fwd_flag#769,PULocationID#770L,DOLocationID#771L,payment_type#772L,fare_amount#773,extra#774,mta_tax#775,tip_amount#776,tolls_amount#777,improvement_surcharge#778,total_amount#779,congestion_surcharge#780,airport_fee#781] parquet
         :                                                           :- Project [VendorID#820L, tpep_pickup_datetime#821, tpep_dropoff_datetime#822, passenger_count#823, trip_distance#824, RatecodeID#825, store_and_fwd_flag#826, PULocationID#827L, DOLocationID#828L, payment_type#829L, fare_amount#830, extra#831, mta_tax#832, tip_amount#833, tolls_amount#834, improvement_surcharge#835, total_amount#836, congestion_surcharge#837, cast(airport_fee#838 as double) AS airport_fee#1896]
         :                                                           :  +- Relation [VendorID#820L,tpep_pickup_datetime#821,tpep_dropoff_datetime#822,passenger_count#823,trip_distance#824,RatecodeID#825,store_and_fwd_flag#826,PULocationID#827L,DOLocationID#828L,payment_type#829L,fare_amount#830,extra#831,mta_tax#832,tip_amount#833,tolls_amount#834,improvement_surcharge#835,total_amount#836,congestion_surcharge#837,airport_fee#838] parquet
         :                                                           :- Project [VendorID#877L, tpep_pickup_datetime#878, tpep_dropoff_datetime#879, passenger_count#880, trip_distance#881, RatecodeID#882, store_and_fwd_flag#883, PULocationID#884L, DOLocationID#885L, payment_type#886L, fare_amount#887, extra#888, mta_tax#889, tip_amount#890, tolls_amount#891, improvement_surcharge#892, total_amount#893, congestion_surcharge#894, cast(airport_fee#895 as double) AS airport_fee#1897]
         :                                                           :  +- Relation [VendorID#877L,tpep_pickup_datetime#878,tpep_dropoff_datetime#879,passenger_count#880,trip_distance#881,RatecodeID#882,store_and_fwd_flag#883,PULocationID#884L,DOLocationID#885L,payment_type#886L,fare_amount#887,extra#888,mta_tax#889,tip_amount#890,tolls_amount#891,improvement_surcharge#892,total_amount#893,congestion_surcharge#894,airport_fee#895] parquet
         :                                                           :- Project [VendorID#934L, tpep_pickup_datetime#935, tpep_dropoff_datetime#936, passenger_count#937, trip_distance#938, RatecodeID#939, store_and_fwd_flag#940, PULocationID#941L, DOLocationID#942L, payment_type#943L, fare_amount#944, extra#945, mta_tax#946, tip_amount#947, tolls_amount#948, improvement_surcharge#949, total_amount#950, congestion_surcharge#951, cast(airport_fee#952 as double) AS airport_fee#1898]
         :                                                           :  +- Relation [VendorID#934L,tpep_pickup_datetime#935,tpep_dropoff_datetime#936,passenger_count#937,trip_distance#938,RatecodeID#939,store_and_fwd_flag#940,PULocationID#941L,DOLocationID#942L,payment_type#943L,fare_amount#944,extra#945,mta_tax#946,tip_amount#947,tolls_amount#948,improvement_surcharge#949,total_amount#950,congestion_surcharge#951,airport_fee#952] parquet
         :                                                           :- Project [VendorID#991L, tpep_pickup_datetime#992, tpep_dropoff_datetime#993, passenger_count#994, trip_distance#995, RatecodeID#996, store_and_fwd_flag#997, PULocationID#998L, DOLocationID#999L, payment_type#1000L, fare_amount#1001, extra#1002, mta_tax#1003, tip_amount#1004, tolls_amount#1005, improvement_surcharge#1006, total_amount#1007, congestion_surcharge#1008, cast(airport_fee#1009 as double) AS airport_fee#1899]
         :                                                           :  +- Relation [VendorID#991L,tpep_pickup_datetime#992,tpep_dropoff_datetime#993,passenger_count#994,trip_distance#995,RatecodeID#996,store_and_fwd_flag#997,PULocationID#998L,DOLocationID#999L,payment_type#1000L,fare_amount#1001,extra#1002,mta_tax#1003,tip_amount#1004,tolls_amount#1005,improvement_surcharge#1006,total_amount#1007,congestion_surcharge#1008,airport_fee#1009] parquet
         :                                                           :- Project [VendorID#1048L, tpep_pickup_datetime#1049, tpep_dropoff_datetime#1050, passenger_count#1051, trip_distance#1052, RatecodeID#1053, store_and_fwd_flag#1054, PULocationID#1055L, DOLocationID#1056L, payment_type#1057L, fare_amount#1058, extra#1059, mta_tax#1060, tip_amount#1061, tolls_amount#1062, improvement_surcharge#1063, total_amount#1064, congestion_surcharge#1065, cast(airport_fee#1066 as double) AS airport_fee#1900]
         :                                                           :  +- Relation [VendorID#1048L,tpep_pickup_datetime#1049,tpep_dropoff_datetime#1050,passenger_count#1051,trip_distance#1052,RatecodeID#1053,store_and_fwd_flag#1054,PULocationID#1055L,DOLocationID#1056L,payment_type#1057L,fare_amount#1058,extra#1059,mta_tax#1060,tip_amount#1061,tolls_amount#1062,improvement_surcharge#1063,total_amount#1064,congestion_surcharge#1065,airport_fee#1066] parquet
         :                                                           :- Project [VendorID#1105L, tpep_pickup_datetime#1106, tpep_dropoff_datetime#1107, passenger_count#1108, trip_distance#1109, RatecodeID#1110, store_and_fwd_flag#1111, PULocationID#1112L, DOLocationID#1113L, payment_type#1114L, fare_amount#1115, extra#1116, mta_tax#1117, tip_amount#1118, tolls_amount#1119, improvement_surcharge#1120, total_amount#1121, congestion_surcharge#1122, cast(airport_fee#1123 as double) AS airport_fee#1901]
         :                                                           :  +- Relation [VendorID#1105L,tpep_pickup_datetime#1106,tpep_dropoff_datetime#1107,passenger_count#1108,trip_distance#1109,RatecodeID#1110,store_and_fwd_flag#1111,PULocationID#1112L,DOLocationID#1113L,payment_type#1114L,fare_amount#1115,extra#1116,mta_tax#1117,tip_amount#1118,tolls_amount#1119,improvement_surcharge#1120,total_amount#1121,congestion_surcharge#1122,airport_fee#1123] parquet
         :                                                           :- Project [VendorID#1162L, tpep_pickup_datetime#1163, tpep_dropoff_datetime#1164, passenger_count#1165, trip_distance#1166, RatecodeID#1167, store_and_fwd_flag#1168, PULocationID#1169L, DOLocationID#1170L, payment_type#1171L, fare_amount#1172, extra#1173, mta_tax#1174, tip_amount#1175, tolls_amount#1176, improvement_surcharge#1177, total_amount#1178, congestion_surcharge#1179, cast(airport_fee#1180 as double) AS airport_fee#1902]
         :                                                           :  +- Relation [VendorID#1162L,tpep_pickup_datetime#1163,tpep_dropoff_datetime#1164,passenger_count#1165,trip_distance#1166,RatecodeID#1167,store_and_fwd_flag#1168,PULocationID#1169L,DOLocationID#1170L,payment_type#1171L,fare_amount#1172,extra#1173,mta_tax#1174,tip_amount#1175,tolls_amount#1176,improvement_surcharge#1177,total_amount#1178,congestion_surcharge#1179,airport_fee#1180] parquet
         :                                                           :- Project [VendorID#1219L, tpep_pickup_datetime#1220, tpep_dropoff_datetime#1221, passenger_count#1222, trip_distance#1223, RatecodeID#1224, store_and_fwd_flag#1225, PULocationID#1226L, DOLocationID#1227L, payment_type#1228L, fare_amount#1229, extra#1230, mta_tax#1231, tip_amount#1232, tolls_amount#1233, improvement_surcharge#1234, total_amount#1235, congestion_surcharge#1236, cast(airport_fee#1237 as double) AS airport_fee#1903]
         :                                                           :  +- Relation [VendorID#1219L,tpep_pickup_datetime#1220,tpep_dropoff_datetime#1221,passenger_count#1222,trip_distance#1223,RatecodeID#1224,store_and_fwd_flag#1225,PULocationID#1226L,DOLocationID#1227L,payment_type#1228L,fare_amount#1229,extra#1230,mta_tax#1231,tip_amount#1232,tolls_amount#1233,improvement_surcharge#1234,total_amount#1235,congestion_surcharge#1236,airport_fee#1237] parquet
         :                                                           :- Project [VendorID#1276L, tpep_pickup_datetime#1277, tpep_dropoff_datetime#1278, passenger_count#1279, trip_distance#1280, RatecodeID#1281, store_and_fwd_flag#1282, PULocationID#1283L, DOLocationID#1284L, payment_type#1285L, fare_amount#1286, extra#1287, mta_tax#1288, tip_amount#1289, tolls_amount#1290, improvement_surcharge#1291, total_amount#1292, congestion_surcharge#1293, cast(airport_fee#1294 as double) AS airport_fee#1904]
         :                                                           :  +- Relation [VendorID#1276L,tpep_pickup_datetime#1277,tpep_dropoff_datetime#1278,passenger_count#1279,trip_distance#1280,RatecodeID#1281,store_and_fwd_flag#1282,PULocationID#1283L,DOLocationID#1284L,payment_type#1285L,fare_amount#1286,extra#1287,mta_tax#1288,tip_amount#1289,tolls_amount#1290,improvement_surcharge#1291,total_amount#1292,congestion_surcharge#1293,airport_fee#1294] parquet
         :                                                           :- Project [VendorID#1333L, tpep_pickup_datetime#1334, tpep_dropoff_datetime#1335, passenger_count#1336, trip_distance#1337, RatecodeID#1338, store_and_fwd_flag#1339, PULocationID#1340L, DOLocationID#1341L, payment_type#1342L, fare_amount#1343, extra#1344, mta_tax#1345, tip_amount#1346, tolls_amount#1347, improvement_surcharge#1348, total_amount#1349, congestion_surcharge#1350, cast(airport_fee#1351 as double) AS airport_fee#1905]
         :                                                           :  +- Relation [VendorID#1333L,tpep_pickup_datetime#1334,tpep_dropoff_datetime#1335,passenger_count#1336,trip_distance#1337,RatecodeID#1338,store_and_fwd_flag#1339,PULocationID#1340L,DOLocationID#1341L,payment_type#1342L,fare_amount#1343,extra#1344,mta_tax#1345,tip_amount#1346,tolls_amount#1347,improvement_surcharge#1348,total_amount#1349,congestion_surcharge#1350,airport_fee#1351] parquet
         :                                                           :- Project [VendorID#1390L, tpep_pickup_datetime#1391, tpep_dropoff_datetime#1392, passenger_count#1393, trip_distance#1394, RatecodeID#1395, store_and_fwd_flag#1396, PULocationID#1397L, DOLocationID#1398L, payment_type#1399L, fare_amount#1400, extra#1401, mta_tax#1402, tip_amount#1403, tolls_amount#1404, improvement_surcharge#1405, total_amount#1406, congestion_surcharge#1407, cast(airport_fee#1408 as double) AS airport_fee#1906]
         :                                                           :  +- Relation [VendorID#1390L,tpep_pickup_datetime#1391,tpep_dropoff_datetime#1392,passenger_count#1393,trip_distance#1394,RatecodeID#1395,store_and_fwd_flag#1396,PULocationID#1397L,DOLocationID#1398L,payment_type#1399L,fare_amount#1400,extra#1401,mta_tax#1402,tip_amount#1403,tolls_amount#1404,improvement_surcharge#1405,total_amount#1406,congestion_surcharge#1407,airport_fee#1408] parquet
         :                                                           :- Project [VendorID#1447L, tpep_pickup_datetime#1448, tpep_dropoff_datetime#1449, passenger_count#1450, trip_distance#1451, RatecodeID#1452, store_and_fwd_flag#1453, PULocationID#1454L, DOLocationID#1455L, payment_type#1456L, fare_amount#1457, extra#1458, mta_tax#1459, tip_amount#1460, tolls_amount#1461, improvement_surcharge#1462, total_amount#1463, congestion_surcharge#1464, cast(airport_fee#1465 as double) AS airport_fee#1907]
         :                                                           :  +- Relation [VendorID#1447L,tpep_pickup_datetime#1448,tpep_dropoff_datetime#1449,passenger_count#1450,trip_distance#1451,RatecodeID#1452,store_and_fwd_flag#1453,PULocationID#1454L,DOLocationID#1455L,payment_type#1456L,fare_amount#1457,extra#1458,mta_tax#1459,tip_amount#1460,tolls_amount#1461,improvement_surcharge#1462,total_amount#1463,congestion_surcharge#1464,airport_fee#1465] parquet
         :                                                           :- Project [VendorID#1504L, tpep_pickup_datetime#1505, tpep_dropoff_datetime#1506, passenger_count#1507, trip_distance#1508, RatecodeID#1509, store_and_fwd_flag#1510, PULocationID#1511L, DOLocationID#1512L, payment_type#1513L, fare_amount#1514, extra#1515, mta_tax#1516, tip_amount#1517, tolls_amount#1518, improvement_surcharge#1519, total_amount#1520, congestion_surcharge#1521, cast(airport_fee#1522 as double) AS airport_fee#1908]
         :                                                           :  +- Relation [VendorID#1504L,tpep_pickup_datetime#1505,tpep_dropoff_datetime#1506,passenger_count#1507,trip_distance#1508,RatecodeID#1509,store_and_fwd_flag#1510,PULocationID#1511L,DOLocationID#1512L,payment_type#1513L,fare_amount#1514,extra#1515,mta_tax#1516,tip_amount#1517,tolls_amount#1518,improvement_surcharge#1519,total_amount#1520,congestion_surcharge#1521,airport_fee#1522] parquet
         :                                                           :- Project [VendorID#1561L, tpep_pickup_datetime#1562, tpep_dropoff_datetime#1563, passenger_count#1564, trip_distance#1565, RatecodeID#1566, store_and_fwd_flag#1567, PULocationID#1568L, DOLocationID#1569L, payment_type#1570L, fare_amount#1571, extra#1572, mta_tax#1573, tip_amount#1574, tolls_amount#1575, improvement_surcharge#1576, total_amount#1577, congestion_surcharge#1578, cast(airport_fee#1579 as double) AS airport_fee#1909]
         :                                                           :  +- Relation [VendorID#1561L,tpep_pickup_datetime#1562,tpep_dropoff_datetime#1563,passenger_count#1564,trip_distance#1565,RatecodeID#1566,store_and_fwd_flag#1567,PULocationID#1568L,DOLocationID#1569L,payment_type#1570L,fare_amount#1571,extra#1572,mta_tax#1573,tip_amount#1574,tolls_amount#1575,improvement_surcharge#1576,total_amount#1577,congestion_surcharge#1578,airport_fee#1579] parquet
         :                                                           :- Project [VendorID#1618L, tpep_pickup_datetime#1619, tpep_dropoff_datetime#1620, passenger_count#1621, trip_distance#1622, RatecodeID#1623, store_and_fwd_flag#1624, PULocationID#1625L, DOLocationID#1626L, payment_type#1627L, fare_amount#1628, extra#1629, mta_tax#1630, tip_amount#1631, tolls_amount#1632, improvement_surcharge#1633, total_amount#1634, congestion_surcharge#1635, cast(airport_fee#1636 as double) AS airport_fee#1910]
         :                                                           :  +- Relation [VendorID#1618L,tpep_pickup_datetime#1619,tpep_dropoff_datetime#1620,passenger_count#1621,trip_distance#1622,RatecodeID#1623,store_and_fwd_flag#1624,PULocationID#1625L,DOLocationID#1626L,payment_type#1627L,fare_amount#1628,extra#1629,mta_tax#1630,tip_amount#1631,tolls_amount#1632,improvement_surcharge#1633,total_amount#1634,congestion_surcharge#1635,airport_fee#1636] parquet
         :                                                           :- Project [VendorID#1675L, tpep_pickup_datetime#1676, tpep_dropoff_datetime#1677, passenger_count#1678, trip_distance#1679, RatecodeID#1680, store_and_fwd_flag#1681, PULocationID#1682L, DOLocationID#1683L, payment_type#1684L, fare_amount#1685, extra#1686, mta_tax#1687, tip_amount#1688, tolls_amount#1689, improvement_surcharge#1690, total_amount#1691, congestion_surcharge#1692, cast(airport_fee#1693 as double) AS airport_fee#1911]
         :                                                           :  +- Relation [VendorID#1675L,tpep_pickup_datetime#1676,tpep_dropoff_datetime#1677,passenger_count#1678,trip_distance#1679,RatecodeID#1680,store_and_fwd_flag#1681,PULocationID#1682L,DOLocationID#1683L,payment_type#1684L,fare_amount#1685,extra#1686,mta_tax#1687,tip_amount#1688,tolls_amount#1689,improvement_surcharge#1690,total_amount#1691,congestion_surcharge#1692,airport_fee#1693] parquet
         :                                                           :- Project [VendorID#1732L, tpep_pickup_datetime#1733, tpep_dropoff_datetime#1734, passenger_count#1735, trip_distance#1736, RatecodeID#1737, store_and_fwd_flag#1738, PULocationID#1739L, DOLocationID#1740L, payment_type#1741L, fare_amount#1742, extra#1743, mta_tax#1744, tip_amount#1745, tolls_amount#1746, improvement_surcharge#1747, total_amount#1748, congestion_surcharge#1749, cast(airport_fee#1750 as double) AS airport_fee#1912]
         :                                                           :  +- Relation [VendorID#1732L,tpep_pickup_datetime#1733,tpep_dropoff_datetime#1734,passenger_count#1735,trip_distance#1736,RatecodeID#1737,store_and_fwd_flag#1738,PULocationID#1739L,DOLocationID#1740L,payment_type#1741L,fare_amount#1742,extra#1743,mta_tax#1744,tip_amount#1745,tolls_amount#1746,improvement_surcharge#1747,total_amount#1748,congestion_surcharge#1749,airport_fee#1750] parquet
         :                                                           :- Project [VendorID#1789L, tpep_pickup_datetime#1790, tpep_dropoff_datetime#1791, passenger_count#1792, trip_distance#1793, RatecodeID#1794, store_and_fwd_flag#1795, PULocationID#1796L, DOLocationID#1797L, payment_type#1798L, fare_amount#1799, extra#1800, mta_tax#1801, tip_amount#1802, tolls_amount#1803, improvement_surcharge#1804, total_amount#1805, congestion_surcharge#1806, cast(airport_fee#1807 as double) AS airport_fee#1913]
         :                                                           :  +- Relation [VendorID#1789L,tpep_pickup_datetime#1790,tpep_dropoff_datetime#1791,passenger_count#1792,trip_distance#1793,RatecodeID#1794,store_and_fwd_flag#1795,PULocationID#1796L,DOLocationID#1797L,payment_type#1798L,fare_amount#1799,extra#1800,mta_tax#1801,tip_amount#1802,tolls_amount#1803,improvement_surcharge#1804,total_amount#1805,congestion_surcharge#1806,airport_fee#1807] parquet
         :                                                           :- Relation [VendorID#1846L,tpep_pickup_datetime#1847,tpep_dropoff_datetime#1848,passenger_count#1849,trip_distance#1850,RatecodeID#1851,store_and_fwd_flag#1852,PULocationID#1853L,DOLocationID#1854L,payment_type#1855L,fare_amount#1856,extra#1857,mta_tax#1858,tip_amount#1859,tolls_amount#1860,improvement_surcharge#1861,total_amount#1862,congestion_surcharge#1863,airport_fee#1864] parquet
         :                                                           :- Relation [VendorID#1954L,tpep_pickup_datetime#1955,tpep_dropoff_datetime#1956,passenger_count#1957,trip_distance#1958,RatecodeID#1959,store_and_fwd_flag#1960,PULocationID#1961L,DOLocationID#1962L,payment_type#1963L,fare_amount#1964,extra#1965,mta_tax#1966,tip_amount#1967,tolls_amount#1968,improvement_surcharge#1969,total_amount#1970,congestion_surcharge#1971,airport_fee#1972] parquet
         :                                                           :- Project [VendorID#2011L, tpep_pickup_datetime#2012, tpep_dropoff_datetime#2013, passenger_count#2014, trip_distance#2015, RatecodeID#2016, store_and_fwd_flag#2017, PULocationID#2018L, DOLocationID#2019L, payment_type#2020L, fare_amount#2021, extra#2022, mta_tax#2023, tip_amount#2024, tolls_amount#2025, improvement_surcharge#2026, total_amount#2027, congestion_surcharge#2028, cast(airport_fee#2029 as double) AS airport_fee#2049]
         :                                                           :  +- Relation [VendorID#2011L,tpep_pickup_datetime#2012,tpep_dropoff_datetime#2013,passenger_count#2014,trip_distance#2015,RatecodeID#2016,store_and_fwd_flag#2017,PULocationID#2018L,DOLocationID#2019L,payment_type#2020L,fare_amount#2021,extra#2022,mta_tax#2023,tip_amount#2024,tolls_amount#2025,improvement_surcharge#2026,total_amount#2027,congestion_surcharge#2028,airport_fee#2029] parquet
         :                                                           :- Relation [VendorID#2070L,tpep_pickup_datetime#2071,tpep_dropoff_datetime#2072,passenger_count#2073,trip_distance#2074,RatecodeID#2075,store_and_fwd_flag#2076,PULocationID#2077L,DOLocationID#2078L,payment_type#2079L,fare_amount#2080,extra#2081,mta_tax#2082,tip_amount#2083,tolls_amount#2084,improvement_surcharge#2085,total_amount#2086,congestion_surcharge#2087,airport_fee#2088] parquet
         :                                                           :- Relation [VendorID#2127L,tpep_pickup_datetime#2128,tpep_dropoff_datetime#2129,passenger_count#2130,trip_distance#2131,RatecodeID#2132,store_and_fwd_flag#2133,PULocationID#2134L,DOLocationID#2135L,payment_type#2136L,fare_amount#2137,extra#2138,mta_tax#2139,tip_amount#2140,tolls_amount#2141,improvement_surcharge#2142,total_amount#2143,congestion_surcharge#2144,airport_fee#2145] parquet
         :                                                           :- Relation [VendorID#2184L,tpep_pickup_datetime#2185,tpep_dropoff_datetime#2186,passenger_count#2187,trip_distance#2188,RatecodeID#2189,store_and_fwd_flag#2190,PULocationID#2191L,DOLocationID#2192L,payment_type#2193L,fare_amount#2194,extra#2195,mta_tax#2196,tip_amount#2197,tolls_amount#2198,improvement_surcharge#2199,total_amount#2200,congestion_surcharge#2201,airport_fee#2202] parquet
         :                                                           :- Relation [VendorID#2241L,tpep_pickup_datetime#2242,tpep_dropoff_datetime#2243,passenger_count#2244,trip_distance#2245,RatecodeID#2246,store_and_fwd_flag#2247,PULocationID#2248L,DOLocationID#2249L,payment_type#2250L,fare_amount#2251,extra#2252,mta_tax#2253,tip_amount#2254,tolls_amount#2255,improvement_surcharge#2256,total_amount#2257,congestion_surcharge#2258,airport_fee#2259] parquet
         :                                                           +- Relation [VendorID#2298L,tpep_pickup_datetime#2299,tpep_dropoff_datetime#2300,passenger_count#2301,trip_distance#2302,RatecodeID#2303,store_and_fwd_flag#2304,PULocationID#2305L,DOLocationID#2306L,payment_type#2307L,fare_amount#2308,extra#2309,mta_tax#2310,tip_amount#2311,tolls_amount#2312,improvement_surcharge#2313,total_amount#2314,congestion_surcharge#2315,airport_fee#2316] parquet
         +- Relation [year#0L,month#1L,day#2L,cdc_week#3L,week_index#4L,us_format#5,week_ending#6,week_month#7L,week_year#8L,timeline#9] parquet


In [None]:
# next, filter out trips which do not start and/or end within the 5 boroughs 
tlc_df = ch.extract_borough_name(tlc_df, zones_df,  'pu')
tlc_df = ch.extract_borough_name(tlc_df, zones_df,  'do')

if INTERMEDIATE_OUTPUTS:
    tlc_df.sort('trip_distance', ascending=False).limit(5)

In [None]:
if INTERMEDIATE_OUTPUTS:
    tlc_df.count()

In [None]:
if INTERMEDIATE_OUTPUTS:
    tlc_df.sort('week_index', ascending = False).limit(10)

In [None]:
if INTERMEDIATE_OUTPUTS:
    tlc_df.sort('trip_distance', ascending=False).limit(5)

In [None]:
# filter by anything outside of the time bounded by the timelines
# (keep the transitionary time (2019) for time-series analysis)
tlc_df = tlc_df.where(F.col('timeline') != 'neither')

In [None]:
# save the stacked df by month (this will take a while)
tlc_df = tlc_df.sort('week_year', 'week_month')
tlc_df.write\
    .partitionBy('week_year', 'week_month')\
    .mode('overwrite')\
    .parquet(f'../data/curated/tlc/cleaned/yellow')

### 2. Cleaning the COVID dataset

In [None]:
# read in the covid dataset
covid_df = spark.read.csv('../data/raw/virals/covid/cases-by-day.csv',
    header = True)
covid_df.limit(5)
# TODO: commenting

In [None]:
# sum the number of incomplete datasets (ensure no incomplete values)
if INTERMEDIATE_OUTPUTS:
    sum(covid_df.select('INCOMPLETE'))
# TODO: commenting

In [None]:
# TODO: commenting on covid cleaning
COVID_KEEP_COLUMNS = {
    'date_of_interest':'date'
}

COVID_CLEAN_COLUMNS = defaultdict(lambda: ch.non_negative)

COVID_BOROUGHS = {
    'BX_':'Bronx',
    'BK_':'Brooklyn',
    'MN_':'Manhattan',
    'QN_':'Queens',
    'SI_':'Staten Island',
}

COVID_COUNTS = {
    'CASE_COUNT': 'cases', 
    'DEATH_COUNT': 'deaths', 
    'HOSPITALIZED_COUNT': 'hospitalised'
}
# TODO: commenting
for prefix, new_prefix in COVID_BOROUGHS.items():
    for suffix, new_suffix in COVID_COUNTS.items():
        COVID_KEEP_COLUMNS[f'{prefix}{suffix}'] = f'{new_prefix}{new_suffix}'

In [None]:
covid_df = ch.perform_cleaning(covid_df, mmwr_weeks_df, COVID_KEEP_COLUMNS, 
    COVID_CLEAN_COLUMNS)
# TODO: commenting

In [None]:
from itertools import product

temp_df = None
# TODO: commenting
COVID_DATE_COLUMNS = [
    F.col('date'), 
    F.col('week_ending'), 
    F.col('week_year'), 
    F.col('week_month'), 
    F.col('week_index')
    F.col('timeline')
]

# The data here is very wide, I'd rather just have a 'borough' column
# for homogeneity of all the data
for prefix in COVID_BOROUGHS.values():
    borough_columns = []
    for suffix in COVID_COUNTS.values():
        borough_columns.append(F.col(f'{prefix}{suffix}').alias(suffix))

    if temp_df == None:
        temp_df = covid_df.select(COVID_DATE_COLUMNS + borough_columns)\
            .withColumn('borough', F.lit(prefix))
    else:
        temp_df = temp_df\
            .union(
                covid_df.select(COVID_DATE_COLUMNS + borough_columns)\
                    .withColumn('borough', F.lit(prefix))
            )
    
covid_df = temp_df

In [None]:
if INTERMEDIATE_OUTPUTS:
    covid_df.sort('week_index', 'date').limit(5)
# TODO: commenting

In [None]:
# save the cleaned covid data
# TODO: commenting
covid_df.write.mode('overwrite').parquet('../data/curated/virals/covid/cleaned/cases-by-day')

### 3. Cleaning the flu dataset

In [None]:
# read in the flu dataset
# TODO: commenting
flu_df = spark.read.csv('../data/raw/virals/flu/cases-by-week.csv',
    header=True)
flu_df.limit(5)

In [None]:
FLU_KEEP_COLUMNS = {
    'Week Ending Date': 'date',
    'Region': 'region',
    'County': 'borough',
    'Disease': 'disease',
    'Count': 'cases',
}
# TODO: commenting
FLU_CLEAN_COLUMNS = {
    'date': [],
    'region': [lambda _: F.col('region') == 'NYC'],
    'borough': [],
    'disease': [],
    'cases': [ch.non_negative]
}

In [None]:
# TODO: commenting
flu_df:DataFrame = ch.perform_cleaning(flu_df, mmwr_weeks_df, FLU_KEEP_COLUMNS, 
    FLU_CLEAN_COLUMNS)

In [None]:
if INTERMEDIATE_OUTPUTS:
    # get the list of distinct counties (column now called 'borough')
    flu_df.select('borough').distinct().limit(5)

In [None]:
# map the boroughs to their proper names
# from: https://portal.311.nyc.gov/article/?kanumber=KA-02877
# also from map dict
FLU_COUNTY_TO_BOROUGH = {
    'BRONX': 'Bronx',
    'KINGS': 'Brooklyn',
    'NEW YORK': 'Manhattan',
    'QUEENS': 'Queens',
    'RICHMOND': 'Staten Island'
}

In [None]:
# apply the mapping to the flu df
flu_df = ch.replace_column_using_dict(flu_df, 'borough', FLU_COUNTY_TO_BOROUGH)

# also remove the regions column (not needed anymore)
columns_without_regions = flu_df.columns[:]
columns_without_regions.remove('region')
flu_df = flu_df.select(columns_without_regions)

In [None]:
if INTERMEDIATE_OUTPUTS:
    flu_df.limit(5)

In [None]:
# save the cleaned flu data
flu_df.write.mode('overwrite').parquet('../data/curated/virals/flu/cleaned/cases-by-week')