### MAST30034: Applied Data Science Project 1
---
# Preprocessing Part 1: Cleaning The Data
#### Xavier Travers (1178369)

Cleaning the datasets of null, inconsistent, or unnecessary values.
This is performed on the TLC data and COVID data.

In [1]:
# imports used throughout this notebook
from collections import defaultdict
from itertools import product
import os
import sys
from pyspark.sql import DataFrame, Column
from pyspark.sql.types import IntegerType
from pyspark.sql import functions as F
import geopandas

# add homemade helpers
sys.path.insert(1, '../scripts')
import helpers.cleaning_helpers as ch
import helpers.join_helpers as jh

# for printouts
DEBUGGING = False

In [2]:
from pyspark.sql import SparkSession

# Create a spark session (which will run spark jobs)
spark = (
    SparkSession.builder.appName('MAST30034 XT Project 1')
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config('spark.sql.repl.eagerEval.enabled', True) 
    .config('spark.sql.parquet.cacheMetadata', 'true')
    .config("spark.executor.memory", "2g")
    .config("spark.driver.memory", "4g")
    .getOrCreate()
)

22/08/12 10:27:06 WARN Utils: Your hostname, Ganymede resolves to a loopback address: 127.0.1.1; using 172.29.200.237 instead (on interface eth0)
22/08/12 10:27:06 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/08/12 10:27:08 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
# import the cdc week file to convert all dates to cdc weeks now
mmwr_weeks_df = spark.read.parquet('../data/raw/virals/mmwr_weeks.parquet')
mmwr_weeks_df.limit(5)

                                                                                

year,month,day,cdc_week,week_index,us_format
2018,12,30,1,1,12/30/2018
2018,12,31,1,1,12/31/2018
2019,1,1,1,1,01/01/2019
2019,1,2,1,1,01/02/2019
2019,1,3,1,1,01/03/2019


In [4]:
# import the zones dataset
zones_df = spark.read.csv('../data/raw/tlc_zones/zones.csv',
    header = True)

### 1. Cleaning the TLC dataset(s)

In [5]:
example_df = spark.read.parquet('../data/raw/tlc/yellow/2019-07.parquet/')
example_df.limit(5)
# TODO: commenting

                                                                                

VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
2,2019-07-01 00:51:04,2019-07-01 00:51:33,1.0,0.0,1.0,N,193,193,1,2.5,0.5,0.5,1.14,0.0,0.3,4.94,0.0,
2,2019-07-01 00:46:04,2019-07-01 01:05:46,1.0,4.16,1.0,N,234,25,2,16.5,0.5,0.5,0.0,0.0,0.3,20.3,2.5,
1,2019-07-01 00:25:09,2019-07-01 01:00:56,1.0,18.8,2.0,N,132,42,1,52.0,0.0,0.5,11.75,6.12,0.3,70.67,0.0,
2,2019-07-01 00:33:32,2019-07-01 01:15:27,1.0,18.46,2.0,N,132,142,1,52.0,0.0,0.5,11.06,0.0,0.3,66.36,2.5,
1,2019-07-01 00:00:55,2019-07-01 00:13:05,0.0,1.7,1.0,N,107,114,1,9.5,3.0,0.5,2.0,0.0,0.3,15.3,2.5,


In [6]:
example_df.sort('trip_distance', ascending = False).limit(5)

                                                                                

VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
2,2019-07-29 09:46:42,2019-07-29 15:12:31,1.0,311.56,4.0,N,68,265,2,1574.0,0.0,0.5,0.0,10.5,0.3,1587.8,2.5,
1,2019-07-17 13:42:23,2019-07-17 14:15:25,1.0,307.5,1.0,N,161,138,1,28.5,2.5,0.5,5.0,0.0,0.3,36.8,2.5,
2,2019-07-03 16:13:11,2019-07-03 20:09:21,2.0,180.09,5.0,N,93,265,1,400.0,0.0,0.0,0.0,57.12,0.3,457.42,0.0,
2,2019-07-19 07:01:46,2019-07-19 10:50:56,2.0,169.47,4.0,N,43,265,2,794.5,0.0,0.5,0.0,12.5,0.3,807.8,0.0,
2,2019-07-13 05:40:49,2019-07-13 08:32:15,4.0,168.44,4.0,N,132,265,2,796.5,0.5,0.5,0.0,0.0,0.3,797.8,0.0,


In [7]:
# names of the tlc datasets to clean 
# (I was originally planning on working on fhvhv and green as well)
TLC_NAMES = ['yellow']

# dictionary to rename all the columns I want to keep
TLC_KEEP_COLUMNS = {
    'tpep_pickup_datetime': 'date',
    'passenger_count': 'passengers',
    'trip_distance': 'trip_distance',
    'PULocationID': 'pu_location_id',
    'DOLocationID': 'do_location_id',
    'hours_elapsed': 'hours_elapsed'
    # #  below only apply to fhvhv
    # 'hvfhs_license_num': 'fhvhv_license',
    # 'pickup_datetime': 'date',
    # 'trip_miles': 'trip_distance',
    # 'shared_request_flag': 'shared'
}

# create a dictionary of the columns to keep and the required filters
TLC_CLEAN_COLUMNS = {
    'pu_location_id': [ch.non_null], 
    'do_location_id': [ch.non_null], 
    'passengers': [ch.non_null], 
    'trip_distance': [ch.non_null, ch.strictly_positive], 
    # 'fhvhv_license': [ch.non_null], 
}

In [8]:
# iterate through the TLC names/types (~5-10 mins)
# TODO: commenting
# stacked_tlc_df = None
# for name in TLC_NAMES:
#     # iterate through the downloaded files per taxi type
#     for filename in os.listdir(f'../data/raw/tlc/{name}'):

#         # read the parquet in
#         tlc_df = spark.read.parquet(f'../data/raw/tlc/{name}/{filename}')

#         # debug info
#         print(f'=== CLEANING "{name}/{filename}"')
    
#         if DEBUGGING:
#             print(f'STARTING WITH {tlc_df.count()} ROWS')

#         tlc_df = ch.perform_cleaning(tlc_df, mmwr_weeks_df, TLC_KEEP_COLUMNS, 
#             TLC_CLEAN_COLUMNS)

#         if stacked_tlc_df == None:
#             stacked_tlc_df = tlc_df
#         else:
#             stacked_tlc_df = stacked_tlc_df.union(tlc_df)

#         if DEBUGGING:
#             print(f'REDUCED TO {tlc_df.count()} ROWS')
        
        # # write to file system
        # tlc_df.write.mode('overwrite')\
        #     .parquet(f'../data/curated/tlc/cleaned/{name}/{filename}')

# stacked_tlc_rows = stacked_tlc_df.collect()
# stacked_tlc_df = spark.createDataFrame(stacked_tlc_rows)

# get the count of the elements
# count_rows = stacked_tlc_df.count()
# print(count_rows)

# # remove the top and bottom 5% of values by trip distance (removes outliers)
# stacked_tlc_df:DataFrame = stacked_tlc_df.sort('trip_distance')
# stacked_tlc_df = stacked_tlc_df.limit(int(count_rows * 0.95))
# stacked_tlc_df = stacked_tlc_df.sort('trip_distance', ascending = False)
# stacked_tlc_df = stacked_tlc_df.limit(int(count_rows * 0.95))

# # print(stacked_tlc_df.count())
# stacked_tlc_df = stacked_tlc_df.sort('year', 'month')
# stacked_tlc_df.write\
#     .partitionBy('year', 'month')\
#     .mode('overwrite')\
#     .parquet(f'../data/curated/tlc/cleaned/{name}.parquet')

In [9]:
# read in the tlc data
tlc_df = jh.read_stacked_tlc_df(spark)

In [10]:
# sanity check
tlc_df.limit(5)

                                                                                

VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
1,2019-01-01 00:46:40,2019-01-01 00:53:20,1.0,1.5,1.0,N,151,239,1,7.0,0.5,0.5,1.65,0.0,0.3,9.95,,
1,2019-01-01 00:59:47,2019-01-01 01:18:59,1.0,2.6,1.0,N,239,246,1,14.0,0.5,0.5,1.0,0.0,0.3,16.3,,
2,2018-12-21 13:48:30,2018-12-21 13:52:40,3.0,0.0,1.0,N,236,236,1,4.5,0.5,0.5,0.0,0.0,0.3,5.8,,
2,2018-11-28 15:52:25,2018-11-28 15:55:45,5.0,0.0,1.0,N,193,193,2,3.5,0.5,0.5,0.0,0.0,0.3,7.55,,
2,2018-11-28 15:56:57,2018-11-28 15:58:33,5.0,0.0,2.0,N,193,193,2,52.0,0.0,0.5,0.0,0.0,0.3,55.55,,


In [11]:
# derive extra values which are used to filter out valid trips
SECONDS_TO_HOURS = 1 / (60*60)
tlc_df = tlc_df\
    .withColumn('hours_elapsed', 
        (
            (F.col("tpep_dropoff_datetime").cast("long")
            - F.col('tpep_pickup_datetime').cast("long")) 
            * SECONDS_TO_HOURS
        )
    )\
    .withColumn('mph', (F.col('trip_distance') / F.col('elapsed_time')))
tlc_df.limit(5) 

                                                                                

VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee,elapsed_time,mph
1,2019-01-01 00:46:40,2019-01-01 00:53:20,1.0,1.5,1.0,N,151,239,1,7.0,0.5,0.5,1.65,0.0,0.3,9.95,,,0.1111111111111111,13.5
1,2019-01-01 00:59:47,2019-01-01 01:18:59,1.0,2.6,1.0,N,239,246,1,14.0,0.5,0.5,1.0,0.0,0.3,16.3,,,0.32,8.125
2,2018-12-21 13:48:30,2018-12-21 13:52:40,3.0,0.0,1.0,N,236,236,1,4.5,0.5,0.5,0.0,0.0,0.3,5.8,,,0.0694444444444444,0.0
2,2018-11-28 15:52:25,2018-11-28 15:55:45,5.0,0.0,1.0,N,193,193,2,3.5,0.5,0.5,0.0,0.0,0.3,7.55,,,0.0555555555555555,0.0
2,2018-11-28 15:56:57,2018-11-28 15:58:33,5.0,0.0,2.0,N,193,193,2,52.0,0.0,0.5,0.0,0.0,0.3,55.55,,,0.0266666666666666,0.0


In [12]:
# https://ypdcrime.com/vt/article30.php?zoom_highlight=fifty+five+miles+per+hour#t1180-a.
# As per: https://www.dot.ny.gov/divisions/operating/oom/transportation-systems/repository/TSMI-17-05.pdf
# the NYS maximum speed limit is 65 mph. filter out trips faster than legal.
tlc_df = tlc_df.where(
    (F.col('mph').isNotNull()) &
    (F.col('mph') <= 65)
)

In [13]:
tlc_df.limit(5) 

                                                                                

VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee,elapsed_time,mph
1,2019-01-01 00:46:40,2019-01-01 00:53:20,1.0,1.5,1.0,N,151,239,1,7.0,0.5,0.5,1.65,0.0,0.3,9.95,,,0.1111111111111111,13.5
1,2019-01-01 00:59:47,2019-01-01 01:18:59,1.0,2.6,1.0,N,239,246,1,14.0,0.5,0.5,1.0,0.0,0.3,16.3,,,0.32,8.125
2,2018-12-21 13:48:30,2018-12-21 13:52:40,3.0,0.0,1.0,N,236,236,1,4.5,0.5,0.5,0.0,0.0,0.3,5.8,,,0.0694444444444444,0.0
2,2018-11-28 15:52:25,2018-11-28 15:55:45,5.0,0.0,1.0,N,193,193,2,3.5,0.5,0.5,0.0,0.0,0.3,7.55,,,0.0555555555555555,0.0
2,2018-11-28 15:56:57,2018-11-28 15:58:33,5.0,0.0,2.0,N,193,193,2,52.0,0.0,0.5,0.0,0.0,0.3,55.55,,,0.0266666666666666,0.0


In [14]:
tlc_df.sort('trip_distance', ascending = False).limit(5)

                                                                                

VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee,elapsed_time,mph
1,2020-06-02 06:36:15,2020-06-02 14:33:50,1.0,441.6,5.0,N,68,265,2,300.0,0.0,0.0,0.0,0.0,0.3,300.3,0.0,,7.959722222222222,55.479322980282674
1,2021-01-20 11:22:05,2021-01-20 19:47:56,1.0,427.7,1.0,Y,4,265,1,1128.5,2.5,0.5,1140.44,20.16,0.3,2292.4,2.5,,8.430833333333334,50.73045369180586
1,2020-07-30 15:10:02,2020-07-30 22:04:34,1.0,414.4,5.0,N,138,265,1,400.0,0.0,0.0,87.2,35.74,0.3,523.24,0.0,,6.908888888888889,59.98070119009328
2,2020-12-06 07:33:27,2020-12-06 14:01:29,1.0,407.78,5.0,N,264,265,1,200.0,0.0,0.0,1.0,11.75,0.3,215.55,2.5,,6.467222222222222,63.05334593248002
2,2019-12-25 20:44:07,2019-12-26 03:10:41,2.0,363.13,5.0,N,132,265,2,400.0,0.0,0.5,0.0,46.34,0.3,447.14,0.0,,6.442777777777778,56.362335086660345


In [15]:
tlc_df = ch.perform_cleaning(tlc_df, mmwr_weeks_df, TLC_KEEP_COLUMNS, 
    TLC_CLEAN_COLUMNS)

In [16]:
# next, filter out trips which do not start and/or end within the 5 boroughs 
# the inner join should filter it all out (except for airports)
tlc_df = ch.extract_borough_name(tlc_df, zones_df,  'pu')
tlc_df = ch.extract_borough_name(tlc_df, zones_df,  'do')
tlc_df.sort('trip_distance', ascending=False).limit(5)

                                                                                

year,month,day,week_index,cdc_week,date,passengers,trip_distance,pu_location_id,do_location_id,pu_borough,do_borough
2020,9,9,89,37,09/09/2020,1.0,358.33,186,69,Manhattan,Bronx
2021,1,12,107,2,01/12/2021,0.0,326.1,234,39,Manhattan,Brooklyn
2020,5,16,72,20,05/16/2020,2.0,305.1,4,79,Manhattan,Manhattan
2020,11,13,98,46,11/13/2020,1.0,277.8,79,4,Manhattan,Manhattan
2021,1,6,106,1,01/06/2021,1.0,271.4,229,39,Manhattan,Brooklyn


In [17]:
# get the IQR of the trip distances
q1, q3 = tlc_df.approxQuantile('trip_distance', [0.25, 0.75], 0.0001)
print(f'Q1: {q1}, Q3: {q3}')
iqr = q3 - q1
IQR_MULTIPLIER = 1.5



Q1: 1.0, Q3: 3.0


                                                                                

In [18]:
q1, q2 = tlc_df\
    .where(F.col('pu_borough') == 'Staten Island')\
    .approxQuantile('trip_distance', [0.25, 0.75], 10 ** -5)
print(q1, q2)



7.07 32.53


                                                                                

In [19]:
q025, q975 = tlc_df.approxQuantile('trip_distance', [0.01, 0.99999], 10 ** -5)
print(q025, q975)

                                                                                

0.3 358.33


In [20]:
# # remove the values that aren't within Q1 - 1.5 IQR and Q3 + 1.5 IQR
# stacked_tlc_df = stacked_tlc_df.where(F.col('trip_distance').between(
#     q1 - IQR_MULTIPLIER * iqr, q3 + IQR_MULTIPLIER * iqr
# ))

In [21]:
tlc_df.count()

                                                                                

95613333

In [22]:
tlc_df.sort('trip_distance', ascending=False).limit(5)

                                                                                

year,month,day,week_index,cdc_week,date,passengers,trip_distance,pu_location_id,do_location_id,pu_borough,do_borough
2020,9,9,89,37,09/09/2020,1.0,358.33,186,69,Manhattan,Bronx
2021,1,12,107,2,01/12/2021,0.0,326.1,234,39,Manhattan,Brooklyn
2020,5,16,72,20,05/16/2020,2.0,305.1,4,79,Manhattan,Manhattan
2020,11,13,98,46,11/13/2020,1.0,277.8,79,4,Manhattan,Manhattan
2021,1,6,106,1,01/06/2021,1.0,271.4,229,39,Manhattan,Brooklyn


In [23]:
# save the stacked df by month
tlc_df = tlc_df.sort('year', 'month')
tlc_df.write\
    .partitionBy('year', 'month')\
    .mode('overwrite')\
    .parquet(f'../data/curated/tlc/cleaned/yellow')

                                                                                

### 2. Cleaning the COVID dataset

In [24]:
# read in the covid dataset
covid_df = spark.read.csv('../data/raw/virals/covid/cases-by-day.csv',
    header = True)
covid_df.limit(5)
# TODO: commenting

22/08/12 10:54:16 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


date_of_interest,CASE_COUNT,PROBABLE_CASE_COUNT,HOSPITALIZED_COUNT,DEATH_COUNT,PROBABLE_DEATH_COUNT,CASE_COUNT_7DAY_AVG,ALL_CASE_COUNT_7DAY_AVG,HOSP_COUNT_7DAY_AVG,DEATH_COUNT_7DAY_AVG,ALL_DEATH_COUNT_7DAY_AVG,BX_CASE_COUNT,BX_PROBABLE_CASE_COUNT,BX_HOSPITALIZED_COUNT,BX_DEATH_COUNT,BX_PROBABLE_DEATH_COUNT,BX_CASE_COUNT_7DAY_AVG,BX_PROBABLE_CASE_COUNT_7DAY_AVG,BX_ALL_CASE_COUNT_7DAY_AVG,BX_HOSPITALIZED_COUNT_7DAY_AVG,BX_DEATH_COUNT_7DAY_AVG,BX_ALL_DEATH_COUNT_7DAY_AVG,BK_CASE_COUNT,BK_PROBABLE_CASE_COUNT,BK_HOSPITALIZED_COUNT,BK_DEATH_COUNT,BK_PROBABLE_DEATH_COUNT,BK_CASE_COUNT_7DAY_AVG,BK_PROBABLE_CASE_COUNT_7DAY_AVG,BK_ALL_CASE_COUNT_7DAY_AVG,BK_HOSPITALIZED_COUNT_7DAY_AVG,BK_DEATH_COUNT_7DAY_AVG,BK_ALL_DEATH_COUNT_7DAY_AVG,MN_CASE_COUNT,MN_PROBABLE_CASE_COUNT,MN_HOSPITALIZED_COUNT,MN_DEATH_COUNT,MN_PROBABLE_DEATH_COUNT,MN_CASE_COUNT_7DAY_AVG,MN_PROBABLE_CASE_COUNT_7DAY_AVG,MN_ALL_CASE_COUNT_7DAY_AVG,MN_HOSPITALIZED_COUNT_7DAY_AVG,MN_DEATH_COUNT_7DAY_AVG,MN_ALL_DEATH_COUNT_7DAY_AVG,QN_CASE_COUNT,QN_PROBABLE_CASE_COUNT,QN_HOSPITALIZED_COUNT,QN_DEATH_COUNT,QN_PROBABLE_DEATH_COUNT,QN_CASE_COUNT_7DAY_AVG,QN_PROBABLE_CASE_COUNT_7DAY_AVG,QN_ALL_CASE_COUNT_7DAY_AVG,QN_HOSPITALIZED_COUNT_7DAY_AVG,QN_DEATH_COUNT_7DAY_AVG,QN_ALL_DEATH_COUNT_7DAY_AVG,SI_CASE_COUNT,SI_PROBABLE_CASE_COUNT,SI_HOSPITALIZED_COUNT,SI_DEATH_COUNT,SI_PROBABLE_DEATH_COUNT,SI_CASE_COUNT_7DAY_AVG,SI_PROBABLE_CASE_COUNT_7DAY_AVG,SI_ALL_CASE_COUNT_7DAY_AVG,SI_HOSPITALIZED_COUNT_7DAY_AVG,SI_DEATH_COUNT_7DAY_AVG,SI_ALL_DEATH_COUNT_7DAY_AVG,INCOMPLETE
02/29/2020,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
03/01/2020,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
03/02/2020,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
03/03/2020,1,0,7,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
03/04/2020,5,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,2,0,1,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [25]:
# sum the number of incomplete datasets (ensure no incomplete values)
sum(covid_df.select('INCOMPLETE'))
# TODO: commenting

Column<'(INCOMPLETE + 0)'>

In [26]:
# TODO: commenting on covid cleaning
COVID_KEEP_COLUMNS = {
    'date_of_interest':'date'
}

COVID_CLEAN_COLUMNS = defaultdict(lambda: ch.non_negative)

COVID_BOROUGHS = {
    '': 'Overall',
    'BX_':'Bronx',
    'BK_':'Brooklyn',
    'MN_':'Manhattan',
    'QN_':'Queens',
    'SI_':'Staten Island',
}

COVID_COUNTS = {
    'CASE_COUNT': 'cases', 
    'DEATH_COUNT': 'deaths', 
    'HOSPITALIZED_COUNT': 'hospitalised'
}
# TODO: commenting
for prefix, new_prefix in COVID_BOROUGHS.items():
    for suffix, new_suffix in COVID_COUNTS.items():
        COVID_KEEP_COLUMNS[f'{prefix}{suffix}'] = f'{new_prefix}{new_suffix}'

In [27]:
covid_df = ch.perform_cleaning(covid_df, mmwr_weeks_df, COVID_KEEP_COLUMNS, 
    COVID_CLEAN_COLUMNS)
# TODO: commenting

In [28]:
from itertools import product

temp_df = None
# TODO: commenting
COVID_DATE_COLUMNS = [
    F.col('date'), F.col('year'), F.col('cdc_week'), F.col('week_index'),
]

# The data here is very wide, I'd rather just have a 'borough' column
# for homogeneity of all the data
for prefix in COVID_BOROUGHS.values():
    borough_columns = []
    for suffix in COVID_COUNTS.values():
        borough_columns.append(F.col(f'{prefix}{suffix}').alias(suffix))

    if temp_df == None:
        temp_df = covid_df.select(COVID_DATE_COLUMNS + borough_columns)\
            .withColumn('borough', F.lit(prefix))
    else:
        temp_df = temp_df\
            .union(
                covid_df.select(COVID_DATE_COLUMNS + borough_columns)\
                    .withColumn('borough', F.lit(prefix))
            )
    
covid_df = temp_df

In [29]:
covid_df.sort('week_index', 'date').limit(5)
# TODO: commenting

date,year,cdc_week,week_index,cases,deaths,hospitalised,borough
02/29/2020,2020,9,61,0,0,0,Bronx
02/29/2020,2020,9,61,0,0,1,Brooklyn
02/29/2020,2020,9,61,1,0,1,Overall
02/29/2020,2020,9,61,1,0,0,Manhattan
02/29/2020,2020,9,61,0,0,0,Queens


In [30]:
# save the cleaned covid data
# TODO: commenting
covid_df.write.mode('overwrite').parquet('../data/curated/virals/covid/cases-by-day')

### 3. Cleaning the flu dataset

In [31]:
# read in the flu dataset
# TODO: commenting
flu_df = spark.read.csv('../data/raw/virals/flu/cases-by-week.csv',
    header=True)
flu_df.limit(5)

Season,Region,County,CDC Week,Week Ending Date,Disease,Count,County Centroid,FIPS
2012-2013,NYC,RICHMOND,10,03/09/2013,INFLUENZA_A,0,"(40.5795, -74.1502)",36085
2011-2012,CAPITAL DISTRICT,ALBANY,10,03/10/2012,INFLUENZA_UNSPECI...,0,"(42.5882713, -73....",36001
2009-2010,CAPITAL DISTRICT,SCHENECTADY,41,10/17/2009,INFLUENZA_UNSPECI...,0,"(42.8175421, -74....",36093
2010-2011,WESTERN,CHAUTAUQUA,19,05/14/2011,INFLUENZA_B,0,"(42.3042159, -79....",36013
2013-2014,METRO,DUTCHESS,44,11/02/2013,INFLUENZA_A,0,"(41.7550085, -73....",36027


In [32]:
FLU_KEEP_COLUMNS = {
    'Week Ending Date': 'date',
    'Region': 'region',
    'County': 'borough',
    'Disease': 'disease',
    'Count': 'cases',
}
# TODO: commenting
FLU_CLEAN_COLUMNS = {
    'date': [],
    'region': [lambda _: F.col('region') == 'NYC'],
    'borough': [],
    'disease': [],
    'cases': [ch.non_negative]
}

In [33]:
# TODO: commenting
flu_df:DataFrame = ch.perform_cleaning(flu_df, mmwr_weeks_df, FLU_KEEP_COLUMNS, 
    FLU_CLEAN_COLUMNS)

In [34]:
# get the list of distinct counties (column now called 'borough')
flu_df.select('borough').distinct().limit(5)

borough
BRONX
RICHMOND
NEW YORK
KINGS
QUEENS


In [35]:
# map the boroughs to their proper names
# from: https://portal.311.nyc.gov/article/?kanumber=KA-02877
# also from map dict
FLU_COUNTY_TO_BOROUGH = {
    'BRONX': 'Bronx',
    'KINGS': 'Brooklyn',
    'NEW YORK': 'Manhattan',
    'QUEENS': 'Queens',
    'RICHMOND': 'Staten Island'
}

In [36]:
# apply the mapping to the flu df
flu_df = ch.replace_column_using_dict(flu_df, 'borough', FLU_COUNTY_TO_BOROUGH)

# also remove the regions column (not needed anymore)
columns_without_regions = flu_df.columns[:]
columns_without_regions.remove('region')
flu_df = flu_df.select(columns_without_regions)

In [37]:
flu_df.limit(5)

year,month,day,week_index,cdc_week,date,borough,disease,cases
2019,1,12,2,2,01/12/2019,Staten Island,INFLUENZA_UNSPECI...,0
2019,10,19,42,42,10/19/2019,Manhattan,INFLUENZA_B,0
2019,2,9,6,6,02/09/2019,Bronx,INFLUENZA_B,11
2019,2,16,7,7,02/16/2019,Staten Island,INFLUENZA_UNSPECI...,0
2021,12,25,156,51,12/25/2021,Queens,INFLUENZA_A,582


In [38]:
# save the cleaned flu data
flu_df.write.mode('overwrite').parquet('../data/curated/virals/flu/cases-by-week')