### MAST30034: Applied Data Science Project 1
---
# Preprocessing Part 1: Cleaning The Data
#### Xavier Travers (1178369)

Cleaning the datasets of null, inconsistent, or unnecessary values.
This is performed on the TLC data and COVID data.

In [1]:
# imports used throughout this notebook
from collections import defaultdict
from itertools import product
import os
import sys
from pyspark.sql import DataFrame, Column
from pyspark.sql.types import IntegerType
from pyspark.sql import functions as F

# add homemade helpers
sys.path.insert(1, '../scripts')
import helpers.cleaning_helpers as ch

# for printouts
DEBUGGING = False

In [2]:
from pyspark.sql import SparkSession

# Create a spark session (which will run spark jobs)
spark = (
    SparkSession.builder.appName('MAST30034 XT Project 1')
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config('spark.sql.repl.eagerEval.enabled', True) 
    .config('spark.sql.parquet.cacheMetadata', 'true')
    .getOrCreate()
)

22/08/09 00:43:55 WARN Utils: Your hostname, Polaris resolves to a loopback address: 127.0.1.1; using 172.20.95.79 instead (on interface eth0)
22/08/09 00:43:55 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/08/09 00:43:56 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
# import the cdc week file to convert all dates to cdc weeks now
mmwr_weeks_df = spark.read.parquet('../data/raw/virals/mmwr_weeks.parquet')
mmwr_weeks_df.limit(5)

year,month,day,cdc_week,week_index,us_format
2017,12,31,1,1,12/31/2017
2018,1,1,1,1,01/01/2018
2018,1,2,1,1,01/02/2018
2018,1,3,1,1,01/03/2018
2018,1,4,1,1,01/04/2018


### 1. Cleaning the TLC dataset(s)

In [4]:
example_df = spark.read.parquet('../data/raw/tlc/yellow/2019-07.parquet/')
example_df.limit(5)
# TODO: commenting

VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
2,2019-07-01 00:51:04,2019-07-01 00:51:33,1.0,0.0,1.0,N,193,193,1,2.5,0.5,0.5,1.14,0.0,0.3,4.94,0.0,
2,2019-07-01 00:46:04,2019-07-01 01:05:46,1.0,4.16,1.0,N,234,25,2,16.5,0.5,0.5,0.0,0.0,0.3,20.3,2.5,
1,2019-07-01 00:25:09,2019-07-01 01:00:56,1.0,18.8,2.0,N,132,42,1,52.0,0.0,0.5,11.75,6.12,0.3,70.67,0.0,
2,2019-07-01 00:33:32,2019-07-01 01:15:27,1.0,18.46,2.0,N,132,142,1,52.0,0.0,0.5,11.06,0.0,0.3,66.36,2.5,
1,2019-07-01 00:00:55,2019-07-01 00:13:05,0.0,1.7,1.0,N,107,114,1,9.5,3.0,0.5,2.0,0.0,0.3,15.3,2.5,


In [5]:
# names of the tlc datasets to clean 
# (I was originally planning on working on fhvhv and green as well)
TLC_NAMES = ['yellow']

# dictionary to rename all the columns I want to keep
TLC_KEEP_COLUMNS = {
    'tpep_pickup_datetime': 'date',
    'passenger_count': 'passengers',
    'trip_distance': 'trip_distance',
    'PULocationID': 'pu_location_id',
    'DOLocationID': 'do_location_id',
    # below only apply to fhvhv
    # 'hvfhs_license_num': 'fhvhv_license',
    # 'pickup_datetime': 'date',
    # 'trip_miles': 'trip_distance',
    # 'shared_request_flag': 'shared'
}

# create a dictionary of the columns to keep and the required filters
TLC_CLEAN_COLUMNS = {
    'pu_location_id': [ch.non_null], 
    'do_location_id': [ch.non_null], 
    'passengers': [], 
    'trip_distance': [ch.non_null, ch.strictly_positive], 
    # 'fhvhv_license': [ch.non_null], 
}

In [6]:
# iterate through the TLC names/types (~5-10 mins)
# TODO: commenting
stacked_tlc_df = None
for name in TLC_NAMES:
    # iterate through the downloaded files per taxi type
    for filename in os.listdir(f'../data/raw/tlc/{name}'):

        # read the parquet in
        tlc_df = spark.read.parquet(f'../data/raw/tlc/{name}/{filename}')

        # debug info
        print(f'=== CLEANING "{name}/{filename}"')
    
        if DEBUGGING:
            print(f'STARTING WITH {tlc_df.count()} ROWS')

        tlc_df = ch.perform_cleaning(tlc_df, mmwr_weeks_df, TLC_KEEP_COLUMNS, 
            TLC_CLEAN_COLUMNS)

        if stacked_tlc_df == None:
            stacked_tlc_df = tlc_df
        else:
            stacked_tlc_df = stacked_tlc_df.union(tlc_df)

        if DEBUGGING:
            print(f'REDUCED TO {tlc_df.count()} ROWS')
        
        # write to file system
        tlc_df.write.mode('overwrite')\
            .parquet(f'../data/curated/tlc/cleaned/{name}/{filename}')

# stacked_tlc_df.write.mode('overwrite')\
#     .parquet(f'../data/curated/tlc/cleaned/{name}.parquet')

=== CLEANING "yellow/2018-01.parquet"
STARTING WITH 8760687 ROWS


                                                                                

REDUCED TO 8705271 ROWS


                                                                                

=== CLEANING "yellow/2018-02.parquet"
STARTING WITH 8492819 ROWS


                                                                                

REDUCED TO 8440477 ROWS


                                                                                

=== CLEANING "yellow/2018-03.parquet"
STARTING WITH 9431289 ROWS


                                                                                

REDUCED TO 9370177 ROWS


                                                                                

=== CLEANING "yellow/2018-04.parquet"
STARTING WITH 9306216 ROWS


                                                                                

REDUCED TO 9248518 ROWS


                                                                                

=== CLEANING "yellow/2018-05.parquet"
STARTING WITH 9224788 ROWS


                                                                                

REDUCED TO 9165539 ROWS


                                                                                

=== CLEANING "yellow/2018-06.parquet"
STARTING WITH 8714667 ROWS


                                                                                

REDUCED TO 8654613 ROWS


                                                                                

=== CLEANING "yellow/2018-07.parquet"
STARTING WITH 7851143 ROWS


                                                                                

REDUCED TO 7795245 ROWS


                                                                                

=== CLEANING "yellow/2018-08.parquet"
STARTING WITH 7855040 ROWS


                                                                                

REDUCED TO 7799781 ROWS


                                                                                

=== CLEANING "yellow/2018-09.parquet"
STARTING WITH 8049094 ROWS


                                                                                

REDUCED TO 7991308 ROWS


                                                                                

=== CLEANING "yellow/2018-10.parquet"
STARTING WITH 8834520 ROWS


                                                                                

REDUCED TO 8765266 ROWS


                                                                                

=== CLEANING "yellow/2018-11.parquet"
STARTING WITH 8155449 ROWS


                                                                                

REDUCED TO 8096223 ROWS


                                                                                

=== CLEANING "yellow/2018-12.parquet"
STARTING WITH 8195675 ROWS


                                                                                

REDUCED TO 8133847 ROWS


                                                                                

=== CLEANING "yellow/2019-01.parquet"
STARTING WITH 7696617 ROWS


                                                                                

REDUCED TO 7641460 ROWS


                                                                                

=== CLEANING "yellow/2019-02.parquet"
STARTING WITH 7049370 ROWS


                                                                                

REDUCED TO 6999501 ROWS


                                                                                

=== CLEANING "yellow/2019-03.parquet"
STARTING WITH 7866620 ROWS


                                                                                

REDUCED TO 7812753 ROWS


                                                                                

=== CLEANING "yellow/2019-04.parquet"
STARTING WITH 7475949 ROWS


                                                                                

REDUCED TO 7424917 ROWS


                                                                                

=== CLEANING "yellow/2019-05.parquet"
STARTING WITH 7598445 ROWS


                                                                                

REDUCED TO 7542053 ROWS


                                                                                

=== CLEANING "yellow/2019-06.parquet"
STARTING WITH 6971560 ROWS


                                                                                

REDUCED TO 6908437 ROWS


                                                                                

=== CLEANING "yellow/2019-07.parquet"
STARTING WITH 6310419 ROWS


                                                                                

REDUCED TO 6242700 ROWS


                                                                                

=== CLEANING "yellow/2019-08.parquet"
STARTING WITH 6073357 ROWS


                                                                                

REDUCED TO 6004160 ROWS


                                                                                

=== CLEANING "yellow/2019-09.parquet"
STARTING WITH 6567788 ROWS


                                                                                

REDUCED TO 6496575 ROWS


                                                                                

=== CLEANING "yellow/2019-10.parquet"
STARTING WITH 7213891 ROWS


                                                                                

REDUCED TO 7143933 ROWS


                                                                                

=== CLEANING "yellow/2019-11.parquet"
STARTING WITH 6878111 ROWS


                                                                                

REDUCED TO 6805094 ROWS


                                                                                

=== CLEANING "yellow/2019-12.parquet"
STARTING WITH 6896317 ROWS


                                                                                

REDUCED TO 6823493 ROWS


                                                                                

=== CLEANING "yellow/2020-01.parquet"
STARTING WITH 6405008 ROWS


                                                                                

REDUCED TO 6334779 ROWS


                                                                                

=== CLEANING "yellow/2020-02.parquet"
STARTING WITH 6299367 ROWS


                                                                                

REDUCED TO 6238857 ROWS


                                                                                

=== CLEANING "yellow/2020-03.parquet"
STARTING WITH 3007687 ROWS


                                                                                

REDUCED TO 2976275 ROWS


                                                                                

=== CLEANING "yellow/2020-04.parquet"
STARTING WITH 238073 ROWS
REDUCED TO 231848 ROWS
=== CLEANING "yellow/2020-05.parquet"
STARTING WITH 348415 ROWS
REDUCED TO 338185 ROWS


                                                                                

=== CLEANING "yellow/2020-06.parquet"
STARTING WITH 549797 ROWS


                                                                                

REDUCED TO 532356 ROWS


                                                                                

=== CLEANING "yellow/2020-07.parquet"
STARTING WITH 800412 ROWS


                                                                                

REDUCED TO 775434 ROWS


                                                                                

=== CLEANING "yellow/2020-08.parquet"
STARTING WITH 1007286 ROWS


                                                                                

REDUCED TO 979388 ROWS


                                                                                

=== CLEANING "yellow/2020-09.parquet"
STARTING WITH 1341017 ROWS


                                                                                

REDUCED TO 1318248 ROWS


                                                                                

=== CLEANING "yellow/2020-10.parquet"
STARTING WITH 1681132 ROWS


                                                                                

REDUCED TO 1656734 ROWS


                                                                                

=== CLEANING "yellow/2020-11.parquet"
STARTING WITH 1509000 ROWS


                                                                                

REDUCED TO 1489897 ROWS


                                                                                

=== CLEANING "yellow/2020-12.parquet"
STARTING WITH 1461898 ROWS


                                                                                

REDUCED TO 1444543 ROWS


                                                                                

=== CLEANING "yellow/2021-01.parquet"
STARTING WITH 1369769 ROWS


                                                                                

REDUCED TO 1349814 ROWS


                                                                                

=== CLEANING "yellow/2021-02.parquet"
STARTING WITH 1371709 ROWS


                                                                                

REDUCED TO 1352825 ROWS


                                                                                

=== CLEANING "yellow/2021-03.parquet"
STARTING WITH 1925152 ROWS


                                                                                

REDUCED TO 1900716 ROWS


                                                                                

=== CLEANING "yellow/2021-04.parquet"
STARTING WITH 2171187 ROWS


                                                                                

REDUCED TO 2139723 ROWS


                                                                                

=== CLEANING "yellow/2021-05.parquet"
STARTING WITH 2507109 ROWS


                                                                                

REDUCED TO 2474228 ROWS


                                                                                

=== CLEANING "yellow/2021-06.parquet"
STARTING WITH 2834264 ROWS


                                                                                

REDUCED TO 2800084 ROWS


                                                                                

=== CLEANING "yellow/2021-07.parquet"
STARTING WITH 2821746 ROWS


                                                                                

REDUCED TO 2784377 ROWS


                                                                                

=== CLEANING "yellow/2021-08.parquet"
STARTING WITH 2788757 ROWS


                                                                                

REDUCED TO 2751198 ROWS


                                                                                

=== CLEANING "yellow/2021-09.parquet"
STARTING WITH 2963793 ROWS


                                                                                

REDUCED TO 2907626 ROWS


                                                                                

=== CLEANING "yellow/2021-10.parquet"
STARTING WITH 3463504 ROWS


                                                                                

REDUCED TO 3420698 ROWS


                                                                                

=== CLEANING "yellow/2021-11.parquet"
STARTING WITH 3472949 ROWS


                                                                                

REDUCED TO 3437593 ROWS


                                                                                

=== CLEANING "yellow/2021-12.parquet"
STARTING WITH 3214369 ROWS


                                                                                

REDUCED TO 3177337 ROWS


                                                                                

### 2. Cleaning the COVID dataset

In [7]:
# read in the covid dataset
covid_df = spark.read.csv('../data/raw/virals/covid/cases-by-day.csv',
    header = True)
covid_df.limit(5)
# TODO: commenting

22/08/09 00:56:46 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


date_of_interest,CASE_COUNT,PROBABLE_CASE_COUNT,HOSPITALIZED_COUNT,DEATH_COUNT,PROBABLE_DEATH_COUNT,CASE_COUNT_7DAY_AVG,ALL_CASE_COUNT_7DAY_AVG,HOSP_COUNT_7DAY_AVG,DEATH_COUNT_7DAY_AVG,ALL_DEATH_COUNT_7DAY_AVG,BX_CASE_COUNT,BX_PROBABLE_CASE_COUNT,BX_HOSPITALIZED_COUNT,BX_DEATH_COUNT,BX_PROBABLE_DEATH_COUNT,BX_CASE_COUNT_7DAY_AVG,BX_PROBABLE_CASE_COUNT_7DAY_AVG,BX_ALL_CASE_COUNT_7DAY_AVG,BX_HOSPITALIZED_COUNT_7DAY_AVG,BX_DEATH_COUNT_7DAY_AVG,BX_ALL_DEATH_COUNT_7DAY_AVG,BK_CASE_COUNT,BK_PROBABLE_CASE_COUNT,BK_HOSPITALIZED_COUNT,BK_DEATH_COUNT,BK_PROBABLE_DEATH_COUNT,BK_CASE_COUNT_7DAY_AVG,BK_PROBABLE_CASE_COUNT_7DAY_AVG,BK_ALL_CASE_COUNT_7DAY_AVG,BK_HOSPITALIZED_COUNT_7DAY_AVG,BK_DEATH_COUNT_7DAY_AVG,BK_ALL_DEATH_COUNT_7DAY_AVG,MN_CASE_COUNT,MN_PROBABLE_CASE_COUNT,MN_HOSPITALIZED_COUNT,MN_DEATH_COUNT,MN_PROBABLE_DEATH_COUNT,MN_CASE_COUNT_7DAY_AVG,MN_PROBABLE_CASE_COUNT_7DAY_AVG,MN_ALL_CASE_COUNT_7DAY_AVG,MN_HOSPITALIZED_COUNT_7DAY_AVG,MN_DEATH_COUNT_7DAY_AVG,MN_ALL_DEATH_COUNT_7DAY_AVG,QN_CASE_COUNT,QN_PROBABLE_CASE_COUNT,QN_HOSPITALIZED_COUNT,QN_DEATH_COUNT,QN_PROBABLE_DEATH_COUNT,QN_CASE_COUNT_7DAY_AVG,QN_PROBABLE_CASE_COUNT_7DAY_AVG,QN_ALL_CASE_COUNT_7DAY_AVG,QN_HOSPITALIZED_COUNT_7DAY_AVG,QN_DEATH_COUNT_7DAY_AVG,QN_ALL_DEATH_COUNT_7DAY_AVG,SI_CASE_COUNT,SI_PROBABLE_CASE_COUNT,SI_HOSPITALIZED_COUNT,SI_DEATH_COUNT,SI_PROBABLE_DEATH_COUNT,SI_CASE_COUNT_7DAY_AVG,SI_PROBABLE_CASE_COUNT_7DAY_AVG,SI_ALL_CASE_COUNT_7DAY_AVG,SI_HOSPITALIZED_COUNT_7DAY_AVG,SI_DEATH_COUNT_7DAY_AVG,SI_ALL_DEATH_COUNT_7DAY_AVG,INCOMPLETE
02/29/2020,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
03/01/2020,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
03/02/2020,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
03/03/2020,1,0,7,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
03/04/2020,5,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,2,0,1,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
03/05/2020,3,0,14,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,3,0,3,0,0,0,0,0,0,0,0,0,0,5,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
03/06/2020,8,0,8,0,0,3,3,5,0,0,2,0,1,0,0,0,0,0,1,0,0,1,0,1,0,0,1,0,1,2,0,0,3,0,3,0,0,1,0,1,1,0,0,1,0,2,0,0,1,0,1,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0
03/07/2020,7,0,8,0,0,3,3,6,0,0,0,0,1,0,0,0,0,0,1,0,0,2,0,3,0,0,1,0,1,2,0,0,1,0,0,0,0,1,0,1,1,0,0,3,0,4,0,0,1,0,1,2,0,0,1,0,0,0,0,0,0,0,0,0,0,0
03/08/2020,21,0,18,0,0,6,6,8,0,0,3,0,5,0,0,1,0,1,1,0,0,5,0,8,0,0,2,0,2,3,0,0,6,0,1,0,0,2,0,2,2,0,0,6,0,4,0,0,2,0,2,2,0,0,1,0,0,0,0,0,0,0,0,0,0,0
03/09/2020,57,0,37,0,0,15,15,13,0,0,4,0,7,0,0,1,0,1,2,0,0,16,0,11,0,0,4,0,4,4,0,0,24,0,10,0,0,5,0,5,3,0,0,10,0,8,0,0,3,0,3,3,0,0,3,0,2,0,0,1,0,1,1,0,0,0


In [8]:
# sum the number of incomplete datasets (ensure no incomplete values)
sum(covid_df.select('INCOMPLETE'))
# TODO: commenting

Column<'(INCOMPLETE + 0)'>

In [9]:
# TODO: commenting on covid cleaning
COVID_KEEP_COLUMNS = {
    'date_of_interest':'date'
}

COVID_CLEAN_COLUMNS = defaultdict(lambda: ch.non_negative)

COVID_BOROUGHS = {
    '': 'Overall',
    'BX_':'Bronx',
    'BK_':'Brooklyn',
    'MN_':'Manhattan',
    'QN_':'Queens',
    'SI_':'Staten Island',
}

COVID_COUNTS = {
    'CASE_COUNT': 'cases', 
    'DEATH_COUNT': 'deaths', 
    'HOSPITALIZED_COUNT': 'hospitalised'
}
# TODO: commenting
for prefix, new_prefix in COVID_BOROUGHS.items():
    for suffix, new_suffix in COVID_COUNTS.items():
        COVID_KEEP_COLUMNS[f'{prefix}{suffix}'] = f'{new_prefix}{new_suffix}'

In [10]:
covid_df = ch.perform_cleaning(covid_df, mmwr_weeks_df, COVID_KEEP_COLUMNS, 
    COVID_CLEAN_COLUMNS)
# TODO: commenting

In [11]:
from itertools import product

temp_df = None
# TODO: commenting
COVID_DATE_COLUMNS = [
    F.col('date'), F.col('year'), F.col('cdc_week'), F.col('week_index'),
]

# The data here is very wide, I'd rather just have a 'borough' column
# for homogeneity of all the data
for prefix in COVID_BOROUGHS.values():
    borough_columns = []
    for suffix in COVID_COUNTS.values():
        borough_columns.append(F.col(f'{prefix}{suffix}').alias(suffix))

    if temp_df == None:
        temp_df = covid_df.select(COVID_DATE_COLUMNS + borough_columns)\
            .withColumn('borough', F.lit(prefix))
    else:
        temp_df = temp_df\
            .union(
                covid_df.select(COVID_DATE_COLUMNS + borough_columns)\
                    .withColumn('borough', F.lit(prefix))
            )
    
covid_df = temp_df

In [12]:
covid_df.sort('week_index', 'date').limit(5)
# TODO: commenting

date,year,cdc_week,week_index,cases,deaths,hospitalised,borough
02/29/2020,2020,9,113,0,0,0,Bronx
02/29/2020,2020,9,113,0,0,1,Brooklyn
02/29/2020,2020,9,113,1,0,1,Overall
02/29/2020,2020,9,113,1,0,0,Manhattan
02/29/2020,2020,9,113,0,0,0,Queens
02/29/2020,2020,9,113,0,0,0,Staten Island
03/01/2020,2020,10,114,0,0,1,Overall
03/01/2020,2020,10,114,0,0,0,Queens
03/01/2020,2020,10,114,0,0,0,Manhattan
03/01/2020,2020,10,114,0,0,0,Brooklyn


In [13]:
# save the cleaned covid data
# TODO: commenting
covid_df.write.mode('overwrite').parquet('../data/curated/virals/covid/cases-by-day')

### 3. Cleaning the flu dataset

In [14]:
# read in the flu dataset
# TODO: commenting
flu_df = spark.read.csv('../data/raw/virals/flu/cases-by-week.csv',
    header=True)
flu_df.limit(5)

Season,Region,County,CDC Week,Week Ending Date,Disease,Count,County Centroid,FIPS
2012-2013,NYC,RICHMOND,10,03/09/2013,INFLUENZA_A,0,"(40.5795, -74.1502)",36085
2011-2012,CAPITAL DISTRICT,ALBANY,10,03/10/2012,INFLUENZA_UNSPECI...,0,"(42.5882713, -73....",36001
2009-2010,CAPITAL DISTRICT,SCHENECTADY,41,10/17/2009,INFLUENZA_UNSPECI...,0,"(42.8175421, -74....",36093
2010-2011,WESTERN,CHAUTAUQUA,19,05/14/2011,INFLUENZA_B,0,"(42.3042159, -79....",36013
2013-2014,METRO,DUTCHESS,44,11/02/2013,INFLUENZA_A,0,"(41.7550085, -73....",36027
2009-2010,CAPITAL DISTRICT,SCHENECTADY,10,03/13/2010,INFLUENZA_UNSPECI...,0,"(42.8175421, -74....",36093
2011-2012,CENTRAL,CORTLAND,20,05/19/2012,INFLUENZA_B,0,"(42.5938237, -76....",36023
2012-2013,CAPITAL DISTRICT,RENSSELAER,43,10/27/2012,INFLUENZA_A,0,"(42.7104206, -73....",36083
2012-2013,METRO,DUTCHESS,3,01/19/2013,INFLUENZA_A,62,"(41.7550085, -73....",36027
2011-2012,CENTRAL,CAYUGA,46,11/19/2011,INFLUENZA_A,0,"(43.0085456, -76....",36011


In [15]:
FLU_KEEP_COLUMNS = {
    'Week Ending Date': 'date',
    'Region': 'region',
    'County': 'borough',
    'Disease': 'disease',
    'Count': 'cases',
}
# TODO: commenting
FLU_CLEAN_COLUMNS = {
    'date': [],
    'region': [lambda _: F.col('region') == 'NYC'],
    'borough': [],
    'disease': [],
    'cases': [ch.non_negative]
}

In [16]:
# TODO: commenting
flu_df:DataFrame = ch.perform_cleaning(flu_df, mmwr_weeks_df, FLU_KEEP_COLUMNS, 
    FLU_CLEAN_COLUMNS)

In [17]:
# get the list of distinct counties (column now called 'borough')
flu_df.select('borough').distinct().limit(5)

borough
KINGS
QUEENS
BRONX
RICHMOND
NEW YORK


In [18]:
# map the boroughs to their proper names
# from: https://portal.311.nyc.gov/article/?kanumber=KA-02877
# also from map dict
FLU_COUNTY_TO_BOROUGH = {
    'BRONX': 'Bronx',
    'KINGS': 'Brooklyn',
    'NEW YORK': 'Manhattan',
    'QUEENS': 'Queens',
    'RICHMOND': 'Staten Island'
}

In [19]:
# apply the mapping to the flu df
flu_df = ch.replace_column_using_dict(flu_df, 'borough', FLU_COUNTY_TO_BOROUGH)

# also remove the regions column (not needed anymore)
columns_without_regions = flu_df.columns[:]
columns_without_regions.remove('region')
flu_df = flu_df.select(columns_without_regions)

In [20]:
flu_df.limit(5)

year,month,day,week_index,cdc_week,date,borough,disease,cases
2018,4,7,14,14,04/07/2018,Staten Island,INFLUENZA_B,34
2018,4,21,16,16,04/21/2018,Brooklyn,INFLUENZA_UNSPECI...,0
2018,4,14,15,15,04/14/2018,Bronx,INFLUENZA_A,31
2018,5,12,19,19,05/12/2018,Queens,INFLUENZA_B,24
2018,1,6,1,1,01/06/2018,Manhattan,INFLUENZA_A,117
2018,2,24,8,8,02/24/2018,Queens,INFLUENZA_A,762
2018,5,12,19,19,05/12/2018,Bronx,INFLUENZA_UNSPECI...,0
2018,4,28,17,17,04/28/2018,Brooklyn,INFLUENZA_A,19
2018,3,10,10,10,03/10/2018,Manhattan,INFLUENZA_UNSPECI...,3
2018,2,17,7,7,02/17/2018,Staten Island,INFLUENZA_UNSPECI...,1


In [21]:
# save the cleaned flu data
flu_df.write.mode('overwrite').parquet('../data/curated/virals/flu/cases-by-week')