### MAST30034: Applied Data Science Project 1
---
# Preprocessing Part 1: Cleaning The Data
#### Xavier Travers (1178369)

Cleaning the datasets of null, inconsistent, or unnecessary values.
This is performed on the TLC data and COVID data.

In [1]:
# imports used throughout this notebook
from collections import defaultdict
from itertools import product
import os
import sys
from pyspark.sql import DataFrame, Column
from pyspark.sql.types import IntegerType
from pyspark.sql import functions as F

# add homemade helpers
sys.path.insert(1, '../scripts')
import helpers.cleaning_helpers as ch
import helpers.join_helpers as jh

# Used for saving time (if you don't want sanity-check printouts)
INTERMEDIATE_OUTPUTS = False

In [2]:
from pyspark.sql import SparkSession

# Create a spark session (which will run spark jobs)
spark = (
    SparkSession.builder.appName('MAST30034 XT Project 1')
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config('spark.sql.repl.eagerEval.enabled', True) 
    .config('spark.sql.parquet.cacheMetadata', 'true')
    .config("spark.executor.memory", "2g")
    .config("spark.driver.memory", "4g")
    .getOrCreate()
)

22/08/14 02:23:07 WARN Utils: Your hostname, Polaris resolves to a loopback address: 127.0.1.1; using 172.26.235.73 instead (on interface eth0)
22/08/14 02:23:07 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/08/14 02:23:09 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/08/14 02:23:09 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
22/08/14 02:23:09 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
22/08/14 02:23:09 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.
22/08/14 02:23:09 WARN Utils: Service 'SparkUI' could not bind on port 4043. Attempting port 4044.


In [3]:
# import the cdc week file to convert all dates to cdc weeks now
mmwr_weeks_df = spark.read.parquet('../data/raw/virals/mmwr_weeks.parquet')
mmwr_weeks_df.limit(5)

year,month,day,cdc_week,week_index,us_format,week_ending,week_month,week_year,timeline
2017,12,31,1,1,12/31/2017,2018-01-06,1,2018,neither
2018,1,1,1,1,01/01/2018,2018-01-06,1,2018,neither
2018,1,2,1,1,01/02/2018,2018-01-06,1,2018,neither
2018,1,3,1,1,01/03/2018,2018-01-06,1,2018,neither
2018,1,4,1,1,01/04/2018,2018-01-06,1,2018,neither


In [4]:
# import the zones dataset
zones_df = spark.read.csv('../data/raw/tlc_zones/zones.csv',
    header = True)

### 1. Cleaning the TLC dataset(s)

In [5]:
if INTERMEDIATE_OUTPUTS:
    example_df = spark.read.parquet('../data/raw/tlc/yellow/2019-07.parquet/')
    example_df.limit(5)
# TODO: commenting

In [6]:
if INTERMEDIATE_OUTPUTS:
    example_df.sort('trip_distance', ascending = False).limit(5)

In [7]:
# names of the tlc datasets to clean 
# (I was originally planning on working on fhvhv and green as well)
TLC_NAMES = ['yellow']

# dictionary to rename all the columns I want to keep
TLC_KEEP_COLUMNS = {
    'tpep_pickup_datetime': 'date',
    'passenger_count': 'passengers',
    'trip_distance': 'trip_distance',
    'PULocationID': 'pu_location_id',
    'DOLocationID': 'do_location_id',
    'hours_elapsed': 'hours_elapsed'
    # #  below only apply to fhvhv
    # 'hvfhs_license_num': 'fhvhv_license',
    # 'pickup_datetime': 'date',
    # 'trip_miles': 'trip_distance',
    # 'shared_request_flag': 'shared'
}

# create a dictionary of the columns to keep and the required filters
TLC_CLEAN_COLUMNS = {
    'pu_location_id': [ch.non_null], 
    'do_location_id': [ch.non_null], 
    'passengers': [ch.non_null], 
    'trip_distance': [ch.non_null, ch.non_negative], 
    # 'fhvhv_license': [ch.non_null], 
}

In [8]:
# read in the tlc data
tlc_df = jh.read_stacked_tlc_df(spark)

In [9]:
if INTERMEDIATE_OUTPUTS:
    # sanity check
    tlc_df.limit(5)

In [10]:
# derive extra values which are used to filter out valid trips
SECONDS_TO_HOURS = 1 / (60*60)
tlc_df = tlc_df\
    .withColumn('hours_elapsed', 
        (
            (F.col("tpep_dropoff_datetime").cast("long")
            - F.col('tpep_pickup_datetime').cast("long")) 
            * SECONDS_TO_HOURS
        )
    )\
    .withColumn('mph', (F.col('trip_distance') / F.col('hours_elapsed')))

In [11]:
if INTERMEDIATE_OUTPUTS:
    tlc_df.limit(5) 

In [12]:
# https://ypdcrime.com/vt/article30.php?zoom_highlight=fifty+five+miles+per+hour#t1180-a.
# As per: https://www.dot.ny.gov/divisions/operating/oom/transportation-systems/repository/TSMI-17-05.pdf
# the NYS maximum speed limit is 65 mph. filter out trips faster than legal.
tlc_df = tlc_df.where(
    (F.col('mph').isNotNull()) &
    (F.col('mph') <= 65)
)

In [13]:
if INTERMEDIATE_OUTPUTS:
    # this one is time instensive 
    tlc_df.sort('trip_distance', ascending = False).limit(5)

In [14]:
tlc_df = ch.perform_cleaning(tlc_df, mmwr_weeks_df, TLC_KEEP_COLUMNS, 
    TLC_CLEAN_COLUMNS)

In [15]:
# next, filter out trips which do not start and/or end within the 5 boroughs 
tlc_df = ch.extract_borough_name(tlc_df, zones_df,  'pu')
tlc_df = ch.extract_borough_name(tlc_df, zones_df,  'do')

if INTERMEDIATE_OUTPUTS:
    tlc_df.sort('trip_distance', ascending=False).limit(5)

In [16]:
if INTERMEDIATE_OUTPUTS:
    tlc_df.count()

In [17]:
if INTERMEDIATE_OUTPUTS:
    tlc_df.sort('week_index', ascending = False).limit(10)

In [18]:
if INTERMEDIATE_OUTPUTS:
    tlc_df.sort('trip_distance', ascending=False).limit(5)

In [19]:
# filter by anything outside of the time bounded by the timelines
# (keep the transitionary time (2019) for time-series analysis)
tlc_df = tlc_df.where(F.col('timeline') != 'neither')

In [20]:
# save the stacked df by month (this will take a while)
tlc_df = tlc_df.sort('week_year', 'week_month')
tlc_df.write\
    .partitionBy('week_year', 'week_month')\
    .mode('overwrite')\
    .parquet(f'../data/curated/tlc/cleaned/yellow')

                                                                                

### 2. Cleaning the COVID dataset

In [21]:
# read in the covid dataset
covid_df = spark.read.csv('../data/raw/virals/covid/cases-by-day.csv',
    header = True)
covid_df.limit(5)
# TODO: commenting

22/08/14 02:32:35 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


date_of_interest,CASE_COUNT,PROBABLE_CASE_COUNT,HOSPITALIZED_COUNT,DEATH_COUNT,PROBABLE_DEATH_COUNT,CASE_COUNT_7DAY_AVG,ALL_CASE_COUNT_7DAY_AVG,HOSP_COUNT_7DAY_AVG,DEATH_COUNT_7DAY_AVG,ALL_DEATH_COUNT_7DAY_AVG,BX_CASE_COUNT,BX_PROBABLE_CASE_COUNT,BX_HOSPITALIZED_COUNT,BX_DEATH_COUNT,BX_PROBABLE_DEATH_COUNT,BX_CASE_COUNT_7DAY_AVG,BX_PROBABLE_CASE_COUNT_7DAY_AVG,BX_ALL_CASE_COUNT_7DAY_AVG,BX_HOSPITALIZED_COUNT_7DAY_AVG,BX_DEATH_COUNT_7DAY_AVG,BX_ALL_DEATH_COUNT_7DAY_AVG,BK_CASE_COUNT,BK_PROBABLE_CASE_COUNT,BK_HOSPITALIZED_COUNT,BK_DEATH_COUNT,BK_PROBABLE_DEATH_COUNT,BK_CASE_COUNT_7DAY_AVG,BK_PROBABLE_CASE_COUNT_7DAY_AVG,BK_ALL_CASE_COUNT_7DAY_AVG,BK_HOSPITALIZED_COUNT_7DAY_AVG,BK_DEATH_COUNT_7DAY_AVG,BK_ALL_DEATH_COUNT_7DAY_AVG,MN_CASE_COUNT,MN_PROBABLE_CASE_COUNT,MN_HOSPITALIZED_COUNT,MN_DEATH_COUNT,MN_PROBABLE_DEATH_COUNT,MN_CASE_COUNT_7DAY_AVG,MN_PROBABLE_CASE_COUNT_7DAY_AVG,MN_ALL_CASE_COUNT_7DAY_AVG,MN_HOSPITALIZED_COUNT_7DAY_AVG,MN_DEATH_COUNT_7DAY_AVG,MN_ALL_DEATH_COUNT_7DAY_AVG,QN_CASE_COUNT,QN_PROBABLE_CASE_COUNT,QN_HOSPITALIZED_COUNT,QN_DEATH_COUNT,QN_PROBABLE_DEATH_COUNT,QN_CASE_COUNT_7DAY_AVG,QN_PROBABLE_CASE_COUNT_7DAY_AVG,QN_ALL_CASE_COUNT_7DAY_AVG,QN_HOSPITALIZED_COUNT_7DAY_AVG,QN_DEATH_COUNT_7DAY_AVG,QN_ALL_DEATH_COUNT_7DAY_AVG,SI_CASE_COUNT,SI_PROBABLE_CASE_COUNT,SI_HOSPITALIZED_COUNT,SI_DEATH_COUNT,SI_PROBABLE_DEATH_COUNT,SI_CASE_COUNT_7DAY_AVG,SI_PROBABLE_CASE_COUNT_7DAY_AVG,SI_ALL_CASE_COUNT_7DAY_AVG,SI_HOSPITALIZED_COUNT_7DAY_AVG,SI_DEATH_COUNT_7DAY_AVG,SI_ALL_DEATH_COUNT_7DAY_AVG,INCOMPLETE
02/29/2020,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
03/01/2020,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
03/02/2020,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
03/03/2020,1,0,7,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
03/04/2020,5,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,2,0,1,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [22]:
# sum the number of incomplete datasets (ensure no incomplete values)
if INTERMEDIATE_OUTPUTS:
    sum(covid_df.select('INCOMPLETE'))
# TODO: commenting

In [23]:
# TODO: commenting on covid cleaning
COVID_KEEP_COLUMNS = {
    'date_of_interest':'date'
}

COVID_CLEAN_COLUMNS = defaultdict(lambda: ch.non_negative)

COVID_BOROUGHS = {
    'BX_':'Bronx',
    'BK_':'Brooklyn',
    'MN_':'Manhattan',
    'QN_':'Queens',
    'SI_':'Staten Island',
}

COVID_COUNTS = {
    'CASE_COUNT': 'cases', 
    'DEATH_COUNT': 'deaths', 
    'HOSPITALIZED_COUNT': 'hospitalised'
}
# TODO: commenting
for prefix, new_prefix in COVID_BOROUGHS.items():
    for suffix, new_suffix in COVID_COUNTS.items():
        COVID_KEEP_COLUMNS[f'{prefix}{suffix}'] = f'{new_prefix}{new_suffix}'

In [24]:
covid_df = ch.perform_cleaning(covid_df, mmwr_weeks_df, COVID_KEEP_COLUMNS, 
    COVID_CLEAN_COLUMNS)
# TODO: commenting

In [26]:
from itertools import product

temp_df = None
# TODO: commenting
COVID_DATE_COLUMNS = [
    F.col('date'), 
    F.col('week_ending'), 
    F.col('week_year'), 
    F.col('week_month'), 
    F.col('week_index'),
    F.col('timeline')
]

# The data here is very wide, I'd rather just have a 'borough' column
# for homogeneity of all the data
for prefix in COVID_BOROUGHS.values():
    borough_columns = []
    for suffix in COVID_COUNTS.values():
        borough_columns.append(F.col(f'{prefix}{suffix}').alias(suffix))

    if temp_df == None:
        temp_df = covid_df.select(COVID_DATE_COLUMNS + borough_columns)\
            .withColumn('borough', F.lit(prefix))
    else:
        temp_df = temp_df\
            .union(
                covid_df.select(COVID_DATE_COLUMNS + borough_columns)\
                    .withColumn('borough', F.lit(prefix))
            )
    
covid_df = temp_df

In [27]:
if INTERMEDIATE_OUTPUTS:
    covid_df.sort('week_index', 'date').limit(5)
# TODO: commenting

In [28]:
# save the cleaned covid data
# TODO: commenting
covid_df.write.mode('overwrite').parquet('../data/curated/virals/covid/cleaned/cases-by-day')

### 3. Cleaning the flu dataset

In [29]:
# read in the flu dataset
# TODO: commenting
flu_df = spark.read.csv('../data/raw/virals/flu/cases-by-week.csv',
    header=True)
flu_df.limit(5)

Season,Region,County,CDC Week,Week Ending Date,Disease,Count,County Centroid,FIPS
2012-2013,NYC,RICHMOND,10,03/09/2013,INFLUENZA_A,0,"(40.5795, -74.1502)",36085
2011-2012,CAPITAL DISTRICT,ALBANY,10,03/10/2012,INFLUENZA_UNSPECI...,0,"(42.5882713, -73....",36001
2009-2010,CAPITAL DISTRICT,SCHENECTADY,41,10/17/2009,INFLUENZA_UNSPECI...,0,"(42.8175421, -74....",36093
2010-2011,WESTERN,CHAUTAUQUA,19,05/14/2011,INFLUENZA_B,0,"(42.3042159, -79....",36013
2013-2014,METRO,DUTCHESS,44,11/02/2013,INFLUENZA_A,0,"(41.7550085, -73....",36027


In [30]:
FLU_KEEP_COLUMNS = {
    'Week Ending Date': 'date',
    'Region': 'region',
    'County': 'borough',
    'Disease': 'disease',
    'Count': 'cases',
}
# TODO: commenting
FLU_CLEAN_COLUMNS = {
    'date': [],
    'region': [lambda _: F.col('region') == 'NYC'],
    'borough': [],
    'disease': [],
    'cases': [ch.non_negative]
}

In [31]:
# TODO: commenting
flu_df:DataFrame = ch.perform_cleaning(flu_df, mmwr_weeks_df, FLU_KEEP_COLUMNS, 
    FLU_CLEAN_COLUMNS)

In [32]:
if INTERMEDIATE_OUTPUTS:
    # get the list of distinct counties (column now called 'borough')
    flu_df.select('borough').distinct().limit(5)

In [33]:
# map the boroughs to their proper names
# from: https://portal.311.nyc.gov/article/?kanumber=KA-02877
# also from map dict
FLU_COUNTY_TO_BOROUGH = {
    'BRONX': 'Bronx',
    'KINGS': 'Brooklyn',
    'NEW YORK': 'Manhattan',
    'QUEENS': 'Queens',
    'RICHMOND': 'Staten Island'
}

In [34]:
# apply the mapping to the flu df
flu_df = ch.replace_column_using_dict(flu_df, 'borough', FLU_COUNTY_TO_BOROUGH)

# also remove the regions column (not needed anymore)
columns_without_regions = flu_df.columns[:]
columns_without_regions.remove('region')
flu_df = flu_df.select(columns_without_regions)

In [35]:
if INTERMEDIATE_OUTPUTS:
    flu_df.limit(5)

In [36]:
# save the cleaned flu data
flu_df.write.mode('overwrite').parquet('../data/curated/virals/flu/cleaned/cases-by-week')