### MAST30034: Applied Data Science Project 1
---
# Preprocessing Part 1: Cleaning The Data
#### Xavier Travers (1178369)

Cleaning the datasets of null, inconsistent, or unnecessary values.
This is performed on the TLC data and COVID data.

In [1]:
# imports used throughout this notebook
from collections import defaultdict
from itertools import product
import os
import sys
from pyspark.sql import DataFrame, Column
from pyspark.sql.types import IntegerType
from pyspark.sql import functions as F
import geopandas

# add homemade helpers
sys.path.insert(1, '../scripts')
import helpers.cleaning_helpers as ch

# for printouts
DEBUGGING = False

In [2]:
from pyspark.sql import SparkSession

# Create a spark session (which will run spark jobs)
spark = (
    SparkSession.builder.appName('MAST30034 XT Project 1')
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config('spark.sql.repl.eagerEval.enabled', True) 
    .config('spark.sql.parquet.cacheMetadata', 'true')
    .getOrCreate()
)

22/08/11 15:19:22 WARN Utils: Your hostname, Ganymede resolves to a loopback address: 127.0.1.1; using 172.29.200.206 instead (on interface eth0)
22/08/11 15:19:22 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/08/11 15:19:23 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
# import the cdc week file to convert all dates to cdc weeks now
mmwr_weeks_df = spark.read.parquet('../data/raw/virals/mmwr_weeks.parquet')
mmwr_weeks_df.limit(5)

                                                                                

year,month,day,cdc_week,week_index,us_format
2017,12,31,1,1,12/31/2017
2018,1,1,1,1,01/01/2018
2018,1,2,1,1,01/02/2018
2018,1,3,1,1,01/03/2018
2018,1,4,1,1,01/04/2018


### 1. Cleaning the TLC dataset(s)

In [5]:
example_df = spark.read.parquet('../data/raw/tlc/yellow/2019-07.parquet/')
example_df.limit(5)
# TODO: commenting

                                                                                

VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
2,2019-07-01 00:51:04,2019-07-01 00:51:33,1.0,0.0,1.0,N,193,193,1,2.5,0.5,0.5,1.14,0.0,0.3,4.94,0.0,
2,2019-07-01 00:46:04,2019-07-01 01:05:46,1.0,4.16,1.0,N,234,25,2,16.5,0.5,0.5,0.0,0.0,0.3,20.3,2.5,
1,2019-07-01 00:25:09,2019-07-01 01:00:56,1.0,18.8,2.0,N,132,42,1,52.0,0.0,0.5,11.75,6.12,0.3,70.67,0.0,
2,2019-07-01 00:33:32,2019-07-01 01:15:27,1.0,18.46,2.0,N,132,142,1,52.0,0.0,0.5,11.06,0.0,0.3,66.36,2.5,
1,2019-07-01 00:00:55,2019-07-01 00:13:05,0.0,1.7,1.0,N,107,114,1,9.5,3.0,0.5,2.0,0.0,0.3,15.3,2.5,


In [6]:
example_df.sort('trip_distance', ascending = False).limit(5)

                                                                                

VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
2,2019-07-29 09:46:42,2019-07-29 15:12:31,1.0,311.56,4.0,N,68,265,2,1574.0,0.0,0.5,0.0,10.5,0.3,1587.8,2.5,
1,2019-07-17 13:42:23,2019-07-17 14:15:25,1.0,307.5,1.0,N,161,138,1,28.5,2.5,0.5,5.0,0.0,0.3,36.8,2.5,
2,2019-07-03 16:13:11,2019-07-03 20:09:21,2.0,180.09,5.0,N,93,265,1,400.0,0.0,0.0,0.0,57.12,0.3,457.42,0.0,
2,2019-07-19 07:01:46,2019-07-19 10:50:56,2.0,169.47,4.0,N,43,265,2,794.5,0.0,0.5,0.0,12.5,0.3,807.8,0.0,
2,2019-07-13 05:40:49,2019-07-13 08:32:15,4.0,168.44,4.0,N,132,265,2,796.5,0.5,0.5,0.0,0.0,0.3,797.8,0.0,


In [7]:
# names of the tlc datasets to clean 
# (I was originally planning on working on fhvhv and green as well)
TLC_NAMES = ['yellow']

# dictionary to rename all the columns I want to keep
TLC_KEEP_COLUMNS = {
    'tpep_pickup_datetime': 'date',
    'passenger_count': 'passengers',
    'trip_distance': 'trip_distance',
    'PULocationID': 'pu_location_id',
    'DOLocationID': 'do_location_id',
    # below only apply to fhvhv
    # 'hvfhs_license_num': 'fhvhv_license',
    # 'pickup_datetime': 'date',
    # 'trip_miles': 'trip_distance',
    # 'shared_request_flag': 'shared'
}

# create a dictionary of the columns to keep and the required filters
TLC_CLEAN_COLUMNS = {
    'pu_location_id': [ch.non_null], 
    'do_location_id': [ch.non_null], 
    'passengers': [ch.non_null], 
    'trip_distance': [ch.non_null, ch.strictly_positive], 
    # 'fhvhv_license': [ch.non_null], 
}

In [8]:
# iterate through the TLC names/types (~5-10 mins)
# TODO: commenting
stacked_tlc_df = None
for name in TLC_NAMES:
    # iterate through the downloaded files per taxi type
    for filename in os.listdir(f'../data/raw/tlc/{name}'):

        # read the parquet in
        tlc_df = spark.read.parquet(f'../data/raw/tlc/{name}/{filename}')

        # debug info
        print(f'=== CLEANING "{name}/{filename}"')
    
        if DEBUGGING:
            print(f'STARTING WITH {tlc_df.count()} ROWS')

        tlc_df = ch.perform_cleaning(tlc_df, mmwr_weeks_df, TLC_KEEP_COLUMNS, 
            TLC_CLEAN_COLUMNS)

        if stacked_tlc_df == None:
            stacked_tlc_df = tlc_df
        else:
            stacked_tlc_df = stacked_tlc_df.union(tlc_df)

        if DEBUGGING:
            print(f'REDUCED TO {tlc_df.count()} ROWS')
        
        # write to file system
        # tlc_df.write.mode('overwrite')\
        #     .parquet(f'../data/curated/tlc/cleaned/{name}/{filename}')


# get the count of the elements
count_rows = stacked_tlc_df.count()
print(count_rows)

# remove the top and bottom 5% of values by trip distance (removes outliers)
stacked_tlc_df:DataFrame = stacked_tlc_df.sort('trip_distance')
stacked_tlc_df = stacked_tlc_df.limit(int(count_rows * 0.95))
stacked_tlc_df = stacked_tlc_df.sort('trip_distance', ascending = False)
stacked_tlc_df = stacked_tlc_df.limit(int(count_rows * 0.95))

# print(stacked_tlc_df.count())
stacked_tlc_df = stacked_tlc_df.sort('year', 'month')
stacked_tlc_df.write\
    .partitionBy('year', 'month')\
    .mode('overwrite')\
    .parquet(f'../data/curated/tlc/cleaned/{name}.parquet')

=== CLEANING "yellow/2018-01.parquet"
=== CLEANING "yellow/2018-02.parquet"
=== CLEANING "yellow/2018-03.parquet"
=== CLEANING "yellow/2018-04.parquet"
=== CLEANING "yellow/2018-05.parquet"
=== CLEANING "yellow/2018-06.parquet"
=== CLEANING "yellow/2018-07.parquet"
=== CLEANING "yellow/2018-08.parquet"
=== CLEANING "yellow/2018-09.parquet"
=== CLEANING "yellow/2018-10.parquet"
=== CLEANING "yellow/2018-11.parquet"
=== CLEANING "yellow/2018-12.parquet"
=== CLEANING "yellow/2019-01.parquet"
=== CLEANING "yellow/2019-02.parquet"
=== CLEANING "yellow/2019-03.parquet"
=== CLEANING "yellow/2019-04.parquet"
=== CLEANING "yellow/2019-05.parquet"
=== CLEANING "yellow/2019-06.parquet"
=== CLEANING "yellow/2019-07.parquet"
=== CLEANING "yellow/2019-08.parquet"
=== CLEANING "yellow/2019-09.parquet"
=== CLEANING "yellow/2019-10.parquet"
=== CLEANING "yellow/2019-11.parquet"
=== CLEANING "yellow/2019-12.parquet"
=== CLEANING "yellow/2020-01.parquet"
=== CLEANING "yellow/2020-02.parquet"
=== CLEANING

                                                                                

238084819
22/08/11 15:22:58 WARN DAGScheduler: Broadcasting large task binary with size 1132.6 KiB


[Stage 71:=====>                                                 (37 + 8) / 348]

22/08/11 15:23:00 ERROR Executor: Exception in task 3.0 in stage 71.0 (TID 444)
java.lang.OutOfMemoryError: Java heap space
	at org.sparkproject.guava.collect.Ordering.leastOf(Ordering.java:657)
	at org.apache.spark.util.collection.Utils$.takeOrdered(Utils.scala:37)
	at org.apache.spark.sql.execution.TakeOrderedAndProjectExec.$anonfun$doExecute$4(limit.scala:231)
	at org.apache.spark.sql.execution.TakeOrderedAndProjectExec$$Lambda$3066/2141549804.apply(Unknown Source)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2(RDD.scala:855)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2$adapted(RDD.scala:855)
	at org.apache.spark.rdd.RDD$$Lambda$1524/83199108.apply(Unknown Source)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:365)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:329)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.comp



22/08/11 15:23:01 ERROR Executor: Exception in task 11.0 in stage 71.0 (TID 452)
java.lang.OutOfMemoryError: Java heap space
	at org.sparkproject.guava.collect.Ordering.leastOf(Ordering.java:657)
	at org.apache.spark.util.collection.Utils$.takeOrdered(Utils.scala:37)
	at org.apache.spark.sql.execution.TakeOrderedAndProjectExec.$anonfun$doExecute$4(limit.scala:231)
	at org.apache.spark.sql.execution.TakeOrderedAndProjectExec$$Lambda$3066/2141549804.apply(Unknown Source)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2(RDD.scala:855)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2$adapted(RDD.scala:855)
	at org.apache.spark.rdd.RDD$$Lambda$1524/83199108.apply(Unknown Source)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:365)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:329)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.com

ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/home/digitaldata/.local/lib/python3.8/site-packages/py4j/clientserver.py", line 516, in send_command
    raise Py4JNetworkError("Answer from Java side is empty")
py4j.protocol.Py4JNetworkError: Answer from Java side is empty

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/digitaldata/.local/lib/python3.8/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/home/digitaldata/.local/lib/python3.8/site-packages/py4j/clientserver.py", line 539, in send_command
    raise Py4JNetworkError(
py4j.protocol.Py4JNetworkError: Error while sending or receiving
ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/home/digitaldata/.local/lib/python3.8/site-packages/py4j/clientserver.py", line 516, in send_command
    raise Py4JNetworkError("Ans

Py4JError: An error occurred while calling o5052.parquet

### 2. Cleaning the COVID dataset

In [None]:
# read in the covid dataset
covid_df = spark.read.csv('../data/raw/virals/covid/cases-by-day.csv',
    header = True)
covid_df.limit(5)
# TODO: commenting

In [None]:
# sum the number of incomplete datasets (ensure no incomplete values)
sum(covid_df.select('INCOMPLETE'))
# TODO: commenting

In [None]:
# TODO: commenting on covid cleaning
COVID_KEEP_COLUMNS = {
    'date_of_interest':'date'
}

COVID_CLEAN_COLUMNS = defaultdict(lambda: ch.non_negative)

COVID_BOROUGHS = {
    '': 'Overall',
    'BX_':'Bronx',
    'BK_':'Brooklyn',
    'MN_':'Manhattan',
    'QN_':'Queens',
    'SI_':'Staten Island',
}

COVID_COUNTS = {
    'CASE_COUNT': 'cases', 
    'DEATH_COUNT': 'deaths', 
    'HOSPITALIZED_COUNT': 'hospitalised'
}
# TODO: commenting
for prefix, new_prefix in COVID_BOROUGHS.items():
    for suffix, new_suffix in COVID_COUNTS.items():
        COVID_KEEP_COLUMNS[f'{prefix}{suffix}'] = f'{new_prefix}{new_suffix}'

In [None]:
covid_df = ch.perform_cleaning(covid_df, mmwr_weeks_df, COVID_KEEP_COLUMNS, 
    COVID_CLEAN_COLUMNS)
# TODO: commenting

In [None]:
from itertools import product

temp_df = None
# TODO: commenting
COVID_DATE_COLUMNS = [
    F.col('date'), F.col('year'), F.col('cdc_week'), F.col('week_index'),
]

# The data here is very wide, I'd rather just have a 'borough' column
# for homogeneity of all the data
for prefix in COVID_BOROUGHS.values():
    borough_columns = []
    for suffix in COVID_COUNTS.values():
        borough_columns.append(F.col(f'{prefix}{suffix}').alias(suffix))

    if temp_df == None:
        temp_df = covid_df.select(COVID_DATE_COLUMNS + borough_columns)\
            .withColumn('borough', F.lit(prefix))
    else:
        temp_df = temp_df\
            .union(
                covid_df.select(COVID_DATE_COLUMNS + borough_columns)\
                    .withColumn('borough', F.lit(prefix))
            )
    
covid_df = temp_df

In [None]:
covid_df.sort('week_index', 'date').limit(5)
# TODO: commenting

In [None]:
# save the cleaned covid data
# TODO: commenting
covid_df.write.mode('overwrite').parquet('../data/curated/virals/covid/cases-by-day')

### 3. Cleaning the flu dataset

In [None]:
# read in the flu dataset
# TODO: commenting
flu_df = spark.read.csv('../data/raw/virals/flu/cases-by-week.csv',
    header=True)
flu_df.limit(5)

In [None]:
FLU_KEEP_COLUMNS = {
    'Week Ending Date': 'date',
    'Region': 'region',
    'County': 'borough',
    'Disease': 'disease',
    'Count': 'cases',
}
# TODO: commenting
FLU_CLEAN_COLUMNS = {
    'date': [],
    'region': [lambda _: F.col('region') == 'NYC'],
    'borough': [],
    'disease': [],
    'cases': [ch.non_negative]
}

In [None]:
# TODO: commenting
flu_df:DataFrame = ch.perform_cleaning(flu_df, mmwr_weeks_df, FLU_KEEP_COLUMNS, 
    FLU_CLEAN_COLUMNS)

In [None]:
# get the list of distinct counties (column now called 'borough')
flu_df.select('borough').distinct().limit(5)

In [None]:
# map the boroughs to their proper names
# from: https://portal.311.nyc.gov/article/?kanumber=KA-02877
# also from map dict
FLU_COUNTY_TO_BOROUGH = {
    'BRONX': 'Bronx',
    'KINGS': 'Brooklyn',
    'NEW YORK': 'Manhattan',
    'QUEENS': 'Queens',
    'RICHMOND': 'Staten Island'
}

In [None]:
# apply the mapping to the flu df
flu_df = ch.replace_column_using_dict(flu_df, 'borough', FLU_COUNTY_TO_BOROUGH)

# also remove the regions column (not needed anymore)
columns_without_regions = flu_df.columns[:]
columns_without_regions.remove('region')
flu_df = flu_df.select(columns_without_regions)

In [None]:
flu_df.limit(5)

In [None]:
# save the cleaned flu data
flu_df.write.mode('overwrite').parquet('../data/curated/virals/flu/cases-by-week')