### MAST30034: Applied Data Science Project 1
---
# Preprocessing Part 2: Cleaning The Data
#### Xavier Travers (1178369)

Cleaning the datasets of null and inconsistent values.
This is performed on the TLC data and COVID data.

In [1]:
# imports used throughout this notebook
from pyspark.sql import DataFrame, Column
from pyspark.sql.types import IntegerType
from pyspark.sql import functions as F
import os

DEBUGGING = False

In [2]:
from pyspark.sql import SparkSession

# Create a spark session (which will run spark jobs)
spark = (
    SparkSession.builder.appName('MAST30034 XT Project 1')
    .config('spark.sql.repl.eagerEval.enabled', True) 
    .config('spark.sql.parquet.cacheMetadata', 'true')
    .getOrCreate()
)

22/07/31 00:59:11 WARN Utils: Your hostname, Polaris resolves to a loopback address: 127.0.1.1; using 192.168.153.180 instead (on interface eth0)
22/07/31 00:59:11 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/07/31 00:59:13 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


### Useful Functions

This will be used to clean the different datasets.

In [3]:
def non_null(colname: str) -> Column:
    """ Returns the non-null filter for a column

    Args:
        colname (str): column name

    Returns:
        `Column`: Column filter
    """
    return F.col(colname).isNotNull()

In [4]:
def non_negative(colname: str) -> Column:
    """ Returns the non-negative filter for a column

    Args:
        colname (str): column name

    Returns:
        `Column`: Column filter
    """
    return F.col(colname) >= 0

In [5]:
def strictly_positive(colname: str) -> Column:
    """ Returns the strictly positive filter for a column

    Args:
        colname (str): column name

    Returns:
        `Column`: Column filter
    """
    return F.col(colname) > 0

### 1. Cleaning the TCL datasets

This involves removing null values where necessary.

In [6]:
# denote the datasets I'll be accessing
TLC_NAMES = ['green', 'yellow', 'fhvhv']

In [7]:
# create a dictionary of the columns to keep and the required filters
TLC_CLEAN_COLS_DICT = {
    'PULocationID': [non_null], 
    'DOLocationID': [non_null], 
    'shared_request_flag': [non_null], 
    'trip_miles': [non_null, strictly_positive], 
    'hvfhs_license_num': [non_null], 
    'passenger_count': [], # I actually may play around with null values here 
    'trip_distance': [non_null, strictly_positive]
}

In [8]:
# iterate through the TLC names/types
for name in TLC_NAMES:
    # iterate through the downloaded files per taxi type
    for filename in os.listdir(f'../data/raw/tlc/{name}'):

        # read the parquet in
        tlc_df = spark.read.parquet(f'../data/raw/tlc/{name}/{filename}')

        # extract the possible wanted columns for this dataset
        wanted_cols = set(TLC_CLEAN_COLS_DICT.keys()).intersection(set(tlc_df.columns))
        tlc_df = tlc_df.select(list(wanted_cols))

        # debug info
        if DEBUGGING:
            print(f'=== CLEANING "{name}/{filename}"')
            print(f'STARTING WITH {tlc_df.count()} ROWS')

        # iterate through columns and perform necessary cleaning processes
        for colname in wanted_cols:
            for col_filter in TLC_CLEAN_COLS_DICT[colname]:
                tlc_df = tlc_df.where(col_filter(colname))

        if DEBUGGING:
            print(f'REDUCED TO {tlc_df.count()} ROWS')
        
        # write to file system
        tlc_df.write.mode('overwrite').parquet(f'../data/curated/tlc/{name}/{filename}')

                                                                                

### 2. Cleaning the COVID dataset

In [9]:
# read in the covid dataset
covid_df = spark.read.parquet('../data/raw/covid/cases-by-day')

In [10]:
covid_df.limit(10)

22/07/31 01:04:39 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


date_of_interest,CASE_COUNT,PROBABLE_CASE_COUNT,HOSPITALIZED_COUNT,DEATH_COUNT,PROBABLE_DEATH_COUNT,CASE_COUNT_7DAY_AVG,ALL_CASE_COUNT_7DAY_AVG,HOSP_COUNT_7DAY_AVG,DEATH_COUNT_7DAY_AVG,ALL_DEATH_COUNT_7DAY_AVG,BX_CASE_COUNT,BX_PROBABLE_CASE_COUNT,BX_HOSPITALIZED_COUNT,BX_DEATH_COUNT,BX_PROBABLE_DEATH_COUNT,BX_CASE_COUNT_7DAY_AVG,BX_PROBABLE_CASE_COUNT_7DAY_AVG,BX_ALL_CASE_COUNT_7DAY_AVG,BX_HOSPITALIZED_COUNT_7DAY_AVG,BX_DEATH_COUNT_7DAY_AVG,BX_ALL_DEATH_COUNT_7DAY_AVG,BK_CASE_COUNT,BK_PROBABLE_CASE_COUNT,BK_HOSPITALIZED_COUNT,BK_DEATH_COUNT,BK_PROBABLE_DEATH_COUNT,BK_CASE_COUNT_7DAY_AVG,BK_PROBABLE_CASE_COUNT_7DAY_AVG,BK_ALL_CASE_COUNT_7DAY_AVG,BK_HOSPITALIZED_COUNT_7DAY_AVG,BK_DEATH_COUNT_7DAY_AVG,BK_ALL_DEATH_COUNT_7DAY_AVG,MN_CASE_COUNT,MN_PROBABLE_CASE_COUNT,MN_HOSPITALIZED_COUNT,MN_DEATH_COUNT,MN_PROBABLE_DEATH_COUNT,MN_CASE_COUNT_7DAY_AVG,MN_PROBABLE_CASE_COUNT_7DAY_AVG,MN_ALL_CASE_COUNT_7DAY_AVG,MN_HOSPITALIZED_COUNT_7DAY_AVG,MN_DEATH_COUNT_7DAY_AVG,MN_ALL_DEATH_COUNT_7DAY_AVG,QN_CASE_COUNT,QN_PROBABLE_CASE_COUNT,QN_HOSPITALIZED_COUNT,QN_DEATH_COUNT,QN_PROBABLE_DEATH_COUNT,QN_CASE_COUNT_7DAY_AVG,QN_PROBABLE_CASE_COUNT_7DAY_AVG,QN_ALL_CASE_COUNT_7DAY_AVG,QN_HOSPITALIZED_COUNT_7DAY_AVG,QN_DEATH_COUNT_7DAY_AVG,QN_ALL_DEATH_COUNT_7DAY_AVG,SI_CASE_COUNT,SI_PROBABLE_CASE_COUNT,SI_HOSPITALIZED_COUNT,SI_DEATH_COUNT,SI_PROBABLE_DEATH_COUNT,SI_CASE_COUNT_7DAY_AVG,SI_PROBABLE_CASE_COUNT_7DAY_AVG,SI_ALL_CASE_COUNT_7DAY_AVG,SI_HOSPITALIZED_COUNT_7DAY_AVG,SI_DEATH_COUNT_7DAY_AVG,SI_ALL_DEATH_COUNT_7DAY_AVG,INCOMPLETE
02/29/2020,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
03/01/2020,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
03/02/2020,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
03/03/2020,1,0,7,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
03/04/2020,5,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,2,0,1,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
03/05/2020,3,0,14,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,3,0,3,0,0,0,0,0,0,0,0,0,0,5,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
03/06/2020,8,0,8,0,0,3,3,5,0,0,2,0,1,0,0,0,0,0,1,0,0,1,0,1,0,0,1,0,1,2,0,0,3,0,3,0,0,1,0,1,1,0,0,1,0,2,0,0,1,0,1,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0
03/07/2020,7,0,8,0,0,3,3,6,0,0,0,0,1,0,0,0,0,0,1,0,0,2,0,3,0,0,1,0,1,2,0,0,1,0,0,0,0,1,0,1,1,0,0,3,0,4,0,0,1,0,1,2,0,0,1,0,0,0,0,0,0,0,0,0,0,0
03/08/2020,21,0,18,0,0,6,6,8,0,0,3,0,5,0,0,1,0,1,1,0,0,5,0,8,0,0,2,0,2,3,0,0,6,0,1,0,0,2,0,2,2,0,0,6,0,4,0,0,2,0,2,2,0,0,1,0,0,0,0,0,0,0,0,0,0,0
03/09/2020,57,0,37,0,0,15,15,13,0,0,4,0,7,0,0,1,0,1,2,0,0,16,0,11,0,0,4,0,4,4,0,0,24,0,10,0,0,5,0,5,3,0,0,10,0,8,0,0,3,0,3,3,0,0,3,0,2,0,0,1,0,1,1,0,0,0


In [11]:
if DEBUGGING:
    print(covid_df.count())

# iterate through columns and clean them
for colname in covid_df.columns:

    # remove null values (there actually aren't any)
    covid_df = covid_df.where(non_null(colname))
    
    if colname != 'date_of_interest':
        # remove negative values and convert all values to integers
        covid_df = covid_df.withColumn(colname, covid_df[colname].cast(IntegerType()))
        covid_df = covid_df.where(non_negative(colname))
    else:
        # extract the year and month from the date
        covid_df = covid_df.withColumn('year', covid_df[colname][7:10].cast(IntegerType()))
        covid_df = covid_df.withColumn('month', covid_df[colname][0:2].cast(IntegerType()))

if DEBUGGING:
    print(covid_df.count())

# rearrange the columns to move month and year to the left
covid_df = covid_df.select(covid_df.columns[-2:] + covid_df.columns[:-2])
covid_df.limit(5)

# save the cleaned covid data
# covid_df.write.mode('overwrite').parquet('../data/curated/covid/cases-by-day')
covid_df.write.mode('overwrite').parquet('../data/curated/covid/cases-by-day')